1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/icBuffer.hpp"
  35 #include "code/nativeInst.hpp"
  36 #include "code/vtableStubs.hpp"
  37 #include "compiler/oopMap.hpp"
  38 #include "gc/shared/collectedHeap.hpp"
  39 #include "gc/shared/gcLocker.hpp"
  40 #include "gc/shared/barrierSet.hpp"
  41 #include "gc/shared/barrierSetAssembler.hpp"
  42 #include "interpreter/interpreter.hpp"
  43 #include "logging/log.hpp"
  44 #include "memory/resourceArea.hpp"
  45 #include "memory/universe.hpp"
  46 #include "oops/compiledICHolder.hpp"
  47 #include "oops/klass.inline.hpp"
  48 #include "oops/method.inline.hpp"
  49 #include "prims/methodHandles.hpp"
  50 #include "runtime/continuation.hpp"
  51 #include "runtime/continuationEntry.inline.hpp"
  52 #include "runtime/globals.hpp"
  53 #include "runtime/jniHandles.hpp"
  54 #include "runtime/safepointMechanism.hpp"
  55 #include "runtime/sharedRuntime.hpp"
  56 #include "runtime/signature.hpp"
  57 #include "runtime/stubRoutines.hpp"
  58 #include "runtime/vframeArray.hpp"
  59 #include "runtime/vm_version.hpp"
  60 #include "utilities/align.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  76 
  77 class SimpleRuntimeFrame {
  78 
  79   public:
  80 
  81   // Most of the runtime stubs have this simple frame layout.
  82   // This class exists to make the layout shared in one place.
  83   // Offsets are for compiler stack slots, which are jints.
  84   enum layout {
  85     // The frame sender code expects that rbp will be in the "natural" place and
  86     // will override any oopMap setting for it. We must therefore force the layout
  87     // so that it agrees with the frame sender code.
  88     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  89     rbp_off2,
  90     return_off, return_off2,
  91     framesize
  92   };
  93 };
  94 
  95 class RegisterSaver {
  96   // Capture info about frame layout.  Layout offsets are in jint
  97   // units because compiler frame slots are jints.
  98 #define XSAVE_AREA_BEGIN 160
  99 #define XSAVE_AREA_YMM_BEGIN 576
 100 #define XSAVE_AREA_OPMASK_BEGIN 1088
 101 #define XSAVE_AREA_ZMM_BEGIN 1152
 102 #define XSAVE_AREA_UPPERBANK 1664
 103 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 104 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 105 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 106 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 107 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 108   enum layout {
 109     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 110     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 111     DEF_XMM_OFFS(0),
 112     DEF_XMM_OFFS(1),
 113     // 2..15 are implied in range usage
 114     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 115     DEF_YMM_OFFS(0),
 116     DEF_YMM_OFFS(1),
 117     // 2..15 are implied in range usage
 118     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 119     DEF_OPMASK_OFFS(0),
 120     DEF_OPMASK_OFFS(1),
 121     // 2..7 are implied in range usage
 122     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_ZMM_OFFS(0),
 124     DEF_ZMM_OFFS(1),
 125     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 126     DEF_ZMM_UPPER_OFFS(16),
 127     DEF_ZMM_UPPER_OFFS(17),
 128     // 18..31 are implied in range usage
 129     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 130     fpu_stateH_end,
 131     r15_off, r15H_off,
 132     r14_off, r14H_off,
 133     r13_off, r13H_off,
 134     r12_off, r12H_off,
 135     r11_off, r11H_off,
 136     r10_off, r10H_off,
 137     r9_off,  r9H_off,
 138     r8_off,  r8H_off,
 139     rdi_off, rdiH_off,
 140     rsi_off, rsiH_off,
 141     ignore_off, ignoreH_off,  // extra copy of rbp
 142     rsp_off, rspH_off,
 143     rbx_off, rbxH_off,
 144     rdx_off, rdxH_off,
 145     rcx_off, rcxH_off,
 146     rax_off, raxH_off,
 147     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 148     align_off, alignH_off,
 149     flags_off, flagsH_off,
 150     // The frame sender code expects that rbp will be in the "natural" place and
 151     // will override any oopMap setting for it. We must therefore force the layout
 152     // so that it agrees with the frame sender code.
 153     rbp_off, rbpH_off,        // copy of rbp we will restore
 154     return_off, returnH_off,  // slot for return address
 155     reg_save_size             // size in compiler stack slots
 156   };
 157 
 158  public:
 159   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 160   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 161 
 162   // Offsets into the register save area
 163   // Used by deoptimization when it is managing result register
 164   // values on its own
 165 
 166   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 167   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 168   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 169   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 170   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 171 
 172   // During deoptimization only the result registers need to be restored,
 173   // all the other values have already been extracted.
 174   static void restore_result_registers(MacroAssembler* masm);
 175 };
 176 
 177 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 178   int off = 0;
 179   int num_xmm_regs = XMMRegister::available_xmm_registers();
 180 #if COMPILER2_OR_JVMCI
 181   if (save_wide_vectors && UseAVX == 0) {
 182     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 183   }
 184   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 185 #else
 186   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 187 #endif
 188 
 189   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 190   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 191   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 192   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 193   // CodeBlob frame size is in words.
 194   int frame_size_in_words = frame_size_in_bytes / wordSize;
 195   *total_frame_words = frame_size_in_words;
 196 
 197   // Save registers, fpu state, and flags.
 198   // We assume caller has already pushed the return address onto the
 199   // stack, so rsp is 8-byte aligned here.
 200   // We push rpb twice in this sequence because we want the real rbp
 201   // to be under the return like a normal enter.
 202 
 203   __ enter();          // rsp becomes 16-byte aligned here
 204   __ push_CPU_state(); // Push a multiple of 16 bytes
 205 
 206   // push cpu state handles this on EVEX enabled targets
 207   if (save_wide_vectors) {
 208     // Save upper half of YMM registers(0..15)
 209     int base_addr = XSAVE_AREA_YMM_BEGIN;
 210     for (int n = 0; n < 16; n++) {
 211       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 212     }
 213     if (VM_Version::supports_evex()) {
 214       // Save upper half of ZMM registers(0..15)
 215       base_addr = XSAVE_AREA_ZMM_BEGIN;
 216       for (int n = 0; n < 16; n++) {
 217         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 218       }
 219       // Save full ZMM registers(16..num_xmm_regs)
 220       base_addr = XSAVE_AREA_UPPERBANK;
 221       off = 0;
 222       int vector_len = Assembler::AVX_512bit;
 223       for (int n = 16; n < num_xmm_regs; n++) {
 224         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 225       }
 226 #if COMPILER2_OR_JVMCI
 227       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 228       off = 0;
 229       for(int n = 0; n < KRegister::number_of_registers; n++) {
 230         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 231       }
 232 #endif
 233     }
 234   } else {
 235     if (VM_Version::supports_evex()) {
 236       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 237       int base_addr = XSAVE_AREA_UPPERBANK;
 238       off = 0;
 239       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 240       for (int n = 16; n < num_xmm_regs; n++) {
 241         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 242       }
 243 #if COMPILER2_OR_JVMCI
 244       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 245       off = 0;
 246       for(int n = 0; n < KRegister::number_of_registers; n++) {
 247         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 248       }
 249 #endif
 250     }
 251   }
 252   __ vzeroupper();
 253   if (frame::arg_reg_save_area_bytes != 0) {
 254     // Allocate argument register save area
 255     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 256   }
 257 
 258   // Set an oopmap for the call site.  This oopmap will map all
 259   // oop-registers and debug-info registers as callee-saved.  This
 260   // will allow deoptimization at this safepoint to find all possible
 261   // debug-info recordings, as well as let GC find all oops.
 262 
 263   OopMapSet *oop_maps = new OopMapSet();
 264   OopMap* map = new OopMap(frame_size_in_slots, 0);
 265 
 266 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 267 
 268   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 272   // rbp location is known implicitly by the frame sender code, needs no oopmap
 273   // and the location where rbp was saved by is ignored
 274   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 284   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 285   // on EVEX enabled targets, we get it included in the xsave area
 286   off = xmm0_off;
 287   int delta = xmm1_off - off;
 288   for (int n = 0; n < 16; n++) {
 289     XMMRegister xmm_name = as_XMMRegister(n);
 290     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 291     off += delta;
 292   }
 293   if (UseAVX > 2) {
 294     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 295     off = zmm16_off;
 296     delta = zmm17_off - off;
 297     for (int n = 16; n < num_xmm_regs; n++) {
 298       XMMRegister zmm_name = as_XMMRegister(n);
 299       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 300       off += delta;
 301     }
 302   }
 303 
 304 #if COMPILER2_OR_JVMCI
 305   if (save_wide_vectors) {
 306     // Save upper half of YMM registers(0..15)
 307     off = ymm0_off;
 308     delta = ymm1_off - ymm0_off;
 309     for (int n = 0; n < 16; n++) {
 310       XMMRegister ymm_name = as_XMMRegister(n);
 311       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 312       off += delta;
 313     }
 314     if (VM_Version::supports_evex()) {
 315       // Save upper half of ZMM registers(0..15)
 316       off = zmm0_off;
 317       delta = zmm1_off - zmm0_off;
 318       for (int n = 0; n < 16; n++) {
 319         XMMRegister zmm_name = as_XMMRegister(n);
 320         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 321         off += delta;
 322       }
 323     }
 324   }
 325 #endif // COMPILER2_OR_JVMCI
 326 
 327   // %%% These should all be a waste but we'll keep things as they were for now
 328   if (true) {
 329     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 333     // rbp location is known implicitly by the frame sender code, needs no oopmap
 334     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 344     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 345     // on EVEX enabled targets, we get it included in the xsave area
 346     off = xmm0H_off;
 347     delta = xmm1H_off - off;
 348     for (int n = 0; n < 16; n++) {
 349       XMMRegister xmm_name = as_XMMRegister(n);
 350       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 351       off += delta;
 352     }
 353     if (UseAVX > 2) {
 354       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 355       off = zmm16H_off;
 356       delta = zmm17H_off - off;
 357       for (int n = 16; n < num_xmm_regs; n++) {
 358         XMMRegister zmm_name = as_XMMRegister(n);
 359         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 360         off += delta;
 361       }
 362     }
 363   }
 364 
 365   return map;
 366 }
 367 
 368 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 369   int num_xmm_regs = XMMRegister::available_xmm_registers();
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegister::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegister::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 471 // Register up to Register::number_of_registers are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528     case T_PRIMITIVE_OBJECT:
 529       if (int_args < Argument::n_int_register_parameters_j) {
 530         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 531       } else {
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 541         stk_args += 2;
 542       }
 543       break;
 544     case T_DOUBLE:
 545       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 546       if (fp_args < Argument::n_float_register_parameters_j) {
 547         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 548       } else {
 549         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 550         stk_args += 2;
 551       }
 552       break;
 553     default:
 554       ShouldNotReachHere();
 555       break;
 556     }
 557   }
 558 
 559   return align_up(stk_args, 2);
 560 }
 561 
 562 // Same as java_calling_convention() but for multiple return
 563 // values. There's no way to store them on the stack so if we don't
 564 // have enough registers, multiple values can't be returned.
 565 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 566 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 567 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 568                                           VMRegPair *regs,
 569                                           int total_args_passed) {
 570   // Create the mapping between argument positions and
 571   // registers.
 572   static const Register INT_ArgReg[java_return_convention_max_int] = {
 573     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 574   };
 575   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 576     j_farg0, j_farg1, j_farg2, j_farg3,
 577     j_farg4, j_farg5, j_farg6, j_farg7
 578   };
 579 
 580 
 581   uint int_args = 0;
 582   uint fp_args = 0;
 583 
 584   for (int i = 0; i < total_args_passed; i++) {
 585     switch (sig_bt[i]) {
 586     case T_BOOLEAN:
 587     case T_CHAR:
 588     case T_BYTE:
 589     case T_SHORT:
 590     case T_INT:
 591       if (int_args < Argument::n_int_register_parameters_j+1) {
 592         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 593         int_args++;
 594       } else {
 595         return -1;
 596       }
 597       break;
 598     case T_VOID:
 599       // halves of T_LONG or T_DOUBLE
 600       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 601       regs[i].set_bad();
 602       break;
 603     case T_LONG:
 604       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 605       // fall through
 606     case T_OBJECT:
 607     case T_PRIMITIVE_OBJECT:
 608     case T_ARRAY:
 609     case T_ADDRESS:
 610     case T_METADATA:
 611       if (int_args < Argument::n_int_register_parameters_j+1) {
 612         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 613         int_args++;
 614       } else {
 615         return -1;
 616       }
 617       break;
 618     case T_FLOAT:
 619       if (fp_args < Argument::n_float_register_parameters_j) {
 620         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 621         fp_args++;
 622       } else {
 623         return -1;
 624       }
 625       break;
 626     case T_DOUBLE:
 627       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 628       if (fp_args < Argument::n_float_register_parameters_j) {
 629         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 630         fp_args++;
 631       } else {
 632         return -1;
 633       }
 634       break;
 635     default:
 636       ShouldNotReachHere();
 637       break;
 638     }
 639   }
 640 
 641   return int_args + fp_args;
 642 }
 643 
 644 // Patch the callers callsite with entry to compiled code if it exists.
 645 static void patch_callers_callsite(MacroAssembler *masm) {
 646   Label L;
 647   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 648   __ jcc(Assembler::equal, L);
 649 
 650   // Save the current stack pointer
 651   __ mov(r13, rsp);
 652   // Schedule the branch target address early.
 653   // Call into the VM to patch the caller, then jump to compiled callee
 654   // rax isn't live so capture return address while we easily can
 655   __ movptr(rax, Address(rsp, 0));
 656 
 657   // align stack so push_CPU_state doesn't fault
 658   __ andptr(rsp, -(StackAlignmentInBytes));
 659   __ push_CPU_state();
 660   __ vzeroupper();
 661   // VM needs caller's callsite
 662   // VM needs target method
 663   // This needs to be a long call since we will relocate this adapter to
 664   // the codeBuffer and it may not reach
 665 
 666   // Allocate argument register save area
 667   if (frame::arg_reg_save_area_bytes != 0) {
 668     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 669   }
 670   __ mov(c_rarg0, rbx);
 671   __ mov(c_rarg1, rax);
 672   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 673 
 674   // De-allocate argument register save area
 675   if (frame::arg_reg_save_area_bytes != 0) {
 676     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 677   }
 678 
 679   __ vzeroupper();
 680   __ pop_CPU_state();
 681   // restore sp
 682   __ mov(rsp, r13);
 683   __ bind(L);
 684 }
 685 
 686 // For each inline type argument, sig includes the list of fields of
 687 // the inline type. This utility function computes the number of
 688 // arguments for the call if inline types are passed by reference (the
 689 // calling convention the interpreter expects).
 690 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 691   int total_args_passed = 0;
 692   if (InlineTypePassFieldsAsArgs) {
 693     for (int i = 0; i < sig_extended->length(); i++) {
 694       BasicType bt = sig_extended->at(i)._bt;
 695       if (bt == T_PRIMITIVE_OBJECT) {
 696         // In sig_extended, an inline type argument starts with:
 697         // T_PRIMITIVE_OBJECT, followed by the types of the fields of the
 698         // inline type and T_VOID to mark the end of the value
 699         // type. Inline types are flattened so, for instance, in the
 700         // case of an inline type with an int field and an inline type
 701         // field that itself has 2 fields, an int and a long:
 702         // T_PRIMITIVE_OBJECT T_INT T_PRIMITIVE_OBJECT T_INT T_LONG T_VOID (second
 703         // slot for the T_LONG) T_VOID (inner T_PRIMITIVE_OBJECT) T_VOID
 704         // (outer T_PRIMITIVE_OBJECT)
 705         total_args_passed++;
 706         int vt = 1;
 707         do {
 708           i++;
 709           BasicType bt = sig_extended->at(i)._bt;
 710           BasicType prev_bt = sig_extended->at(i-1)._bt;
 711           if (bt == T_PRIMITIVE_OBJECT) {
 712             vt++;
 713           } else if (bt == T_VOID &&
 714                      prev_bt != T_LONG &&
 715                      prev_bt != T_DOUBLE) {
 716             vt--;
 717           }
 718         } while (vt != 0);
 719       } else {
 720         total_args_passed++;
 721       }
 722     }
 723   } else {
 724     total_args_passed = sig_extended->length();
 725   }
 726   return total_args_passed;
 727 }
 728 
 729 
 730 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 731                                    BasicType bt,
 732                                    BasicType prev_bt,
 733                                    size_t size_in_bytes,
 734                                    const VMRegPair& reg_pair,
 735                                    const Address& to,
 736                                    int extraspace,
 737                                    bool is_oop) {
 738   assert(bt != T_PRIMITIVE_OBJECT || !InlineTypePassFieldsAsArgs, "no inline type here");
 739   if (bt == T_VOID) {
 740     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 741     return;
 742   }
 743 
 744   // Say 4 args:
 745   // i   st_off
 746   // 0   32 T_LONG
 747   // 1   24 T_VOID
 748   // 2   16 T_OBJECT
 749   // 3    8 T_BOOL
 750   // -    0 return address
 751   //
 752   // However to make thing extra confusing. Because we can fit a long/double in
 753   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 754   // leaves one slot empty and only stores to a single slot. In this case the
 755   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 756 
 757   bool wide = (size_in_bytes == wordSize);
 758   VMReg r_1 = reg_pair.first();
 759   VMReg r_2 = reg_pair.second();
 760   assert(r_2->is_valid() == wide, "invalid size");
 761   if (!r_1->is_valid()) {
 762     assert(!r_2->is_valid(), "must be invalid");
 763     return;
 764   }
 765 
 766   if (!r_1->is_XMMRegister()) {
 767     Register val = rax;
 768     if (r_1->is_stack()) {
 769       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 770       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 771     } else {
 772       val = r_1->as_Register();
 773     }
 774     assert_different_registers(to.base(), val, rscratch1);
 775     if (is_oop) {
 776       __ push(r13);
 777       __ push(rbx);
 778       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 779       __ pop(rbx);
 780       __ pop(r13);
 781     } else {
 782       __ store_sized_value(to, val, size_in_bytes);
 783     }
 784   } else {
 785     if (wide) {
 786       __ movdbl(to, r_1->as_XMMRegister());
 787     } else {
 788       __ movflt(to, r_1->as_XMMRegister());
 789     }
 790   }
 791 }
 792 
 793 static void gen_c2i_adapter(MacroAssembler *masm,
 794                             const GrowableArray<SigEntry>* sig_extended,
 795                             const VMRegPair *regs,
 796                             Label& skip_fixup,
 797                             address start,
 798                             OopMapSet* oop_maps,
 799                             int& frame_complete,
 800                             int& frame_size_in_words,
 801                             bool alloc_inline_receiver) {
 802   // Before we get into the guts of the C2I adapter, see if we should be here
 803   // at all.  We've come from compiled code and are attempting to jump to the
 804   // interpreter, which means the caller made a static call to get here
 805   // (vcalls always get a compiled target if there is one).  Check for a
 806   // compiled target.  If there is one, we need to patch the caller's call.
 807   patch_callers_callsite(masm);
 808 
 809   __ bind(skip_fixup);
 810 
 811   if (InlineTypePassFieldsAsArgs) {
 812     // Is there an inline type argument?
 813     bool has_inline_argument = false;
 814     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 815       has_inline_argument = (sig_extended->at(i)._bt == T_PRIMITIVE_OBJECT);
 816     }
 817     if (has_inline_argument) {
 818       // There is at least an inline type argument: we're coming from
 819       // compiled code so we have no buffers to back the inline types.
 820       // Allocate the buffers here with a runtime call.
 821       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 822 
 823       frame_complete = __ offset();
 824 
 825       __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
 826 
 827       __ mov(c_rarg0, r15_thread);
 828       __ mov(c_rarg1, rbx);
 829       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 830       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 831 
 832       oop_maps->add_gc_map((int)(__ pc() - start), map);
 833       __ reset_last_Java_frame(false);
 834 
 835       RegisterSaver::restore_live_registers(masm);
 836 
 837       Label no_exception;
 838       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 839       __ jcc(Assembler::equal, no_exception);
 840 
 841       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
 842       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 843       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 844 
 845       __ bind(no_exception);
 846 
 847       // We get an array of objects from the runtime call
 848       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 849       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 850     }
 851   }
 852 
 853   // Since all args are passed on the stack, total_args_passed *
 854   int total_args_passed = compute_total_args_passed_int(sig_extended);
 855   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 856   // Interpreter::stackElementSize is the space we need.
 857 
 858   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 859 
 860   // stack is aligned, keep it that way
 861   // This is not currently needed or enforced by the interpreter, but
 862   // we might as well conform to the ABI.
 863   extraspace = align_up(extraspace, 2*wordSize);
 864 
 865   // set senderSP value
 866   __ lea(r13, Address(rsp, wordSize));
 867 
 868 #ifdef ASSERT
 869   __ check_stack_alignment(r13, "sender stack not aligned");
 870 #endif
 871   if (extraspace > 0) {
 872     // Pop the return address
 873     __ pop(rax);
 874 
 875     __ subptr(rsp, extraspace);
 876 
 877     // Push the return address
 878     __ push(rax);
 879 
 880     // Account for the return address location since we store it first rather
 881     // than hold it in a register across all the shuffling
 882     extraspace += wordSize;
 883   }
 884 
 885 #ifdef ASSERT
 886   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 887 #endif
 888 
 889   // Now write the args into the outgoing interpreter space
 890 
 891   // next_arg_comp is the next argument from the compiler point of
 892   // view (inline type fields are passed in registers/on the stack). In
 893   // sig_extended, an inline type argument starts with: T_PRIMITIVE_OBJECT,
 894   // followed by the types of the fields of the inline type and T_VOID
 895   // to mark the end of the inline type. ignored counts the number of
 896   // T_PRIMITIVE_OBJECT/T_VOID. next_vt_arg is the next inline type argument:
 897   // used to get the buffer for that argument from the pool of buffers
 898   // we allocated above and want to pass to the
 899   // interpreter. next_arg_int is the next argument from the
 900   // interpreter point of view (inline types are passed by reference).
 901   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
 902        next_arg_comp < sig_extended->length(); next_arg_comp++) {
 903     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
 904     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
 905     BasicType bt = sig_extended->at(next_arg_comp)._bt;
 906     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
 907     if (!InlineTypePassFieldsAsArgs || bt != T_PRIMITIVE_OBJECT) {
 908       int next_off = st_off - Interpreter::stackElementSize;
 909       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
 910       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
 911       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
 912       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 913                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
 914       next_arg_int++;
 915 #ifdef ASSERT
 916       if (bt == T_LONG || bt == T_DOUBLE) {
 917         // Overwrite the unused slot with known junk
 918         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 919         __ movptr(Address(rsp, st_off), rax);
 920       }
 921 #endif /* ASSERT */
 922     } else {
 923       ignored++;
 924       // get the buffer from the just allocated pool of buffers
 925       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_PRIMITIVE_OBJECT);
 926       __ load_heap_oop(r14, Address(rscratch2, index));
 927       next_vt_arg++; next_arg_int++;
 928       int vt = 1;
 929       // write fields we get from compiled code in registers/stack
 930       // slots to the buffer: we know we are done with that inline type
 931       // argument when we hit the T_VOID that acts as an end of inline
 932       // type delimiter for this inline type. Inline types are flattened
 933       // so we might encounter embedded inline types. Each entry in
 934       // sig_extended contains a field offset in the buffer.
 935       Label L_null;
 936       do {
 937         next_arg_comp++;
 938         BasicType bt = sig_extended->at(next_arg_comp)._bt;
 939         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
 940         if (bt == T_PRIMITIVE_OBJECT) {
 941           vt++;
 942           ignored++;
 943         } else if (bt == T_VOID &&
 944                    prev_bt != T_LONG &&
 945                    prev_bt != T_DOUBLE) {
 946           vt--;
 947           ignored++;
 948         } else {
 949           int off = sig_extended->at(next_arg_comp)._offset;
 950           if (off == -1) {
 951             // Nullable inline type argument, emit null check
 952             VMReg reg = regs[next_arg_comp-ignored].first();
 953             Label L_notNull;
 954             if (reg->is_stack()) {
 955               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 956               __ testb(Address(rsp, ld_off), 1);
 957             } else {
 958               __ testb(reg->as_Register(), 1);
 959             }
 960             __ jcc(Assembler::notZero, L_notNull);
 961             __ movptr(Address(rsp, st_off), 0);
 962             __ jmp(L_null);
 963             __ bind(L_notNull);
 964             continue;
 965           }
 966           assert(off > 0, "offset in object should be positive");
 967           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
 968           bool is_oop = is_reference_type(bt);
 969           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 970                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
 971         }
 972       } while (vt != 0);
 973       // pass the buffer to the interpreter
 974       __ movptr(Address(rsp, st_off), r14);
 975       __ bind(L_null);
 976     }
 977   }
 978 
 979   // Schedule the branch target address early.
 980   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 981   __ jmp(rcx);
 982 }
 983 
 984 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 985                         address code_start, address code_end,
 986                         Label& L_ok) {
 987   Label L_fail;
 988   __ lea(temp_reg, ExternalAddress(code_start));
 989   __ cmpptr(pc_reg, temp_reg);
 990   __ jcc(Assembler::belowEqual, L_fail);
 991   __ lea(temp_reg, ExternalAddress(code_end));
 992   __ cmpptr(pc_reg, temp_reg);
 993   __ jcc(Assembler::below, L_ok);
 994   __ bind(L_fail);
 995 }
 996 
 997 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 998                                     int comp_args_on_stack,
 999                                     const GrowableArray<SigEntry>* sig,
1000                                     const VMRegPair *regs) {
1001 
1002   // Note: r13 contains the senderSP on entry. We must preserve it since
1003   // we may do a i2c -> c2i transition if we lose a race where compiled
1004   // code goes non-entrant while we get args ready.
1005   // In addition we use r13 to locate all the interpreter args as
1006   // we must align the stack to 16 bytes on an i2c entry else we
1007   // lose alignment we expect in all compiled code and register
1008   // save code can segv when fxsave instructions find improperly
1009   // aligned stack pointer.
1010 
1011   // Adapters can be frameless because they do not require the caller
1012   // to perform additional cleanup work, such as correcting the stack pointer.
1013   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1014   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1015   // even if a callee has modified the stack pointer.
1016   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1017   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1018   // up via the senderSP register).
1019   // In other words, if *either* the caller or callee is interpreted, we can
1020   // get the stack pointer repaired after a call.
1021   // This is why c2i and i2c adapters cannot be indefinitely composed.
1022   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1023   // both caller and callee would be compiled methods, and neither would
1024   // clean up the stack pointer changes performed by the two adapters.
1025   // If this happens, control eventually transfers back to the compiled
1026   // caller, but with an uncorrected stack, causing delayed havoc.
1027 
1028   if (VerifyAdapterCalls &&
1029       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
1030     // So, let's test for cascading c2i/i2c adapters right now.
1031     //  assert(Interpreter::contains($return_addr) ||
1032     //         StubRoutines::contains($return_addr),
1033     //         "i2c adapter must return to an interpreter frame");
1034     __ block_comment("verify_i2c { ");
1035     // Pick up the return address
1036     __ movptr(rax, Address(rsp, 0));
1037     Label L_ok;
1038     if (Interpreter::code() != NULL)
1039       range_check(masm, rax, r11,
1040                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
1041                   L_ok);
1042     if (StubRoutines::code1() != NULL)
1043       range_check(masm, rax, r11,
1044                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
1045                   L_ok);
1046     if (StubRoutines::code2() != NULL)
1047       range_check(masm, rax, r11,
1048                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
1049                   L_ok);
1050     const char* msg = "i2c adapter must return to an interpreter frame";
1051     __ block_comment(msg);
1052     __ stop(msg);
1053     __ bind(L_ok);
1054     __ block_comment("} verify_i2ce ");
1055   }
1056 
1057   // Must preserve original SP for loading incoming arguments because
1058   // we need to align the outgoing SP for compiled code.
1059   __ movptr(r11, rsp);
1060 
1061   // Pick up the return address
1062   __ pop(rax);
1063 
1064   // Convert 4-byte c2 stack slots to words.
1065   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1066 
1067   if (comp_args_on_stack) {
1068     __ subptr(rsp, comp_words_on_stack * wordSize);
1069   }
1070 
1071   // Ensure compiled code always sees stack at proper alignment
1072   __ andptr(rsp, -16);
1073 
1074   // push the return address and misalign the stack that youngest frame always sees
1075   // as far as the placement of the call instruction
1076   __ push(rax);
1077 
1078   // Put saved SP in another register
1079   const Register saved_sp = rax;
1080   __ movptr(saved_sp, r11);
1081 
1082   // Will jump to the compiled code just as if compiled code was doing it.
1083   // Pre-load the register-jump target early, to schedule it better.
1084   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1085 
1086 #if INCLUDE_JVMCI
1087   if (EnableJVMCI) {
1088     // check if this call should be routed towards a specific entry point
1089     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1090     Label no_alternative_target;
1091     __ jcc(Assembler::equal, no_alternative_target);
1092     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1093     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1094     __ bind(no_alternative_target);
1095   }
1096 #endif // INCLUDE_JVMCI
1097 
1098   int total_args_passed = sig->length();
1099 
1100   // Now generate the shuffle code.  Pick up all register args and move the
1101   // rest through the floating point stack top.
1102   for (int i = 0; i < total_args_passed; i++) {
1103     BasicType bt = sig->at(i)._bt;
1104     assert(bt != T_PRIMITIVE_OBJECT, "i2c adapter doesn't unpack inline type args");
1105     if (bt == T_VOID) {
1106       // Longs and doubles are passed in native word order, but misaligned
1107       // in the 32-bit build.
1108       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1109       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1110       continue;
1111     }
1112 
1113     // Pick up 0, 1 or 2 words from SP+offset.
1114 
1115     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1116             "scrambled load targets?");
1117     // Load in argument order going down.
1118     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1119     // Point to interpreter value (vs. tag)
1120     int next_off = ld_off - Interpreter::stackElementSize;
1121     //
1122     //
1123     //
1124     VMReg r_1 = regs[i].first();
1125     VMReg r_2 = regs[i].second();
1126     if (!r_1->is_valid()) {
1127       assert(!r_2->is_valid(), "");
1128       continue;
1129     }
1130     if (r_1->is_stack()) {
1131       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1132       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1133 
1134       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1135       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1136       // will be generated.
1137       if (!r_2->is_valid()) {
1138         // sign extend???
1139         __ movl(r13, Address(saved_sp, ld_off));
1140         __ movptr(Address(rsp, st_off), r13);
1141       } else {
1142         //
1143         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1144         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1145         // So we must adjust where to pick up the data to match the interpreter.
1146         //
1147         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1148         // are accessed as negative so LSW is at LOW address
1149 
1150         // ld_off is MSW so get LSW
1151         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1152                            next_off : ld_off;
1153         __ movq(r13, Address(saved_sp, offset));
1154         // st_off is LSW (i.e. reg.first())
1155         __ movq(Address(rsp, st_off), r13);
1156       }
1157     } else if (r_1->is_Register()) {  // Register argument
1158       Register r = r_1->as_Register();
1159       assert(r != rax, "must be different");
1160       if (r_2->is_valid()) {
1161         //
1162         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1163         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1164         // So we must adjust where to pick up the data to match the interpreter.
1165 
1166         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1167                            next_off : ld_off;
1168 
1169         // this can be a misaligned move
1170         __ movq(r, Address(saved_sp, offset));
1171       } else {
1172         // sign extend and use a full word?
1173         __ movl(r, Address(saved_sp, ld_off));
1174       }
1175     } else {
1176       if (!r_2->is_valid()) {
1177         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1178       } else {
1179         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1180       }
1181     }
1182   }
1183 
1184   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1185 
1186   // 6243940 We might end up in handle_wrong_method if
1187   // the callee is deoptimized as we race thru here. If that
1188   // happens we don't want to take a safepoint because the
1189   // caller frame will look interpreted and arguments are now
1190   // "compiled" so it is much better to make this transition
1191   // invisible to the stack walking code. Unfortunately if
1192   // we try and find the callee by normal means a safepoint
1193   // is possible. So we stash the desired callee in the thread
1194   // and the vm will find there should this case occur.
1195 
1196   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1197 
1198   // put Method* where a c2i would expect should we end up there
1199   // only needed because of c2 resolve stubs return Method* as a result in
1200   // rax
1201   __ mov(rax, rbx);
1202   __ jmp(r11);
1203 }
1204 
1205 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1206   Label ok;
1207 
1208   Register holder = rax;
1209   Register receiver = j_rarg0;
1210   Register temp = rbx;
1211 
1212   __ load_klass(temp, receiver, rscratch1);
1213   __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1214   __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1215   __ jcc(Assembler::equal, ok);
1216   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1217 
1218   __ bind(ok);
1219   // Method might have been compiled since the call site was patched to
1220   // interpreted if that is the case treat it as a miss so we can get
1221   // the call site corrected.
1222   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1223   __ jcc(Assembler::equal, skip_fixup);
1224   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1225 }
1226 
1227 // ---------------------------------------------------------------
1228 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1229                                                             int comp_args_on_stack,
1230                                                             const GrowableArray<SigEntry>* sig,
1231                                                             const VMRegPair* regs,
1232                                                             const GrowableArray<SigEntry>* sig_cc,
1233                                                             const VMRegPair* regs_cc,
1234                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1235                                                             const VMRegPair* regs_cc_ro,
1236                                                             AdapterFingerPrint* fingerprint,
1237                                                             AdapterBlob*& new_adapter,
1238                                                             bool allocate_code_blob) {
1239   address i2c_entry = __ pc();
1240   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1241 
1242   // -------------------------------------------------------------------------
1243   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1244   // to the interpreter.  The args start out packed in the compiled layout.  They
1245   // need to be unpacked into the interpreter layout.  This will almost always
1246   // require some stack space.  We grow the current (compiled) stack, then repack
1247   // the args.  We  finally end in a jump to the generic interpreter entry point.
1248   // On exit from the interpreter, the interpreter will restore our SP (lest the
1249   // compiled code, which relies solely on SP and not RBP, get sick).
1250 
1251   address c2i_unverified_entry = __ pc();
1252   Label skip_fixup;
1253 
1254   gen_inline_cache_check(masm, skip_fixup);
1255 
1256   OopMapSet* oop_maps = new OopMapSet();
1257   int frame_complete = CodeOffsets::frame_never_safe;
1258   int frame_size_in_words = 0;
1259 
1260   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1261   address c2i_inline_ro_entry = __ pc();
1262   if (regs_cc != regs_cc_ro) {
1263     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, false);
1264     skip_fixup.reset();
1265   }
1266 
1267   // Scalarized c2i adapter
1268   address c2i_entry = __ pc();
1269 
1270   // Class initialization barrier for static methods
1271   address c2i_no_clinit_check_entry = NULL;
1272   if (VM_Version::supports_fast_class_init_checks()) {
1273     Label L_skip_barrier;
1274     Register method = rbx;
1275 
1276     { // Bypass the barrier for non-static methods
1277       Register flags = rscratch1;
1278       __ movl(flags, Address(method, Method::access_flags_offset()));
1279       __ testl(flags, JVM_ACC_STATIC);
1280       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1281     }
1282 
1283     Register klass = rscratch1;
1284     __ load_method_holder(klass, method);
1285     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1286 
1287     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1288 
1289     __ bind(L_skip_barrier);
1290     c2i_no_clinit_check_entry = __ pc();
1291   }
1292 
1293   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1294   bs->c2i_entry_barrier(masm);
1295 
1296   gen_c2i_adapter(masm, sig_cc, regs_cc, skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, true);
1297 
1298   address c2i_unverified_inline_entry = c2i_unverified_entry;
1299 
1300   // Non-scalarized c2i adapter
1301   address c2i_inline_entry = c2i_entry;
1302   if (regs != regs_cc) {
1303     Label inline_entry_skip_fixup;
1304     c2i_unverified_inline_entry = __ pc();
1305     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1306 
1307     c2i_inline_entry = __ pc();
1308     gen_c2i_adapter(masm, sig, regs, inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, false);
1309   }
1310 
1311   __ flush();
1312 
1313   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1314   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1315   if (allocate_code_blob) {
1316     bool caller_must_gc_arguments = (regs != regs_cc);
1317     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1318   }
1319 
1320   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1321 }
1322 
1323 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1324                                          VMRegPair *regs,
1325                                          VMRegPair *regs2,
1326                                          int total_args_passed) {
1327   assert(regs2 == NULL, "not needed on x86");
1328 // We return the amount of VMRegImpl stack slots we need to reserve for all
1329 // the arguments NOT counting out_preserve_stack_slots.
1330 
1331 // NOTE: These arrays will have to change when c1 is ported
1332 #ifdef _WIN64
1333     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1334       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1335     };
1336     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1337       c_farg0, c_farg1, c_farg2, c_farg3
1338     };
1339 #else
1340     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1341       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1342     };
1343     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1344       c_farg0, c_farg1, c_farg2, c_farg3,
1345       c_farg4, c_farg5, c_farg6, c_farg7
1346     };
1347 #endif // _WIN64
1348 
1349 
1350     uint int_args = 0;
1351     uint fp_args = 0;
1352     uint stk_args = 0; // inc by 2 each time
1353 
1354     for (int i = 0; i < total_args_passed; i++) {
1355       switch (sig_bt[i]) {
1356       case T_BOOLEAN:
1357       case T_CHAR:
1358       case T_BYTE:
1359       case T_SHORT:
1360       case T_INT:
1361         if (int_args < Argument::n_int_register_parameters_c) {
1362           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1363 #ifdef _WIN64
1364           fp_args++;
1365           // Allocate slots for callee to stuff register args the stack.
1366           stk_args += 2;
1367 #endif
1368         } else {
1369           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1370           stk_args += 2;
1371         }
1372         break;
1373       case T_LONG:
1374         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1375         // fall through
1376       case T_OBJECT:
1377       case T_ARRAY:
1378       case T_PRIMITIVE_OBJECT:
1379       case T_ADDRESS:
1380       case T_METADATA:
1381         if (int_args < Argument::n_int_register_parameters_c) {
1382           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1383 #ifdef _WIN64
1384           fp_args++;
1385           stk_args += 2;
1386 #endif
1387         } else {
1388           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1389           stk_args += 2;
1390         }
1391         break;
1392       case T_FLOAT:
1393         if (fp_args < Argument::n_float_register_parameters_c) {
1394           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1395 #ifdef _WIN64
1396           int_args++;
1397           // Allocate slots for callee to stuff register args the stack.
1398           stk_args += 2;
1399 #endif
1400         } else {
1401           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1402           stk_args += 2;
1403         }
1404         break;
1405       case T_DOUBLE:
1406         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1407         if (fp_args < Argument::n_float_register_parameters_c) {
1408           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1409 #ifdef _WIN64
1410           int_args++;
1411           // Allocate slots for callee to stuff register args the stack.
1412           stk_args += 2;
1413 #endif
1414         } else {
1415           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1416           stk_args += 2;
1417         }
1418         break;
1419       case T_VOID: // Halves of longs and doubles
1420         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1421         regs[i].set_bad();
1422         break;
1423       default:
1424         ShouldNotReachHere();
1425         break;
1426       }
1427     }
1428 #ifdef _WIN64
1429   // windows abi requires that we always allocate enough stack space
1430   // for 4 64bit registers to be stored down.
1431   if (stk_args < 8) {
1432     stk_args = 8;
1433   }
1434 #endif // _WIN64
1435 
1436   return stk_args;
1437 }
1438 
1439 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1440                                              uint num_bits,
1441                                              uint total_args_passed) {
1442   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1443          "only certain vector sizes are supported for now");
1444 
1445   static const XMMRegister VEC_ArgReg[32] = {
1446      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1447      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1448     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1449     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1450   };
1451 
1452   uint stk_args = 0;
1453   uint fp_args = 0;
1454 
1455   for (uint i = 0; i < total_args_passed; i++) {
1456     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1457     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1458     regs[i].set_pair(vmreg->next(next_val), vmreg);
1459   }
1460 
1461   return stk_args;
1462 }
1463 
1464 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1465   // We always ignore the frame_slots arg and just use the space just below frame pointer
1466   // which by this time is free to use
1467   switch (ret_type) {
1468   case T_FLOAT:
1469     __ movflt(Address(rbp, -wordSize), xmm0);
1470     break;
1471   case T_DOUBLE:
1472     __ movdbl(Address(rbp, -wordSize), xmm0);
1473     break;
1474   case T_VOID:  break;
1475   default: {
1476     __ movptr(Address(rbp, -wordSize), rax);
1477     }
1478   }
1479 }
1480 
1481 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1482   // We always ignore the frame_slots arg and just use the space just below frame pointer
1483   // which by this time is free to use
1484   switch (ret_type) {
1485   case T_FLOAT:
1486     __ movflt(xmm0, Address(rbp, -wordSize));
1487     break;
1488   case T_DOUBLE:
1489     __ movdbl(xmm0, Address(rbp, -wordSize));
1490     break;
1491   case T_VOID:  break;
1492   default: {
1493     __ movptr(rax, Address(rbp, -wordSize));
1494     }
1495   }
1496 }
1497 
1498 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1499     for ( int i = first_arg ; i < arg_count ; i++ ) {
1500       if (args[i].first()->is_Register()) {
1501         __ push(args[i].first()->as_Register());
1502       } else if (args[i].first()->is_XMMRegister()) {
1503         __ subptr(rsp, 2*wordSize);
1504         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1505       }
1506     }
1507 }
1508 
1509 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1510     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1511       if (args[i].first()->is_Register()) {
1512         __ pop(args[i].first()->as_Register());
1513       } else if (args[i].first()->is_XMMRegister()) {
1514         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1515         __ addptr(rsp, 2*wordSize);
1516       }
1517     }
1518 }
1519 
1520 static void verify_oop_args(MacroAssembler* masm,
1521                             const methodHandle& method,
1522                             const BasicType* sig_bt,
1523                             const VMRegPair* regs) {
1524   Register temp_reg = rbx;  // not part of any compiled calling seq
1525   if (VerifyOops) {
1526     for (int i = 0; i < method->size_of_parameters(); i++) {
1527       if (is_reference_type(sig_bt[i])) {
1528         VMReg r = regs[i].first();
1529         assert(r->is_valid(), "bad oop arg");
1530         if (r->is_stack()) {
1531           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1532           __ verify_oop(temp_reg);
1533         } else {
1534           __ verify_oop(r->as_Register());
1535         }
1536       }
1537     }
1538   }
1539 }
1540 
1541 static void check_continuation_enter_argument(VMReg actual_vmreg,
1542                                               Register expected_reg,
1543                                               const char* name) {
1544   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1545   assert(actual_vmreg->as_Register() == expected_reg,
1546          "%s is in unexpected register: %s instead of %s",
1547          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1548 }
1549 
1550 static void gen_continuation_enter(MacroAssembler* masm,
1551                                    const VMRegPair* regs,
1552                                    int& exception_offset,
1553                                    OopMapSet* oop_maps,
1554                                    int& frame_complete,
1555                                    int& stack_slots,
1556                                    int& interpreted_entry_offset,
1557                                    int& compiled_entry_offset) {
1558 
1559   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1560   int pos_cont_obj   = 0;
1561   int pos_is_cont    = 1;
1562   int pos_is_virtual = 2;
1563 
1564   // The platform-specific calling convention may present the arguments in various registers.
1565   // To simplify the rest of the code, we expect the arguments to reside at these known
1566   // registers, and we additionally check the placement here in case calling convention ever
1567   // changes.
1568   Register reg_cont_obj   = c_rarg1;
1569   Register reg_is_cont    = c_rarg2;
1570   Register reg_is_virtual = c_rarg3;
1571 
1572   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1573   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1574   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1575 
1576   // Utility methods kill rax, make sure there are no collisions
1577   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1578 
1579   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1580                          relocInfo::static_call_type);
1581 
1582   address start = __ pc();
1583 
1584   Label L_thaw, L_exit;
1585 
1586   // i2i entry used at interp_only_mode only
1587   interpreted_entry_offset = __ pc() - start;
1588   {
1589 #ifdef ASSERT
1590     Label is_interp_only;
1591     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1592     __ jcc(Assembler::notEqual, is_interp_only);
1593     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1594     __ bind(is_interp_only);
1595 #endif
1596 
1597     __ pop(rax); // return address
1598     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1599     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1600     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1601     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1602     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1603     __ push(rax); // return address
1604     __ push_cont_fastpath();
1605 
1606     __ enter();
1607 
1608     stack_slots = 2; // will be adjusted in setup
1609     OopMap* map = __ continuation_enter_setup(stack_slots);
1610     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1611     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1612 
1613     __ verify_oop(reg_cont_obj);
1614 
1615     __ fill_continuation_entry(reg_cont_obj, reg_is_virtual);
1616 
1617     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1618     __ testptr(reg_is_cont, reg_is_cont);
1619     __ jcc(Assembler::notZero, L_thaw);
1620 
1621     // --- Resolve path
1622 
1623     // Make sure the call is patchable
1624     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1625     // Emit stub for static call
1626     CodeBuffer* cbuf = masm->code_section()->outer();
1627     address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1628     if (stub == nullptr) {
1629       fatal("CodeCache is full at gen_continuation_enter");
1630     }
1631     __ call(resolve);
1632     oop_maps->add_gc_map(__ pc() - start, map);
1633     __ post_call_nop();
1634 
1635     __ jmp(L_exit);
1636   }
1637 
1638   // compiled entry
1639   __ align(CodeEntryAlignment);
1640   compiled_entry_offset = __ pc() - start;
1641   __ enter();
1642 
1643   stack_slots = 2; // will be adjusted in setup
1644   OopMap* map = __ continuation_enter_setup(stack_slots);
1645 
1646   // Frame is now completed as far as size and linkage.
1647   frame_complete = __ pc() - start;
1648 
1649   __ verify_oop(reg_cont_obj);
1650 
1651   __ fill_continuation_entry(reg_cont_obj, reg_is_virtual);
1652 
1653   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1654   __ testptr(reg_is_cont, reg_is_cont);
1655   __ jccb(Assembler::notZero, L_thaw);
1656 
1657   // --- call Continuation.enter(Continuation c, boolean isContinue)
1658 
1659   // Make sure the call is patchable
1660   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1661 
1662   // Emit stub for static call
1663   CodeBuffer* cbuf = masm->code_section()->outer();
1664   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1665   if (stub == nullptr) {
1666     fatal("CodeCache is full at gen_continuation_enter");
1667   }
1668 
1669   // The call needs to be resolved. There's a special case for this in
1670   // SharedRuntime::find_callee_info_helper() which calls
1671   // LinkResolver::resolve_continuation_enter() which resolves the call to
1672   // Continuation.enter(Continuation c, boolean isContinue).
1673   __ call(resolve);
1674 
1675   oop_maps->add_gc_map(__ pc() - start, map);
1676   __ post_call_nop();
1677 
1678   __ jmpb(L_exit);
1679 
1680   // --- Thawing path
1681 
1682   __ bind(L_thaw);
1683 
1684   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1685 
1686   ContinuationEntry::_return_pc_offset = __ pc() - start;
1687   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1688   __ post_call_nop();
1689 
1690   // --- Normal exit (resolve/thawing)
1691 
1692   __ bind(L_exit);
1693 
1694   __ continuation_enter_cleanup();
1695   __ pop(rbp);
1696   __ ret(0);
1697 
1698   // --- Exception handling path
1699 
1700   exception_offset = __ pc() - start;
1701 
1702   __ continuation_enter_cleanup();
1703   __ pop(rbp);
1704 
1705   __ movptr(c_rarg0, r15_thread);
1706   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1707 
1708   // rax still holds the original exception oop, save it before the call
1709   __ push(rax);
1710 
1711   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1712   __ movptr(rbx, rax);
1713 
1714   // Continue at exception handler:
1715   //   rax: exception oop
1716   //   rbx: exception handler
1717   //   rdx: exception pc
1718   __ pop(rax);
1719   __ verify_oop(rax);
1720   __ pop(rdx);
1721   __ jmp(rbx);
1722 }
1723 
1724 static void gen_continuation_yield(MacroAssembler* masm,
1725                                    const VMRegPair* regs,
1726                                    OopMapSet* oop_maps,
1727                                    int& frame_complete,
1728                                    int& stack_slots,
1729                                    int& compiled_entry_offset) {
1730   enum layout {
1731     rbp_off,
1732     rbpH_off,
1733     return_off,
1734     return_off2,
1735     framesize // inclusive of return address
1736   };
1737   stack_slots = framesize /  VMRegImpl::slots_per_word;
1738   assert(stack_slots == 2, "recheck layout");
1739 
1740   address start = __ pc();
1741   compiled_entry_offset = __ pc() - start;
1742   __ enter();
1743   address the_pc = __ pc();
1744 
1745   frame_complete = the_pc - start;
1746 
1747   // This nop must be exactly at the PC we push into the frame info.
1748   // We use this nop for fast CodeBlob lookup, associate the OopMap
1749   // with it right away.
1750   __ post_call_nop();
1751   OopMap* map = new OopMap(framesize, 1);
1752   oop_maps->add_gc_map(frame_complete, map);
1753 
1754   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1755   __ movptr(c_rarg0, r15_thread);
1756   __ movptr(c_rarg1, rsp);
1757   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1758   __ reset_last_Java_frame(true);
1759 
1760   Label L_pinned;
1761 
1762   __ testptr(rax, rax);
1763   __ jcc(Assembler::notZero, L_pinned);
1764 
1765   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1766   __ continuation_enter_cleanup();
1767   __ pop(rbp);
1768   __ ret(0);
1769 
1770   __ bind(L_pinned);
1771 
1772   // Pinned, return to caller
1773   __ leave();
1774   __ ret(0);
1775 }
1776 
1777 static void gen_special_dispatch(MacroAssembler* masm,
1778                                  const methodHandle& method,
1779                                  const BasicType* sig_bt,
1780                                  const VMRegPair* regs) {
1781   verify_oop_args(masm, method, sig_bt, regs);
1782   vmIntrinsics::ID iid = method->intrinsic_id();
1783 
1784   // Now write the args into the outgoing interpreter space
1785   bool     has_receiver   = false;
1786   Register receiver_reg   = noreg;
1787   int      member_arg_pos = -1;
1788   Register member_reg     = noreg;
1789   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1790   if (ref_kind != 0) {
1791     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1792     member_reg = rbx;  // known to be free at this point
1793     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1794   } else if (iid == vmIntrinsics::_invokeBasic) {
1795     has_receiver = true;
1796   } else if (iid == vmIntrinsics::_linkToNative) {
1797     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1798     member_reg = rbx;  // known to be free at this point
1799   } else {
1800     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1801   }
1802 
1803   if (member_reg != noreg) {
1804     // Load the member_arg into register, if necessary.
1805     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1806     VMReg r = regs[member_arg_pos].first();
1807     if (r->is_stack()) {
1808       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1809     } else {
1810       // no data motion is needed
1811       member_reg = r->as_Register();
1812     }
1813   }
1814 
1815   if (has_receiver) {
1816     // Make sure the receiver is loaded into a register.
1817     assert(method->size_of_parameters() > 0, "oob");
1818     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1819     VMReg r = regs[0].first();
1820     assert(r->is_valid(), "bad receiver arg");
1821     if (r->is_stack()) {
1822       // Porting note:  This assumes that compiled calling conventions always
1823       // pass the receiver oop in a register.  If this is not true on some
1824       // platform, pick a temp and load the receiver from stack.
1825       fatal("receiver always in a register");
1826       receiver_reg = j_rarg0;  // known to be free at this point
1827       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1828     } else {
1829       // no data motion is needed
1830       receiver_reg = r->as_Register();
1831     }
1832   }
1833 
1834   // Figure out which address we are really jumping to:
1835   MethodHandles::generate_method_handle_dispatch(masm, iid,
1836                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1837 }
1838 
1839 // ---------------------------------------------------------------------------
1840 // Generate a native wrapper for a given method.  The method takes arguments
1841 // in the Java compiled code convention, marshals them to the native
1842 // convention (handlizes oops, etc), transitions to native, makes the call,
1843 // returns to java state (possibly blocking), unhandlizes any result and
1844 // returns.
1845 //
1846 // Critical native functions are a shorthand for the use of
1847 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1848 // functions.  The wrapper is expected to unpack the arguments before
1849 // passing them to the callee. Critical native functions leave the state _in_Java,
1850 // since they cannot stop for GC.
1851 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1852 // block and the check for pending exceptions it's impossible for them
1853 // to be thrown.
1854 //
1855 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1856                                                 const methodHandle& method,
1857                                                 int compile_id,
1858                                                 BasicType* in_sig_bt,
1859                                                 VMRegPair* in_regs,
1860                                                 BasicType ret_type) {
1861   if (method->is_continuation_native_intrinsic()) {
1862     int exception_offset = -1;
1863     OopMapSet* oop_maps = new OopMapSet();
1864     int frame_complete = -1;
1865     int stack_slots = -1;
1866     int interpreted_entry_offset = -1;
1867     int vep_offset = -1;
1868     if (method->is_continuation_enter_intrinsic()) {
1869       gen_continuation_enter(masm,
1870                              in_regs,
1871                              exception_offset,
1872                              oop_maps,
1873                              frame_complete,
1874                              stack_slots,
1875                              interpreted_entry_offset,
1876                              vep_offset);
1877     } else if (method->is_continuation_yield_intrinsic()) {
1878       gen_continuation_yield(masm,
1879                              in_regs,
1880                              oop_maps,
1881                              frame_complete,
1882                              stack_slots,
1883                              vep_offset);
1884     } else {
1885       guarantee(false, "Unknown Continuation native intrinsic");
1886     }
1887 
1888 #ifdef ASSERT
1889     if (method->is_continuation_enter_intrinsic()) {
1890       assert(interpreted_entry_offset != -1, "Must be set");
1891       assert(exception_offset != -1,         "Must be set");
1892     } else {
1893       assert(interpreted_entry_offset == -1, "Must be unset");
1894       assert(exception_offset == -1,         "Must be unset");
1895     }
1896     assert(frame_complete != -1,    "Must be set");
1897     assert(stack_slots != -1,       "Must be set");
1898     assert(vep_offset != -1,        "Must be set");
1899 #endif
1900 
1901     __ flush();
1902     nmethod* nm = nmethod::new_native_nmethod(method,
1903                                               compile_id,
1904                                               masm->code(),
1905                                               vep_offset,
1906                                               frame_complete,
1907                                               stack_slots,
1908                                               in_ByteSize(-1),
1909                                               in_ByteSize(-1),
1910                                               oop_maps,
1911                                               exception_offset);
1912     if (method->is_continuation_enter_intrinsic()) {
1913       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1914     } else if (method->is_continuation_yield_intrinsic()) {
1915       _cont_doYield_stub = nm;
1916     }
1917     return nm;
1918   }
1919 
1920   if (method->is_method_handle_intrinsic()) {
1921     vmIntrinsics::ID iid = method->intrinsic_id();
1922     intptr_t start = (intptr_t)__ pc();
1923     int vep_offset = ((intptr_t)__ pc()) - start;
1924     gen_special_dispatch(masm,
1925                          method,
1926                          in_sig_bt,
1927                          in_regs);
1928     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1929     __ flush();
1930     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1931     return nmethod::new_native_nmethod(method,
1932                                        compile_id,
1933                                        masm->code(),
1934                                        vep_offset,
1935                                        frame_complete,
1936                                        stack_slots / VMRegImpl::slots_per_word,
1937                                        in_ByteSize(-1),
1938                                        in_ByteSize(-1),
1939                                        (OopMapSet*)NULL);
1940   }
1941   address native_func = method->native_function();
1942   assert(native_func != NULL, "must have function");
1943 
1944   // An OopMap for lock (and class if static)
1945   OopMapSet *oop_maps = new OopMapSet();
1946   intptr_t start = (intptr_t)__ pc();
1947 
1948   // We have received a description of where all the java arg are located
1949   // on entry to the wrapper. We need to convert these args to where
1950   // the jni function will expect them. To figure out where they go
1951   // we convert the java signature to a C signature by inserting
1952   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1953 
1954   const int total_in_args = method->size_of_parameters();
1955   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1956 
1957   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1958   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1959   BasicType* in_elem_bt = NULL;
1960 
1961   int argc = 0;
1962   out_sig_bt[argc++] = T_ADDRESS;
1963   if (method->is_static()) {
1964     out_sig_bt[argc++] = T_OBJECT;
1965   }
1966 
1967   for (int i = 0; i < total_in_args ; i++ ) {
1968     out_sig_bt[argc++] = in_sig_bt[i];
1969   }
1970 
1971   // Now figure out where the args must be stored and how much stack space
1972   // they require.
1973   int out_arg_slots;
1974   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1975 
1976   // Compute framesize for the wrapper.  We need to handlize all oops in
1977   // incoming registers
1978 
1979   // Calculate the total number of stack slots we will need.
1980 
1981   // First count the abi requirement plus all of the outgoing args
1982   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1983 
1984   // Now the space for the inbound oop handle area
1985   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1986 
1987   int oop_handle_offset = stack_slots;
1988   stack_slots += total_save_slots;
1989 
1990   // Now any space we need for handlizing a klass if static method
1991 
1992   int klass_slot_offset = 0;
1993   int klass_offset = -1;
1994   int lock_slot_offset = 0;
1995   bool is_static = false;
1996 
1997   if (method->is_static()) {
1998     klass_slot_offset = stack_slots;
1999     stack_slots += VMRegImpl::slots_per_word;
2000     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2001     is_static = true;
2002   }
2003 
2004   // Plus a lock if needed
2005 
2006   if (method->is_synchronized()) {
2007     lock_slot_offset = stack_slots;
2008     stack_slots += VMRegImpl::slots_per_word;
2009   }
2010 
2011   // Now a place (+2) to save return values or temp during shuffling
2012   // + 4 for return address (which we own) and saved rbp
2013   stack_slots += 6;
2014 
2015   // Ok The space we have allocated will look like:
2016   //
2017   //
2018   // FP-> |                     |
2019   //      |---------------------|
2020   //      | 2 slots for moves   |
2021   //      |---------------------|
2022   //      | lock box (if sync)  |
2023   //      |---------------------| <- lock_slot_offset
2024   //      | klass (if static)   |
2025   //      |---------------------| <- klass_slot_offset
2026   //      | oopHandle area      |
2027   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2028   //      | outbound memory     |
2029   //      | based arguments     |
2030   //      |                     |
2031   //      |---------------------|
2032   //      |                     |
2033   // SP-> | out_preserved_slots |
2034   //
2035   //
2036 
2037 
2038   // Now compute actual number of stack words we need rounding to make
2039   // stack properly aligned.
2040   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2041 
2042   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2043 
2044   // First thing make an ic check to see if we should even be here
2045 
2046   // We are free to use all registers as temps without saving them and
2047   // restoring them except rbp. rbp is the only callee save register
2048   // as far as the interpreter and the compiler(s) are concerned.
2049 
2050 
2051   const Register ic_reg = rax;
2052   const Register receiver = j_rarg0;
2053 
2054   Label hit;
2055   Label exception_pending;
2056 
2057   assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
2058   __ verify_oop(receiver);
2059   __ load_klass(rscratch1, receiver, rscratch2);
2060   __ cmpq(ic_reg, rscratch1);
2061   __ jcc(Assembler::equal, hit);
2062 
2063   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2064 
2065   // Verified entry point must be aligned
2066   __ align(8);
2067 
2068   __ bind(hit);
2069 
2070   int vep_offset = ((intptr_t)__ pc()) - start;
2071 
2072   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2073     Label L_skip_barrier;
2074     Register klass = r10;
2075     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2076     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2077 
2078     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2079 
2080     __ bind(L_skip_barrier);
2081   }
2082 
2083 #ifdef COMPILER1
2084   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2085   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2086     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2087   }
2088 #endif // COMPILER1
2089 
2090   // The instruction at the verified entry point must be 5 bytes or longer
2091   // because it can be patched on the fly by make_non_entrant. The stack bang
2092   // instruction fits that requirement.
2093 
2094   // Generate stack overflow check
2095   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2096 
2097   // Generate a new frame for the wrapper.
2098   __ enter();
2099   // -2 because return address is already present and so is saved rbp
2100   __ subptr(rsp, stack_size - 2*wordSize);
2101 
2102   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2103   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2104   bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);
2105 
2106   // Frame is now completed as far as size and linkage.
2107   int frame_complete = ((intptr_t)__ pc()) - start;
2108 
2109     if (UseRTMLocking) {
2110       // Abort RTM transaction before calling JNI
2111       // because critical section will be large and will be
2112       // aborted anyway. Also nmethod could be deoptimized.
2113       __ xabort(0);
2114     }
2115 
2116 #ifdef ASSERT
2117   __ check_stack_alignment(rsp, "improperly aligned stack");
2118 #endif /* ASSERT */
2119 
2120 
2121   // We use r14 as the oop handle for the receiver/klass
2122   // It is callee save so it survives the call to native
2123 
2124   const Register oop_handle_reg = r14;
2125 
2126   //
2127   // We immediately shuffle the arguments so that any vm call we have to
2128   // make from here on out (sync slow path, jvmti, etc.) we will have
2129   // captured the oops from our caller and have a valid oopMap for
2130   // them.
2131 
2132   // -----------------
2133   // The Grand Shuffle
2134 
2135   // The Java calling convention is either equal (linux) or denser (win64) than the
2136   // c calling convention. However the because of the jni_env argument the c calling
2137   // convention always has at least one more (and two for static) arguments than Java.
2138   // Therefore if we move the args from java -> c backwards then we will never have
2139   // a register->register conflict and we don't have to build a dependency graph
2140   // and figure out how to break any cycles.
2141   //
2142 
2143   // Record esp-based slot for receiver on stack for non-static methods
2144   int receiver_offset = -1;
2145 
2146   // This is a trick. We double the stack slots so we can claim
2147   // the oops in the caller's frame. Since we are sure to have
2148   // more args than the caller doubling is enough to make
2149   // sure we can capture all the incoming oop args from the
2150   // caller.
2151   //
2152   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2153 
2154   // Mark location of rbp (someday)
2155   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2156 
2157   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2158   // All inbound args are referenced based on rbp and all outbound args via rsp.
2159 
2160 
2161 #ifdef ASSERT
2162   bool reg_destroyed[Register::number_of_registers];
2163   bool freg_destroyed[XMMRegister::number_of_registers];
2164   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2165     reg_destroyed[r] = false;
2166   }
2167   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2168     freg_destroyed[f] = false;
2169   }
2170 
2171 #endif /* ASSERT */
2172 
2173   // For JNI natives the incoming and outgoing registers are offset upwards.
2174   GrowableArray<int> arg_order(2 * total_in_args);
2175 
2176   VMRegPair tmp_vmreg;
2177   tmp_vmreg.set2(rbx->as_VMReg());
2178 
2179   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2180     arg_order.push(i);
2181     arg_order.push(c_arg);
2182   }
2183 
2184   int temploc = -1;
2185   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2186     int i = arg_order.at(ai);
2187     int c_arg = arg_order.at(ai + 1);
2188     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2189 #ifdef ASSERT
2190     if (in_regs[i].first()->is_Register()) {
2191       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2192     } else if (in_regs[i].first()->is_XMMRegister()) {
2193       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2194     }
2195     if (out_regs[c_arg].first()->is_Register()) {
2196       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2197     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2198       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2199     }
2200 #endif /* ASSERT */
2201     switch (in_sig_bt[i]) {
2202       case T_ARRAY:
2203       case T_PRIMITIVE_OBJECT:
2204       case T_OBJECT:
2205         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2206                     ((i == 0) && (!is_static)),
2207                     &receiver_offset);
2208         break;
2209       case T_VOID:
2210         break;
2211 
2212       case T_FLOAT:
2213         __ float_move(in_regs[i], out_regs[c_arg]);
2214           break;
2215 
2216       case T_DOUBLE:
2217         assert( i + 1 < total_in_args &&
2218                 in_sig_bt[i + 1] == T_VOID &&
2219                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2220         __ double_move(in_regs[i], out_regs[c_arg]);
2221         break;
2222 
2223       case T_LONG :
2224         __ long_move(in_regs[i], out_regs[c_arg]);
2225         break;
2226 
2227       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2228 
2229       default:
2230         __ move32_64(in_regs[i], out_regs[c_arg]);
2231     }
2232   }
2233 
2234   int c_arg;
2235 
2236   // Pre-load a static method's oop into r14.  Used both by locking code and
2237   // the normal JNI call code.
2238   // point c_arg at the first arg that is already loaded in case we
2239   // need to spill before we call out
2240   c_arg = total_c_args - total_in_args;
2241 
2242   if (method->is_static()) {
2243 
2244     //  load oop into a register
2245     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2246 
2247     // Now handlize the static class mirror it's known not-null.
2248     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2249     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2250 
2251     // Now get the handle
2252     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2253     // store the klass handle as second argument
2254     __ movptr(c_rarg1, oop_handle_reg);
2255     // and protect the arg if we must spill
2256     c_arg--;
2257   }
2258 
2259   // Change state to native (we save the return address in the thread, since it might not
2260   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2261   // points into the right code segment. It does not have to be the correct return pc.
2262   // We use the same pc/oopMap repeatedly when we call out
2263 
2264   intptr_t the_pc = (intptr_t) __ pc();
2265   oop_maps->add_gc_map(the_pc - start, map);
2266 
2267   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2268 
2269 
2270   // We have all of the arguments setup at this point. We must not touch any register
2271   // argument registers at this point (what if we save/restore them there are no oop?
2272 
2273   {
2274     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2275     // protect the args we've loaded
2276     save_args(masm, total_c_args, c_arg, out_regs);
2277     __ mov_metadata(c_rarg1, method());
2278     __ call_VM_leaf(
2279       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2280       r15_thread, c_rarg1);
2281     restore_args(masm, total_c_args, c_arg, out_regs);
2282   }
2283 
2284   // RedefineClasses() tracing support for obsolete method entry
2285   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2286     // protect the args we've loaded
2287     save_args(masm, total_c_args, c_arg, out_regs);
2288     __ mov_metadata(c_rarg1, method());
2289     __ call_VM_leaf(
2290       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2291       r15_thread, c_rarg1);
2292     restore_args(masm, total_c_args, c_arg, out_regs);
2293   }
2294 
2295   // Lock a synchronized method
2296 
2297   // Register definitions used by locking and unlocking
2298 
2299   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2300   const Register obj_reg  = rbx;  // Will contain the oop
2301   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2302   const Register old_hdr  = r13;  // value of old header at unlock time
2303 
2304   Label slow_path_lock;
2305   Label lock_done;
2306 
2307   if (method->is_synchronized()) {
2308     Label count_mon;
2309 
2310     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2311 
2312     // Get the handle (the 2nd argument)
2313     __ mov(oop_handle_reg, c_rarg1);
2314 
2315     // Get address of the box
2316 
2317     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2318 
2319     // Load the oop from the handle
2320     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2321 
2322     if (!UseHeavyMonitors) {
2323 
2324       // Load immediate 1 into swap_reg %rax
2325       __ movl(swap_reg, 1);
2326 
2327       // Load (object->mark() | 1) into swap_reg %rax
2328       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2329       if (EnableValhalla) {
2330         // Mask inline_type bit such that we go to the slow path if object is an inline type
2331         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2332       }
2333 
2334       // Save (object->mark() | 1) into BasicLock's displaced header
2335       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2336 
2337       // src -> dest iff dest == rax else rax <- dest
2338       __ lock();
2339       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2340       __ jcc(Assembler::equal, count_mon);
2341 
2342       // Hmm should this move to the slow path code area???
2343 
2344       // Test if the oopMark is an obvious stack pointer, i.e.,
2345       //  1) (mark & 3) == 0, and
2346       //  2) rsp <= mark < mark + os::pagesize()
2347       // These 3 tests can be done by evaluating the following
2348       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2349       // assuming both stack pointer and pagesize have their
2350       // least significant 2 bits clear.
2351       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2352 
2353       __ subptr(swap_reg, rsp);
2354       __ andptr(swap_reg, 3 - os::vm_page_size());
2355 
2356       // Save the test result, for recursive case, the result is zero
2357       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2358       __ jcc(Assembler::notEqual, slow_path_lock);
2359     } else {
2360       __ jmp(slow_path_lock);
2361     }
2362     __ bind(count_mon);
2363     __ inc_held_monitor_count();
2364 
2365     // Slow path will re-enter here
2366     __ bind(lock_done);
2367   }
2368 
2369   // Finally just about ready to make the JNI call
2370 
2371   // get JNIEnv* which is first argument to native
2372   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2373 
2374   // Now set thread in native
2375   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2376 
2377   __ call(RuntimeAddress(native_func));
2378 
2379   // Verify or restore cpu control state after JNI call
2380   __ restore_cpu_control_state_after_jni(rscratch1);
2381 
2382   // Unpack native results.
2383   switch (ret_type) {
2384   case T_BOOLEAN: __ c2bool(rax);            break;
2385   case T_CHAR   : __ movzwl(rax, rax);      break;
2386   case T_BYTE   : __ sign_extend_byte (rax); break;
2387   case T_SHORT  : __ sign_extend_short(rax); break;
2388   case T_INT    : /* nothing to do */        break;
2389   case T_DOUBLE :
2390   case T_FLOAT  :
2391     // Result is in xmm0 we'll save as needed
2392     break;
2393   case T_ARRAY:                 // Really a handle
2394   case T_PRIMITIVE_OBJECT:           // Really a handle
2395   case T_OBJECT:                // Really a handle
2396       break; // can't de-handlize until after safepoint check
2397   case T_VOID: break;
2398   case T_LONG: break;
2399   default       : ShouldNotReachHere();
2400   }
2401 
2402   Label after_transition;
2403 
2404   // Switch thread to "native transition" state before reading the synchronization state.
2405   // This additional state is necessary because reading and testing the synchronization
2406   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2407   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2408   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2409   //     Thread A is resumed to finish this native method, but doesn't block here since it
2410   //     didn't see any synchronization is progress, and escapes.
2411   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2412 
2413   // Force this write out before the read below
2414   if (!UseSystemMemoryBarrier) {
2415     __ membar(Assembler::Membar_mask_bits(
2416               Assembler::LoadLoad | Assembler::LoadStore |
2417               Assembler::StoreLoad | Assembler::StoreStore));
2418   }
2419 
2420   // check for safepoint operation in progress and/or pending suspend requests
2421   {
2422     Label Continue;
2423     Label slow_path;
2424 
2425     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2426 
2427     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2428     __ jcc(Assembler::equal, Continue);
2429     __ bind(slow_path);
2430 
2431     // Don't use call_VM as it will see a possible pending exception and forward it
2432     // and never return here preventing us from clearing _last_native_pc down below.
2433     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2434     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2435     // by hand.
2436     //
2437     __ vzeroupper();
2438     save_native_result(masm, ret_type, stack_slots);
2439     __ mov(c_rarg0, r15_thread);
2440     __ mov(r12, rsp); // remember sp
2441     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2442     __ andptr(rsp, -16); // align stack as required by ABI
2443     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2444     __ mov(rsp, r12); // restore sp
2445     __ reinit_heapbase();
2446     // Restore any method result value
2447     restore_native_result(masm, ret_type, stack_slots);
2448     __ bind(Continue);
2449   }
2450 
2451   // change thread state
2452   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2453   __ bind(after_transition);
2454 
2455   Label reguard;
2456   Label reguard_done;
2457   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2458   __ jcc(Assembler::equal, reguard);
2459   __ bind(reguard_done);
2460 
2461   // native result if any is live
2462 
2463   // Unlock
2464   Label slow_path_unlock;
2465   Label unlock_done;
2466   if (method->is_synchronized()) {
2467 
2468     Label fast_done;
2469 
2470     // Get locked oop from the handle we passed to jni
2471     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2472 
2473     if (!UseHeavyMonitors) {
2474       Label not_recur;
2475       // Simple recursive lock?
2476       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2477       __ jcc(Assembler::notEqual, not_recur);
2478       __ dec_held_monitor_count();
2479       __ jmpb(fast_done);
2480       __ bind(not_recur);
2481     }
2482 
2483     // Must save rax if it is live now because cmpxchg must use it
2484     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2485       save_native_result(masm, ret_type, stack_slots);
2486     }
2487 
2488     if (!UseHeavyMonitors) {
2489       // get address of the stack lock
2490       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2491       //  get old displaced header
2492       __ movptr(old_hdr, Address(rax, 0));
2493 
2494       // Atomic swap old header if oop still contains the stack lock
2495       __ lock();
2496       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2497       __ jcc(Assembler::notEqual, slow_path_unlock);
2498       __ dec_held_monitor_count();
2499     } else {
2500       __ jmp(slow_path_unlock);
2501     }
2502 
2503     // slow path re-enters here
2504     __ bind(unlock_done);
2505     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2506       restore_native_result(masm, ret_type, stack_slots);
2507     }
2508 
2509     __ bind(fast_done);
2510   }
2511   {
2512     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2513     save_native_result(masm, ret_type, stack_slots);
2514     __ mov_metadata(c_rarg1, method());
2515     __ call_VM_leaf(
2516          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2517          r15_thread, c_rarg1);
2518     restore_native_result(masm, ret_type, stack_slots);
2519   }
2520 
2521   __ reset_last_Java_frame(false);
2522 
2523   // Unbox oop result, e.g. JNIHandles::resolve value.
2524   if (is_reference_type(ret_type)) {
2525     __ resolve_jobject(rax /* value */,
2526                        r15_thread /* thread */,
2527                        rcx /* tmp */);
2528   }
2529 
2530   if (CheckJNICalls) {
2531     // clear_pending_jni_exception_check
2532     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2533   }
2534 
2535   // reset handle block
2536   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2537   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), NULL_WORD);
2538 
2539   // pop our frame
2540 
2541   __ leave();
2542 
2543   // Any exception pending?
2544   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2545   __ jcc(Assembler::notEqual, exception_pending);
2546 
2547   // Return
2548 
2549   __ ret(0);
2550 
2551   // Unexpected paths are out of line and go here
2552 
2553   // forward the exception
2554   __ bind(exception_pending);
2555 
2556   // and forward the exception
2557   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2558 
2559   // Slow path locking & unlocking
2560   if (method->is_synchronized()) {
2561 
2562     // BEGIN Slow path lock
2563     __ bind(slow_path_lock);
2564 
2565     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2566     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2567 
2568     // protect the args we've loaded
2569     save_args(masm, total_c_args, c_arg, out_regs);
2570 
2571     __ mov(c_rarg0, obj_reg);
2572     __ mov(c_rarg1, lock_reg);
2573     __ mov(c_rarg2, r15_thread);
2574 
2575     // Not a leaf but we have last_Java_frame setup as we want
2576     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2577     restore_args(masm, total_c_args, c_arg, out_regs);
2578 
2579 #ifdef ASSERT
2580     { Label L;
2581     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2582     __ jcc(Assembler::equal, L);
2583     __ stop("no pending exception allowed on exit from monitorenter");
2584     __ bind(L);
2585     }
2586 #endif
2587     __ jmp(lock_done);
2588 
2589     // END Slow path lock
2590 
2591     // BEGIN Slow path unlock
2592     __ bind(slow_path_unlock);
2593 
2594     // If we haven't already saved the native result we must save it now as xmm registers
2595     // are still exposed.
2596     __ vzeroupper();
2597     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2598       save_native_result(masm, ret_type, stack_slots);
2599     }
2600 
2601     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2602 
2603     __ mov(c_rarg0, obj_reg);
2604     __ mov(c_rarg2, r15_thread);
2605     __ mov(r12, rsp); // remember sp
2606     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2607     __ andptr(rsp, -16); // align stack as required by ABI
2608 
2609     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2610     // NOTE that obj_reg == rbx currently
2611     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2612     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2613 
2614     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2615     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2616     __ mov(rsp, r12); // restore sp
2617     __ reinit_heapbase();
2618 #ifdef ASSERT
2619     {
2620       Label L;
2621       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2622       __ jcc(Assembler::equal, L);
2623       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2624       __ bind(L);
2625     }
2626 #endif /* ASSERT */
2627 
2628     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2629 
2630     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2631       restore_native_result(masm, ret_type, stack_slots);
2632     }
2633     __ jmp(unlock_done);
2634 
2635     // END Slow path unlock
2636 
2637   } // synchronized
2638 
2639   // SLOW PATH Reguard the stack if needed
2640 
2641   __ bind(reguard);
2642   __ vzeroupper();
2643   save_native_result(masm, ret_type, stack_slots);
2644   __ mov(r12, rsp); // remember sp
2645   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2646   __ andptr(rsp, -16); // align stack as required by ABI
2647   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2648   __ mov(rsp, r12); // restore sp
2649   __ reinit_heapbase();
2650   restore_native_result(masm, ret_type, stack_slots);
2651   // and continue
2652   __ jmp(reguard_done);
2653 
2654 
2655 
2656   __ flush();
2657 
2658   nmethod *nm = nmethod::new_native_nmethod(method,
2659                                             compile_id,
2660                                             masm->code(),
2661                                             vep_offset,
2662                                             frame_complete,
2663                                             stack_slots / VMRegImpl::slots_per_word,
2664                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2665                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2666                                             oop_maps);
2667 
2668   return nm;
2669 }
2670 
2671 // this function returns the adjust size (in number of words) to a c2i adapter
2672 // activation for use during deoptimization
2673 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2674   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2675 }
2676 
2677 
2678 uint SharedRuntime::out_preserve_stack_slots() {
2679   return 0;
2680 }
2681 
2682 
2683 // Number of stack slots between incoming argument block and the start of
2684 // a new frame.  The PROLOG must add this many slots to the stack.  The
2685 // EPILOG must remove this many slots.  amd64 needs two slots for
2686 // return address.
2687 uint SharedRuntime::in_preserve_stack_slots() {
2688   return 4 + 2 * VerifyStackAtCalls;
2689 }
2690 
2691 //------------------------------generate_deopt_blob----------------------------
2692 void SharedRuntime::generate_deopt_blob() {
2693   // Allocate space for the code
2694   ResourceMark rm;
2695   // Setup code generation tools
2696   int pad = 0;
2697   if (UseAVX > 2) {
2698     pad += 1024;
2699   }
2700 #if INCLUDE_JVMCI
2701   if (EnableJVMCI) {
2702     pad += 512; // Increase the buffer size when compiling for JVMCI
2703   }
2704 #endif
2705   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2706   MacroAssembler* masm = new MacroAssembler(&buffer);
2707   int frame_size_in_words;
2708   OopMap* map = NULL;
2709   OopMapSet *oop_maps = new OopMapSet();
2710 
2711   // -------------
2712   // This code enters when returning to a de-optimized nmethod.  A return
2713   // address has been pushed on the stack, and return values are in
2714   // registers.
2715   // If we are doing a normal deopt then we were called from the patched
2716   // nmethod from the point we returned to the nmethod. So the return
2717   // address on the stack is wrong by NativeCall::instruction_size
2718   // We will adjust the value so it looks like we have the original return
2719   // address on the stack (like when we eagerly deoptimized).
2720   // In the case of an exception pending when deoptimizing, we enter
2721   // with a return address on the stack that points after the call we patched
2722   // into the exception handler. We have the following register state from,
2723   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2724   //    rax: exception oop
2725   //    rbx: exception handler
2726   //    rdx: throwing pc
2727   // So in this case we simply jam rdx into the useless return address and
2728   // the stack looks just like we want.
2729   //
2730   // At this point we need to de-opt.  We save the argument return
2731   // registers.  We call the first C routine, fetch_unroll_info().  This
2732   // routine captures the return values and returns a structure which
2733   // describes the current frame size and the sizes of all replacement frames.
2734   // The current frame is compiled code and may contain many inlined
2735   // functions, each with their own JVM state.  We pop the current frame, then
2736   // push all the new frames.  Then we call the C routine unpack_frames() to
2737   // populate these frames.  Finally unpack_frames() returns us the new target
2738   // address.  Notice that callee-save registers are BLOWN here; they have
2739   // already been captured in the vframeArray at the time the return PC was
2740   // patched.
2741   address start = __ pc();
2742   Label cont;
2743 
2744   // Prolog for non exception case!
2745 
2746   // Save everything in sight.
2747   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2748 
2749   // Normal deoptimization.  Save exec mode for unpack_frames.
2750   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2751   __ jmp(cont);
2752 
2753   int reexecute_offset = __ pc() - start;
2754 #if INCLUDE_JVMCI && !defined(COMPILER1)
2755   if (EnableJVMCI && UseJVMCICompiler) {
2756     // JVMCI does not use this kind of deoptimization
2757     __ should_not_reach_here();
2758   }
2759 #endif
2760 
2761   // Reexecute case
2762   // return address is the pc describes what bci to do re-execute at
2763 
2764   // No need to update map as each call to save_live_registers will produce identical oopmap
2765   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2766 
2767   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2768   __ jmp(cont);
2769 
2770 #if INCLUDE_JVMCI
2771   Label after_fetch_unroll_info_call;
2772   int implicit_exception_uncommon_trap_offset = 0;
2773   int uncommon_trap_offset = 0;
2774 
2775   if (EnableJVMCI) {
2776     implicit_exception_uncommon_trap_offset = __ pc() - start;
2777 
2778     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2779     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2780 
2781     uncommon_trap_offset = __ pc() - start;
2782 
2783     // Save everything in sight.
2784     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2785     // fetch_unroll_info needs to call last_java_frame()
2786     __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2787 
2788     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2789     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2790 
2791     __ movl(r14, Deoptimization::Unpack_reexecute);
2792     __ mov(c_rarg0, r15_thread);
2793     __ movl(c_rarg2, r14); // exec mode
2794     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2795     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2796 
2797     __ reset_last_Java_frame(false);
2798 
2799     __ jmp(after_fetch_unroll_info_call);
2800   } // EnableJVMCI
2801 #endif // INCLUDE_JVMCI
2802 
2803   int exception_offset = __ pc() - start;
2804 
2805   // Prolog for exception case
2806 
2807   // all registers are dead at this entry point, except for rax, and
2808   // rdx which contain the exception oop and exception pc
2809   // respectively.  Set them in TLS and fall thru to the
2810   // unpack_with_exception_in_tls entry point.
2811 
2812   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2813   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2814 
2815   int exception_in_tls_offset = __ pc() - start;
2816 
2817   // new implementation because exception oop is now passed in JavaThread
2818 
2819   // Prolog for exception case
2820   // All registers must be preserved because they might be used by LinearScan
2821   // Exceptiop oop and throwing PC are passed in JavaThread
2822   // tos: stack at point of call to method that threw the exception (i.e. only
2823   // args are on the stack, no return address)
2824 
2825   // make room on stack for the return address
2826   // It will be patched later with the throwing pc. The correct value is not
2827   // available now because loading it from memory would destroy registers.
2828   __ push(0);
2829 
2830   // Save everything in sight.
2831   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2832 
2833   // Now it is safe to overwrite any register
2834 
2835   // Deopt during an exception.  Save exec mode for unpack_frames.
2836   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2837 
2838   // load throwing pc from JavaThread and patch it as the return address
2839   // of the current frame. Then clear the field in JavaThread
2840 
2841   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2842   __ movptr(Address(rbp, wordSize), rdx);
2843   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2844 
2845 #ifdef ASSERT
2846   // verify that there is really an exception oop in JavaThread
2847   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2848   __ verify_oop(rax);
2849 
2850   // verify that there is no pending exception
2851   Label no_pending_exception;
2852   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2853   __ testptr(rax, rax);
2854   __ jcc(Assembler::zero, no_pending_exception);
2855   __ stop("must not have pending exception here");
2856   __ bind(no_pending_exception);
2857 #endif
2858 
2859   __ bind(cont);
2860 
2861   // Call C code.  Need thread and this frame, but NOT official VM entry
2862   // crud.  We cannot block on this call, no GC can happen.
2863   //
2864   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2865 
2866   // fetch_unroll_info needs to call last_java_frame().
2867 
2868   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2869 #ifdef ASSERT
2870   { Label L;
2871     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2872     __ jcc(Assembler::equal, L);
2873     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2874     __ bind(L);
2875   }
2876 #endif // ASSERT
2877   __ mov(c_rarg0, r15_thread);
2878   __ movl(c_rarg1, r14); // exec_mode
2879   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2880 
2881   // Need to have an oopmap that tells fetch_unroll_info where to
2882   // find any register it might need.
2883   oop_maps->add_gc_map(__ pc() - start, map);
2884 
2885   __ reset_last_Java_frame(false);
2886 
2887 #if INCLUDE_JVMCI
2888   if (EnableJVMCI) {
2889     __ bind(after_fetch_unroll_info_call);
2890   }
2891 #endif
2892 
2893   // Load UnrollBlock* into rdi
2894   __ mov(rdi, rax);
2895 
2896   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2897    Label noException;
2898   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2899   __ jcc(Assembler::notEqual, noException);
2900   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2901   // QQQ this is useless it was NULL above
2902   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2903   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2904   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2905 
2906   __ verify_oop(rax);
2907 
2908   // Overwrite the result registers with the exception results.
2909   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2910   // I think this is useless
2911   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2912 
2913   __ bind(noException);
2914 
2915   // Only register save data is on the stack.
2916   // Now restore the result registers.  Everything else is either dead
2917   // or captured in the vframeArray.
2918   RegisterSaver::restore_result_registers(masm);
2919 
2920   // All of the register save area has been popped of the stack. Only the
2921   // return address remains.
2922 
2923   // Pop all the frames we must move/replace.
2924   //
2925   // Frame picture (youngest to oldest)
2926   // 1: self-frame (no frame link)
2927   // 2: deopting frame  (no frame link)
2928   // 3: caller of deopting frame (could be compiled/interpreted).
2929   //
2930   // Note: by leaving the return address of self-frame on the stack
2931   // and using the size of frame 2 to adjust the stack
2932   // when we are done the return to frame 3 will still be on the stack.
2933 
2934   // Pop deoptimized frame
2935   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2936   __ addptr(rsp, rcx);
2937 
2938   // rsp should be pointing at the return address to the caller (3)
2939 
2940   // Pick up the initial fp we should save
2941   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2942   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2943 
2944 #ifdef ASSERT
2945   // Compilers generate code that bang the stack by as much as the
2946   // interpreter would need. So this stack banging should never
2947   // trigger a fault. Verify that it does not on non product builds.
2948   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2949   __ bang_stack_size(rbx, rcx);
2950 #endif
2951 
2952   // Load address of array of frame pcs into rcx
2953   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2954 
2955   // Trash the old pc
2956   __ addptr(rsp, wordSize);
2957 
2958   // Load address of array of frame sizes into rsi
2959   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2960 
2961   // Load counter into rdx
2962   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2963 
2964   // Now adjust the caller's stack to make up for the extra locals
2965   // but record the original sp so that we can save it in the skeletal interpreter
2966   // frame and the stack walking of interpreter_sender will get the unextended sp
2967   // value and not the "real" sp value.
2968 
2969   const Register sender_sp = r8;
2970 
2971   __ mov(sender_sp, rsp);
2972   __ movl(rbx, Address(rdi,
2973                        Deoptimization::UnrollBlock::
2974                        caller_adjustment_offset_in_bytes()));
2975   __ subptr(rsp, rbx);
2976 
2977   // Push interpreter frames in a loop
2978   Label loop;
2979   __ bind(loop);
2980   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2981   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2982   __ pushptr(Address(rcx, 0));          // Save return address
2983   __ enter();                           // Save old & set new ebp
2984   __ subptr(rsp, rbx);                  // Prolog
2985   // This value is corrected by layout_activation_impl
2986   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2987   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2988   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2989   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2990   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2991   __ decrementl(rdx);                   // Decrement counter
2992   __ jcc(Assembler::notZero, loop);
2993   __ pushptr(Address(rcx, 0));          // Save final return address
2994 
2995   // Re-push self-frame
2996   __ enter();                           // Save old & set new ebp
2997 
2998   // Allocate a full sized register save area.
2999   // Return address and rbp are in place, so we allocate two less words.
3000   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3001 
3002   // Restore frame locals after moving the frame
3003   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3004   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3005 
3006   // Call C code.  Need thread but NOT official VM entry
3007   // crud.  We cannot block on this call, no GC can happen.  Call should
3008   // restore return values to their stack-slots with the new SP.
3009   //
3010   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3011 
3012   // Use rbp because the frames look interpreted now
3013   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3014   // Don't need the precise return PC here, just precise enough to point into this code blob.
3015   address the_pc = __ pc();
3016   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3017 
3018   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3019   __ mov(c_rarg0, r15_thread);
3020   __ movl(c_rarg1, r14); // second arg: exec_mode
3021   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3022   // Revert SP alignment after call since we're going to do some SP relative addressing below
3023   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3024 
3025   // Set an oopmap for the call site
3026   // Use the same PC we used for the last java frame
3027   oop_maps->add_gc_map(the_pc - start,
3028                        new OopMap( frame_size_in_words, 0 ));
3029 
3030   // Clear fp AND pc
3031   __ reset_last_Java_frame(true);
3032 
3033   // Collect return values
3034   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3035   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3036   // I think this is useless (throwing pc?)
3037   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3038 
3039   // Pop self-frame.
3040   __ leave();                           // Epilog
3041 
3042   // Jump to interpreter
3043   __ ret(0);
3044 
3045   // Make sure all code is generated
3046   masm->flush();
3047 
3048   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3049   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3050 #if INCLUDE_JVMCI
3051   if (EnableJVMCI) {
3052     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3053     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3054   }
3055 #endif
3056 }
3057 
3058 #ifdef COMPILER2
3059 //------------------------------generate_uncommon_trap_blob--------------------
3060 void SharedRuntime::generate_uncommon_trap_blob() {
3061   // Allocate space for the code
3062   ResourceMark rm;
3063   // Setup code generation tools
3064   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3065   MacroAssembler* masm = new MacroAssembler(&buffer);
3066 
3067   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3068 
3069   address start = __ pc();
3070 
3071   if (UseRTMLocking) {
3072     // Abort RTM transaction before possible nmethod deoptimization.
3073     __ xabort(0);
3074   }
3075 
3076   // Push self-frame.  We get here with a return address on the
3077   // stack, so rsp is 8-byte aligned until we allocate our frame.
3078   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3079 
3080   // No callee saved registers. rbp is assumed implicitly saved
3081   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3082 
3083   // compiler left unloaded_class_index in j_rarg0 move to where the
3084   // runtime expects it.
3085   __ movl(c_rarg1, j_rarg0);
3086 
3087   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
3088 
3089   // Call C code.  Need thread but NOT official VM entry
3090   // crud.  We cannot block on this call, no GC can happen.  Call should
3091   // capture callee-saved registers as well as return values.
3092   // Thread is in rdi already.
3093   //
3094   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3095 
3096   __ mov(c_rarg0, r15_thread);
3097   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3098   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3099 
3100   // Set an oopmap for the call site
3101   OopMapSet* oop_maps = new OopMapSet();
3102   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3103 
3104   // location of rbp is known implicitly by the frame sender code
3105 
3106   oop_maps->add_gc_map(__ pc() - start, map);
3107 
3108   __ reset_last_Java_frame(false);
3109 
3110   // Load UnrollBlock* into rdi
3111   __ mov(rdi, rax);
3112 
3113 #ifdef ASSERT
3114   { Label L;
3115     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
3116               Deoptimization::Unpack_uncommon_trap);
3117     __ jcc(Assembler::equal, L);
3118     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
3119     __ bind(L);
3120   }
3121 #endif
3122 
3123   // Pop all the frames we must move/replace.
3124   //
3125   // Frame picture (youngest to oldest)
3126   // 1: self-frame (no frame link)
3127   // 2: deopting frame  (no frame link)
3128   // 3: caller of deopting frame (could be compiled/interpreted).
3129 
3130   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3131   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3132 
3133   // Pop deoptimized frame (int)
3134   __ movl(rcx, Address(rdi,
3135                        Deoptimization::UnrollBlock::
3136                        size_of_deoptimized_frame_offset_in_bytes()));
3137   __ addptr(rsp, rcx);
3138 
3139   // rsp should be pointing at the return address to the caller (3)
3140 
3141   // Pick up the initial fp we should save
3142   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3143   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3144 
3145 #ifdef ASSERT
3146   // Compilers generate code that bang the stack by as much as the
3147   // interpreter would need. So this stack banging should never
3148   // trigger a fault. Verify that it does not on non product builds.
3149   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3150   __ bang_stack_size(rbx, rcx);
3151 #endif
3152 
3153   // Load address of array of frame pcs into rcx (address*)
3154   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3155 
3156   // Trash the return pc
3157   __ addptr(rsp, wordSize);
3158 
3159   // Load address of array of frame sizes into rsi (intptr_t*)
3160   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3161 
3162   // Counter
3163   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3164 
3165   // Now adjust the caller's stack to make up for the extra locals but
3166   // record the original sp so that we can save it in the skeletal
3167   // interpreter frame and the stack walking of interpreter_sender
3168   // will get the unextended sp value and not the "real" sp value.
3169 
3170   const Register sender_sp = r8;
3171 
3172   __ mov(sender_sp, rsp);
3173   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3174   __ subptr(rsp, rbx);
3175 
3176   // Push interpreter frames in a loop
3177   Label loop;
3178   __ bind(loop);
3179   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3180   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3181   __ pushptr(Address(rcx, 0));     // Save return address
3182   __ enter();                      // Save old & set new rbp
3183   __ subptr(rsp, rbx);             // Prolog
3184   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3185             sender_sp);            // Make it walkable
3186   // This value is corrected by layout_activation_impl
3187   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3188   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3189   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3190   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3191   __ decrementl(rdx);              // Decrement counter
3192   __ jcc(Assembler::notZero, loop);
3193   __ pushptr(Address(rcx, 0));     // Save final return address
3194 
3195   // Re-push self-frame
3196   __ enter();                 // Save old & set new rbp
3197   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3198                               // Prolog
3199 
3200   // Use rbp because the frames look interpreted now
3201   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3202   // Don't need the precise return PC here, just precise enough to point into this code blob.
3203   address the_pc = __ pc();
3204   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3205 
3206   // Call C code.  Need thread but NOT official VM entry
3207   // crud.  We cannot block on this call, no GC can happen.  Call should
3208   // restore return values to their stack-slots with the new SP.
3209   // Thread is in rdi already.
3210   //
3211   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3212 
3213   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3214   __ mov(c_rarg0, r15_thread);
3215   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3216   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3217 
3218   // Set an oopmap for the call site
3219   // Use the same PC we used for the last java frame
3220   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3221 
3222   // Clear fp AND pc
3223   __ reset_last_Java_frame(true);
3224 
3225   // Pop self-frame.
3226   __ leave();                 // Epilog
3227 
3228   // Jump to interpreter
3229   __ ret(0);
3230 
3231   // Make sure all code is generated
3232   masm->flush();
3233 
3234   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3235                                                  SimpleRuntimeFrame::framesize >> 1);
3236 }
3237 #endif // COMPILER2
3238 
3239 //------------------------------generate_handler_blob------
3240 //
3241 // Generate a special Compile2Runtime blob that saves all registers,
3242 // and setup oopmap.
3243 //
3244 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3245   assert(StubRoutines::forward_exception_entry() != NULL,
3246          "must be generated before");
3247 
3248   ResourceMark rm;
3249   OopMapSet *oop_maps = new OopMapSet();
3250   OopMap* map;
3251 
3252   // Allocate space for the code.  Setup code generation tools.
3253   CodeBuffer buffer("handler_blob", 2048, 1024);
3254   MacroAssembler* masm = new MacroAssembler(&buffer);
3255 
3256   address start   = __ pc();
3257   address call_pc = NULL;
3258   int frame_size_in_words;
3259   bool cause_return = (poll_type == POLL_AT_RETURN);
3260   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3261 
3262   if (UseRTMLocking) {
3263     // Abort RTM transaction before calling runtime
3264     // because critical section will be large and will be
3265     // aborted anyway. Also nmethod could be deoptimized.
3266     __ xabort(0);
3267   }
3268 
3269   // Make room for return address (or push it again)
3270   if (!cause_return) {
3271     __ push(rbx);
3272   }
3273 
3274   // Save registers, fpu state, and flags
3275   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3276 
3277   // The following is basically a call_VM.  However, we need the precise
3278   // address of the call in order to generate an oopmap. Hence, we do all the
3279   // work ourselves.
3280 
3281   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3282 
3283   // The return address must always be correct so that frame constructor never
3284   // sees an invalid pc.
3285 
3286   if (!cause_return) {
3287     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3288     // Additionally, rbx is a callee saved register and we can look at it later to determine
3289     // if someone changed the return address for us!
3290     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3291     __ movptr(Address(rbp, wordSize), rbx);
3292   }
3293 
3294   // Do the call
3295   __ mov(c_rarg0, r15_thread);
3296   __ call(RuntimeAddress(call_ptr));
3297 
3298   // Set an oopmap for the call site.  This oopmap will map all
3299   // oop-registers and debug-info registers as callee-saved.  This
3300   // will allow deoptimization at this safepoint to find all possible
3301   // debug-info recordings, as well as let GC find all oops.
3302 
3303   oop_maps->add_gc_map( __ pc() - start, map);
3304 
3305   Label noException;
3306 
3307   __ reset_last_Java_frame(false);
3308 
3309   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3310   __ jcc(Assembler::equal, noException);
3311 
3312   // Exception pending
3313 
3314   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3315 
3316   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3317 
3318   // No exception case
3319   __ bind(noException);
3320 
3321   Label no_adjust;
3322 #ifdef ASSERT
3323   Label bail;
3324 #endif
3325   if (!cause_return) {
3326     Label no_prefix, not_special;
3327 
3328     // If our stashed return pc was modified by the runtime we avoid touching it
3329     __ cmpptr(rbx, Address(rbp, wordSize));
3330     __ jccb(Assembler::notEqual, no_adjust);
3331 
3332     // Skip over the poll instruction.
3333     // See NativeInstruction::is_safepoint_poll()
3334     // Possible encodings:
3335     //      85 00       test   %eax,(%rax)
3336     //      85 01       test   %eax,(%rcx)
3337     //      85 02       test   %eax,(%rdx)
3338     //      85 03       test   %eax,(%rbx)
3339     //      85 06       test   %eax,(%rsi)
3340     //      85 07       test   %eax,(%rdi)
3341     //
3342     //   41 85 00       test   %eax,(%r8)
3343     //   41 85 01       test   %eax,(%r9)
3344     //   41 85 02       test   %eax,(%r10)
3345     //   41 85 03       test   %eax,(%r11)
3346     //   41 85 06       test   %eax,(%r14)
3347     //   41 85 07       test   %eax,(%r15)
3348     //
3349     //      85 04 24    test   %eax,(%rsp)
3350     //   41 85 04 24    test   %eax,(%r12)
3351     //      85 45 00    test   %eax,0x0(%rbp)
3352     //   41 85 45 00    test   %eax,0x0(%r13)
3353 
3354     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3355     __ jcc(Assembler::notEqual, no_prefix);
3356     __ addptr(rbx, 1);
3357     __ bind(no_prefix);
3358 #ifdef ASSERT
3359     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3360 #endif
3361     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3362     // r12/rsp 0x04
3363     // r13/rbp 0x05
3364     __ movzbq(rcx, Address(rbx, 1));
3365     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3366     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3367     __ cmpptr(rcx, 1);
3368     __ jcc(Assembler::above, not_special);
3369     __ addptr(rbx, 1);
3370     __ bind(not_special);
3371 #ifdef ASSERT
3372     // Verify the correct encoding of the poll we're about to skip.
3373     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3374     __ jcc(Assembler::notEqual, bail);
3375     // Mask out the modrm bits
3376     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3377     // rax encodes to 0, so if the bits are nonzero it's incorrect
3378     __ jcc(Assembler::notZero, bail);
3379 #endif
3380     // Adjust return pc forward to step over the safepoint poll instruction
3381     __ addptr(rbx, 2);
3382     __ movptr(Address(rbp, wordSize), rbx);
3383   }
3384 
3385   __ bind(no_adjust);
3386   // Normal exit, restore registers and exit.
3387   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3388   __ ret(0);
3389 
3390 #ifdef ASSERT
3391   __ bind(bail);
3392   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3393 #endif
3394 
3395   // Make sure all code is generated
3396   masm->flush();
3397 
3398   // Fill-out other meta info
3399   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3400 }
3401 
3402 //
3403 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3404 //
3405 // Generate a stub that calls into vm to find out the proper destination
3406 // of a java call. All the argument registers are live at this point
3407 // but since this is generic code we don't know what they are and the caller
3408 // must do any gc of the args.
3409 //
3410 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3411   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3412 
3413   // allocate space for the code
3414   ResourceMark rm;
3415 
3416   CodeBuffer buffer(name, 1200, 512);
3417   MacroAssembler* masm = new MacroAssembler(&buffer);
3418 
3419   int frame_size_in_words;
3420 
3421   OopMapSet *oop_maps = new OopMapSet();
3422   OopMap* map = NULL;
3423 
3424   int start = __ offset();
3425 
3426   // No need to save vector registers since they are caller-saved anyway.
3427   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3428 
3429   int frame_complete = __ offset();
3430 
3431   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
3432 
3433   __ mov(c_rarg0, r15_thread);
3434 
3435   __ call(RuntimeAddress(destination));
3436 
3437 
3438   // Set an oopmap for the call site.
3439   // We need this not only for callee-saved registers, but also for volatile
3440   // registers that the compiler might be keeping live across a safepoint.
3441 
3442   oop_maps->add_gc_map( __ offset() - start, map);
3443 
3444   // rax contains the address we are going to jump to assuming no exception got installed
3445 
3446   // clear last_Java_sp
3447   __ reset_last_Java_frame(false);
3448   // check for pending exceptions
3449   Label pending;
3450   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3451   __ jcc(Assembler::notEqual, pending);
3452 
3453   // get the returned Method*
3454   __ get_vm_result_2(rbx, r15_thread);
3455   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3456 
3457   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3458 
3459   RegisterSaver::restore_live_registers(masm);
3460 
3461   // We are back to the original state on entry and ready to go.
3462 
3463   __ jmp(rax);
3464 
3465   // Pending exception after the safepoint
3466 
3467   __ bind(pending);
3468 
3469   RegisterSaver::restore_live_registers(masm);
3470 
3471   // exception pending => remove activation and forward to exception handler
3472 
3473   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3474 
3475   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3476   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3477 
3478   // -------------
3479   // make sure all code is generated
3480   masm->flush();
3481 
3482   // return the  blob
3483   // frame_size_words or bytes??
3484   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3485 }
3486 
3487 //------------------------------Montgomery multiplication------------------------
3488 //
3489 
3490 #ifndef _WINDOWS
3491 
3492 // Subtract 0:b from carry:a.  Return carry.
3493 static julong
3494 sub(julong a[], julong b[], julong carry, long len) {
3495   long long i = 0, cnt = len;
3496   julong tmp;
3497   asm volatile("clc; "
3498                "0: ; "
3499                "mov (%[b], %[i], 8), %[tmp]; "
3500                "sbb %[tmp], (%[a], %[i], 8); "
3501                "inc %[i]; dec %[cnt]; "
3502                "jne 0b; "
3503                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3504                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3505                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3506                : "memory");
3507   return tmp;
3508 }
3509 
3510 // Multiply (unsigned) Long A by Long B, accumulating the double-
3511 // length result into the accumulator formed of T0, T1, and T2.
3512 #define MACC(A, B, T0, T1, T2)                                  \
3513 do {                                                            \
3514   unsigned long hi, lo;                                         \
3515   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3516            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3517            : "r"(A), "a"(B) : "cc");                            \
3518  } while(0)
3519 
3520 // As above, but add twice the double-length result into the
3521 // accumulator.
3522 #define MACC2(A, B, T0, T1, T2)                                 \
3523 do {                                                            \
3524   unsigned long hi, lo;                                         \
3525   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3526            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3527            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3528            : "r"(A), "a"(B) : "cc");                            \
3529  } while(0)
3530 
3531 #else //_WINDOWS
3532 
3533 static julong
3534 sub(julong a[], julong b[], julong carry, long len) {
3535   long i;
3536   julong tmp;
3537   unsigned char c = 1;
3538   for (i = 0; i < len; i++) {
3539     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3540     a[i] = tmp;
3541   }
3542   c = _addcarry_u64(c, carry, ~0, &tmp);
3543   return tmp;
3544 }
3545 
3546 // Multiply (unsigned) Long A by Long B, accumulating the double-
3547 // length result into the accumulator formed of T0, T1, and T2.
3548 #define MACC(A, B, T0, T1, T2)                          \
3549 do {                                                    \
3550   julong hi, lo;                            \
3551   lo = _umul128(A, B, &hi);                             \
3552   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3553   c = _addcarry_u64(c, hi, T1, &T1);                    \
3554   _addcarry_u64(c, T2, 0, &T2);                         \
3555  } while(0)
3556 
3557 // As above, but add twice the double-length result into the
3558 // accumulator.
3559 #define MACC2(A, B, T0, T1, T2)                         \
3560 do {                                                    \
3561   julong hi, lo;                            \
3562   lo = _umul128(A, B, &hi);                             \
3563   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3564   c = _addcarry_u64(c, hi, T1, &T1);                    \
3565   _addcarry_u64(c, T2, 0, &T2);                         \
3566   c = _addcarry_u64(0, lo, T0, &T0);                    \
3567   c = _addcarry_u64(c, hi, T1, &T1);                    \
3568   _addcarry_u64(c, T2, 0, &T2);                         \
3569  } while(0)
3570 
3571 #endif //_WINDOWS
3572 
3573 // Fast Montgomery multiplication.  The derivation of the algorithm is
3574 // in  A Cryptographic Library for the Motorola DSP56000,
3575 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3576 
3577 static void NOINLINE
3578 montgomery_multiply(julong a[], julong b[], julong n[],
3579                     julong m[], julong inv, int len) {
3580   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3581   int i;
3582 
3583   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3584 
3585   for (i = 0; i < len; i++) {
3586     int j;
3587     for (j = 0; j < i; j++) {
3588       MACC(a[j], b[i-j], t0, t1, t2);
3589       MACC(m[j], n[i-j], t0, t1, t2);
3590     }
3591     MACC(a[i], b[0], t0, t1, t2);
3592     m[i] = t0 * inv;
3593     MACC(m[i], n[0], t0, t1, t2);
3594 
3595     assert(t0 == 0, "broken Montgomery multiply");
3596 
3597     t0 = t1; t1 = t2; t2 = 0;
3598   }
3599 
3600   for (i = len; i < 2*len; i++) {
3601     int j;
3602     for (j = i-len+1; j < len; j++) {
3603       MACC(a[j], b[i-j], t0, t1, t2);
3604       MACC(m[j], n[i-j], t0, t1, t2);
3605     }
3606     m[i-len] = t0;
3607     t0 = t1; t1 = t2; t2 = 0;
3608   }
3609 
3610   while (t0)
3611     t0 = sub(m, n, t0, len);
3612 }
3613 
3614 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3615 // multiplies so it should be up to 25% faster than Montgomery
3616 // multiplication.  However, its loop control is more complex and it
3617 // may actually run slower on some machines.
3618 
3619 static void NOINLINE
3620 montgomery_square(julong a[], julong n[],
3621                   julong m[], julong inv, int len) {
3622   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3623   int i;
3624 
3625   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3626 
3627   for (i = 0; i < len; i++) {
3628     int j;
3629     int end = (i+1)/2;
3630     for (j = 0; j < end; j++) {
3631       MACC2(a[j], a[i-j], t0, t1, t2);
3632       MACC(m[j], n[i-j], t0, t1, t2);
3633     }
3634     if ((i & 1) == 0) {
3635       MACC(a[j], a[j], t0, t1, t2);
3636     }
3637     for (; j < i; j++) {
3638       MACC(m[j], n[i-j], t0, t1, t2);
3639     }
3640     m[i] = t0 * inv;
3641     MACC(m[i], n[0], t0, t1, t2);
3642 
3643     assert(t0 == 0, "broken Montgomery square");
3644 
3645     t0 = t1; t1 = t2; t2 = 0;
3646   }
3647 
3648   for (i = len; i < 2*len; i++) {
3649     int start = i-len+1;
3650     int end = start + (len - start)/2;
3651     int j;
3652     for (j = start; j < end; j++) {
3653       MACC2(a[j], a[i-j], t0, t1, t2);
3654       MACC(m[j], n[i-j], t0, t1, t2);
3655     }
3656     if ((i & 1) == 0) {
3657       MACC(a[j], a[j], t0, t1, t2);
3658     }
3659     for (; j < len; j++) {
3660       MACC(m[j], n[i-j], t0, t1, t2);
3661     }
3662     m[i-len] = t0;
3663     t0 = t1; t1 = t2; t2 = 0;
3664   }
3665 
3666   while (t0)
3667     t0 = sub(m, n, t0, len);
3668 }
3669 
3670 // Swap words in a longword.
3671 static julong swap(julong x) {
3672   return (x << 32) | (x >> 32);
3673 }
3674 
3675 // Copy len longwords from s to d, word-swapping as we go.  The
3676 // destination array is reversed.
3677 static void reverse_words(julong *s, julong *d, int len) {
3678   d += len;
3679   while(len-- > 0) {
3680     d--;
3681     *d = swap(*s);
3682     s++;
3683   }
3684 }
3685 
3686 // The threshold at which squaring is advantageous was determined
3687 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3688 #define MONTGOMERY_SQUARING_THRESHOLD 64
3689 
3690 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3691                                         jint len, jlong inv,
3692                                         jint *m_ints) {
3693   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3694   int longwords = len/2;
3695 
3696   // Make very sure we don't use so much space that the stack might
3697   // overflow.  512 jints corresponds to an 16384-bit integer and
3698   // will use here a total of 8k bytes of stack space.
3699   int divisor = sizeof(julong) * 4;
3700   guarantee(longwords <= 8192 / divisor, "must be");
3701   int total_allocation = longwords * sizeof (julong) * 4;
3702   julong *scratch = (julong *)alloca(total_allocation);
3703 
3704   // Local scratch arrays
3705   julong
3706     *a = scratch + 0 * longwords,
3707     *b = scratch + 1 * longwords,
3708     *n = scratch + 2 * longwords,
3709     *m = scratch + 3 * longwords;
3710 
3711   reverse_words((julong *)a_ints, a, longwords);
3712   reverse_words((julong *)b_ints, b, longwords);
3713   reverse_words((julong *)n_ints, n, longwords);
3714 
3715   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3716 
3717   reverse_words(m, (julong *)m_ints, longwords);
3718 }
3719 
3720 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3721                                       jint len, jlong inv,
3722                                       jint *m_ints) {
3723   assert(len % 2 == 0, "array length in montgomery_square must be even");
3724   int longwords = len/2;
3725 
3726   // Make very sure we don't use so much space that the stack might
3727   // overflow.  512 jints corresponds to an 16384-bit integer and
3728   // will use here a total of 6k bytes of stack space.
3729   int divisor = sizeof(julong) * 3;
3730   guarantee(longwords <= (8192 / divisor), "must be");
3731   int total_allocation = longwords * sizeof (julong) * 3;
3732   julong *scratch = (julong *)alloca(total_allocation);
3733 
3734   // Local scratch arrays
3735   julong
3736     *a = scratch + 0 * longwords,
3737     *n = scratch + 1 * longwords,
3738     *m = scratch + 2 * longwords;
3739 
3740   reverse_words((julong *)a_ints, a, longwords);
3741   reverse_words((julong *)n_ints, n, longwords);
3742 
3743   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3744     ::montgomery_square(a, n, m, (julong)inv, longwords);
3745   } else {
3746     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3747   }
3748 
3749   reverse_words(m, (julong *)m_ints, longwords);
3750 }
3751 
3752 #ifdef COMPILER2
3753 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3754 //
3755 //------------------------------generate_exception_blob---------------------------
3756 // creates exception blob at the end
3757 // Using exception blob, this code is jumped from a compiled method.
3758 // (see emit_exception_handler in x86_64.ad file)
3759 //
3760 // Given an exception pc at a call we call into the runtime for the
3761 // handler in this method. This handler might merely restore state
3762 // (i.e. callee save registers) unwind the frame and jump to the
3763 // exception handler for the nmethod if there is no Java level handler
3764 // for the nmethod.
3765 //
3766 // This code is entered with a jmp.
3767 //
3768 // Arguments:
3769 //   rax: exception oop
3770 //   rdx: exception pc
3771 //
3772 // Results:
3773 //   rax: exception oop
3774 //   rdx: exception pc in caller or ???
3775 //   destination: exception handler of caller
3776 //
3777 // Note: the exception pc MUST be at a call (precise debug information)
3778 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3779 //
3780 
3781 void OptoRuntime::generate_exception_blob() {
3782   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3783   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3784   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3785 
3786   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3787 
3788   // Allocate space for the code
3789   ResourceMark rm;
3790   // Setup code generation tools
3791   CodeBuffer buffer("exception_blob", 2048, 1024);
3792   MacroAssembler* masm = new MacroAssembler(&buffer);
3793 
3794 
3795   address start = __ pc();
3796 
3797   // Exception pc is 'return address' for stack walker
3798   __ push(rdx);
3799   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3800 
3801   // Save callee-saved registers.  See x86_64.ad.
3802 
3803   // rbp is an implicitly saved callee saved register (i.e., the calling
3804   // convention will save/restore it in the prolog/epilog). Other than that
3805   // there are no callee save registers now that adapter frames are gone.
3806 
3807   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3808 
3809   // Store exception in Thread object. We cannot pass any arguments to the
3810   // handle_exception call, since we do not want to make any assumption
3811   // about the size of the frame where the exception happened in.
3812   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3813   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3814   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3815 
3816   // This call does all the hard work.  It checks if an exception handler
3817   // exists in the method.
3818   // If so, it returns the handler address.
3819   // If not, it prepares for stack-unwinding, restoring the callee-save
3820   // registers of the frame being removed.
3821   //
3822   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3823 
3824   // At a method handle call, the stack may not be properly aligned
3825   // when returning with an exception.
3826   address the_pc = __ pc();
3827   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3828   __ mov(c_rarg0, r15_thread);
3829   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3830   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3831 
3832   // Set an oopmap for the call site.  This oopmap will only be used if we
3833   // are unwinding the stack.  Hence, all locations will be dead.
3834   // Callee-saved registers will be the same as the frame above (i.e.,
3835   // handle_exception_stub), since they were restored when we got the
3836   // exception.
3837 
3838   OopMapSet* oop_maps = new OopMapSet();
3839 
3840   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3841 
3842   __ reset_last_Java_frame(false);
3843 
3844   // Restore callee-saved registers
3845 
3846   // rbp is an implicitly saved callee-saved register (i.e., the calling
3847   // convention will save restore it in prolog/epilog) Other than that
3848   // there are no callee save registers now that adapter frames are gone.
3849 
3850   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3851 
3852   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3853   __ pop(rdx);                  // No need for exception pc anymore
3854 
3855   // rax: exception handler
3856 
3857   // We have a handler in rax (could be deopt blob).
3858   __ mov(r8, rax);
3859 
3860   // Get the exception oop
3861   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3862   // Get the exception pc in case we are deoptimized
3863   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3864 #ifdef ASSERT
3865   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3866   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3867 #endif
3868   // Clear the exception oop so GC no longer processes it as a root.
3869   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3870 
3871   // rax: exception oop
3872   // r8:  exception handler
3873   // rdx: exception pc
3874   // Jump to handler
3875 
3876   __ jmp(r8);
3877 
3878   // Make sure all code is generated
3879   masm->flush();
3880 
3881   // Set exception blob
3882   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3883 }
3884 #endif // COMPILER2
3885 
3886 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3887   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3888   CodeBuffer buffer(buf);
3889   short buffer_locs[20];
3890   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3891                                          sizeof(buffer_locs)/sizeof(relocInfo));
3892 
3893   MacroAssembler* masm = new MacroAssembler(&buffer);
3894 
3895   const Array<SigEntry>* sig_vk = vk->extended_sig();
3896   const Array<VMRegPair>* regs = vk->return_regs();
3897 
3898   int pack_fields_jobject_off = __ offset();
3899   // Resolve pre-allocated buffer from JNI handle.
3900   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3901   __ movptr(rax, Address(r13, 0));
3902   __ resolve_jobject(rax /* value */,
3903                      r15_thread /* thread */,
3904                      r12 /* tmp */);
3905   __ movptr(Address(r13, 0), rax);
3906 
3907   int pack_fields_off = __ offset();
3908 
3909   int j = 1;
3910   for (int i = 0; i < sig_vk->length(); i++) {
3911     BasicType bt = sig_vk->at(i)._bt;
3912     if (bt == T_PRIMITIVE_OBJECT) {
3913       continue;
3914     }
3915     if (bt == T_VOID) {
3916       if (sig_vk->at(i-1)._bt == T_LONG ||
3917           sig_vk->at(i-1)._bt == T_DOUBLE) {
3918         j++;
3919       }
3920       continue;
3921     }
3922     int off = sig_vk->at(i)._offset;
3923     assert(off > 0, "offset in object should be positive");
3924     VMRegPair pair = regs->at(j);
3925     VMReg r_1 = pair.first();
3926     VMReg r_2 = pair.second();
3927     Address to(rax, off);
3928     if (bt == T_FLOAT) {
3929       __ movflt(to, r_1->as_XMMRegister());
3930     } else if (bt == T_DOUBLE) {
3931       __ movdbl(to, r_1->as_XMMRegister());
3932     } else {
3933       Register val = r_1->as_Register();
3934       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
3935       if (is_reference_type(bt)) {
3936         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
3937       } else {
3938         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
3939       }
3940     }
3941     j++;
3942   }
3943   assert(j == regs->length(), "missed a field?");
3944 
3945   __ ret(0);
3946 
3947   int unpack_fields_off = __ offset();
3948 
3949   Label skip;
3950   __ testptr(rax, rax);
3951   __ jcc(Assembler::zero, skip);
3952 
3953   j = 1;
3954   for (int i = 0; i < sig_vk->length(); i++) {
3955     BasicType bt = sig_vk->at(i)._bt;
3956     if (bt == T_PRIMITIVE_OBJECT) {
3957       continue;
3958     }
3959     if (bt == T_VOID) {
3960       if (sig_vk->at(i-1)._bt == T_LONG ||
3961           sig_vk->at(i-1)._bt == T_DOUBLE) {
3962         j++;
3963       }
3964       continue;
3965     }
3966     int off = sig_vk->at(i)._offset;
3967     assert(off > 0, "offset in object should be positive");
3968     VMRegPair pair = regs->at(j);
3969     VMReg r_1 = pair.first();
3970     VMReg r_2 = pair.second();
3971     Address from(rax, off);
3972     if (bt == T_FLOAT) {
3973       __ movflt(r_1->as_XMMRegister(), from);
3974     } else if (bt == T_DOUBLE) {
3975       __ movdbl(r_1->as_XMMRegister(), from);
3976     } else if (bt == T_OBJECT || bt == T_ARRAY) {
3977       assert_different_registers(rax, r_1->as_Register());
3978       __ load_heap_oop(r_1->as_Register(), from);
3979     } else {
3980       assert(is_java_primitive(bt), "unexpected basic type");
3981       assert_different_registers(rax, r_1->as_Register());
3982       size_t size_in_bytes = type2aelembytes(bt);
3983       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
3984     }
3985     j++;
3986   }
3987   assert(j == regs->length(), "missed a field?");
3988 
3989   __ bind(skip);
3990   __ ret(0);
3991 
3992   __ flush();
3993 
3994   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
3995 }