1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/icBuffer.hpp"
  35 #include "code/nativeInst.hpp"
  36 #include "code/vtableStubs.hpp"
  37 #include "compiler/oopMap.hpp"
  38 #include "gc/shared/collectedHeap.hpp"
  39 #include "gc/shared/gcLocker.hpp"
  40 #include "gc/shared/barrierSet.hpp"
  41 #include "gc/shared/barrierSetAssembler.hpp"
  42 #include "interpreter/interpreter.hpp"
  43 #include "logging/log.hpp"
  44 #include "memory/resourceArea.hpp"
  45 #include "memory/universe.hpp"
  46 #include "oops/compiledICHolder.hpp"
  47 #include "oops/klass.inline.hpp"
  48 #include "oops/method.inline.hpp"
  49 #include "prims/methodHandles.hpp"
  50 #include "runtime/continuation.hpp"
  51 #include "runtime/continuationEntry.inline.hpp"
  52 #include "runtime/globals.hpp"
  53 #include "runtime/jniHandles.hpp"
  54 #include "runtime/safepointMechanism.hpp"
  55 #include "runtime/sharedRuntime.hpp"
  56 #include "runtime/signature.hpp"
  57 #include "runtime/stubRoutines.hpp"
  58 #include "runtime/vframeArray.hpp"
  59 #include "runtime/vm_version.hpp"
  60 #include "utilities/align.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  76 
  77 class SimpleRuntimeFrame {
  78 
  79   public:
  80 
  81   // Most of the runtime stubs have this simple frame layout.
  82   // This class exists to make the layout shared in one place.
  83   // Offsets are for compiler stack slots, which are jints.
  84   enum layout {
  85     // The frame sender code expects that rbp will be in the "natural" place and
  86     // will override any oopMap setting for it. We must therefore force the layout
  87     // so that it agrees with the frame sender code.
  88     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  89     rbp_off2,
  90     return_off, return_off2,
  91     framesize
  92   };
  93 };
  94 
  95 class RegisterSaver {
  96   // Capture info about frame layout.  Layout offsets are in jint
  97   // units because compiler frame slots are jints.
  98 #define XSAVE_AREA_BEGIN 160
  99 #define XSAVE_AREA_YMM_BEGIN 576
 100 #define XSAVE_AREA_OPMASK_BEGIN 1088
 101 #define XSAVE_AREA_ZMM_BEGIN 1152
 102 #define XSAVE_AREA_UPPERBANK 1664
 103 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 104 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 105 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 106 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 107 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 108   enum layout {
 109     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 110     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 111     DEF_XMM_OFFS(0),
 112     DEF_XMM_OFFS(1),
 113     // 2..15 are implied in range usage
 114     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 115     DEF_YMM_OFFS(0),
 116     DEF_YMM_OFFS(1),
 117     // 2..15 are implied in range usage
 118     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 119     DEF_OPMASK_OFFS(0),
 120     DEF_OPMASK_OFFS(1),
 121     // 2..7 are implied in range usage
 122     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_ZMM_OFFS(0),
 124     DEF_ZMM_OFFS(1),
 125     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 126     DEF_ZMM_UPPER_OFFS(16),
 127     DEF_ZMM_UPPER_OFFS(17),
 128     // 18..31 are implied in range usage
 129     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 130     fpu_stateH_end,
 131     r15_off, r15H_off,
 132     r14_off, r14H_off,
 133     r13_off, r13H_off,
 134     r12_off, r12H_off,
 135     r11_off, r11H_off,
 136     r10_off, r10H_off,
 137     r9_off,  r9H_off,
 138     r8_off,  r8H_off,
 139     rdi_off, rdiH_off,
 140     rsi_off, rsiH_off,
 141     ignore_off, ignoreH_off,  // extra copy of rbp
 142     rsp_off, rspH_off,
 143     rbx_off, rbxH_off,
 144     rdx_off, rdxH_off,
 145     rcx_off, rcxH_off,
 146     rax_off, raxH_off,
 147     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 148     align_off, alignH_off,
 149     flags_off, flagsH_off,
 150     // The frame sender code expects that rbp will be in the "natural" place and
 151     // will override any oopMap setting for it. We must therefore force the layout
 152     // so that it agrees with the frame sender code.
 153     rbp_off, rbpH_off,        // copy of rbp we will restore
 154     return_off, returnH_off,  // slot for return address
 155     reg_save_size             // size in compiler stack slots
 156   };
 157 
 158  public:
 159   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 160   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 161 
 162   // Offsets into the register save area
 163   // Used by deoptimization when it is managing result register
 164   // values on its own
 165 
 166   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 167   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 168   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 169   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 170   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 171 
 172   // During deoptimization only the result registers need to be restored,
 173   // all the other values have already been extracted.
 174   static void restore_result_registers(MacroAssembler* masm);
 175 };
 176 
 177 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 178   int off = 0;
 179   int num_xmm_regs = XMMRegister::available_xmm_registers();
 180 #if COMPILER2_OR_JVMCI
 181   if (save_wide_vectors && UseAVX == 0) {
 182     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 183   }
 184   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 185 #else
 186   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 187 #endif
 188 
 189   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 190   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 191   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 192   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 193   // CodeBlob frame size is in words.
 194   int frame_size_in_words = frame_size_in_bytes / wordSize;
 195   *total_frame_words = frame_size_in_words;
 196 
 197   // Save registers, fpu state, and flags.
 198   // We assume caller has already pushed the return address onto the
 199   // stack, so rsp is 8-byte aligned here.
 200   // We push rpb twice in this sequence because we want the real rbp
 201   // to be under the return like a normal enter.
 202 
 203   __ enter();          // rsp becomes 16-byte aligned here
 204   __ push_CPU_state(); // Push a multiple of 16 bytes
 205 
 206   // push cpu state handles this on EVEX enabled targets
 207   if (save_wide_vectors) {
 208     // Save upper half of YMM registers(0..15)
 209     int base_addr = XSAVE_AREA_YMM_BEGIN;
 210     for (int n = 0; n < 16; n++) {
 211       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 212     }
 213     if (VM_Version::supports_evex()) {
 214       // Save upper half of ZMM registers(0..15)
 215       base_addr = XSAVE_AREA_ZMM_BEGIN;
 216       for (int n = 0; n < 16; n++) {
 217         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 218       }
 219       // Save full ZMM registers(16..num_xmm_regs)
 220       base_addr = XSAVE_AREA_UPPERBANK;
 221       off = 0;
 222       int vector_len = Assembler::AVX_512bit;
 223       for (int n = 16; n < num_xmm_regs; n++) {
 224         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 225       }
 226 #if COMPILER2_OR_JVMCI
 227       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 228       off = 0;
 229       for(int n = 0; n < KRegister::number_of_registers; n++) {
 230         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 231       }
 232 #endif
 233     }
 234   } else {
 235     if (VM_Version::supports_evex()) {
 236       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 237       int base_addr = XSAVE_AREA_UPPERBANK;
 238       off = 0;
 239       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 240       for (int n = 16; n < num_xmm_regs; n++) {
 241         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 242       }
 243 #if COMPILER2_OR_JVMCI
 244       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 245       off = 0;
 246       for(int n = 0; n < KRegister::number_of_registers; n++) {
 247         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 248       }
 249 #endif
 250     }
 251   }
 252   __ vzeroupper();
 253   if (frame::arg_reg_save_area_bytes != 0) {
 254     // Allocate argument register save area
 255     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 256   }
 257 
 258   // Set an oopmap for the call site.  This oopmap will map all
 259   // oop-registers and debug-info registers as callee-saved.  This
 260   // will allow deoptimization at this safepoint to find all possible
 261   // debug-info recordings, as well as let GC find all oops.
 262 
 263   OopMapSet *oop_maps = new OopMapSet();
 264   OopMap* map = new OopMap(frame_size_in_slots, 0);
 265 
 266 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 267 
 268   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 272   // rbp location is known implicitly by the frame sender code, needs no oopmap
 273   // and the location where rbp was saved by is ignored
 274   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 284   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 285   // on EVEX enabled targets, we get it included in the xsave area
 286   off = xmm0_off;
 287   int delta = xmm1_off - off;
 288   for (int n = 0; n < 16; n++) {
 289     XMMRegister xmm_name = as_XMMRegister(n);
 290     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 291     off += delta;
 292   }
 293   if (UseAVX > 2) {
 294     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 295     off = zmm16_off;
 296     delta = zmm17_off - off;
 297     for (int n = 16; n < num_xmm_regs; n++) {
 298       XMMRegister zmm_name = as_XMMRegister(n);
 299       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 300       off += delta;
 301     }
 302   }
 303 
 304 #if COMPILER2_OR_JVMCI
 305   if (save_wide_vectors) {
 306     // Save upper half of YMM registers(0..15)
 307     off = ymm0_off;
 308     delta = ymm1_off - ymm0_off;
 309     for (int n = 0; n < 16; n++) {
 310       XMMRegister ymm_name = as_XMMRegister(n);
 311       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 312       off += delta;
 313     }
 314     if (VM_Version::supports_evex()) {
 315       // Save upper half of ZMM registers(0..15)
 316       off = zmm0_off;
 317       delta = zmm1_off - zmm0_off;
 318       for (int n = 0; n < 16; n++) {
 319         XMMRegister zmm_name = as_XMMRegister(n);
 320         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 321         off += delta;
 322       }
 323     }
 324   }
 325 #endif // COMPILER2_OR_JVMCI
 326 
 327   // %%% These should all be a waste but we'll keep things as they were for now
 328   if (true) {
 329     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 333     // rbp location is known implicitly by the frame sender code, needs no oopmap
 334     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 344     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 345     // on EVEX enabled targets, we get it included in the xsave area
 346     off = xmm0H_off;
 347     delta = xmm1H_off - off;
 348     for (int n = 0; n < 16; n++) {
 349       XMMRegister xmm_name = as_XMMRegister(n);
 350       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 351       off += delta;
 352     }
 353     if (UseAVX > 2) {
 354       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 355       off = zmm16H_off;
 356       delta = zmm17H_off - off;
 357       for (int n = 16; n < num_xmm_regs; n++) {
 358         XMMRegister zmm_name = as_XMMRegister(n);
 359         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 360         off += delta;
 361       }
 362     }
 363   }
 364 
 365   return map;
 366 }
 367 
 368 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 369   int num_xmm_regs = XMMRegister::available_xmm_registers();
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegister::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegister::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 471 // Register up to Register::number_of_registers are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528     case T_PRIMITIVE_OBJECT:
 529       if (int_args < Argument::n_int_register_parameters_j) {
 530         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 531       } else {
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 541         stk_args += 2;
 542       }
 543       break;
 544     case T_DOUBLE:
 545       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 546       if (fp_args < Argument::n_float_register_parameters_j) {
 547         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 548       } else {
 549         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 550         stk_args += 2;
 551       }
 552       break;
 553     default:
 554       ShouldNotReachHere();
 555       break;
 556     }
 557   }
 558 
 559   return align_up(stk_args, 2);
 560 }
 561 
 562 // Same as java_calling_convention() but for multiple return
 563 // values. There's no way to store them on the stack so if we don't
 564 // have enough registers, multiple values can't be returned.
 565 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 566 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 567 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 568                                           VMRegPair *regs,
 569                                           int total_args_passed) {
 570   // Create the mapping between argument positions and
 571   // registers.
 572   static const Register INT_ArgReg[java_return_convention_max_int] = {
 573     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 574   };
 575   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 576     j_farg0, j_farg1, j_farg2, j_farg3,
 577     j_farg4, j_farg5, j_farg6, j_farg7
 578   };
 579 
 580 
 581   uint int_args = 0;
 582   uint fp_args = 0;
 583 
 584   for (int i = 0; i < total_args_passed; i++) {
 585     switch (sig_bt[i]) {
 586     case T_BOOLEAN:
 587     case T_CHAR:
 588     case T_BYTE:
 589     case T_SHORT:
 590     case T_INT:
 591       if (int_args < Argument::n_int_register_parameters_j+1) {
 592         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 593         int_args++;
 594       } else {
 595         return -1;
 596       }
 597       break;
 598     case T_VOID:
 599       // halves of T_LONG or T_DOUBLE
 600       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 601       regs[i].set_bad();
 602       break;
 603     case T_LONG:
 604       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 605       // fall through
 606     case T_OBJECT:
 607     case T_PRIMITIVE_OBJECT:
 608     case T_ARRAY:
 609     case T_ADDRESS:
 610     case T_METADATA:
 611       if (int_args < Argument::n_int_register_parameters_j+1) {
 612         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 613         int_args++;
 614       } else {
 615         return -1;
 616       }
 617       break;
 618     case T_FLOAT:
 619       if (fp_args < Argument::n_float_register_parameters_j) {
 620         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 621         fp_args++;
 622       } else {
 623         return -1;
 624       }
 625       break;
 626     case T_DOUBLE:
 627       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 628       if (fp_args < Argument::n_float_register_parameters_j) {
 629         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 630         fp_args++;
 631       } else {
 632         return -1;
 633       }
 634       break;
 635     default:
 636       ShouldNotReachHere();
 637       break;
 638     }
 639   }
 640 
 641   return int_args + fp_args;
 642 }
 643 
 644 // Patch the callers callsite with entry to compiled code if it exists.
 645 static void patch_callers_callsite(MacroAssembler *masm) {
 646   Label L;
 647   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 648   __ jcc(Assembler::equal, L);
 649 
 650   // Save the current stack pointer
 651   __ mov(r13, rsp);
 652   // Schedule the branch target address early.
 653   // Call into the VM to patch the caller, then jump to compiled callee
 654   // rax isn't live so capture return address while we easily can
 655   __ movptr(rax, Address(rsp, 0));
 656 
 657   // align stack so push_CPU_state doesn't fault
 658   __ andptr(rsp, -(StackAlignmentInBytes));
 659   __ push_CPU_state();
 660   __ vzeroupper();
 661   // VM needs caller's callsite
 662   // VM needs target method
 663   // This needs to be a long call since we will relocate this adapter to
 664   // the codeBuffer and it may not reach
 665 
 666   // Allocate argument register save area
 667   if (frame::arg_reg_save_area_bytes != 0) {
 668     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 669   }
 670   __ mov(c_rarg0, rbx);
 671   __ mov(c_rarg1, rax);
 672   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 673 
 674   // De-allocate argument register save area
 675   if (frame::arg_reg_save_area_bytes != 0) {
 676     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 677   }
 678 
 679   __ vzeroupper();
 680   __ pop_CPU_state();
 681   // restore sp
 682   __ mov(rsp, r13);
 683   __ bind(L);
 684 }
 685 
 686 // For each inline type argument, sig includes the list of fields of
 687 // the inline type. This utility function computes the number of
 688 // arguments for the call if inline types are passed by reference (the
 689 // calling convention the interpreter expects).
 690 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 691   int total_args_passed = 0;
 692   if (InlineTypePassFieldsAsArgs) {
 693     for (int i = 0; i < sig_extended->length(); i++) {
 694       BasicType bt = sig_extended->at(i)._bt;
 695       if (bt == T_PRIMITIVE_OBJECT) {
 696         // In sig_extended, an inline type argument starts with:
 697         // T_PRIMITIVE_OBJECT, followed by the types of the fields of the
 698         // inline type and T_VOID to mark the end of the value
 699         // type. Inline types are flattened so, for instance, in the
 700         // case of an inline type with an int field and an inline type
 701         // field that itself has 2 fields, an int and a long:
 702         // T_PRIMITIVE_OBJECT T_INT T_PRIMITIVE_OBJECT T_INT T_LONG T_VOID (second
 703         // slot for the T_LONG) T_VOID (inner T_PRIMITIVE_OBJECT) T_VOID
 704         // (outer T_PRIMITIVE_OBJECT)
 705         total_args_passed++;
 706         int vt = 1;
 707         do {
 708           i++;
 709           BasicType bt = sig_extended->at(i)._bt;
 710           BasicType prev_bt = sig_extended->at(i-1)._bt;
 711           if (bt == T_PRIMITIVE_OBJECT) {
 712             vt++;
 713           } else if (bt == T_VOID &&
 714                      prev_bt != T_LONG &&
 715                      prev_bt != T_DOUBLE) {
 716             vt--;
 717           }
 718         } while (vt != 0);
 719       } else {
 720         total_args_passed++;
 721       }
 722     }
 723   } else {
 724     total_args_passed = sig_extended->length();
 725   }
 726   return total_args_passed;
 727 }
 728 
 729 
 730 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 731                                    BasicType bt,
 732                                    BasicType prev_bt,
 733                                    size_t size_in_bytes,
 734                                    const VMRegPair& reg_pair,
 735                                    const Address& to,
 736                                    int extraspace,
 737                                    bool is_oop) {
 738   assert(bt != T_PRIMITIVE_OBJECT || !InlineTypePassFieldsAsArgs, "no inline type here");
 739   if (bt == T_VOID) {
 740     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 741     return;
 742   }
 743 
 744   // Say 4 args:
 745   // i   st_off
 746   // 0   32 T_LONG
 747   // 1   24 T_VOID
 748   // 2   16 T_OBJECT
 749   // 3    8 T_BOOL
 750   // -    0 return address
 751   //
 752   // However to make thing extra confusing. Because we can fit a long/double in
 753   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 754   // leaves one slot empty and only stores to a single slot. In this case the
 755   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 756 
 757   bool wide = (size_in_bytes == wordSize);
 758   VMReg r_1 = reg_pair.first();
 759   VMReg r_2 = reg_pair.second();
 760   assert(r_2->is_valid() == wide, "invalid size");
 761   if (!r_1->is_valid()) {
 762     assert(!r_2->is_valid(), "must be invalid");
 763     return;
 764   }
 765 
 766   if (!r_1->is_XMMRegister()) {
 767     Register val = rax;
 768     if (r_1->is_stack()) {
 769       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 770       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 771     } else {
 772       val = r_1->as_Register();
 773     }
 774     assert_different_registers(to.base(), val, rscratch1);
 775     if (is_oop) {
 776       __ push(r13);
 777       __ push(rbx);
 778       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 779       __ pop(rbx);
 780       __ pop(r13);
 781     } else {
 782       __ store_sized_value(to, val, size_in_bytes);
 783     }
 784   } else {
 785     if (wide) {
 786       __ movdbl(to, r_1->as_XMMRegister());
 787     } else {
 788       __ movflt(to, r_1->as_XMMRegister());
 789     }
 790   }
 791 }
 792 
 793 static void gen_c2i_adapter(MacroAssembler *masm,
 794                             const GrowableArray<SigEntry>* sig_extended,
 795                             const VMRegPair *regs,
 796                             bool requires_clinit_barrier,
 797                             address& c2i_no_clinit_check_entry,
 798                             Label& skip_fixup,
 799                             address start,
 800                             OopMapSet* oop_maps,
 801                             int& frame_complete,
 802                             int& frame_size_in_words,
 803                             bool alloc_inline_receiver) {
 804   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 805     Label L_skip_barrier;
 806     Register method = rbx;
 807 
 808     { // Bypass the barrier for non-static methods
 809       Register flags = rscratch1;
 810       __ movl(flags, Address(method, Method::access_flags_offset()));
 811       __ testl(flags, JVM_ACC_STATIC);
 812       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 813     }
 814 
 815     Register klass = rscratch1;
 816     __ load_method_holder(klass, method);
 817     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
 818 
 819     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 820 
 821     __ bind(L_skip_barrier);
 822     c2i_no_clinit_check_entry = __ pc();
 823   }
 824 
 825   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 826   bs->c2i_entry_barrier(masm);
 827 
 828   // Before we get into the guts of the C2I adapter, see if we should be here
 829   // at all.  We've come from compiled code and are attempting to jump to the
 830   // interpreter, which means the caller made a static call to get here
 831   // (vcalls always get a compiled target if there is one).  Check for a
 832   // compiled target.  If there is one, we need to patch the caller's call.
 833   patch_callers_callsite(masm);
 834 
 835   __ bind(skip_fixup);
 836 
 837   if (InlineTypePassFieldsAsArgs) {
 838     // Is there an inline type argument?
 839     bool has_inline_argument = false;
 840     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 841       has_inline_argument = (sig_extended->at(i)._bt == T_PRIMITIVE_OBJECT);
 842     }
 843     if (has_inline_argument) {
 844       // There is at least an inline type argument: we're coming from
 845       // compiled code so we have no buffers to back the inline types.
 846       // Allocate the buffers here with a runtime call.
 847       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 848 
 849       frame_complete = __ offset();
 850 
 851       __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
 852 
 853       __ mov(c_rarg0, r15_thread);
 854       __ mov(c_rarg1, rbx);
 855       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 856       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 857 
 858       oop_maps->add_gc_map((int)(__ pc() - start), map);
 859       __ reset_last_Java_frame(false);
 860 
 861       RegisterSaver::restore_live_registers(masm);
 862 
 863       Label no_exception;
 864       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 865       __ jcc(Assembler::equal, no_exception);
 866 
 867       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
 868       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 869       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 870 
 871       __ bind(no_exception);
 872 
 873       // We get an array of objects from the runtime call
 874       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 875       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 876     }
 877   }
 878 
 879   // Since all args are passed on the stack, total_args_passed *
 880   // Interpreter::stackElementSize is the space we need.
 881   int total_args_passed = compute_total_args_passed_int(sig_extended);
 882   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 883 
 884   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 885 
 886   // stack is aligned, keep it that way
 887   // This is not currently needed or enforced by the interpreter, but
 888   // we might as well conform to the ABI.
 889   extraspace = align_up(extraspace, 2*wordSize);
 890 
 891   // set senderSP value
 892   __ lea(r13, Address(rsp, wordSize));
 893 
 894 #ifdef ASSERT
 895   __ check_stack_alignment(r13, "sender stack not aligned");
 896 #endif
 897   if (extraspace > 0) {
 898     // Pop the return address
 899     __ pop(rax);
 900 
 901     __ subptr(rsp, extraspace);
 902 
 903     // Push the return address
 904     __ push(rax);
 905 
 906     // Account for the return address location since we store it first rather
 907     // than hold it in a register across all the shuffling
 908     extraspace += wordSize;
 909   }
 910 
 911 #ifdef ASSERT
 912   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 913 #endif
 914 
 915   // Now write the args into the outgoing interpreter space
 916 
 917   // next_arg_comp is the next argument from the compiler point of
 918   // view (inline type fields are passed in registers/on the stack). In
 919   // sig_extended, an inline type argument starts with: T_PRIMITIVE_OBJECT,
 920   // followed by the types of the fields of the inline type and T_VOID
 921   // to mark the end of the inline type. ignored counts the number of
 922   // T_PRIMITIVE_OBJECT/T_VOID. next_vt_arg is the next inline type argument:
 923   // used to get the buffer for that argument from the pool of buffers
 924   // we allocated above and want to pass to the
 925   // interpreter. next_arg_int is the next argument from the
 926   // interpreter point of view (inline types are passed by reference).
 927   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
 928        next_arg_comp < sig_extended->length(); next_arg_comp++) {
 929     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
 930     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
 931     BasicType bt = sig_extended->at(next_arg_comp)._bt;
 932     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
 933     if (!InlineTypePassFieldsAsArgs || bt != T_PRIMITIVE_OBJECT) {
 934       int next_off = st_off - Interpreter::stackElementSize;
 935       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
 936       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
 937       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
 938       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 939                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
 940       next_arg_int++;
 941 #ifdef ASSERT
 942       if (bt == T_LONG || bt == T_DOUBLE) {
 943         // Overwrite the unused slot with known junk
 944         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 945         __ movptr(Address(rsp, st_off), rax);
 946       }
 947 #endif /* ASSERT */
 948     } else {
 949       ignored++;
 950       // get the buffer from the just allocated pool of buffers
 951       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_PRIMITIVE_OBJECT);
 952       __ load_heap_oop(r14, Address(rscratch2, index));
 953       next_vt_arg++; next_arg_int++;
 954       int vt = 1;
 955       // write fields we get from compiled code in registers/stack
 956       // slots to the buffer: we know we are done with that inline type
 957       // argument when we hit the T_VOID that acts as an end of inline
 958       // type delimiter for this inline type. Inline types are flattened
 959       // so we might encounter embedded inline types. Each entry in
 960       // sig_extended contains a field offset in the buffer.
 961       Label L_null;
 962       do {
 963         next_arg_comp++;
 964         BasicType bt = sig_extended->at(next_arg_comp)._bt;
 965         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
 966         if (bt == T_PRIMITIVE_OBJECT) {
 967           vt++;
 968           ignored++;
 969         } else if (bt == T_VOID &&
 970                    prev_bt != T_LONG &&
 971                    prev_bt != T_DOUBLE) {
 972           vt--;
 973           ignored++;
 974         } else {
 975           int off = sig_extended->at(next_arg_comp)._offset;
 976           if (off == -1) {
 977             // Nullable inline type argument, emit null check
 978             VMReg reg = regs[next_arg_comp-ignored].first();
 979             Label L_notNull;
 980             if (reg->is_stack()) {
 981               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 982               __ testb(Address(rsp, ld_off), 1);
 983             } else {
 984               __ testb(reg->as_Register(), 1);
 985             }
 986             __ jcc(Assembler::notZero, L_notNull);
 987             __ movptr(Address(rsp, st_off), 0);
 988             __ jmp(L_null);
 989             __ bind(L_notNull);
 990             continue;
 991           }
 992           assert(off > 0, "offset in object should be positive");
 993           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
 994           bool is_oop = is_reference_type(bt);
 995           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 996                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
 997         }
 998       } while (vt != 0);
 999       // pass the buffer to the interpreter
1000       __ movptr(Address(rsp, st_off), r14);
1001       __ bind(L_null);
1002     }
1003   }
1004 
1005   // Schedule the branch target address early.
1006   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1007   __ jmp(rcx);
1008 }
1009 
1010 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
1011                         address code_start, address code_end,
1012                         Label& L_ok) {
1013   Label L_fail;
1014   __ lea(temp_reg, ExternalAddress(code_start));
1015   __ cmpptr(pc_reg, temp_reg);
1016   __ jcc(Assembler::belowEqual, L_fail);
1017   __ lea(temp_reg, ExternalAddress(code_end));
1018   __ cmpptr(pc_reg, temp_reg);
1019   __ jcc(Assembler::below, L_ok);
1020   __ bind(L_fail);
1021 }
1022 
1023 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1024                                     int comp_args_on_stack,
1025                                     const GrowableArray<SigEntry>* sig,
1026                                     const VMRegPair *regs) {
1027 
1028   // Note: r13 contains the senderSP on entry. We must preserve it since
1029   // we may do a i2c -> c2i transition if we lose a race where compiled
1030   // code goes non-entrant while we get args ready.
1031   // In addition we use r13 to locate all the interpreter args as
1032   // we must align the stack to 16 bytes on an i2c entry else we
1033   // lose alignment we expect in all compiled code and register
1034   // save code can segv when fxsave instructions find improperly
1035   // aligned stack pointer.
1036 
1037   // Adapters can be frameless because they do not require the caller
1038   // to perform additional cleanup work, such as correcting the stack pointer.
1039   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1040   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1041   // even if a callee has modified the stack pointer.
1042   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1043   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1044   // up via the senderSP register).
1045   // In other words, if *either* the caller or callee is interpreted, we can
1046   // get the stack pointer repaired after a call.
1047   // This is why c2i and i2c adapters cannot be indefinitely composed.
1048   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1049   // both caller and callee would be compiled methods, and neither would
1050   // clean up the stack pointer changes performed by the two adapters.
1051   // If this happens, control eventually transfers back to the compiled
1052   // caller, but with an uncorrected stack, causing delayed havoc.
1053 
1054   if (VerifyAdapterCalls &&
1055       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
1056     // So, let's test for cascading c2i/i2c adapters right now.
1057     //  assert(Interpreter::contains($return_addr) ||
1058     //         StubRoutines::contains($return_addr),
1059     //         "i2c adapter must return to an interpreter frame");
1060     __ block_comment("verify_i2c { ");
1061     // Pick up the return address
1062     __ movptr(rax, Address(rsp, 0));
1063     Label L_ok;
1064     if (Interpreter::code() != NULL)
1065       range_check(masm, rax, r11,
1066                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
1067                   L_ok);
1068     if (StubRoutines::code1() != NULL)
1069       range_check(masm, rax, r11,
1070                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
1071                   L_ok);
1072     if (StubRoutines::code2() != NULL)
1073       range_check(masm, rax, r11,
1074                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
1075                   L_ok);
1076     const char* msg = "i2c adapter must return to an interpreter frame";
1077     __ block_comment(msg);
1078     __ stop(msg);
1079     __ bind(L_ok);
1080     __ block_comment("} verify_i2ce ");
1081   }
1082 
1083   // Must preserve original SP for loading incoming arguments because
1084   // we need to align the outgoing SP for compiled code.
1085   __ movptr(r11, rsp);
1086 
1087   // Pick up the return address
1088   __ pop(rax);
1089 
1090   // Convert 4-byte c2 stack slots to words.
1091   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1092 
1093   if (comp_args_on_stack) {
1094     __ subptr(rsp, comp_words_on_stack * wordSize);
1095   }
1096 
1097   // Ensure compiled code always sees stack at proper alignment
1098   __ andptr(rsp, -16);
1099 
1100   // push the return address and misalign the stack that youngest frame always sees
1101   // as far as the placement of the call instruction
1102   __ push(rax);
1103 
1104   // Put saved SP in another register
1105   const Register saved_sp = rax;
1106   __ movptr(saved_sp, r11);
1107 
1108   // Will jump to the compiled code just as if compiled code was doing it.
1109   // Pre-load the register-jump target early, to schedule it better.
1110   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1111 
1112 #if INCLUDE_JVMCI
1113   if (EnableJVMCI) {
1114     // check if this call should be routed towards a specific entry point
1115     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1116     Label no_alternative_target;
1117     __ jcc(Assembler::equal, no_alternative_target);
1118     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1119     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1120     __ bind(no_alternative_target);
1121   }
1122 #endif // INCLUDE_JVMCI
1123 
1124   int total_args_passed = sig->length();
1125 
1126   // Now generate the shuffle code.  Pick up all register args and move the
1127   // rest through the floating point stack top.
1128   for (int i = 0; i < total_args_passed; i++) {
1129     BasicType bt = sig->at(i)._bt;
1130     assert(bt != T_PRIMITIVE_OBJECT, "i2c adapter doesn't unpack inline type args");
1131     if (bt == T_VOID) {
1132       // Longs and doubles are passed in native word order, but misaligned
1133       // in the 32-bit build.
1134       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1135       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1136       continue;
1137     }
1138 
1139     // Pick up 0, 1 or 2 words from SP+offset.
1140 
1141     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1142             "scrambled load targets?");
1143     // Load in argument order going down.
1144     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1145     // Point to interpreter value (vs. tag)
1146     int next_off = ld_off - Interpreter::stackElementSize;
1147     //
1148     //
1149     //
1150     VMReg r_1 = regs[i].first();
1151     VMReg r_2 = regs[i].second();
1152     if (!r_1->is_valid()) {
1153       assert(!r_2->is_valid(), "");
1154       continue;
1155     }
1156     if (r_1->is_stack()) {
1157       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1158       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1159 
1160       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1161       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1162       // will be generated.
1163       if (!r_2->is_valid()) {
1164         // sign extend???
1165         __ movl(r13, Address(saved_sp, ld_off));
1166         __ movptr(Address(rsp, st_off), r13);
1167       } else {
1168         //
1169         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1170         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1171         // So we must adjust where to pick up the data to match the interpreter.
1172         //
1173         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1174         // are accessed as negative so LSW is at LOW address
1175 
1176         // ld_off is MSW so get LSW
1177         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1178                            next_off : ld_off;
1179         __ movq(r13, Address(saved_sp, offset));
1180         // st_off is LSW (i.e. reg.first())
1181         __ movq(Address(rsp, st_off), r13);
1182       }
1183     } else if (r_1->is_Register()) {  // Register argument
1184       Register r = r_1->as_Register();
1185       assert(r != rax, "must be different");
1186       if (r_2->is_valid()) {
1187         //
1188         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1189         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1190         // So we must adjust where to pick up the data to match the interpreter.
1191 
1192         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1193                            next_off : ld_off;
1194 
1195         // this can be a misaligned move
1196         __ movq(r, Address(saved_sp, offset));
1197       } else {
1198         // sign extend and use a full word?
1199         __ movl(r, Address(saved_sp, ld_off));
1200       }
1201     } else {
1202       if (!r_2->is_valid()) {
1203         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1204       } else {
1205         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1206       }
1207     }
1208   }
1209 
1210   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1211 
1212   // 6243940 We might end up in handle_wrong_method if
1213   // the callee is deoptimized as we race thru here. If that
1214   // happens we don't want to take a safepoint because the
1215   // caller frame will look interpreted and arguments are now
1216   // "compiled" so it is much better to make this transition
1217   // invisible to the stack walking code. Unfortunately if
1218   // we try and find the callee by normal means a safepoint
1219   // is possible. So we stash the desired callee in the thread
1220   // and the vm will find there should this case occur.
1221 
1222   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1223 
1224   // put Method* where a c2i would expect should we end up there
1225   // only needed because of c2 resolve stubs return Method* as a result in
1226   // rax
1227   __ mov(rax, rbx);
1228   __ jmp(r11);
1229 }
1230 
1231 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1232   Label ok;
1233 
1234   Register holder = rax;
1235   Register receiver = j_rarg0;
1236   Register temp = rbx;
1237 
1238   __ load_klass(temp, receiver, rscratch1);
1239   __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1240   __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1241   __ jcc(Assembler::equal, ok);
1242   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1243 
1244   __ bind(ok);
1245   // Method might have been compiled since the call site was patched to
1246   // interpreted if that is the case treat it as a miss so we can get
1247   // the call site corrected.
1248   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1249   __ jcc(Assembler::equal, skip_fixup);
1250   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1251 }
1252 
1253 // ---------------------------------------------------------------
1254 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1255                                                             int comp_args_on_stack,
1256                                                             const GrowableArray<SigEntry>* sig,
1257                                                             const VMRegPair* regs,
1258                                                             const GrowableArray<SigEntry>* sig_cc,
1259                                                             const VMRegPair* regs_cc,
1260                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1261                                                             const VMRegPair* regs_cc_ro,
1262                                                             AdapterFingerPrint* fingerprint,
1263                                                             AdapterBlob*& new_adapter,
1264                                                             bool allocate_code_blob) {
1265   address i2c_entry = __ pc();
1266   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1267 
1268   // -------------------------------------------------------------------------
1269   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1270   // to the interpreter.  The args start out packed in the compiled layout.  They
1271   // need to be unpacked into the interpreter layout.  This will almost always
1272   // require some stack space.  We grow the current (compiled) stack, then repack
1273   // the args.  We  finally end in a jump to the generic interpreter entry point.
1274   // On exit from the interpreter, the interpreter will restore our SP (lest the
1275   // compiled code, which relies solely on SP and not RBP, get sick).
1276 
1277   address c2i_unverified_entry        = __ pc();
1278   address c2i_unverified_inline_entry = __ pc();
1279   Label skip_fixup;
1280 
1281   gen_inline_cache_check(masm, skip_fixup);
1282 
1283   OopMapSet* oop_maps = new OopMapSet();
1284   int frame_complete = CodeOffsets::frame_never_safe;
1285   int frame_size_in_words = 0;
1286 
1287   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1288   address c2i_no_clinit_check_entry = NULL;
1289   address c2i_inline_ro_entry = __ pc();
1290   if (regs_cc != regs_cc_ro) {
1291     // No class init barrier needed because method is guaranteed to be non-static
1292     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1293                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1294     skip_fixup.reset();
1295   }
1296 
1297   // Scalarized c2i adapter
1298   address c2i_entry        = __ pc();
1299   address c2i_inline_entry = __ pc();
1300   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1301                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1302 
1303   // Non-scalarized c2i adapter
1304   if (regs != regs_cc) {
1305     c2i_unverified_inline_entry = __ pc();
1306     Label inline_entry_skip_fixup;
1307     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1308 
1309     c2i_inline_entry = __ pc();
1310     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1311                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1312   }
1313 
1314   __ flush();
1315 
1316   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1317   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1318   if (allocate_code_blob) {
1319     bool caller_must_gc_arguments = (regs != regs_cc);
1320     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1321   }
1322 
1323   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1324 }
1325 
1326 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1327                                          VMRegPair *regs,
1328                                          VMRegPair *regs2,
1329                                          int total_args_passed) {
1330   assert(regs2 == NULL, "not needed on x86");
1331 // We return the amount of VMRegImpl stack slots we need to reserve for all
1332 // the arguments NOT counting out_preserve_stack_slots.
1333 
1334 // NOTE: These arrays will have to change when c1 is ported
1335 #ifdef _WIN64
1336     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1337       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1338     };
1339     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1340       c_farg0, c_farg1, c_farg2, c_farg3
1341     };
1342 #else
1343     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1344       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1345     };
1346     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1347       c_farg0, c_farg1, c_farg2, c_farg3,
1348       c_farg4, c_farg5, c_farg6, c_farg7
1349     };
1350 #endif // _WIN64
1351 
1352 
1353     uint int_args = 0;
1354     uint fp_args = 0;
1355     uint stk_args = 0; // inc by 2 each time
1356 
1357     for (int i = 0; i < total_args_passed; i++) {
1358       switch (sig_bt[i]) {
1359       case T_BOOLEAN:
1360       case T_CHAR:
1361       case T_BYTE:
1362       case T_SHORT:
1363       case T_INT:
1364         if (int_args < Argument::n_int_register_parameters_c) {
1365           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1366 #ifdef _WIN64
1367           fp_args++;
1368           // Allocate slots for callee to stuff register args the stack.
1369           stk_args += 2;
1370 #endif
1371         } else {
1372           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1373           stk_args += 2;
1374         }
1375         break;
1376       case T_LONG:
1377         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1378         // fall through
1379       case T_OBJECT:
1380       case T_ARRAY:
1381       case T_PRIMITIVE_OBJECT:
1382       case T_ADDRESS:
1383       case T_METADATA:
1384         if (int_args < Argument::n_int_register_parameters_c) {
1385           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1386 #ifdef _WIN64
1387           fp_args++;
1388           stk_args += 2;
1389 #endif
1390         } else {
1391           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1392           stk_args += 2;
1393         }
1394         break;
1395       case T_FLOAT:
1396         if (fp_args < Argument::n_float_register_parameters_c) {
1397           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1398 #ifdef _WIN64
1399           int_args++;
1400           // Allocate slots for callee to stuff register args the stack.
1401           stk_args += 2;
1402 #endif
1403         } else {
1404           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1405           stk_args += 2;
1406         }
1407         break;
1408       case T_DOUBLE:
1409         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1410         if (fp_args < Argument::n_float_register_parameters_c) {
1411           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1412 #ifdef _WIN64
1413           int_args++;
1414           // Allocate slots for callee to stuff register args the stack.
1415           stk_args += 2;
1416 #endif
1417         } else {
1418           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1419           stk_args += 2;
1420         }
1421         break;
1422       case T_VOID: // Halves of longs and doubles
1423         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1424         regs[i].set_bad();
1425         break;
1426       default:
1427         ShouldNotReachHere();
1428         break;
1429       }
1430     }
1431 #ifdef _WIN64
1432   // windows abi requires that we always allocate enough stack space
1433   // for 4 64bit registers to be stored down.
1434   if (stk_args < 8) {
1435     stk_args = 8;
1436   }
1437 #endif // _WIN64
1438 
1439   return stk_args;
1440 }
1441 
1442 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1443                                              uint num_bits,
1444                                              uint total_args_passed) {
1445   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1446          "only certain vector sizes are supported for now");
1447 
1448   static const XMMRegister VEC_ArgReg[32] = {
1449      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1450      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1451     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1452     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1453   };
1454 
1455   uint stk_args = 0;
1456   uint fp_args = 0;
1457 
1458   for (uint i = 0; i < total_args_passed; i++) {
1459     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1460     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1461     regs[i].set_pair(vmreg->next(next_val), vmreg);
1462   }
1463 
1464   return stk_args;
1465 }
1466 
1467 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1468   // We always ignore the frame_slots arg and just use the space just below frame pointer
1469   // which by this time is free to use
1470   switch (ret_type) {
1471   case T_FLOAT:
1472     __ movflt(Address(rbp, -wordSize), xmm0);
1473     break;
1474   case T_DOUBLE:
1475     __ movdbl(Address(rbp, -wordSize), xmm0);
1476     break;
1477   case T_VOID:  break;
1478   default: {
1479     __ movptr(Address(rbp, -wordSize), rax);
1480     }
1481   }
1482 }
1483 
1484 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1485   // We always ignore the frame_slots arg and just use the space just below frame pointer
1486   // which by this time is free to use
1487   switch (ret_type) {
1488   case T_FLOAT:
1489     __ movflt(xmm0, Address(rbp, -wordSize));
1490     break;
1491   case T_DOUBLE:
1492     __ movdbl(xmm0, Address(rbp, -wordSize));
1493     break;
1494   case T_VOID:  break;
1495   default: {
1496     __ movptr(rax, Address(rbp, -wordSize));
1497     }
1498   }
1499 }
1500 
1501 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1502     for ( int i = first_arg ; i < arg_count ; i++ ) {
1503       if (args[i].first()->is_Register()) {
1504         __ push(args[i].first()->as_Register());
1505       } else if (args[i].first()->is_XMMRegister()) {
1506         __ subptr(rsp, 2*wordSize);
1507         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1508       }
1509     }
1510 }
1511 
1512 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1513     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1514       if (args[i].first()->is_Register()) {
1515         __ pop(args[i].first()->as_Register());
1516       } else if (args[i].first()->is_XMMRegister()) {
1517         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1518         __ addptr(rsp, 2*wordSize);
1519       }
1520     }
1521 }
1522 
1523 static void verify_oop_args(MacroAssembler* masm,
1524                             const methodHandle& method,
1525                             const BasicType* sig_bt,
1526                             const VMRegPair* regs) {
1527   Register temp_reg = rbx;  // not part of any compiled calling seq
1528   if (VerifyOops) {
1529     for (int i = 0; i < method->size_of_parameters(); i++) {
1530       if (is_reference_type(sig_bt[i])) {
1531         VMReg r = regs[i].first();
1532         assert(r->is_valid(), "bad oop arg");
1533         if (r->is_stack()) {
1534           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1535           __ verify_oop(temp_reg);
1536         } else {
1537           __ verify_oop(r->as_Register());
1538         }
1539       }
1540     }
1541   }
1542 }
1543 
1544 static void check_continuation_enter_argument(VMReg actual_vmreg,
1545                                               Register expected_reg,
1546                                               const char* name) {
1547   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1548   assert(actual_vmreg->as_Register() == expected_reg,
1549          "%s is in unexpected register: %s instead of %s",
1550          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1551 }
1552 
1553 
1554 //---------------------------- continuation_enter_setup ---------------------------
1555 //
1556 // Arguments:
1557 //   None.
1558 //
1559 // Results:
1560 //   rsp: pointer to blank ContinuationEntry
1561 //
1562 // Kills:
1563 //   rax
1564 //
1565 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1566   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1567   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1568   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1569 
1570   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1571   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1572 
1573   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1574   OopMap* map = new OopMap(frame_size, 0);
1575 
1576   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1577   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1578   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1579 
1580   return map;
1581 }
1582 
1583 //---------------------------- fill_continuation_entry ---------------------------
1584 //
1585 // Arguments:
1586 //   rsp: pointer to blank Continuation entry
1587 //   reg_cont_obj: pointer to the continuation
1588 //   reg_flags: flags
1589 //
1590 // Results:
1591 //   rsp: pointer to filled out ContinuationEntry
1592 //
1593 // Kills:
1594 //   rax
1595 //
1596 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1597   assert_different_registers(rax, reg_cont_obj, reg_flags);
1598 #ifdef ASSERT
1599   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1600 #endif
1601   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1602   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1603   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1604   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1605   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1606 
1607   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1608   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1609   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1610   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1611 
1612   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1613   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1614 }
1615 
1616 //---------------------------- continuation_enter_cleanup ---------------------------
1617 //
1618 // Arguments:
1619 //   rsp: pointer to the ContinuationEntry
1620 //
1621 // Results:
1622 //   rsp: pointer to the spilled rbp in the entry frame
1623 //
1624 // Kills:
1625 //   rbx
1626 //
1627 void static continuation_enter_cleanup(MacroAssembler* masm) {
1628 #ifdef ASSERT
1629   Label L_good_sp;
1630   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1631   __ jcc(Assembler::equal, L_good_sp);
1632   __ stop("Incorrect rsp at continuation_enter_cleanup");
1633   __ bind(L_good_sp);
1634 #endif
1635 
1636   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1637   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1638   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1639   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1640 
1641   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1642   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1643   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1644 }
1645 
1646 static void gen_continuation_enter(MacroAssembler* masm,
1647                                    const VMRegPair* regs,
1648                                    int& exception_offset,
1649                                    OopMapSet* oop_maps,
1650                                    int& frame_complete,
1651                                    int& stack_slots,
1652                                    int& interpreted_entry_offset,
1653                                    int& compiled_entry_offset) {
1654 
1655   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1656   int pos_cont_obj   = 0;
1657   int pos_is_cont    = 1;
1658   int pos_is_virtual = 2;
1659 
1660   // The platform-specific calling convention may present the arguments in various registers.
1661   // To simplify the rest of the code, we expect the arguments to reside at these known
1662   // registers, and we additionally check the placement here in case calling convention ever
1663   // changes.
1664   Register reg_cont_obj   = c_rarg1;
1665   Register reg_is_cont    = c_rarg2;
1666   Register reg_is_virtual = c_rarg3;
1667 
1668   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1669   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1670   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1671 
1672   // Utility methods kill rax, make sure there are no collisions
1673   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1674 
1675   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1676                          relocInfo::static_call_type);
1677 
1678   address start = __ pc();
1679 
1680   Label L_thaw, L_exit;
1681 
1682   // i2i entry used at interp_only_mode only
1683   interpreted_entry_offset = __ pc() - start;
1684   {
1685 #ifdef ASSERT
1686     Label is_interp_only;
1687     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1688     __ jcc(Assembler::notEqual, is_interp_only);
1689     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1690     __ bind(is_interp_only);
1691 #endif
1692 
1693     __ pop(rax); // return address
1694     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1695     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1696     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1697     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1698     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1699     __ push(rax); // return address
1700     __ push_cont_fastpath();
1701 
1702     __ enter();
1703 
1704     stack_slots = 2; // will be adjusted in setup
1705     OopMap* map = continuation_enter_setup(masm, stack_slots);
1706     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1707     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1708 
1709     __ verify_oop(reg_cont_obj);
1710 
1711     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1712 
1713     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1714     __ testptr(reg_is_cont, reg_is_cont);
1715     __ jcc(Assembler::notZero, L_thaw);
1716 
1717     // --- Resolve path
1718 
1719     // Make sure the call is patchable
1720     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1721     // Emit stub for static call
1722     CodeBuffer* cbuf = masm->code_section()->outer();
1723     address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1724     if (stub == nullptr) {
1725       fatal("CodeCache is full at gen_continuation_enter");
1726     }
1727     __ call(resolve);
1728     oop_maps->add_gc_map(__ pc() - start, map);
1729     __ post_call_nop();
1730 
1731     __ jmp(L_exit);
1732   }
1733 
1734   // compiled entry
1735   __ align(CodeEntryAlignment);
1736   compiled_entry_offset = __ pc() - start;
1737   __ enter();
1738 
1739   stack_slots = 2; // will be adjusted in setup
1740   OopMap* map = continuation_enter_setup(masm, stack_slots);
1741 
1742   // Frame is now completed as far as size and linkage.
1743   frame_complete = __ pc() - start;
1744 
1745   __ verify_oop(reg_cont_obj);
1746 
1747   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1748 
1749   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1750   __ testptr(reg_is_cont, reg_is_cont);
1751   __ jccb(Assembler::notZero, L_thaw);
1752 
1753   // --- call Continuation.enter(Continuation c, boolean isContinue)
1754 
1755   // Make sure the call is patchable
1756   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1757 
1758   // Emit stub for static call
1759   CodeBuffer* cbuf = masm->code_section()->outer();
1760   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1761   if (stub == nullptr) {
1762     fatal("CodeCache is full at gen_continuation_enter");
1763   }
1764 
1765   // The call needs to be resolved. There's a special case for this in
1766   // SharedRuntime::find_callee_info_helper() which calls
1767   // LinkResolver::resolve_continuation_enter() which resolves the call to
1768   // Continuation.enter(Continuation c, boolean isContinue).
1769   __ call(resolve);
1770 
1771   oop_maps->add_gc_map(__ pc() - start, map);
1772   __ post_call_nop();
1773 
1774   __ jmpb(L_exit);
1775 
1776   // --- Thawing path
1777 
1778   __ bind(L_thaw);
1779 
1780   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1781 
1782   ContinuationEntry::_return_pc_offset = __ pc() - start;
1783   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1784   __ post_call_nop();
1785 
1786   // --- Normal exit (resolve/thawing)
1787 
1788   __ bind(L_exit);
1789 
1790   continuation_enter_cleanup(masm);
1791   __ pop(rbp);
1792   __ ret(0);
1793 
1794   // --- Exception handling path
1795 
1796   exception_offset = __ pc() - start;
1797 
1798   continuation_enter_cleanup(masm);
1799   __ pop(rbp);
1800 
1801   __ movptr(c_rarg0, r15_thread);
1802   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1803 
1804   // rax still holds the original exception oop, save it before the call
1805   __ push(rax);
1806 
1807   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1808   __ movptr(rbx, rax);
1809 
1810   // Continue at exception handler:
1811   //   rax: exception oop
1812   //   rbx: exception handler
1813   //   rdx: exception pc
1814   __ pop(rax);
1815   __ verify_oop(rax);
1816   __ pop(rdx);
1817   __ jmp(rbx);
1818 }
1819 
1820 static void gen_continuation_yield(MacroAssembler* masm,
1821                                    const VMRegPair* regs,
1822                                    OopMapSet* oop_maps,
1823                                    int& frame_complete,
1824                                    int& stack_slots,
1825                                    int& compiled_entry_offset) {
1826   enum layout {
1827     rbp_off,
1828     rbpH_off,
1829     return_off,
1830     return_off2,
1831     framesize // inclusive of return address
1832   };
1833   stack_slots = framesize /  VMRegImpl::slots_per_word;
1834   assert(stack_slots == 2, "recheck layout");
1835 
1836   address start = __ pc();
1837   compiled_entry_offset = __ pc() - start;
1838   __ enter();
1839   address the_pc = __ pc();
1840 
1841   frame_complete = the_pc - start;
1842 
1843   // This nop must be exactly at the PC we push into the frame info.
1844   // We use this nop for fast CodeBlob lookup, associate the OopMap
1845   // with it right away.
1846   __ post_call_nop();
1847   OopMap* map = new OopMap(framesize, 1);
1848   oop_maps->add_gc_map(frame_complete, map);
1849 
1850   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1851   __ movptr(c_rarg0, r15_thread);
1852   __ movptr(c_rarg1, rsp);
1853   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1854   __ reset_last_Java_frame(true);
1855 
1856   Label L_pinned;
1857 
1858   __ testptr(rax, rax);
1859   __ jcc(Assembler::notZero, L_pinned);
1860 
1861   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1862   continuation_enter_cleanup(masm);
1863   __ pop(rbp);
1864   __ ret(0);
1865 
1866   __ bind(L_pinned);
1867 
1868   // Pinned, return to caller
1869   __ leave();
1870   __ ret(0);
1871 }
1872 
1873 static void gen_special_dispatch(MacroAssembler* masm,
1874                                  const methodHandle& method,
1875                                  const BasicType* sig_bt,
1876                                  const VMRegPair* regs) {
1877   verify_oop_args(masm, method, sig_bt, regs);
1878   vmIntrinsics::ID iid = method->intrinsic_id();
1879 
1880   // Now write the args into the outgoing interpreter space
1881   bool     has_receiver   = false;
1882   Register receiver_reg   = noreg;
1883   int      member_arg_pos = -1;
1884   Register member_reg     = noreg;
1885   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1886   if (ref_kind != 0) {
1887     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1888     member_reg = rbx;  // known to be free at this point
1889     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1890   } else if (iid == vmIntrinsics::_invokeBasic) {
1891     has_receiver = true;
1892   } else if (iid == vmIntrinsics::_linkToNative) {
1893     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1894     member_reg = rbx;  // known to be free at this point
1895   } else {
1896     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1897   }
1898 
1899   if (member_reg != noreg) {
1900     // Load the member_arg into register, if necessary.
1901     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1902     VMReg r = regs[member_arg_pos].first();
1903     if (r->is_stack()) {
1904       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1905     } else {
1906       // no data motion is needed
1907       member_reg = r->as_Register();
1908     }
1909   }
1910 
1911   if (has_receiver) {
1912     // Make sure the receiver is loaded into a register.
1913     assert(method->size_of_parameters() > 0, "oob");
1914     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1915     VMReg r = regs[0].first();
1916     assert(r->is_valid(), "bad receiver arg");
1917     if (r->is_stack()) {
1918       // Porting note:  This assumes that compiled calling conventions always
1919       // pass the receiver oop in a register.  If this is not true on some
1920       // platform, pick a temp and load the receiver from stack.
1921       fatal("receiver always in a register");
1922       receiver_reg = j_rarg0;  // known to be free at this point
1923       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1924     } else {
1925       // no data motion is needed
1926       receiver_reg = r->as_Register();
1927     }
1928   }
1929 
1930   // Figure out which address we are really jumping to:
1931   MethodHandles::generate_method_handle_dispatch(masm, iid,
1932                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1933 }
1934 
1935 // ---------------------------------------------------------------------------
1936 // Generate a native wrapper for a given method.  The method takes arguments
1937 // in the Java compiled code convention, marshals them to the native
1938 // convention (handlizes oops, etc), transitions to native, makes the call,
1939 // returns to java state (possibly blocking), unhandlizes any result and
1940 // returns.
1941 //
1942 // Critical native functions are a shorthand for the use of
1943 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1944 // functions.  The wrapper is expected to unpack the arguments before
1945 // passing them to the callee. Critical native functions leave the state _in_Java,
1946 // since they cannot stop for GC.
1947 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1948 // block and the check for pending exceptions it's impossible for them
1949 // to be thrown.
1950 //
1951 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1952                                                 const methodHandle& method,
1953                                                 int compile_id,
1954                                                 BasicType* in_sig_bt,
1955                                                 VMRegPair* in_regs,
1956                                                 BasicType ret_type) {
1957   if (method->is_continuation_native_intrinsic()) {
1958     int exception_offset = -1;
1959     OopMapSet* oop_maps = new OopMapSet();
1960     int frame_complete = -1;
1961     int stack_slots = -1;
1962     int interpreted_entry_offset = -1;
1963     int vep_offset = -1;
1964     if (method->is_continuation_enter_intrinsic()) {
1965       gen_continuation_enter(masm,
1966                              in_regs,
1967                              exception_offset,
1968                              oop_maps,
1969                              frame_complete,
1970                              stack_slots,
1971                              interpreted_entry_offset,
1972                              vep_offset);
1973     } else if (method->is_continuation_yield_intrinsic()) {
1974       gen_continuation_yield(masm,
1975                              in_regs,
1976                              oop_maps,
1977                              frame_complete,
1978                              stack_slots,
1979                              vep_offset);
1980     } else {
1981       guarantee(false, "Unknown Continuation native intrinsic");
1982     }
1983 
1984 #ifdef ASSERT
1985     if (method->is_continuation_enter_intrinsic()) {
1986       assert(interpreted_entry_offset != -1, "Must be set");
1987       assert(exception_offset != -1,         "Must be set");
1988     } else {
1989       assert(interpreted_entry_offset == -1, "Must be unset");
1990       assert(exception_offset == -1,         "Must be unset");
1991     }
1992     assert(frame_complete != -1,    "Must be set");
1993     assert(stack_slots != -1,       "Must be set");
1994     assert(vep_offset != -1,        "Must be set");
1995 #endif
1996 
1997     __ flush();
1998     nmethod* nm = nmethod::new_native_nmethod(method,
1999                                               compile_id,
2000                                               masm->code(),
2001                                               vep_offset,
2002                                               frame_complete,
2003                                               stack_slots,
2004                                               in_ByteSize(-1),
2005                                               in_ByteSize(-1),
2006                                               oop_maps,
2007                                               exception_offset);
2008     if (method->is_continuation_enter_intrinsic()) {
2009       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2010     } else if (method->is_continuation_yield_intrinsic()) {
2011       _cont_doYield_stub = nm;
2012     }
2013     return nm;
2014   }
2015 
2016   if (method->is_method_handle_intrinsic()) {
2017     vmIntrinsics::ID iid = method->intrinsic_id();
2018     intptr_t start = (intptr_t)__ pc();
2019     int vep_offset = ((intptr_t)__ pc()) - start;
2020     gen_special_dispatch(masm,
2021                          method,
2022                          in_sig_bt,
2023                          in_regs);
2024     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2025     __ flush();
2026     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2027     return nmethod::new_native_nmethod(method,
2028                                        compile_id,
2029                                        masm->code(),
2030                                        vep_offset,
2031                                        frame_complete,
2032                                        stack_slots / VMRegImpl::slots_per_word,
2033                                        in_ByteSize(-1),
2034                                        in_ByteSize(-1),
2035                                        (OopMapSet*)NULL);
2036   }
2037   address native_func = method->native_function();
2038   assert(native_func != NULL, "must have function");
2039 
2040   // An OopMap for lock (and class if static)
2041   OopMapSet *oop_maps = new OopMapSet();
2042   intptr_t start = (intptr_t)__ pc();
2043 
2044   // We have received a description of where all the java arg are located
2045   // on entry to the wrapper. We need to convert these args to where
2046   // the jni function will expect them. To figure out where they go
2047   // we convert the java signature to a C signature by inserting
2048   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2049 
2050   const int total_in_args = method->size_of_parameters();
2051   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2052 
2053   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2054   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2055   BasicType* in_elem_bt = NULL;
2056 
2057   int argc = 0;
2058   out_sig_bt[argc++] = T_ADDRESS;
2059   if (method->is_static()) {
2060     out_sig_bt[argc++] = T_OBJECT;
2061   }
2062 
2063   for (int i = 0; i < total_in_args ; i++ ) {
2064     out_sig_bt[argc++] = in_sig_bt[i];
2065   }
2066 
2067   // Now figure out where the args must be stored and how much stack space
2068   // they require.
2069   int out_arg_slots;
2070   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
2071 
2072   // Compute framesize for the wrapper.  We need to handlize all oops in
2073   // incoming registers
2074 
2075   // Calculate the total number of stack slots we will need.
2076 
2077   // First count the abi requirement plus all of the outgoing args
2078   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2079 
2080   // Now the space for the inbound oop handle area
2081   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2082 
2083   int oop_handle_offset = stack_slots;
2084   stack_slots += total_save_slots;
2085 
2086   // Now any space we need for handlizing a klass if static method
2087 
2088   int klass_slot_offset = 0;
2089   int klass_offset = -1;
2090   int lock_slot_offset = 0;
2091   bool is_static = false;
2092 
2093   if (method->is_static()) {
2094     klass_slot_offset = stack_slots;
2095     stack_slots += VMRegImpl::slots_per_word;
2096     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2097     is_static = true;
2098   }
2099 
2100   // Plus a lock if needed
2101 
2102   if (method->is_synchronized()) {
2103     lock_slot_offset = stack_slots;
2104     stack_slots += VMRegImpl::slots_per_word;
2105   }
2106 
2107   // Now a place (+2) to save return values or temp during shuffling
2108   // + 4 for return address (which we own) and saved rbp
2109   stack_slots += 6;
2110 
2111   // Ok The space we have allocated will look like:
2112   //
2113   //
2114   // FP-> |                     |
2115   //      |---------------------|
2116   //      | 2 slots for moves   |
2117   //      |---------------------|
2118   //      | lock box (if sync)  |
2119   //      |---------------------| <- lock_slot_offset
2120   //      | klass (if static)   |
2121   //      |---------------------| <- klass_slot_offset
2122   //      | oopHandle area      |
2123   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2124   //      | outbound memory     |
2125   //      | based arguments     |
2126   //      |                     |
2127   //      |---------------------|
2128   //      |                     |
2129   // SP-> | out_preserved_slots |
2130   //
2131   //
2132 
2133 
2134   // Now compute actual number of stack words we need rounding to make
2135   // stack properly aligned.
2136   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2137 
2138   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2139 
2140   // First thing make an ic check to see if we should even be here
2141 
2142   // We are free to use all registers as temps without saving them and
2143   // restoring them except rbp. rbp is the only callee save register
2144   // as far as the interpreter and the compiler(s) are concerned.
2145 
2146 
2147   const Register ic_reg = rax;
2148   const Register receiver = j_rarg0;
2149 
2150   Label hit;
2151   Label exception_pending;
2152 
2153   assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
2154   __ verify_oop(receiver);
2155   __ load_klass(rscratch1, receiver, rscratch2);
2156   __ cmpq(ic_reg, rscratch1);
2157   __ jcc(Assembler::equal, hit);
2158 
2159   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2160 
2161   // Verified entry point must be aligned
2162   __ align(8);
2163 
2164   __ bind(hit);
2165 
2166   int vep_offset = ((intptr_t)__ pc()) - start;
2167 
2168   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2169     Label L_skip_barrier;
2170     Register klass = r10;
2171     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2172     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2173 
2174     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2175 
2176     __ bind(L_skip_barrier);
2177   }
2178 
2179 #ifdef COMPILER1
2180   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2181   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2182     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2183   }
2184 #endif // COMPILER1
2185 
2186   // The instruction at the verified entry point must be 5 bytes or longer
2187   // because it can be patched on the fly by make_non_entrant. The stack bang
2188   // instruction fits that requirement.
2189 
2190   // Generate stack overflow check
2191   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2192 
2193   // Generate a new frame for the wrapper.
2194   __ enter();
2195   // -2 because return address is already present and so is saved rbp
2196   __ subptr(rsp, stack_size - 2*wordSize);
2197 
2198   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2199   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2200   bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);
2201 
2202   // Frame is now completed as far as size and linkage.
2203   int frame_complete = ((intptr_t)__ pc()) - start;
2204 
2205     if (UseRTMLocking) {
2206       // Abort RTM transaction before calling JNI
2207       // because critical section will be large and will be
2208       // aborted anyway. Also nmethod could be deoptimized.
2209       __ xabort(0);
2210     }
2211 
2212 #ifdef ASSERT
2213   __ check_stack_alignment(rsp, "improperly aligned stack");
2214 #endif /* ASSERT */
2215 
2216 
2217   // We use r14 as the oop handle for the receiver/klass
2218   // It is callee save so it survives the call to native
2219 
2220   const Register oop_handle_reg = r14;
2221 
2222   //
2223   // We immediately shuffle the arguments so that any vm call we have to
2224   // make from here on out (sync slow path, jvmti, etc.) we will have
2225   // captured the oops from our caller and have a valid oopMap for
2226   // them.
2227 
2228   // -----------------
2229   // The Grand Shuffle
2230 
2231   // The Java calling convention is either equal (linux) or denser (win64) than the
2232   // c calling convention. However the because of the jni_env argument the c calling
2233   // convention always has at least one more (and two for static) arguments than Java.
2234   // Therefore if we move the args from java -> c backwards then we will never have
2235   // a register->register conflict and we don't have to build a dependency graph
2236   // and figure out how to break any cycles.
2237   //
2238 
2239   // Record esp-based slot for receiver on stack for non-static methods
2240   int receiver_offset = -1;
2241 
2242   // This is a trick. We double the stack slots so we can claim
2243   // the oops in the caller's frame. Since we are sure to have
2244   // more args than the caller doubling is enough to make
2245   // sure we can capture all the incoming oop args from the
2246   // caller.
2247   //
2248   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2249 
2250   // Mark location of rbp (someday)
2251   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2252 
2253   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2254   // All inbound args are referenced based on rbp and all outbound args via rsp.
2255 
2256 
2257 #ifdef ASSERT
2258   bool reg_destroyed[Register::number_of_registers];
2259   bool freg_destroyed[XMMRegister::number_of_registers];
2260   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2261     reg_destroyed[r] = false;
2262   }
2263   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2264     freg_destroyed[f] = false;
2265   }
2266 
2267 #endif /* ASSERT */
2268 
2269   // For JNI natives the incoming and outgoing registers are offset upwards.
2270   GrowableArray<int> arg_order(2 * total_in_args);
2271 
2272   VMRegPair tmp_vmreg;
2273   tmp_vmreg.set2(rbx->as_VMReg());
2274 
2275   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2276     arg_order.push(i);
2277     arg_order.push(c_arg);
2278   }
2279 
2280   int temploc = -1;
2281   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2282     int i = arg_order.at(ai);
2283     int c_arg = arg_order.at(ai + 1);
2284     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2285 #ifdef ASSERT
2286     if (in_regs[i].first()->is_Register()) {
2287       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2288     } else if (in_regs[i].first()->is_XMMRegister()) {
2289       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2290     }
2291     if (out_regs[c_arg].first()->is_Register()) {
2292       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2293     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2294       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2295     }
2296 #endif /* ASSERT */
2297     switch (in_sig_bt[i]) {
2298       case T_ARRAY:
2299       case T_PRIMITIVE_OBJECT:
2300       case T_OBJECT:
2301         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2302                     ((i == 0) && (!is_static)),
2303                     &receiver_offset);
2304         break;
2305       case T_VOID:
2306         break;
2307 
2308       case T_FLOAT:
2309         __ float_move(in_regs[i], out_regs[c_arg]);
2310           break;
2311 
2312       case T_DOUBLE:
2313         assert( i + 1 < total_in_args &&
2314                 in_sig_bt[i + 1] == T_VOID &&
2315                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2316         __ double_move(in_regs[i], out_regs[c_arg]);
2317         break;
2318 
2319       case T_LONG :
2320         __ long_move(in_regs[i], out_regs[c_arg]);
2321         break;
2322 
2323       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2324 
2325       default:
2326         __ move32_64(in_regs[i], out_regs[c_arg]);
2327     }
2328   }
2329 
2330   int c_arg;
2331 
2332   // Pre-load a static method's oop into r14.  Used both by locking code and
2333   // the normal JNI call code.
2334   // point c_arg at the first arg that is already loaded in case we
2335   // need to spill before we call out
2336   c_arg = total_c_args - total_in_args;
2337 
2338   if (method->is_static()) {
2339 
2340     //  load oop into a register
2341     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2342 
2343     // Now handlize the static class mirror it's known not-null.
2344     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2345     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2346 
2347     // Now get the handle
2348     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2349     // store the klass handle as second argument
2350     __ movptr(c_rarg1, oop_handle_reg);
2351     // and protect the arg if we must spill
2352     c_arg--;
2353   }
2354 
2355   // Change state to native (we save the return address in the thread, since it might not
2356   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2357   // points into the right code segment. It does not have to be the correct return pc.
2358   // We use the same pc/oopMap repeatedly when we call out
2359 
2360   intptr_t the_pc = (intptr_t) __ pc();
2361   oop_maps->add_gc_map(the_pc - start, map);
2362 
2363   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2364 
2365 
2366   // We have all of the arguments setup at this point. We must not touch any register
2367   // argument registers at this point (what if we save/restore them there are no oop?
2368 
2369   {
2370     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2371     // protect the args we've loaded
2372     save_args(masm, total_c_args, c_arg, out_regs);
2373     __ mov_metadata(c_rarg1, method());
2374     __ call_VM_leaf(
2375       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2376       r15_thread, c_rarg1);
2377     restore_args(masm, total_c_args, c_arg, out_regs);
2378   }
2379 
2380   // RedefineClasses() tracing support for obsolete method entry
2381   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2382     // protect the args we've loaded
2383     save_args(masm, total_c_args, c_arg, out_regs);
2384     __ mov_metadata(c_rarg1, method());
2385     __ call_VM_leaf(
2386       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2387       r15_thread, c_rarg1);
2388     restore_args(masm, total_c_args, c_arg, out_regs);
2389   }
2390 
2391   // Lock a synchronized method
2392 
2393   // Register definitions used by locking and unlocking
2394 
2395   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2396   const Register obj_reg  = rbx;  // Will contain the oop
2397   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2398   const Register old_hdr  = r13;  // value of old header at unlock time
2399 
2400   Label slow_path_lock;
2401   Label lock_done;
2402 
2403   if (method->is_synchronized()) {
2404     Label count_mon;
2405 
2406     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2407 
2408     // Get the handle (the 2nd argument)
2409     __ mov(oop_handle_reg, c_rarg1);
2410 
2411     // Get address of the box
2412 
2413     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2414 
2415     // Load the oop from the handle
2416     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2417 
2418     if (!UseHeavyMonitors) {
2419 
2420       // Load immediate 1 into swap_reg %rax
2421       __ movl(swap_reg, 1);
2422 
2423       // Load (object->mark() | 1) into swap_reg %rax
2424       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2425       if (EnableValhalla) {
2426         // Mask inline_type bit such that we go to the slow path if object is an inline type
2427         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2428       }
2429 
2430       // Save (object->mark() | 1) into BasicLock's displaced header
2431       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2432 
2433       // src -> dest iff dest == rax else rax <- dest
2434       __ lock();
2435       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2436       __ jcc(Assembler::equal, count_mon);
2437 
2438       // Hmm should this move to the slow path code area???
2439 
2440       // Test if the oopMark is an obvious stack pointer, i.e.,
2441       //  1) (mark & 3) == 0, and
2442       //  2) rsp <= mark < mark + os::pagesize()
2443       // These 3 tests can be done by evaluating the following
2444       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2445       // assuming both stack pointer and pagesize have their
2446       // least significant 2 bits clear.
2447       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2448 
2449       __ subptr(swap_reg, rsp);
2450       __ andptr(swap_reg, 3 - os::vm_page_size());
2451 
2452       // Save the test result, for recursive case, the result is zero
2453       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2454       __ jcc(Assembler::notEqual, slow_path_lock);
2455     } else {
2456       __ jmp(slow_path_lock);
2457     }
2458     __ bind(count_mon);
2459     __ inc_held_monitor_count();
2460 
2461     // Slow path will re-enter here
2462     __ bind(lock_done);
2463   }
2464 
2465   // Finally just about ready to make the JNI call
2466 
2467   // get JNIEnv* which is first argument to native
2468   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2469 
2470   // Now set thread in native
2471   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2472 
2473   __ call(RuntimeAddress(native_func));
2474 
2475   // Verify or restore cpu control state after JNI call
2476   __ restore_cpu_control_state_after_jni(rscratch1);
2477 
2478   // Unpack native results.
2479   switch (ret_type) {
2480   case T_BOOLEAN: __ c2bool(rax);            break;
2481   case T_CHAR   : __ movzwl(rax, rax);      break;
2482   case T_BYTE   : __ sign_extend_byte (rax); break;
2483   case T_SHORT  : __ sign_extend_short(rax); break;
2484   case T_INT    : /* nothing to do */        break;
2485   case T_DOUBLE :
2486   case T_FLOAT  :
2487     // Result is in xmm0 we'll save as needed
2488     break;
2489   case T_ARRAY:                 // Really a handle
2490   case T_PRIMITIVE_OBJECT:           // Really a handle
2491   case T_OBJECT:                // Really a handle
2492       break; // can't de-handlize until after safepoint check
2493   case T_VOID: break;
2494   case T_LONG: break;
2495   default       : ShouldNotReachHere();
2496   }
2497 
2498   Label after_transition;
2499 
2500   // Switch thread to "native transition" state before reading the synchronization state.
2501   // This additional state is necessary because reading and testing the synchronization
2502   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2503   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2504   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2505   //     Thread A is resumed to finish this native method, but doesn't block here since it
2506   //     didn't see any synchronization is progress, and escapes.
2507   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2508 
2509   // Force this write out before the read below
2510   if (!UseSystemMemoryBarrier) {
2511     __ membar(Assembler::Membar_mask_bits(
2512               Assembler::LoadLoad | Assembler::LoadStore |
2513               Assembler::StoreLoad | Assembler::StoreStore));
2514   }
2515 
2516   // check for safepoint operation in progress and/or pending suspend requests
2517   {
2518     Label Continue;
2519     Label slow_path;
2520 
2521     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2522 
2523     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2524     __ jcc(Assembler::equal, Continue);
2525     __ bind(slow_path);
2526 
2527     // Don't use call_VM as it will see a possible pending exception and forward it
2528     // and never return here preventing us from clearing _last_native_pc down below.
2529     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2530     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2531     // by hand.
2532     //
2533     __ vzeroupper();
2534     save_native_result(masm, ret_type, stack_slots);
2535     __ mov(c_rarg0, r15_thread);
2536     __ mov(r12, rsp); // remember sp
2537     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2538     __ andptr(rsp, -16); // align stack as required by ABI
2539     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2540     __ mov(rsp, r12); // restore sp
2541     __ reinit_heapbase();
2542     // Restore any method result value
2543     restore_native_result(masm, ret_type, stack_slots);
2544     __ bind(Continue);
2545   }
2546 
2547   // change thread state
2548   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2549   __ bind(after_transition);
2550 
2551   Label reguard;
2552   Label reguard_done;
2553   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2554   __ jcc(Assembler::equal, reguard);
2555   __ bind(reguard_done);
2556 
2557   // native result if any is live
2558 
2559   // Unlock
2560   Label slow_path_unlock;
2561   Label unlock_done;
2562   if (method->is_synchronized()) {
2563 
2564     Label fast_done;
2565 
2566     // Get locked oop from the handle we passed to jni
2567     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2568 
2569     if (!UseHeavyMonitors) {
2570       Label not_recur;
2571       // Simple recursive lock?
2572       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2573       __ jcc(Assembler::notEqual, not_recur);
2574       __ dec_held_monitor_count();
2575       __ jmpb(fast_done);
2576       __ bind(not_recur);
2577     }
2578 
2579     // Must save rax if it is live now because cmpxchg must use it
2580     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2581       save_native_result(masm, ret_type, stack_slots);
2582     }
2583 
2584     if (!UseHeavyMonitors) {
2585       // get address of the stack lock
2586       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2587       //  get old displaced header
2588       __ movptr(old_hdr, Address(rax, 0));
2589 
2590       // Atomic swap old header if oop still contains the stack lock
2591       __ lock();
2592       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2593       __ jcc(Assembler::notEqual, slow_path_unlock);
2594       __ dec_held_monitor_count();
2595     } else {
2596       __ jmp(slow_path_unlock);
2597     }
2598 
2599     // slow path re-enters here
2600     __ bind(unlock_done);
2601     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2602       restore_native_result(masm, ret_type, stack_slots);
2603     }
2604 
2605     __ bind(fast_done);
2606   }
2607   {
2608     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2609     save_native_result(masm, ret_type, stack_slots);
2610     __ mov_metadata(c_rarg1, method());
2611     __ call_VM_leaf(
2612          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2613          r15_thread, c_rarg1);
2614     restore_native_result(masm, ret_type, stack_slots);
2615   }
2616 
2617   __ reset_last_Java_frame(false);
2618 
2619   // Unbox oop result, e.g. JNIHandles::resolve value.
2620   if (is_reference_type(ret_type)) {
2621     __ resolve_jobject(rax /* value */,
2622                        r15_thread /* thread */,
2623                        rcx /* tmp */);
2624   }
2625 
2626   if (CheckJNICalls) {
2627     // clear_pending_jni_exception_check
2628     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2629   }
2630 
2631   // reset handle block
2632   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2633   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), NULL_WORD);
2634 
2635   // pop our frame
2636 
2637   __ leave();
2638 
2639   // Any exception pending?
2640   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2641   __ jcc(Assembler::notEqual, exception_pending);
2642 
2643   // Return
2644 
2645   __ ret(0);
2646 
2647   // Unexpected paths are out of line and go here
2648 
2649   // forward the exception
2650   __ bind(exception_pending);
2651 
2652   // and forward the exception
2653   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2654 
2655   // Slow path locking & unlocking
2656   if (method->is_synchronized()) {
2657 
2658     // BEGIN Slow path lock
2659     __ bind(slow_path_lock);
2660 
2661     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2662     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2663 
2664     // protect the args we've loaded
2665     save_args(masm, total_c_args, c_arg, out_regs);
2666 
2667     __ mov(c_rarg0, obj_reg);
2668     __ mov(c_rarg1, lock_reg);
2669     __ mov(c_rarg2, r15_thread);
2670 
2671     // Not a leaf but we have last_Java_frame setup as we want
2672     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2673     restore_args(masm, total_c_args, c_arg, out_regs);
2674 
2675 #ifdef ASSERT
2676     { Label L;
2677     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2678     __ jcc(Assembler::equal, L);
2679     __ stop("no pending exception allowed on exit from monitorenter");
2680     __ bind(L);
2681     }
2682 #endif
2683     __ jmp(lock_done);
2684 
2685     // END Slow path lock
2686 
2687     // BEGIN Slow path unlock
2688     __ bind(slow_path_unlock);
2689 
2690     // If we haven't already saved the native result we must save it now as xmm registers
2691     // are still exposed.
2692     __ vzeroupper();
2693     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2694       save_native_result(masm, ret_type, stack_slots);
2695     }
2696 
2697     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2698 
2699     __ mov(c_rarg0, obj_reg);
2700     __ mov(c_rarg2, r15_thread);
2701     __ mov(r12, rsp); // remember sp
2702     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2703     __ andptr(rsp, -16); // align stack as required by ABI
2704 
2705     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2706     // NOTE that obj_reg == rbx currently
2707     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2708     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2709 
2710     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2711     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2712     __ mov(rsp, r12); // restore sp
2713     __ reinit_heapbase();
2714 #ifdef ASSERT
2715     {
2716       Label L;
2717       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2718       __ jcc(Assembler::equal, L);
2719       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2720       __ bind(L);
2721     }
2722 #endif /* ASSERT */
2723 
2724     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2725 
2726     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2727       restore_native_result(masm, ret_type, stack_slots);
2728     }
2729     __ jmp(unlock_done);
2730 
2731     // END Slow path unlock
2732 
2733   } // synchronized
2734 
2735   // SLOW PATH Reguard the stack if needed
2736 
2737   __ bind(reguard);
2738   __ vzeroupper();
2739   save_native_result(masm, ret_type, stack_slots);
2740   __ mov(r12, rsp); // remember sp
2741   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2742   __ andptr(rsp, -16); // align stack as required by ABI
2743   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2744   __ mov(rsp, r12); // restore sp
2745   __ reinit_heapbase();
2746   restore_native_result(masm, ret_type, stack_slots);
2747   // and continue
2748   __ jmp(reguard_done);
2749 
2750 
2751 
2752   __ flush();
2753 
2754   nmethod *nm = nmethod::new_native_nmethod(method,
2755                                             compile_id,
2756                                             masm->code(),
2757                                             vep_offset,
2758                                             frame_complete,
2759                                             stack_slots / VMRegImpl::slots_per_word,
2760                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2761                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2762                                             oop_maps);
2763 
2764   return nm;
2765 }
2766 
2767 // this function returns the adjust size (in number of words) to a c2i adapter
2768 // activation for use during deoptimization
2769 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2770   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2771 }
2772 
2773 
2774 uint SharedRuntime::out_preserve_stack_slots() {
2775   return 0;
2776 }
2777 
2778 
2779 // Number of stack slots between incoming argument block and the start of
2780 // a new frame.  The PROLOG must add this many slots to the stack.  The
2781 // EPILOG must remove this many slots.  amd64 needs two slots for
2782 // return address.
2783 uint SharedRuntime::in_preserve_stack_slots() {
2784   return 4 + 2 * VerifyStackAtCalls;
2785 }
2786 
2787 //------------------------------generate_deopt_blob----------------------------
2788 void SharedRuntime::generate_deopt_blob() {
2789   // Allocate space for the code
2790   ResourceMark rm;
2791   // Setup code generation tools
2792   int pad = 0;
2793   if (UseAVX > 2) {
2794     pad += 1024;
2795   }
2796 #if INCLUDE_JVMCI
2797   if (EnableJVMCI) {
2798     pad += 512; // Increase the buffer size when compiling for JVMCI
2799   }
2800 #endif
2801   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2802   MacroAssembler* masm = new MacroAssembler(&buffer);
2803   int frame_size_in_words;
2804   OopMap* map = NULL;
2805   OopMapSet *oop_maps = new OopMapSet();
2806 
2807   // -------------
2808   // This code enters when returning to a de-optimized nmethod.  A return
2809   // address has been pushed on the stack, and return values are in
2810   // registers.
2811   // If we are doing a normal deopt then we were called from the patched
2812   // nmethod from the point we returned to the nmethod. So the return
2813   // address on the stack is wrong by NativeCall::instruction_size
2814   // We will adjust the value so it looks like we have the original return
2815   // address on the stack (like when we eagerly deoptimized).
2816   // In the case of an exception pending when deoptimizing, we enter
2817   // with a return address on the stack that points after the call we patched
2818   // into the exception handler. We have the following register state from,
2819   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2820   //    rax: exception oop
2821   //    rbx: exception handler
2822   //    rdx: throwing pc
2823   // So in this case we simply jam rdx into the useless return address and
2824   // the stack looks just like we want.
2825   //
2826   // At this point we need to de-opt.  We save the argument return
2827   // registers.  We call the first C routine, fetch_unroll_info().  This
2828   // routine captures the return values and returns a structure which
2829   // describes the current frame size and the sizes of all replacement frames.
2830   // The current frame is compiled code and may contain many inlined
2831   // functions, each with their own JVM state.  We pop the current frame, then
2832   // push all the new frames.  Then we call the C routine unpack_frames() to
2833   // populate these frames.  Finally unpack_frames() returns us the new target
2834   // address.  Notice that callee-save registers are BLOWN here; they have
2835   // already been captured in the vframeArray at the time the return PC was
2836   // patched.
2837   address start = __ pc();
2838   Label cont;
2839 
2840   // Prolog for non exception case!
2841 
2842   // Save everything in sight.
2843   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2844 
2845   // Normal deoptimization.  Save exec mode for unpack_frames.
2846   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2847   __ jmp(cont);
2848 
2849   int reexecute_offset = __ pc() - start;
2850 #if INCLUDE_JVMCI && !defined(COMPILER1)
2851   if (EnableJVMCI && UseJVMCICompiler) {
2852     // JVMCI does not use this kind of deoptimization
2853     __ should_not_reach_here();
2854   }
2855 #endif
2856 
2857   // Reexecute case
2858   // return address is the pc describes what bci to do re-execute at
2859 
2860   // No need to update map as each call to save_live_registers will produce identical oopmap
2861   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2862 
2863   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2864   __ jmp(cont);
2865 
2866 #if INCLUDE_JVMCI
2867   Label after_fetch_unroll_info_call;
2868   int implicit_exception_uncommon_trap_offset = 0;
2869   int uncommon_trap_offset = 0;
2870 
2871   if (EnableJVMCI) {
2872     implicit_exception_uncommon_trap_offset = __ pc() - start;
2873 
2874     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2875     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2876 
2877     uncommon_trap_offset = __ pc() - start;
2878 
2879     // Save everything in sight.
2880     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2881     // fetch_unroll_info needs to call last_java_frame()
2882     __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2883 
2884     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2885     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2886 
2887     __ movl(r14, Deoptimization::Unpack_reexecute);
2888     __ mov(c_rarg0, r15_thread);
2889     __ movl(c_rarg2, r14); // exec mode
2890     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2891     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2892 
2893     __ reset_last_Java_frame(false);
2894 
2895     __ jmp(after_fetch_unroll_info_call);
2896   } // EnableJVMCI
2897 #endif // INCLUDE_JVMCI
2898 
2899   int exception_offset = __ pc() - start;
2900 
2901   // Prolog for exception case
2902 
2903   // all registers are dead at this entry point, except for rax, and
2904   // rdx which contain the exception oop and exception pc
2905   // respectively.  Set them in TLS and fall thru to the
2906   // unpack_with_exception_in_tls entry point.
2907 
2908   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2909   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2910 
2911   int exception_in_tls_offset = __ pc() - start;
2912 
2913   // new implementation because exception oop is now passed in JavaThread
2914 
2915   // Prolog for exception case
2916   // All registers must be preserved because they might be used by LinearScan
2917   // Exceptiop oop and throwing PC are passed in JavaThread
2918   // tos: stack at point of call to method that threw the exception (i.e. only
2919   // args are on the stack, no return address)
2920 
2921   // make room on stack for the return address
2922   // It will be patched later with the throwing pc. The correct value is not
2923   // available now because loading it from memory would destroy registers.
2924   __ push(0);
2925 
2926   // Save everything in sight.
2927   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2928 
2929   // Now it is safe to overwrite any register
2930 
2931   // Deopt during an exception.  Save exec mode for unpack_frames.
2932   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2933 
2934   // load throwing pc from JavaThread and patch it as the return address
2935   // of the current frame. Then clear the field in JavaThread
2936 
2937   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2938   __ movptr(Address(rbp, wordSize), rdx);
2939   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2940 
2941 #ifdef ASSERT
2942   // verify that there is really an exception oop in JavaThread
2943   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2944   __ verify_oop(rax);
2945 
2946   // verify that there is no pending exception
2947   Label no_pending_exception;
2948   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2949   __ testptr(rax, rax);
2950   __ jcc(Assembler::zero, no_pending_exception);
2951   __ stop("must not have pending exception here");
2952   __ bind(no_pending_exception);
2953 #endif
2954 
2955   __ bind(cont);
2956 
2957   // Call C code.  Need thread and this frame, but NOT official VM entry
2958   // crud.  We cannot block on this call, no GC can happen.
2959   //
2960   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2961 
2962   // fetch_unroll_info needs to call last_java_frame().
2963 
2964   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2965 #ifdef ASSERT
2966   { Label L;
2967     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2968     __ jcc(Assembler::equal, L);
2969     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2970     __ bind(L);
2971   }
2972 #endif // ASSERT
2973   __ mov(c_rarg0, r15_thread);
2974   __ movl(c_rarg1, r14); // exec_mode
2975   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2976 
2977   // Need to have an oopmap that tells fetch_unroll_info where to
2978   // find any register it might need.
2979   oop_maps->add_gc_map(__ pc() - start, map);
2980 
2981   __ reset_last_Java_frame(false);
2982 
2983 #if INCLUDE_JVMCI
2984   if (EnableJVMCI) {
2985     __ bind(after_fetch_unroll_info_call);
2986   }
2987 #endif
2988 
2989   // Load UnrollBlock* into rdi
2990   __ mov(rdi, rax);
2991 
2992   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2993    Label noException;
2994   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2995   __ jcc(Assembler::notEqual, noException);
2996   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2997   // QQQ this is useless it was NULL above
2998   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2999   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3000   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3001 
3002   __ verify_oop(rax);
3003 
3004   // Overwrite the result registers with the exception results.
3005   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3006   // I think this is useless
3007   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3008 
3009   __ bind(noException);
3010 
3011   // Only register save data is on the stack.
3012   // Now restore the result registers.  Everything else is either dead
3013   // or captured in the vframeArray.
3014   RegisterSaver::restore_result_registers(masm);
3015 
3016   // All of the register save area has been popped of the stack. Only the
3017   // return address remains.
3018 
3019   // Pop all the frames we must move/replace.
3020   //
3021   // Frame picture (youngest to oldest)
3022   // 1: self-frame (no frame link)
3023   // 2: deopting frame  (no frame link)
3024   // 3: caller of deopting frame (could be compiled/interpreted).
3025   //
3026   // Note: by leaving the return address of self-frame on the stack
3027   // and using the size of frame 2 to adjust the stack
3028   // when we are done the return to frame 3 will still be on the stack.
3029 
3030   // Pop deoptimized frame
3031   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
3032   __ addptr(rsp, rcx);
3033 
3034   // rsp should be pointing at the return address to the caller (3)
3035 
3036   // Pick up the initial fp we should save
3037   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3038   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3039 
3040 #ifdef ASSERT
3041   // Compilers generate code that bang the stack by as much as the
3042   // interpreter would need. So this stack banging should never
3043   // trigger a fault. Verify that it does not on non product builds.
3044   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3045   __ bang_stack_size(rbx, rcx);
3046 #endif
3047 
3048   // Load address of array of frame pcs into rcx
3049   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3050 
3051   // Trash the old pc
3052   __ addptr(rsp, wordSize);
3053 
3054   // Load address of array of frame sizes into rsi
3055   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
3056 
3057   // Load counter into rdx
3058   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
3059 
3060   // Now adjust the caller's stack to make up for the extra locals
3061   // but record the original sp so that we can save it in the skeletal interpreter
3062   // frame and the stack walking of interpreter_sender will get the unextended sp
3063   // value and not the "real" sp value.
3064 
3065   const Register sender_sp = r8;
3066 
3067   __ mov(sender_sp, rsp);
3068   __ movl(rbx, Address(rdi,
3069                        Deoptimization::UnrollBlock::
3070                        caller_adjustment_offset_in_bytes()));
3071   __ subptr(rsp, rbx);
3072 
3073   // Push interpreter frames in a loop
3074   Label loop;
3075   __ bind(loop);
3076   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3077   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3078   __ pushptr(Address(rcx, 0));          // Save return address
3079   __ enter();                           // Save old & set new ebp
3080   __ subptr(rsp, rbx);                  // Prolog
3081   // This value is corrected by layout_activation_impl
3082   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3083   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3084   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3085   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3086   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3087   __ decrementl(rdx);                   // Decrement counter
3088   __ jcc(Assembler::notZero, loop);
3089   __ pushptr(Address(rcx, 0));          // Save final return address
3090 
3091   // Re-push self-frame
3092   __ enter();                           // Save old & set new ebp
3093 
3094   // Allocate a full sized register save area.
3095   // Return address and rbp are in place, so we allocate two less words.
3096   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3097 
3098   // Restore frame locals after moving the frame
3099   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3100   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3101 
3102   // Call C code.  Need thread but NOT official VM entry
3103   // crud.  We cannot block on this call, no GC can happen.  Call should
3104   // restore return values to their stack-slots with the new SP.
3105   //
3106   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3107 
3108   // Use rbp because the frames look interpreted now
3109   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3110   // Don't need the precise return PC here, just precise enough to point into this code blob.
3111   address the_pc = __ pc();
3112   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3113 
3114   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3115   __ mov(c_rarg0, r15_thread);
3116   __ movl(c_rarg1, r14); // second arg: exec_mode
3117   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3118   // Revert SP alignment after call since we're going to do some SP relative addressing below
3119   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3120 
3121   // Set an oopmap for the call site
3122   // Use the same PC we used for the last java frame
3123   oop_maps->add_gc_map(the_pc - start,
3124                        new OopMap( frame_size_in_words, 0 ));
3125 
3126   // Clear fp AND pc
3127   __ reset_last_Java_frame(true);
3128 
3129   // Collect return values
3130   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3131   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3132   // I think this is useless (throwing pc?)
3133   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3134 
3135   // Pop self-frame.
3136   __ leave();                           // Epilog
3137 
3138   // Jump to interpreter
3139   __ ret(0);
3140 
3141   // Make sure all code is generated
3142   masm->flush();
3143 
3144   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3145   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3146 #if INCLUDE_JVMCI
3147   if (EnableJVMCI) {
3148     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3149     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3150   }
3151 #endif
3152 }
3153 
3154 #ifdef COMPILER2
3155 //------------------------------generate_uncommon_trap_blob--------------------
3156 void SharedRuntime::generate_uncommon_trap_blob() {
3157   // Allocate space for the code
3158   ResourceMark rm;
3159   // Setup code generation tools
3160   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3161   MacroAssembler* masm = new MacroAssembler(&buffer);
3162 
3163   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3164 
3165   address start = __ pc();
3166 
3167   if (UseRTMLocking) {
3168     // Abort RTM transaction before possible nmethod deoptimization.
3169     __ xabort(0);
3170   }
3171 
3172   // Push self-frame.  We get here with a return address on the
3173   // stack, so rsp is 8-byte aligned until we allocate our frame.
3174   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3175 
3176   // No callee saved registers. rbp is assumed implicitly saved
3177   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3178 
3179   // compiler left unloaded_class_index in j_rarg0 move to where the
3180   // runtime expects it.
3181   __ movl(c_rarg1, j_rarg0);
3182 
3183   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
3184 
3185   // Call C code.  Need thread but NOT official VM entry
3186   // crud.  We cannot block on this call, no GC can happen.  Call should
3187   // capture callee-saved registers as well as return values.
3188   // Thread is in rdi already.
3189   //
3190   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3191 
3192   __ mov(c_rarg0, r15_thread);
3193   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3194   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3195 
3196   // Set an oopmap for the call site
3197   OopMapSet* oop_maps = new OopMapSet();
3198   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3199 
3200   // location of rbp is known implicitly by the frame sender code
3201 
3202   oop_maps->add_gc_map(__ pc() - start, map);
3203 
3204   __ reset_last_Java_frame(false);
3205 
3206   // Load UnrollBlock* into rdi
3207   __ mov(rdi, rax);
3208 
3209 #ifdef ASSERT
3210   { Label L;
3211     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
3212               Deoptimization::Unpack_uncommon_trap);
3213     __ jcc(Assembler::equal, L);
3214     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
3215     __ bind(L);
3216   }
3217 #endif
3218 
3219   // Pop all the frames we must move/replace.
3220   //
3221   // Frame picture (youngest to oldest)
3222   // 1: self-frame (no frame link)
3223   // 2: deopting frame  (no frame link)
3224   // 3: caller of deopting frame (could be compiled/interpreted).
3225 
3226   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3227   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3228 
3229   // Pop deoptimized frame (int)
3230   __ movl(rcx, Address(rdi,
3231                        Deoptimization::UnrollBlock::
3232                        size_of_deoptimized_frame_offset_in_bytes()));
3233   __ addptr(rsp, rcx);
3234 
3235   // rsp should be pointing at the return address to the caller (3)
3236 
3237   // Pick up the initial fp we should save
3238   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3239   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3240 
3241 #ifdef ASSERT
3242   // Compilers generate code that bang the stack by as much as the
3243   // interpreter would need. So this stack banging should never
3244   // trigger a fault. Verify that it does not on non product builds.
3245   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3246   __ bang_stack_size(rbx, rcx);
3247 #endif
3248 
3249   // Load address of array of frame pcs into rcx (address*)
3250   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3251 
3252   // Trash the return pc
3253   __ addptr(rsp, wordSize);
3254 
3255   // Load address of array of frame sizes into rsi (intptr_t*)
3256   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3257 
3258   // Counter
3259   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3260 
3261   // Now adjust the caller's stack to make up for the extra locals but
3262   // record the original sp so that we can save it in the skeletal
3263   // interpreter frame and the stack walking of interpreter_sender
3264   // will get the unextended sp value and not the "real" sp value.
3265 
3266   const Register sender_sp = r8;
3267 
3268   __ mov(sender_sp, rsp);
3269   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3270   __ subptr(rsp, rbx);
3271 
3272   // Push interpreter frames in a loop
3273   Label loop;
3274   __ bind(loop);
3275   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3276   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3277   __ pushptr(Address(rcx, 0));     // Save return address
3278   __ enter();                      // Save old & set new rbp
3279   __ subptr(rsp, rbx);             // Prolog
3280   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3281             sender_sp);            // Make it walkable
3282   // This value is corrected by layout_activation_impl
3283   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3284   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3285   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3286   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3287   __ decrementl(rdx);              // Decrement counter
3288   __ jcc(Assembler::notZero, loop);
3289   __ pushptr(Address(rcx, 0));     // Save final return address
3290 
3291   // Re-push self-frame
3292   __ enter();                 // Save old & set new rbp
3293   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3294                               // Prolog
3295 
3296   // Use rbp because the frames look interpreted now
3297   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3298   // Don't need the precise return PC here, just precise enough to point into this code blob.
3299   address the_pc = __ pc();
3300   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3301 
3302   // Call C code.  Need thread but NOT official VM entry
3303   // crud.  We cannot block on this call, no GC can happen.  Call should
3304   // restore return values to their stack-slots with the new SP.
3305   // Thread is in rdi already.
3306   //
3307   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3308 
3309   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3310   __ mov(c_rarg0, r15_thread);
3311   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3312   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3313 
3314   // Set an oopmap for the call site
3315   // Use the same PC we used for the last java frame
3316   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3317 
3318   // Clear fp AND pc
3319   __ reset_last_Java_frame(true);
3320 
3321   // Pop self-frame.
3322   __ leave();                 // Epilog
3323 
3324   // Jump to interpreter
3325   __ ret(0);
3326 
3327   // Make sure all code is generated
3328   masm->flush();
3329 
3330   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3331                                                  SimpleRuntimeFrame::framesize >> 1);
3332 }
3333 #endif // COMPILER2
3334 
3335 //------------------------------generate_handler_blob------
3336 //
3337 // Generate a special Compile2Runtime blob that saves all registers,
3338 // and setup oopmap.
3339 //
3340 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3341   assert(StubRoutines::forward_exception_entry() != NULL,
3342          "must be generated before");
3343 
3344   ResourceMark rm;
3345   OopMapSet *oop_maps = new OopMapSet();
3346   OopMap* map;
3347 
3348   // Allocate space for the code.  Setup code generation tools.
3349   CodeBuffer buffer("handler_blob", 2048, 1024);
3350   MacroAssembler* masm = new MacroAssembler(&buffer);
3351 
3352   address start   = __ pc();
3353   address call_pc = NULL;
3354   int frame_size_in_words;
3355   bool cause_return = (poll_type == POLL_AT_RETURN);
3356   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3357 
3358   if (UseRTMLocking) {
3359     // Abort RTM transaction before calling runtime
3360     // because critical section will be large and will be
3361     // aborted anyway. Also nmethod could be deoptimized.
3362     __ xabort(0);
3363   }
3364 
3365   // Make room for return address (or push it again)
3366   if (!cause_return) {
3367     __ push(rbx);
3368   }
3369 
3370   // Save registers, fpu state, and flags
3371   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3372 
3373   // The following is basically a call_VM.  However, we need the precise
3374   // address of the call in order to generate an oopmap. Hence, we do all the
3375   // work ourselves.
3376 
3377   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3378 
3379   // The return address must always be correct so that frame constructor never
3380   // sees an invalid pc.
3381 
3382   if (!cause_return) {
3383     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3384     // Additionally, rbx is a callee saved register and we can look at it later to determine
3385     // if someone changed the return address for us!
3386     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3387     __ movptr(Address(rbp, wordSize), rbx);
3388   }
3389 
3390   // Do the call
3391   __ mov(c_rarg0, r15_thread);
3392   __ call(RuntimeAddress(call_ptr));
3393 
3394   // Set an oopmap for the call site.  This oopmap will map all
3395   // oop-registers and debug-info registers as callee-saved.  This
3396   // will allow deoptimization at this safepoint to find all possible
3397   // debug-info recordings, as well as let GC find all oops.
3398 
3399   oop_maps->add_gc_map( __ pc() - start, map);
3400 
3401   Label noException;
3402 
3403   __ reset_last_Java_frame(false);
3404 
3405   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3406   __ jcc(Assembler::equal, noException);
3407 
3408   // Exception pending
3409 
3410   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3411 
3412   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3413 
3414   // No exception case
3415   __ bind(noException);
3416 
3417   Label no_adjust;
3418 #ifdef ASSERT
3419   Label bail;
3420 #endif
3421   if (!cause_return) {
3422     Label no_prefix, not_special;
3423 
3424     // If our stashed return pc was modified by the runtime we avoid touching it
3425     __ cmpptr(rbx, Address(rbp, wordSize));
3426     __ jccb(Assembler::notEqual, no_adjust);
3427 
3428     // Skip over the poll instruction.
3429     // See NativeInstruction::is_safepoint_poll()
3430     // Possible encodings:
3431     //      85 00       test   %eax,(%rax)
3432     //      85 01       test   %eax,(%rcx)
3433     //      85 02       test   %eax,(%rdx)
3434     //      85 03       test   %eax,(%rbx)
3435     //      85 06       test   %eax,(%rsi)
3436     //      85 07       test   %eax,(%rdi)
3437     //
3438     //   41 85 00       test   %eax,(%r8)
3439     //   41 85 01       test   %eax,(%r9)
3440     //   41 85 02       test   %eax,(%r10)
3441     //   41 85 03       test   %eax,(%r11)
3442     //   41 85 06       test   %eax,(%r14)
3443     //   41 85 07       test   %eax,(%r15)
3444     //
3445     //      85 04 24    test   %eax,(%rsp)
3446     //   41 85 04 24    test   %eax,(%r12)
3447     //      85 45 00    test   %eax,0x0(%rbp)
3448     //   41 85 45 00    test   %eax,0x0(%r13)
3449 
3450     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3451     __ jcc(Assembler::notEqual, no_prefix);
3452     __ addptr(rbx, 1);
3453     __ bind(no_prefix);
3454 #ifdef ASSERT
3455     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3456 #endif
3457     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3458     // r12/rsp 0x04
3459     // r13/rbp 0x05
3460     __ movzbq(rcx, Address(rbx, 1));
3461     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3462     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3463     __ cmpptr(rcx, 1);
3464     __ jcc(Assembler::above, not_special);
3465     __ addptr(rbx, 1);
3466     __ bind(not_special);
3467 #ifdef ASSERT
3468     // Verify the correct encoding of the poll we're about to skip.
3469     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3470     __ jcc(Assembler::notEqual, bail);
3471     // Mask out the modrm bits
3472     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3473     // rax encodes to 0, so if the bits are nonzero it's incorrect
3474     __ jcc(Assembler::notZero, bail);
3475 #endif
3476     // Adjust return pc forward to step over the safepoint poll instruction
3477     __ addptr(rbx, 2);
3478     __ movptr(Address(rbp, wordSize), rbx);
3479   }
3480 
3481   __ bind(no_adjust);
3482   // Normal exit, restore registers and exit.
3483   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3484   __ ret(0);
3485 
3486 #ifdef ASSERT
3487   __ bind(bail);
3488   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3489 #endif
3490 
3491   // Make sure all code is generated
3492   masm->flush();
3493 
3494   // Fill-out other meta info
3495   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3496 }
3497 
3498 //
3499 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3500 //
3501 // Generate a stub that calls into vm to find out the proper destination
3502 // of a java call. All the argument registers are live at this point
3503 // but since this is generic code we don't know what they are and the caller
3504 // must do any gc of the args.
3505 //
3506 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3507   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3508 
3509   // allocate space for the code
3510   ResourceMark rm;
3511 
3512   CodeBuffer buffer(name, 1200, 512);
3513   MacroAssembler* masm = new MacroAssembler(&buffer);
3514 
3515   int frame_size_in_words;
3516 
3517   OopMapSet *oop_maps = new OopMapSet();
3518   OopMap* map = NULL;
3519 
3520   int start = __ offset();
3521 
3522   // No need to save vector registers since they are caller-saved anyway.
3523   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3524 
3525   int frame_complete = __ offset();
3526 
3527   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
3528 
3529   __ mov(c_rarg0, r15_thread);
3530 
3531   __ call(RuntimeAddress(destination));
3532 
3533 
3534   // Set an oopmap for the call site.
3535   // We need this not only for callee-saved registers, but also for volatile
3536   // registers that the compiler might be keeping live across a safepoint.
3537 
3538   oop_maps->add_gc_map( __ offset() - start, map);
3539 
3540   // rax contains the address we are going to jump to assuming no exception got installed
3541 
3542   // clear last_Java_sp
3543   __ reset_last_Java_frame(false);
3544   // check for pending exceptions
3545   Label pending;
3546   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3547   __ jcc(Assembler::notEqual, pending);
3548 
3549   // get the returned Method*
3550   __ get_vm_result_2(rbx, r15_thread);
3551   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3552 
3553   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3554 
3555   RegisterSaver::restore_live_registers(masm);
3556 
3557   // We are back to the original state on entry and ready to go.
3558 
3559   __ jmp(rax);
3560 
3561   // Pending exception after the safepoint
3562 
3563   __ bind(pending);
3564 
3565   RegisterSaver::restore_live_registers(masm);
3566 
3567   // exception pending => remove activation and forward to exception handler
3568 
3569   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3570 
3571   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3572   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3573 
3574   // -------------
3575   // make sure all code is generated
3576   masm->flush();
3577 
3578   // return the  blob
3579   // frame_size_words or bytes??
3580   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3581 }
3582 
3583 //------------------------------Montgomery multiplication------------------------
3584 //
3585 
3586 #ifndef _WINDOWS
3587 
3588 // Subtract 0:b from carry:a.  Return carry.
3589 static julong
3590 sub(julong a[], julong b[], julong carry, long len) {
3591   long long i = 0, cnt = len;
3592   julong tmp;
3593   asm volatile("clc; "
3594                "0: ; "
3595                "mov (%[b], %[i], 8), %[tmp]; "
3596                "sbb %[tmp], (%[a], %[i], 8); "
3597                "inc %[i]; dec %[cnt]; "
3598                "jne 0b; "
3599                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3600                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3601                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3602                : "memory");
3603   return tmp;
3604 }
3605 
3606 // Multiply (unsigned) Long A by Long B, accumulating the double-
3607 // length result into the accumulator formed of T0, T1, and T2.
3608 #define MACC(A, B, T0, T1, T2)                                  \
3609 do {                                                            \
3610   unsigned long hi, lo;                                         \
3611   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3612            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3613            : "r"(A), "a"(B) : "cc");                            \
3614  } while(0)
3615 
3616 // As above, but add twice the double-length result into the
3617 // accumulator.
3618 #define MACC2(A, B, T0, T1, T2)                                 \
3619 do {                                                            \
3620   unsigned long hi, lo;                                         \
3621   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3622            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3623            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3624            : "r"(A), "a"(B) : "cc");                            \
3625  } while(0)
3626 
3627 #else //_WINDOWS
3628 
3629 static julong
3630 sub(julong a[], julong b[], julong carry, long len) {
3631   long i;
3632   julong tmp;
3633   unsigned char c = 1;
3634   for (i = 0; i < len; i++) {
3635     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3636     a[i] = tmp;
3637   }
3638   c = _addcarry_u64(c, carry, ~0, &tmp);
3639   return tmp;
3640 }
3641 
3642 // Multiply (unsigned) Long A by Long B, accumulating the double-
3643 // length result into the accumulator formed of T0, T1, and T2.
3644 #define MACC(A, B, T0, T1, T2)                          \
3645 do {                                                    \
3646   julong hi, lo;                            \
3647   lo = _umul128(A, B, &hi);                             \
3648   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3649   c = _addcarry_u64(c, hi, T1, &T1);                    \
3650   _addcarry_u64(c, T2, 0, &T2);                         \
3651  } while(0)
3652 
3653 // As above, but add twice the double-length result into the
3654 // accumulator.
3655 #define MACC2(A, B, T0, T1, T2)                         \
3656 do {                                                    \
3657   julong hi, lo;                            \
3658   lo = _umul128(A, B, &hi);                             \
3659   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3660   c = _addcarry_u64(c, hi, T1, &T1);                    \
3661   _addcarry_u64(c, T2, 0, &T2);                         \
3662   c = _addcarry_u64(0, lo, T0, &T0);                    \
3663   c = _addcarry_u64(c, hi, T1, &T1);                    \
3664   _addcarry_u64(c, T2, 0, &T2);                         \
3665  } while(0)
3666 
3667 #endif //_WINDOWS
3668 
3669 // Fast Montgomery multiplication.  The derivation of the algorithm is
3670 // in  A Cryptographic Library for the Motorola DSP56000,
3671 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3672 
3673 static void NOINLINE
3674 montgomery_multiply(julong a[], julong b[], julong n[],
3675                     julong m[], julong inv, int len) {
3676   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3677   int i;
3678 
3679   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3680 
3681   for (i = 0; i < len; i++) {
3682     int j;
3683     for (j = 0; j < i; j++) {
3684       MACC(a[j], b[i-j], t0, t1, t2);
3685       MACC(m[j], n[i-j], t0, t1, t2);
3686     }
3687     MACC(a[i], b[0], t0, t1, t2);
3688     m[i] = t0 * inv;
3689     MACC(m[i], n[0], t0, t1, t2);
3690 
3691     assert(t0 == 0, "broken Montgomery multiply");
3692 
3693     t0 = t1; t1 = t2; t2 = 0;
3694   }
3695 
3696   for (i = len; i < 2*len; i++) {
3697     int j;
3698     for (j = i-len+1; j < len; j++) {
3699       MACC(a[j], b[i-j], t0, t1, t2);
3700       MACC(m[j], n[i-j], t0, t1, t2);
3701     }
3702     m[i-len] = t0;
3703     t0 = t1; t1 = t2; t2 = 0;
3704   }
3705 
3706   while (t0)
3707     t0 = sub(m, n, t0, len);
3708 }
3709 
3710 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3711 // multiplies so it should be up to 25% faster than Montgomery
3712 // multiplication.  However, its loop control is more complex and it
3713 // may actually run slower on some machines.
3714 
3715 static void NOINLINE
3716 montgomery_square(julong a[], julong n[],
3717                   julong m[], julong inv, int len) {
3718   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3719   int i;
3720 
3721   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3722 
3723   for (i = 0; i < len; i++) {
3724     int j;
3725     int end = (i+1)/2;
3726     for (j = 0; j < end; j++) {
3727       MACC2(a[j], a[i-j], t0, t1, t2);
3728       MACC(m[j], n[i-j], t0, t1, t2);
3729     }
3730     if ((i & 1) == 0) {
3731       MACC(a[j], a[j], t0, t1, t2);
3732     }
3733     for (; j < i; j++) {
3734       MACC(m[j], n[i-j], t0, t1, t2);
3735     }
3736     m[i] = t0 * inv;
3737     MACC(m[i], n[0], t0, t1, t2);
3738 
3739     assert(t0 == 0, "broken Montgomery square");
3740 
3741     t0 = t1; t1 = t2; t2 = 0;
3742   }
3743 
3744   for (i = len; i < 2*len; i++) {
3745     int start = i-len+1;
3746     int end = start + (len - start)/2;
3747     int j;
3748     for (j = start; j < end; j++) {
3749       MACC2(a[j], a[i-j], t0, t1, t2);
3750       MACC(m[j], n[i-j], t0, t1, t2);
3751     }
3752     if ((i & 1) == 0) {
3753       MACC(a[j], a[j], t0, t1, t2);
3754     }
3755     for (; j < len; j++) {
3756       MACC(m[j], n[i-j], t0, t1, t2);
3757     }
3758     m[i-len] = t0;
3759     t0 = t1; t1 = t2; t2 = 0;
3760   }
3761 
3762   while (t0)
3763     t0 = sub(m, n, t0, len);
3764 }
3765 
3766 // Swap words in a longword.
3767 static julong swap(julong x) {
3768   return (x << 32) | (x >> 32);
3769 }
3770 
3771 // Copy len longwords from s to d, word-swapping as we go.  The
3772 // destination array is reversed.
3773 static void reverse_words(julong *s, julong *d, int len) {
3774   d += len;
3775   while(len-- > 0) {
3776     d--;
3777     *d = swap(*s);
3778     s++;
3779   }
3780 }
3781 
3782 // The threshold at which squaring is advantageous was determined
3783 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3784 #define MONTGOMERY_SQUARING_THRESHOLD 64
3785 
3786 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3787                                         jint len, jlong inv,
3788                                         jint *m_ints) {
3789   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3790   int longwords = len/2;
3791 
3792   // Make very sure we don't use so much space that the stack might
3793   // overflow.  512 jints corresponds to an 16384-bit integer and
3794   // will use here a total of 8k bytes of stack space.
3795   int divisor = sizeof(julong) * 4;
3796   guarantee(longwords <= 8192 / divisor, "must be");
3797   int total_allocation = longwords * sizeof (julong) * 4;
3798   julong *scratch = (julong *)alloca(total_allocation);
3799 
3800   // Local scratch arrays
3801   julong
3802     *a = scratch + 0 * longwords,
3803     *b = scratch + 1 * longwords,
3804     *n = scratch + 2 * longwords,
3805     *m = scratch + 3 * longwords;
3806 
3807   reverse_words((julong *)a_ints, a, longwords);
3808   reverse_words((julong *)b_ints, b, longwords);
3809   reverse_words((julong *)n_ints, n, longwords);
3810 
3811   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3812 
3813   reverse_words(m, (julong *)m_ints, longwords);
3814 }
3815 
3816 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3817                                       jint len, jlong inv,
3818                                       jint *m_ints) {
3819   assert(len % 2 == 0, "array length in montgomery_square must be even");
3820   int longwords = len/2;
3821 
3822   // Make very sure we don't use so much space that the stack might
3823   // overflow.  512 jints corresponds to an 16384-bit integer and
3824   // will use here a total of 6k bytes of stack space.
3825   int divisor = sizeof(julong) * 3;
3826   guarantee(longwords <= (8192 / divisor), "must be");
3827   int total_allocation = longwords * sizeof (julong) * 3;
3828   julong *scratch = (julong *)alloca(total_allocation);
3829 
3830   // Local scratch arrays
3831   julong
3832     *a = scratch + 0 * longwords,
3833     *n = scratch + 1 * longwords,
3834     *m = scratch + 2 * longwords;
3835 
3836   reverse_words((julong *)a_ints, a, longwords);
3837   reverse_words((julong *)n_ints, n, longwords);
3838 
3839   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3840     ::montgomery_square(a, n, m, (julong)inv, longwords);
3841   } else {
3842     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3843   }
3844 
3845   reverse_words(m, (julong *)m_ints, longwords);
3846 }
3847 
3848 #ifdef COMPILER2
3849 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3850 //
3851 //------------------------------generate_exception_blob---------------------------
3852 // creates exception blob at the end
3853 // Using exception blob, this code is jumped from a compiled method.
3854 // (see emit_exception_handler in x86_64.ad file)
3855 //
3856 // Given an exception pc at a call we call into the runtime for the
3857 // handler in this method. This handler might merely restore state
3858 // (i.e. callee save registers) unwind the frame and jump to the
3859 // exception handler for the nmethod if there is no Java level handler
3860 // for the nmethod.
3861 //
3862 // This code is entered with a jmp.
3863 //
3864 // Arguments:
3865 //   rax: exception oop
3866 //   rdx: exception pc
3867 //
3868 // Results:
3869 //   rax: exception oop
3870 //   rdx: exception pc in caller or ???
3871 //   destination: exception handler of caller
3872 //
3873 // Note: the exception pc MUST be at a call (precise debug information)
3874 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3875 //
3876 
3877 void OptoRuntime::generate_exception_blob() {
3878   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3879   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3880   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3881 
3882   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3883 
3884   // Allocate space for the code
3885   ResourceMark rm;
3886   // Setup code generation tools
3887   CodeBuffer buffer("exception_blob", 2048, 1024);
3888   MacroAssembler* masm = new MacroAssembler(&buffer);
3889 
3890 
3891   address start = __ pc();
3892 
3893   // Exception pc is 'return address' for stack walker
3894   __ push(rdx);
3895   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3896 
3897   // Save callee-saved registers.  See x86_64.ad.
3898 
3899   // rbp is an implicitly saved callee saved register (i.e., the calling
3900   // convention will save/restore it in the prolog/epilog). Other than that
3901   // there are no callee save registers now that adapter frames are gone.
3902 
3903   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3904 
3905   // Store exception in Thread object. We cannot pass any arguments to the
3906   // handle_exception call, since we do not want to make any assumption
3907   // about the size of the frame where the exception happened in.
3908   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3909   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3910   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3911 
3912   // This call does all the hard work.  It checks if an exception handler
3913   // exists in the method.
3914   // If so, it returns the handler address.
3915   // If not, it prepares for stack-unwinding, restoring the callee-save
3916   // registers of the frame being removed.
3917   //
3918   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3919 
3920   // At a method handle call, the stack may not be properly aligned
3921   // when returning with an exception.
3922   address the_pc = __ pc();
3923   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3924   __ mov(c_rarg0, r15_thread);
3925   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3926   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3927 
3928   // Set an oopmap for the call site.  This oopmap will only be used if we
3929   // are unwinding the stack.  Hence, all locations will be dead.
3930   // Callee-saved registers will be the same as the frame above (i.e.,
3931   // handle_exception_stub), since they were restored when we got the
3932   // exception.
3933 
3934   OopMapSet* oop_maps = new OopMapSet();
3935 
3936   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3937 
3938   __ reset_last_Java_frame(false);
3939 
3940   // Restore callee-saved registers
3941 
3942   // rbp is an implicitly saved callee-saved register (i.e., the calling
3943   // convention will save restore it in prolog/epilog) Other than that
3944   // there are no callee save registers now that adapter frames are gone.
3945 
3946   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3947 
3948   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3949   __ pop(rdx);                  // No need for exception pc anymore
3950 
3951   // rax: exception handler
3952 
3953   // We have a handler in rax (could be deopt blob).
3954   __ mov(r8, rax);
3955 
3956   // Get the exception oop
3957   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3958   // Get the exception pc in case we are deoptimized
3959   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3960 #ifdef ASSERT
3961   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3962   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3963 #endif
3964   // Clear the exception oop so GC no longer processes it as a root.
3965   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3966 
3967   // rax: exception oop
3968   // r8:  exception handler
3969   // rdx: exception pc
3970   // Jump to handler
3971 
3972   __ jmp(r8);
3973 
3974   // Make sure all code is generated
3975   masm->flush();
3976 
3977   // Set exception blob
3978   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3979 }
3980 #endif // COMPILER2
3981 
3982 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
3983   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
3984   CodeBuffer buffer(buf);
3985   short buffer_locs[20];
3986   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
3987                                          sizeof(buffer_locs)/sizeof(relocInfo));
3988 
3989   MacroAssembler* masm = new MacroAssembler(&buffer);
3990 
3991   const Array<SigEntry>* sig_vk = vk->extended_sig();
3992   const Array<VMRegPair>* regs = vk->return_regs();
3993 
3994   int pack_fields_jobject_off = __ offset();
3995   // Resolve pre-allocated buffer from JNI handle.
3996   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
3997   __ movptr(rax, Address(r13, 0));
3998   __ resolve_jobject(rax /* value */,
3999                      r15_thread /* thread */,
4000                      r12 /* tmp */);
4001   __ movptr(Address(r13, 0), rax);
4002 
4003   int pack_fields_off = __ offset();
4004 
4005   int j = 1;
4006   for (int i = 0; i < sig_vk->length(); i++) {
4007     BasicType bt = sig_vk->at(i)._bt;
4008     if (bt == T_PRIMITIVE_OBJECT) {
4009       continue;
4010     }
4011     if (bt == T_VOID) {
4012       if (sig_vk->at(i-1)._bt == T_LONG ||
4013           sig_vk->at(i-1)._bt == T_DOUBLE) {
4014         j++;
4015       }
4016       continue;
4017     }
4018     int off = sig_vk->at(i)._offset;
4019     assert(off > 0, "offset in object should be positive");
4020     VMRegPair pair = regs->at(j);
4021     VMReg r_1 = pair.first();
4022     VMReg r_2 = pair.second();
4023     Address to(rax, off);
4024     if (bt == T_FLOAT) {
4025       __ movflt(to, r_1->as_XMMRegister());
4026     } else if (bt == T_DOUBLE) {
4027       __ movdbl(to, r_1->as_XMMRegister());
4028     } else {
4029       Register val = r_1->as_Register();
4030       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
4031       if (is_reference_type(bt)) {
4032         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
4033       } else {
4034         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
4035       }
4036     }
4037     j++;
4038   }
4039   assert(j == regs->length(), "missed a field?");
4040 
4041   __ ret(0);
4042 
4043   int unpack_fields_off = __ offset();
4044 
4045   Label skip;
4046   __ testptr(rax, rax);
4047   __ jcc(Assembler::zero, skip);
4048 
4049   j = 1;
4050   for (int i = 0; i < sig_vk->length(); i++) {
4051     BasicType bt = sig_vk->at(i)._bt;
4052     if (bt == T_PRIMITIVE_OBJECT) {
4053       continue;
4054     }
4055     if (bt == T_VOID) {
4056       if (sig_vk->at(i-1)._bt == T_LONG ||
4057           sig_vk->at(i-1)._bt == T_DOUBLE) {
4058         j++;
4059       }
4060       continue;
4061     }
4062     int off = sig_vk->at(i)._offset;
4063     assert(off > 0, "offset in object should be positive");
4064     VMRegPair pair = regs->at(j);
4065     VMReg r_1 = pair.first();
4066     VMReg r_2 = pair.second();
4067     Address from(rax, off);
4068     if (bt == T_FLOAT) {
4069       __ movflt(r_1->as_XMMRegister(), from);
4070     } else if (bt == T_DOUBLE) {
4071       __ movdbl(r_1->as_XMMRegister(), from);
4072     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4073       assert_different_registers(rax, r_1->as_Register());
4074       __ load_heap_oop(r_1->as_Register(), from);
4075     } else {
4076       assert(is_java_primitive(bt), "unexpected basic type");
4077       assert_different_registers(rax, r_1->as_Register());
4078       size_t size_in_bytes = type2aelembytes(bt);
4079       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4080     }
4081     j++;
4082   }
4083   assert(j == regs->length(), "missed a field?");
4084 
4085   __ bind(skip);
4086   __ ret(0);
4087 
4088   __ flush();
4089 
4090   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4091 }