1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/compiledIC.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/icBuffer.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/compiledICHolder.hpp"
  46 #include "oops/klass.inline.hpp"
  47 #include "oops/method.inline.hpp"
  48 #include "prims/methodHandles.hpp"
  49 #include "runtime/continuation.hpp"
  50 #include "runtime/continuationEntry.inline.hpp"
  51 #include "runtime/globals.hpp"
  52 #include "runtime/jniHandles.hpp"
  53 #include "runtime/safepointMechanism.hpp"
  54 #include "runtime/sharedRuntime.hpp"
  55 #include "runtime/signature.hpp"
  56 #include "runtime/stubRoutines.hpp"
  57 #include "runtime/vframeArray.hpp"
  58 #include "runtime/vm_version.hpp"
  59 #include "utilities/align.hpp"
  60 #include "utilities/formatBuffer.hpp"
  61 #include "vmreg_x86.inline.hpp"
  62 #ifdef COMPILER1
  63 #include "c1/c1_Runtime1.hpp"
  64 #endif
  65 #ifdef COMPILER2
  66 #include "opto/runtime.hpp"
  67 #endif
  68 #if INCLUDE_JVMCI
  69 #include "jvmci/jvmciJavaClasses.hpp"
  70 #endif
  71 
  72 #define __ masm->
  73 
  74 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  75 
  76 class SimpleRuntimeFrame {
  77 
  78   public:
  79 
  80   // Most of the runtime stubs have this simple frame layout.
  81   // This class exists to make the layout shared in one place.
  82   // Offsets are for compiler stack slots, which are jints.
  83   enum layout {
  84     // The frame sender code expects that rbp will be in the "natural" place and
  85     // will override any oopMap setting for it. We must therefore force the layout
  86     // so that it agrees with the frame sender code.
  87     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  88     rbp_off2,
  89     return_off, return_off2,
  90     framesize
  91   };
  92 };
  93 
  94 class RegisterSaver {
  95   // Capture info about frame layout.  Layout offsets are in jint
  96   // units because compiler frame slots are jints.
  97 #define XSAVE_AREA_BEGIN 160
  98 #define XSAVE_AREA_YMM_BEGIN 576
  99 #define XSAVE_AREA_OPMASK_BEGIN 1088
 100 #define XSAVE_AREA_ZMM_BEGIN 1152
 101 #define XSAVE_AREA_UPPERBANK 1664
 102 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 103 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 104 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 105 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 106 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 107   enum layout {
 108     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 109     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 110     DEF_XMM_OFFS(0),
 111     DEF_XMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_YMM_OFFS(0),
 115     DEF_YMM_OFFS(1),
 116     // 2..15 are implied in range usage
 117     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_OPMASK_OFFS(0),
 119     DEF_OPMASK_OFFS(1),
 120     // 2..7 are implied in range usage
 121     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 122     DEF_ZMM_OFFS(0),
 123     DEF_ZMM_OFFS(1),
 124     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 125     DEF_ZMM_UPPER_OFFS(16),
 126     DEF_ZMM_UPPER_OFFS(17),
 127     // 18..31 are implied in range usage
 128     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 129     fpu_stateH_end,
 130     r15_off, r15H_off,
 131     r14_off, r14H_off,
 132     r13_off, r13H_off,
 133     r12_off, r12H_off,
 134     r11_off, r11H_off,
 135     r10_off, r10H_off,
 136     r9_off,  r9H_off,
 137     r8_off,  r8H_off,
 138     rdi_off, rdiH_off,
 139     rsi_off, rsiH_off,
 140     ignore_off, ignoreH_off,  // extra copy of rbp
 141     rsp_off, rspH_off,
 142     rbx_off, rbxH_off,
 143     rdx_off, rdxH_off,
 144     rcx_off, rcxH_off,
 145     rax_off, raxH_off,
 146     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 147     align_off, alignH_off,
 148     flags_off, flagsH_off,
 149     // The frame sender code expects that rbp will be in the "natural" place and
 150     // will override any oopMap setting for it. We must therefore force the layout
 151     // so that it agrees with the frame sender code.
 152     rbp_off, rbpH_off,        // copy of rbp we will restore
 153     return_off, returnH_off,  // slot for return address
 154     reg_save_size             // size in compiler stack slots
 155   };
 156 
 157  public:
 158   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 159   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 160 
 161   // Offsets into the register save area
 162   // Used by deoptimization when it is managing result register
 163   // values on its own
 164 
 165   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 166   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 167   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 168   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 169   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 170 
 171   // During deoptimization only the result registers need to be restored,
 172   // all the other values have already been extracted.
 173   static void restore_result_registers(MacroAssembler* masm);
 174 };
 175 
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegister::available_xmm_registers();
 179 #if COMPILER2_OR_JVMCI
 180   if (save_wide_vectors && UseAVX == 0) {
 181     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 182   }
 183   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 184 #else
 185   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 186 #endif
 187 
 188   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 189   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 190   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 191   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 192   // CodeBlob frame size is in words.
 193   int frame_size_in_words = frame_size_in_bytes / wordSize;
 194   *total_frame_words = frame_size_in_words;
 195 
 196   // Save registers, fpu state, and flags.
 197   // We assume caller has already pushed the return address onto the
 198   // stack, so rsp is 8-byte aligned here.
 199   // We push rpb twice in this sequence because we want the real rbp
 200   // to be under the return like a normal enter.
 201 
 202   __ enter();          // rsp becomes 16-byte aligned here
 203   __ push_CPU_state(); // Push a multiple of 16 bytes
 204 
 205   // push cpu state handles this on EVEX enabled targets
 206   if (save_wide_vectors) {
 207     // Save upper half of YMM registers(0..15)
 208     int base_addr = XSAVE_AREA_YMM_BEGIN;
 209     for (int n = 0; n < 16; n++) {
 210       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 211     }
 212     if (VM_Version::supports_evex()) {
 213       // Save upper half of ZMM registers(0..15)
 214       base_addr = XSAVE_AREA_ZMM_BEGIN;
 215       for (int n = 0; n < 16; n++) {
 216         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 217       }
 218       // Save full ZMM registers(16..num_xmm_regs)
 219       base_addr = XSAVE_AREA_UPPERBANK;
 220       off = 0;
 221       int vector_len = Assembler::AVX_512bit;
 222       for (int n = 16; n < num_xmm_regs; n++) {
 223         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 224       }
 225 #if COMPILER2_OR_JVMCI
 226       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 227       off = 0;
 228       for(int n = 0; n < KRegister::number_of_registers; n++) {
 229         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 230       }
 231 #endif
 232     }
 233   } else {
 234     if (VM_Version::supports_evex()) {
 235       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 236       int base_addr = XSAVE_AREA_UPPERBANK;
 237       off = 0;
 238       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 239       for (int n = 16; n < num_xmm_regs; n++) {
 240         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 241       }
 242 #if COMPILER2_OR_JVMCI
 243       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 244       off = 0;
 245       for(int n = 0; n < KRegister::number_of_registers; n++) {
 246         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 247       }
 248 #endif
 249     }
 250   }
 251   __ vzeroupper();
 252   if (frame::arg_reg_save_area_bytes != 0) {
 253     // Allocate argument register save area
 254     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 255   }
 256 
 257   // Set an oopmap for the call site.  This oopmap will map all
 258   // oop-registers and debug-info registers as callee-saved.  This
 259   // will allow deoptimization at this safepoint to find all possible
 260   // debug-info recordings, as well as let GC find all oops.
 261 
 262   OopMapSet *oop_maps = new OopMapSet();
 263   OopMap* map = new OopMap(frame_size_in_slots, 0);
 264 
 265 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 266 
 267   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 268   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 271   // rbp location is known implicitly by the frame sender code, needs no oopmap
 272   // and the location where rbp was saved by is ignored
 273   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 274   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 283   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 284   // on EVEX enabled targets, we get it included in the xsave area
 285   off = xmm0_off;
 286   int delta = xmm1_off - off;
 287   for (int n = 0; n < 16; n++) {
 288     XMMRegister xmm_name = as_XMMRegister(n);
 289     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 290     off += delta;
 291   }
 292   if (UseAVX > 2) {
 293     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 294     off = zmm16_off;
 295     delta = zmm17_off - off;
 296     for (int n = 16; n < num_xmm_regs; n++) {
 297       XMMRegister zmm_name = as_XMMRegister(n);
 298       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 299       off += delta;
 300     }
 301   }
 302 
 303 #if COMPILER2_OR_JVMCI
 304   if (save_wide_vectors) {
 305     // Save upper half of YMM registers(0..15)
 306     off = ymm0_off;
 307     delta = ymm1_off - ymm0_off;
 308     for (int n = 0; n < 16; n++) {
 309       XMMRegister ymm_name = as_XMMRegister(n);
 310       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 311       off += delta;
 312     }
 313     if (VM_Version::supports_evex()) {
 314       // Save upper half of ZMM registers(0..15)
 315       off = zmm0_off;
 316       delta = zmm1_off - zmm0_off;
 317       for (int n = 0; n < 16; n++) {
 318         XMMRegister zmm_name = as_XMMRegister(n);
 319         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 320         off += delta;
 321       }
 322     }
 323   }
 324 #endif // COMPILER2_OR_JVMCI
 325 
 326   // %%% These should all be a waste but we'll keep things as they were for now
 327   if (true) {
 328     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 329     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 332     // rbp location is known implicitly by the frame sender code, needs no oopmap
 333     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 334     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 343     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 344     // on EVEX enabled targets, we get it included in the xsave area
 345     off = xmm0H_off;
 346     delta = xmm1H_off - off;
 347     for (int n = 0; n < 16; n++) {
 348       XMMRegister xmm_name = as_XMMRegister(n);
 349       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 350       off += delta;
 351     }
 352     if (UseAVX > 2) {
 353       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 354       off = zmm16H_off;
 355       delta = zmm17H_off - off;
 356       for (int n = 16; n < num_xmm_regs; n++) {
 357         XMMRegister zmm_name = as_XMMRegister(n);
 358         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 359         off += delta;
 360       }
 361     }
 362   }
 363 
 364   return map;
 365 }
 366 
 367 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 368   int num_xmm_regs = XMMRegister::available_xmm_registers();
 369   if (frame::arg_reg_save_area_bytes != 0) {
 370     // Pop arg register save area
 371     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 372   }
 373 
 374 #if COMPILER2_OR_JVMCI
 375   if (restore_wide_vectors) {
 376     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 377     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 378   }
 379 #else
 380   assert(!restore_wide_vectors, "vectors are generated only by C2");
 381 #endif
 382 
 383   __ vzeroupper();
 384 
 385   // On EVEX enabled targets everything is handled in pop fpu state
 386   if (restore_wide_vectors) {
 387     // Restore upper half of YMM registers (0..15)
 388     int base_addr = XSAVE_AREA_YMM_BEGIN;
 389     for (int n = 0; n < 16; n++) {
 390       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 391     }
 392     if (VM_Version::supports_evex()) {
 393       // Restore upper half of ZMM registers (0..15)
 394       base_addr = XSAVE_AREA_ZMM_BEGIN;
 395       for (int n = 0; n < 16; n++) {
 396         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 397       }
 398       // Restore full ZMM registers(16..num_xmm_regs)
 399       base_addr = XSAVE_AREA_UPPERBANK;
 400       int vector_len = Assembler::AVX_512bit;
 401       int off = 0;
 402       for (int n = 16; n < num_xmm_regs; n++) {
 403         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 404       }
 405 #if COMPILER2_OR_JVMCI
 406       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 407       off = 0;
 408       for (int n = 0; n < KRegister::number_of_registers; n++) {
 409         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 410       }
 411 #endif
 412     }
 413   } else {
 414     if (VM_Version::supports_evex()) {
 415       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 416       int base_addr = XSAVE_AREA_UPPERBANK;
 417       int off = 0;
 418       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 419       for (int n = 16; n < num_xmm_regs; n++) {
 420         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 421       }
 422 #if COMPILER2_OR_JVMCI
 423       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 424       off = 0;
 425       for (int n = 0; n < KRegister::number_of_registers; n++) {
 426         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 427       }
 428 #endif
 429     }
 430   }
 431 
 432   // Recover CPU state
 433   __ pop_CPU_state();
 434   // Get the rbp described implicitly by the calling convention (no oopMap)
 435   __ pop(rbp);
 436 }
 437 
 438 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 439 
 440   // Just restore result register. Only used by deoptimization. By
 441   // now any callee save register that needs to be restored to a c2
 442   // caller of the deoptee has been extracted into the vframeArray
 443   // and will be stuffed into the c2i adapter we create for later
 444   // restoration so only result registers need to be restored here.
 445 
 446   // Restore fp result register
 447   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 448   // Restore integer result register
 449   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 450   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 451 
 452   // Pop all of the register save are off the stack except the return address
 453   __ addptr(rsp, return_offset_in_bytes());
 454 }
 455 
 456 // Is vector's size (in bytes) bigger than a size saved by default?
 457 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 458 bool SharedRuntime::is_wide_vector(int size) {
 459   return size > 16;
 460 }
 461 
 462 // ---------------------------------------------------------------------------
 463 // Read the array of BasicTypes from a signature, and compute where the
 464 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 465 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 466 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 467 // as framesizes are fixed.
 468 // VMRegImpl::stack0 refers to the first slot 0(sp).
 469 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 470 // Register up to Register::number_of_registers are the 64-bit
 471 // integer registers.
 472 
 473 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 474 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 475 // units regardless of build. Of course for i486 there is no 64 bit build
 476 
 477 // The Java calling convention is a "shifted" version of the C ABI.
 478 // By skipping the first C ABI register we can call non-static jni methods
 479 // with small numbers of arguments without having to shuffle the arguments
 480 // at all. Since we control the java ABI we ought to at least get some
 481 // advantage out of it.
 482 
 483 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 484                                            VMRegPair *regs,
 485                                            int total_args_passed) {
 486 
 487   // Create the mapping between argument positions and
 488   // registers.
 489   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 490     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 491   };
 492   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 493     j_farg0, j_farg1, j_farg2, j_farg3,
 494     j_farg4, j_farg5, j_farg6, j_farg7
 495   };
 496 
 497 
 498   uint int_args = 0;
 499   uint fp_args = 0;
 500   uint stk_args = 0; // inc by 2 each time
 501 
 502   for (int i = 0; i < total_args_passed; i++) {
 503     switch (sig_bt[i]) {
 504     case T_BOOLEAN:
 505     case T_CHAR:
 506     case T_BYTE:
 507     case T_SHORT:
 508     case T_INT:
 509       if (int_args < Argument::n_int_register_parameters_j) {
 510         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 511       } else {
 512         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 513         stk_args += 2;
 514       }
 515       break;
 516     case T_VOID:
 517       // halves of T_LONG or T_DOUBLE
 518       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 519       regs[i].set_bad();
 520       break;
 521     case T_LONG:
 522       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 523       // fall through
 524     case T_OBJECT:
 525     case T_ARRAY:
 526     case T_ADDRESS:
 527       if (int_args < Argument::n_int_register_parameters_j) {
 528         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 529       } else {
 530         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 531         stk_args += 2;
 532       }
 533       break;
 534     case T_FLOAT:
 535       if (fp_args < Argument::n_float_register_parameters_j) {
 536         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 537       } else {
 538         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 539         stk_args += 2;
 540       }
 541       break;
 542     case T_DOUBLE:
 543       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 544       if (fp_args < Argument::n_float_register_parameters_j) {
 545         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 546       } else {
 547         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 548         stk_args += 2;
 549       }
 550       break;
 551     default:
 552       ShouldNotReachHere();
 553       break;
 554     }
 555   }
 556 
 557   return align_up(stk_args, 2);
 558 }
 559 
 560 // Patch the callers callsite with entry to compiled code if it exists.
 561 static void patch_callers_callsite(MacroAssembler *masm) {
 562   Label L;
 563   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 564   __ jcc(Assembler::equal, L);
 565 
 566   // Save the current stack pointer
 567   __ mov(r13, rsp);
 568   // Schedule the branch target address early.
 569   // Call into the VM to patch the caller, then jump to compiled callee
 570   // rax isn't live so capture return address while we easily can
 571   __ movptr(rax, Address(rsp, 0));
 572 
 573   // align stack so push_CPU_state doesn't fault
 574   __ andptr(rsp, -(StackAlignmentInBytes));
 575   __ push_CPU_state();
 576   __ vzeroupper();
 577   // VM needs caller's callsite
 578   // VM needs target method
 579   // This needs to be a long call since we will relocate this adapter to
 580   // the codeBuffer and it may not reach
 581 
 582   // Allocate argument register save area
 583   if (frame::arg_reg_save_area_bytes != 0) {
 584     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 585   }
 586   __ mov(c_rarg0, rbx);
 587   __ mov(c_rarg1, rax);
 588   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 589 
 590   // De-allocate argument register save area
 591   if (frame::arg_reg_save_area_bytes != 0) {
 592     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 593   }
 594 
 595   __ vzeroupper();
 596   __ pop_CPU_state();
 597   // restore sp
 598   __ mov(rsp, r13);
 599   __ bind(L);
 600 }
 601 
 602 
 603 static void gen_c2i_adapter(MacroAssembler *masm,
 604                             int total_args_passed,
 605                             int comp_args_on_stack,
 606                             const BasicType *sig_bt,
 607                             const VMRegPair *regs,
 608                             Label& skip_fixup) {
 609   // Before we get into the guts of the C2I adapter, see if we should be here
 610   // at all.  We've come from compiled code and are attempting to jump to the
 611   // interpreter, which means the caller made a static call to get here
 612   // (vcalls always get a compiled target if there is one).  Check for a
 613   // compiled target.  If there is one, we need to patch the caller's call.
 614   patch_callers_callsite(masm);
 615 
 616   __ bind(skip_fixup);
 617 
 618   // Since all args are passed on the stack, total_args_passed *
 619   // Interpreter::stackElementSize is the space we need.
 620 
 621   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 622 
 623   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 624 
 625   // stack is aligned, keep it that way
 626   // This is not currently needed or enforced by the interpreter, but
 627   // we might as well conform to the ABI.
 628   extraspace = align_up(extraspace, 2*wordSize);
 629 
 630   // set senderSP value
 631   __ lea(r13, Address(rsp, wordSize));
 632 
 633 #ifdef ASSERT
 634   __ check_stack_alignment(r13, "sender stack not aligned");
 635 #endif
 636   if (extraspace > 0) {
 637     // Pop the return address
 638     __ pop(rax);
 639 
 640     __ subptr(rsp, extraspace);
 641 
 642     // Push the return address
 643     __ push(rax);
 644 
 645     // Account for the return address location since we store it first rather
 646     // than hold it in a register across all the shuffling
 647     extraspace += wordSize;
 648   }
 649 
 650 #ifdef ASSERT
 651   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 652 #endif
 653 
 654   // Now write the args into the outgoing interpreter space
 655   for (int i = 0; i < total_args_passed; i++) {
 656     if (sig_bt[i] == T_VOID) {
 657       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 658       continue;
 659     }
 660 
 661     // offset to start parameters
 662     int st_off   = (total_args_passed - i) * Interpreter::stackElementSize;
 663     int next_off = st_off - Interpreter::stackElementSize;
 664 
 665     // Say 4 args:
 666     // i   st_off
 667     // 0   32 T_LONG
 668     // 1   24 T_VOID
 669     // 2   16 T_OBJECT
 670     // 3    8 T_BOOL
 671     // -    0 return address
 672     //
 673     // However to make thing extra confusing. Because we can fit a long/double in
 674     // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 675     // leaves one slot empty and only stores to a single slot. In this case the
 676     // slot that is occupied is the T_VOID slot. See I said it was confusing.
 677 
 678     VMReg r_1 = regs[i].first();
 679     VMReg r_2 = regs[i].second();
 680     if (!r_1->is_valid()) {
 681       assert(!r_2->is_valid(), "");
 682       continue;
 683     }
 684     if (r_1->is_stack()) {
 685       // memory to memory use rax
 686       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 687       if (!r_2->is_valid()) {
 688         // sign extend??
 689         __ movl(rax, Address(rsp, ld_off));
 690         __ movptr(Address(rsp, st_off), rax);
 691 
 692       } else {
 693 
 694         __ movq(rax, Address(rsp, ld_off));
 695 
 696         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 697         // T_DOUBLE and T_LONG use two slots in the interpreter
 698         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 699           // ld_off == LSW, ld_off+wordSize == MSW
 700           // st_off == MSW, next_off == LSW
 701           __ movq(Address(rsp, next_off), rax);
 702 #ifdef ASSERT
 703           // Overwrite the unused slot with known junk
 704           __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 705           __ movptr(Address(rsp, st_off), rax);
 706 #endif /* ASSERT */
 707         } else {
 708           __ movq(Address(rsp, st_off), rax);
 709         }
 710       }
 711     } else if (r_1->is_Register()) {
 712       Register r = r_1->as_Register();
 713       if (!r_2->is_valid()) {
 714         // must be only an int (or less ) so move only 32bits to slot
 715         // why not sign extend??
 716         __ movl(Address(rsp, st_off), r);
 717       } else {
 718         // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
 719         // T_DOUBLE and T_LONG use two slots in the interpreter
 720         if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
 721           // long/double in gpr
 722 #ifdef ASSERT
 723           // Overwrite the unused slot with known junk
 724           __ mov64(rax, CONST64(0xdeadffffdeadaaab));
 725           __ movptr(Address(rsp, st_off), rax);
 726 #endif /* ASSERT */
 727           __ movq(Address(rsp, next_off), r);
 728         } else {
 729           __ movptr(Address(rsp, st_off), r);
 730         }
 731       }
 732     } else {
 733       assert(r_1->is_XMMRegister(), "");
 734       if (!r_2->is_valid()) {
 735         // only a float use just part of the slot
 736         __ movflt(Address(rsp, st_off), r_1->as_XMMRegister());
 737       } else {
 738 #ifdef ASSERT
 739         // Overwrite the unused slot with known junk
 740         __ mov64(rax, CONST64(0xdeadffffdeadaaac));
 741         __ movptr(Address(rsp, st_off), rax);
 742 #endif /* ASSERT */
 743         __ movdbl(Address(rsp, next_off), r_1->as_XMMRegister());
 744       }
 745     }
 746   }
 747 
 748   // Schedule the branch target address early.
 749   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 750   __ jmp(rcx);
 751 }
 752 
 753 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 754                         address code_start, address code_end,
 755                         Label& L_ok) {
 756   Label L_fail;
 757   __ lea(temp_reg, ExternalAddress(code_start));
 758   __ cmpptr(pc_reg, temp_reg);
 759   __ jcc(Assembler::belowEqual, L_fail);
 760   __ lea(temp_reg, ExternalAddress(code_end));
 761   __ cmpptr(pc_reg, temp_reg);
 762   __ jcc(Assembler::below, L_ok);
 763   __ bind(L_fail);
 764 }
 765 
 766 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 767                                     int total_args_passed,
 768                                     int comp_args_on_stack,
 769                                     const BasicType *sig_bt,
 770                                     const VMRegPair *regs) {
 771 
 772   // Note: r13 contains the senderSP on entry. We must preserve it since
 773   // we may do a i2c -> c2i transition if we lose a race where compiled
 774   // code goes non-entrant while we get args ready.
 775   // In addition we use r13 to locate all the interpreter args as
 776   // we must align the stack to 16 bytes on an i2c entry else we
 777   // lose alignment we expect in all compiled code and register
 778   // save code can segv when fxsave instructions find improperly
 779   // aligned stack pointer.
 780 
 781   // Adapters can be frameless because they do not require the caller
 782   // to perform additional cleanup work, such as correcting the stack pointer.
 783   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 784   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 785   // even if a callee has modified the stack pointer.
 786   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 787   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 788   // up via the senderSP register).
 789   // In other words, if *either* the caller or callee is interpreted, we can
 790   // get the stack pointer repaired after a call.
 791   // This is why c2i and i2c adapters cannot be indefinitely composed.
 792   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 793   // both caller and callee would be compiled methods, and neither would
 794   // clean up the stack pointer changes performed by the two adapters.
 795   // If this happens, control eventually transfers back to the compiled
 796   // caller, but with an uncorrected stack, causing delayed havoc.
 797 
 798   if (VerifyAdapterCalls &&
 799       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
 800     // So, let's test for cascading c2i/i2c adapters right now.
 801     //  assert(Interpreter::contains($return_addr) ||
 802     //         StubRoutines::contains($return_addr),
 803     //         "i2c adapter must return to an interpreter frame");
 804     __ block_comment("verify_i2c { ");
 805     // Pick up the return address
 806     __ movptr(rax, Address(rsp, 0));
 807     Label L_ok;
 808     if (Interpreter::code() != NULL)
 809       range_check(masm, rax, r11,
 810                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
 811                   L_ok);
 812     if (StubRoutines::code1() != NULL)
 813       range_check(masm, rax, r11,
 814                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
 815                   L_ok);
 816     if (StubRoutines::code2() != NULL)
 817       range_check(masm, rax, r11,
 818                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
 819                   L_ok);
 820     const char* msg = "i2c adapter must return to an interpreter frame";
 821     __ block_comment(msg);
 822     __ stop(msg);
 823     __ bind(L_ok);
 824     __ block_comment("} verify_i2ce ");
 825   }
 826 
 827   // Must preserve original SP for loading incoming arguments because
 828   // we need to align the outgoing SP for compiled code.
 829   __ movptr(r11, rsp);
 830 
 831   // Pick up the return address
 832   __ pop(rax);
 833 
 834   // Convert 4-byte c2 stack slots to words.
 835   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
 836 
 837   if (comp_args_on_stack) {
 838     __ subptr(rsp, comp_words_on_stack * wordSize);
 839   }
 840 
 841   // Ensure compiled code always sees stack at proper alignment
 842   __ andptr(rsp, -16);
 843 
 844   // push the return address and misalign the stack that youngest frame always sees
 845   // as far as the placement of the call instruction
 846   __ push(rax);
 847 
 848   // Put saved SP in another register
 849   const Register saved_sp = rax;
 850   __ movptr(saved_sp, r11);
 851 
 852   // Will jump to the compiled code just as if compiled code was doing it.
 853   // Pre-load the register-jump target early, to schedule it better.
 854   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_offset())));
 855 
 856 #if INCLUDE_JVMCI
 857   if (EnableJVMCI) {
 858     // check if this call should be routed towards a specific entry point
 859     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 860     Label no_alternative_target;
 861     __ jcc(Assembler::equal, no_alternative_target);
 862     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
 863     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
 864     __ bind(no_alternative_target);
 865   }
 866 #endif // INCLUDE_JVMCI
 867 
 868   // Now generate the shuffle code.  Pick up all register args and move the
 869   // rest through the floating point stack top.
 870   for (int i = 0; i < total_args_passed; i++) {
 871     if (sig_bt[i] == T_VOID) {
 872       // Longs and doubles are passed in native word order, but misaligned
 873       // in the 32-bit build.
 874       assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
 875       continue;
 876     }
 877 
 878     // Pick up 0, 1 or 2 words from SP+offset.
 879 
 880     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
 881             "scrambled load targets?");
 882     // Load in argument order going down.
 883     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
 884     // Point to interpreter value (vs. tag)
 885     int next_off = ld_off - Interpreter::stackElementSize;
 886     //
 887     //
 888     //
 889     VMReg r_1 = regs[i].first();
 890     VMReg r_2 = regs[i].second();
 891     if (!r_1->is_valid()) {
 892       assert(!r_2->is_valid(), "");
 893       continue;
 894     }
 895     if (r_1->is_stack()) {
 896       // Convert stack slot to an SP offset (+ wordSize to account for return address )
 897       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
 898 
 899       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
 900       // and if we end up going thru a c2i because of a miss a reasonable value of r13
 901       // will be generated.
 902       if (!r_2->is_valid()) {
 903         // sign extend???
 904         __ movl(r13, Address(saved_sp, ld_off));
 905         __ movptr(Address(rsp, st_off), r13);
 906       } else {
 907         //
 908         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 909         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 910         // So we must adjust where to pick up the data to match the interpreter.
 911         //
 912         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
 913         // are accessed as negative so LSW is at LOW address
 914 
 915         // ld_off is MSW so get LSW
 916         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 917                            next_off : ld_off;
 918         __ movq(r13, Address(saved_sp, offset));
 919         // st_off is LSW (i.e. reg.first())
 920         __ movq(Address(rsp, st_off), r13);
 921       }
 922     } else if (r_1->is_Register()) {  // Register argument
 923       Register r = r_1->as_Register();
 924       assert(r != rax, "must be different");
 925       if (r_2->is_valid()) {
 926         //
 927         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
 928         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
 929         // So we must adjust where to pick up the data to match the interpreter.
 930 
 931         const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
 932                            next_off : ld_off;
 933 
 934         // this can be a misaligned move
 935         __ movq(r, Address(saved_sp, offset));
 936       } else {
 937         // sign extend and use a full word?
 938         __ movl(r, Address(saved_sp, ld_off));
 939       }
 940     } else {
 941       if (!r_2->is_valid()) {
 942         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
 943       } else {
 944         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
 945       }
 946     }
 947   }
 948 
 949   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
 950 
 951   // 6243940 We might end up in handle_wrong_method if
 952   // the callee is deoptimized as we race thru here. If that
 953   // happens we don't want to take a safepoint because the
 954   // caller frame will look interpreted and arguments are now
 955   // "compiled" so it is much better to make this transition
 956   // invisible to the stack walking code. Unfortunately if
 957   // we try and find the callee by normal means a safepoint
 958   // is possible. So we stash the desired callee in the thread
 959   // and the vm will find there should this case occur.
 960 
 961   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
 962 
 963   // put Method* where a c2i would expect should we end up there
 964   // only needed because eof c2 resolve stubs return Method* as a result in
 965   // rax
 966   __ mov(rax, rbx);
 967   __ jmp(r11);
 968 }
 969 
 970 // ---------------------------------------------------------------
 971 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
 972                                                             int total_args_passed,
 973                                                             int comp_args_on_stack,
 974                                                             const BasicType *sig_bt,
 975                                                             const VMRegPair *regs,
 976                                                             AdapterFingerPrint* fingerprint) {
 977   address i2c_entry = __ pc();
 978 
 979   gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
 980 
 981   // -------------------------------------------------------------------------
 982   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
 983   // to the interpreter.  The args start out packed in the compiled layout.  They
 984   // need to be unpacked into the interpreter layout.  This will almost always
 985   // require some stack space.  We grow the current (compiled) stack, then repack
 986   // the args.  We  finally end in a jump to the generic interpreter entry point.
 987   // On exit from the interpreter, the interpreter will restore our SP (lest the
 988   // compiled code, which relies solely on SP and not RBP, get sick).
 989 
 990   address c2i_unverified_entry = __ pc();
 991   Label skip_fixup;
 992   Label ok;
 993 
 994   Register holder = rax;
 995   Register receiver = j_rarg0;
 996   Register temp = rbx;
 997 
 998   {
 999     __ load_klass(temp, receiver, rscratch1);
1000     __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1001     __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1002     __ jcc(Assembler::equal, ok);
1003     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1004 
1005     __ bind(ok);
1006     // Method might have been compiled since the call site was patched to
1007     // interpreted if that is the case treat it as a miss so we can get
1008     // the call site corrected.
1009     __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1010     __ jcc(Assembler::equal, skip_fixup);
1011     __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1012   }
1013 
1014   address c2i_entry = __ pc();
1015 
1016   // Class initialization barrier for static methods
1017   address c2i_no_clinit_check_entry = NULL;
1018   if (VM_Version::supports_fast_class_init_checks()) {
1019     Label L_skip_barrier;
1020     Register method = rbx;
1021 
1022     { // Bypass the barrier for non-static methods
1023       Register flags = rscratch1;
1024       __ movl(flags, Address(method, Method::access_flags_offset()));
1025       __ testl(flags, JVM_ACC_STATIC);
1026       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1027     }
1028 
1029     Register klass = rscratch1;
1030     __ load_method_holder(klass, method);
1031     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1032 
1033     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1034 
1035     __ bind(L_skip_barrier);
1036     c2i_no_clinit_check_entry = __ pc();
1037   }
1038 
1039   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1040   bs->c2i_entry_barrier(masm);
1041 
1042   gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
1043 
1044   __ flush();
1045   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry, c2i_no_clinit_check_entry);
1046 }
1047 
1048 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1049                                          VMRegPair *regs,
1050                                          VMRegPair *regs2,
1051                                          int total_args_passed) {
1052   assert(regs2 == NULL, "not needed on x86");
1053 // We return the amount of VMRegImpl stack slots we need to reserve for all
1054 // the arguments NOT counting out_preserve_stack_slots.
1055 
1056 // NOTE: These arrays will have to change when c1 is ported
1057 #ifdef _WIN64
1058     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1059       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1060     };
1061     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1062       c_farg0, c_farg1, c_farg2, c_farg3
1063     };
1064 #else
1065     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1066       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1067     };
1068     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1069       c_farg0, c_farg1, c_farg2, c_farg3,
1070       c_farg4, c_farg5, c_farg6, c_farg7
1071     };
1072 #endif // _WIN64
1073 
1074 
1075     uint int_args = 0;
1076     uint fp_args = 0;
1077     uint stk_args = 0; // inc by 2 each time
1078 
1079     for (int i = 0; i < total_args_passed; i++) {
1080       switch (sig_bt[i]) {
1081       case T_BOOLEAN:
1082       case T_CHAR:
1083       case T_BYTE:
1084       case T_SHORT:
1085       case T_INT:
1086         if (int_args < Argument::n_int_register_parameters_c) {
1087           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1088 #ifdef _WIN64
1089           fp_args++;
1090           // Allocate slots for callee to stuff register args the stack.
1091           stk_args += 2;
1092 #endif
1093         } else {
1094           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1095           stk_args += 2;
1096         }
1097         break;
1098       case T_LONG:
1099         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1100         // fall through
1101       case T_OBJECT:
1102       case T_ARRAY:
1103       case T_ADDRESS:
1104       case T_METADATA:
1105         if (int_args < Argument::n_int_register_parameters_c) {
1106           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1107 #ifdef _WIN64
1108           fp_args++;
1109           stk_args += 2;
1110 #endif
1111         } else {
1112           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1113           stk_args += 2;
1114         }
1115         break;
1116       case T_FLOAT:
1117         if (fp_args < Argument::n_float_register_parameters_c) {
1118           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1119 #ifdef _WIN64
1120           int_args++;
1121           // Allocate slots for callee to stuff register args the stack.
1122           stk_args += 2;
1123 #endif
1124         } else {
1125           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1126           stk_args += 2;
1127         }
1128         break;
1129       case T_DOUBLE:
1130         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1131         if (fp_args < Argument::n_float_register_parameters_c) {
1132           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1133 #ifdef _WIN64
1134           int_args++;
1135           // Allocate slots for callee to stuff register args the stack.
1136           stk_args += 2;
1137 #endif
1138         } else {
1139           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1140           stk_args += 2;
1141         }
1142         break;
1143       case T_VOID: // Halves of longs and doubles
1144         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1145         regs[i].set_bad();
1146         break;
1147       default:
1148         ShouldNotReachHere();
1149         break;
1150       }
1151     }
1152 #ifdef _WIN64
1153   // windows abi requires that we always allocate enough stack space
1154   // for 4 64bit registers to be stored down.
1155   if (stk_args < 8) {
1156     stk_args = 8;
1157   }
1158 #endif // _WIN64
1159 
1160   return stk_args;
1161 }
1162 
1163 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1164                                              uint num_bits,
1165                                              uint total_args_passed) {
1166   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1167          "only certain vector sizes are supported for now");
1168 
1169   static const XMMRegister VEC_ArgReg[32] = {
1170      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1171      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1172     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1173     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1174   };
1175 
1176   uint stk_args = 0;
1177   uint fp_args = 0;
1178 
1179   for (uint i = 0; i < total_args_passed; i++) {
1180     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1181     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1182     regs[i].set_pair(vmreg->next(next_val), vmreg);
1183   }
1184 
1185   return stk_args;
1186 }
1187 
1188 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1189   // We always ignore the frame_slots arg and just use the space just below frame pointer
1190   // which by this time is free to use
1191   switch (ret_type) {
1192   case T_FLOAT:
1193     __ movflt(Address(rbp, -wordSize), xmm0);
1194     break;
1195   case T_DOUBLE:
1196     __ movdbl(Address(rbp, -wordSize), xmm0);
1197     break;
1198   case T_VOID:  break;
1199   default: {
1200     __ movptr(Address(rbp, -wordSize), rax);
1201     }
1202   }
1203 }
1204 
1205 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1206   // We always ignore the frame_slots arg and just use the space just below frame pointer
1207   // which by this time is free to use
1208   switch (ret_type) {
1209   case T_FLOAT:
1210     __ movflt(xmm0, Address(rbp, -wordSize));
1211     break;
1212   case T_DOUBLE:
1213     __ movdbl(xmm0, Address(rbp, -wordSize));
1214     break;
1215   case T_VOID:  break;
1216   default: {
1217     __ movptr(rax, Address(rbp, -wordSize));
1218     }
1219   }
1220 }
1221 
1222 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1223     for ( int i = first_arg ; i < arg_count ; i++ ) {
1224       if (args[i].first()->is_Register()) {
1225         __ push(args[i].first()->as_Register());
1226       } else if (args[i].first()->is_XMMRegister()) {
1227         __ subptr(rsp, 2*wordSize);
1228         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1229       }
1230     }
1231 }
1232 
1233 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1234     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1235       if (args[i].first()->is_Register()) {
1236         __ pop(args[i].first()->as_Register());
1237       } else if (args[i].first()->is_XMMRegister()) {
1238         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1239         __ addptr(rsp, 2*wordSize);
1240       }
1241     }
1242 }
1243 
1244 static void verify_oop_args(MacroAssembler* masm,
1245                             const methodHandle& method,
1246                             const BasicType* sig_bt,
1247                             const VMRegPair* regs) {
1248   Register temp_reg = rbx;  // not part of any compiled calling seq
1249   if (VerifyOops) {
1250     for (int i = 0; i < method->size_of_parameters(); i++) {
1251       if (is_reference_type(sig_bt[i])) {
1252         VMReg r = regs[i].first();
1253         assert(r->is_valid(), "bad oop arg");
1254         if (r->is_stack()) {
1255           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1256           __ verify_oop(temp_reg);
1257         } else {
1258           __ verify_oop(r->as_Register());
1259         }
1260       }
1261     }
1262   }
1263 }
1264 
1265 static void check_continuation_enter_argument(VMReg actual_vmreg,
1266                                               Register expected_reg,
1267                                               const char* name) {
1268   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1269   assert(actual_vmreg->as_Register() == expected_reg,
1270          "%s is in unexpected register: %s instead of %s",
1271          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1272 }
1273 
1274 static void gen_continuation_enter(MacroAssembler* masm,
1275                                    const VMRegPair* regs,
1276                                    int& exception_offset,
1277                                    OopMapSet* oop_maps,
1278                                    int& frame_complete,
1279                                    int& stack_slots,
1280                                    int& interpreted_entry_offset,
1281                                    int& compiled_entry_offset) {
1282 
1283   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1284   int pos_cont_obj   = 0;
1285   int pos_is_cont    = 1;
1286   int pos_is_virtual = 2;
1287 
1288   // The platform-specific calling convention may present the arguments in various registers.
1289   // To simplify the rest of the code, we expect the arguments to reside at these known
1290   // registers, and we additionally check the placement here in case calling convention ever
1291   // changes.
1292   Register reg_cont_obj   = c_rarg1;
1293   Register reg_is_cont    = c_rarg2;
1294   Register reg_is_virtual = c_rarg3;
1295 
1296   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1297   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1298   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1299 
1300   // Utility methods kill rax, make sure there are no collisions
1301   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1302 
1303   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1304                          relocInfo::static_call_type);
1305 
1306   address start = __ pc();
1307 
1308   Label L_thaw, L_exit;
1309 
1310   // i2i entry used at interp_only_mode only
1311   interpreted_entry_offset = __ pc() - start;
1312   {
1313 #ifdef ASSERT
1314     Label is_interp_only;
1315     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1316     __ jcc(Assembler::notEqual, is_interp_only);
1317     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1318     __ bind(is_interp_only);
1319 #endif
1320 
1321     __ pop(rax); // return address
1322     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1323     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1324     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1325     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1326     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1327     __ push(rax); // return address
1328     __ push_cont_fastpath();
1329 
1330     __ enter();
1331 
1332     stack_slots = 2; // will be adjusted in setup
1333     OopMap* map = __ continuation_enter_setup(stack_slots);
1334     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1335     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1336 
1337     __ verify_oop(reg_cont_obj);
1338 
1339     __ fill_continuation_entry(reg_cont_obj, reg_is_virtual);
1340 
1341     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1342     __ testptr(reg_is_cont, reg_is_cont);
1343     __ jcc(Assembler::notZero, L_thaw);
1344 
1345     // --- Resolve path
1346 
1347     // Make sure the call is patchable
1348     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1349     // Emit stub for static call
1350     CodeBuffer* cbuf = masm->code_section()->outer();
1351     address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1352     if (stub == nullptr) {
1353       fatal("CodeCache is full at gen_continuation_enter");
1354     }
1355     __ call(resolve);
1356     oop_maps->add_gc_map(__ pc() - start, map);
1357     __ post_call_nop();
1358 
1359     __ jmp(L_exit);
1360   }
1361 
1362   // compiled entry
1363   __ align(CodeEntryAlignment);
1364   compiled_entry_offset = __ pc() - start;
1365   __ enter();
1366 
1367   stack_slots = 2; // will be adjusted in setup
1368   OopMap* map = __ continuation_enter_setup(stack_slots);
1369 
1370   // Frame is now completed as far as size and linkage.
1371   frame_complete = __ pc() - start;
1372 
1373   __ verify_oop(reg_cont_obj);
1374 
1375   __ fill_continuation_entry(reg_cont_obj, reg_is_virtual);
1376 
1377   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1378   __ testptr(reg_is_cont, reg_is_cont);
1379   __ jccb(Assembler::notZero, L_thaw);
1380 
1381   // --- call Continuation.enter(Continuation c, boolean isContinue)
1382 
1383   // Make sure the call is patchable
1384   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1385 
1386   // Emit stub for static call
1387   CodeBuffer* cbuf = masm->code_section()->outer();
1388   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1389   if (stub == nullptr) {
1390     fatal("CodeCache is full at gen_continuation_enter");
1391   }
1392 
1393   // The call needs to be resolved. There's a special case for this in
1394   // SharedRuntime::find_callee_info_helper() which calls
1395   // LinkResolver::resolve_continuation_enter() which resolves the call to
1396   // Continuation.enter(Continuation c, boolean isContinue).
1397   __ call(resolve);
1398 
1399   oop_maps->add_gc_map(__ pc() - start, map);
1400   __ post_call_nop();
1401 
1402   __ jmpb(L_exit);
1403 
1404   // --- Thawing path
1405 
1406   __ bind(L_thaw);
1407 
1408   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1409 
1410   ContinuationEntry::_return_pc_offset = __ pc() - start;
1411   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1412   __ post_call_nop();
1413 
1414   // --- Normal exit (resolve/thawing)
1415 
1416   __ bind(L_exit);
1417 
1418   __ continuation_enter_cleanup();
1419   __ pop(rbp);
1420   __ ret(0);
1421 
1422   // --- Exception handling path
1423 
1424   exception_offset = __ pc() - start;
1425 
1426   __ continuation_enter_cleanup();
1427   __ pop(rbp);
1428 
1429   __ movptr(c_rarg0, r15_thread);
1430   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1431 
1432   // rax still holds the original exception oop, save it before the call
1433   __ push(rax);
1434 
1435   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1436   __ movptr(rbx, rax);
1437 
1438   // Continue at exception handler:
1439   //   rax: exception oop
1440   //   rbx: exception handler
1441   //   rdx: exception pc
1442   __ pop(rax);
1443   __ verify_oop(rax);
1444   __ pop(rdx);
1445   __ jmp(rbx);
1446 }
1447 
1448 static void gen_continuation_yield(MacroAssembler* masm,
1449                                    const VMRegPair* regs,
1450                                    OopMapSet* oop_maps,
1451                                    int& frame_complete,
1452                                    int& stack_slots,
1453                                    int& compiled_entry_offset) {
1454   enum layout {
1455     rbp_off,
1456     rbpH_off,
1457     return_off,
1458     return_off2,
1459     framesize // inclusive of return address
1460   };
1461   stack_slots = framesize /  VMRegImpl::slots_per_word;
1462   assert(stack_slots == 2, "recheck layout");
1463 
1464   address start = __ pc();
1465   compiled_entry_offset = __ pc() - start;
1466   __ enter();
1467   address the_pc = __ pc();
1468 
1469   frame_complete = the_pc - start;
1470 
1471   // This nop must be exactly at the PC we push into the frame info.
1472   // We use this nop for fast CodeBlob lookup, associate the OopMap
1473   // with it right away.
1474   __ post_call_nop();
1475   OopMap* map = new OopMap(framesize, 1);
1476   oop_maps->add_gc_map(frame_complete, map);
1477 
1478   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1479   __ movptr(c_rarg0, r15_thread);
1480   __ movptr(c_rarg1, rsp);
1481   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1482   __ reset_last_Java_frame(true);
1483 
1484   Label L_pinned;
1485 
1486   __ testptr(rax, rax);
1487   __ jcc(Assembler::notZero, L_pinned);
1488 
1489   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1490   __ continuation_enter_cleanup();
1491   __ pop(rbp);
1492   __ ret(0);
1493 
1494   __ bind(L_pinned);
1495 
1496   // Pinned, return to caller
1497   __ leave();
1498   __ ret(0);
1499 }
1500 
1501 static void gen_special_dispatch(MacroAssembler* masm,
1502                                  const methodHandle& method,
1503                                  const BasicType* sig_bt,
1504                                  const VMRegPair* regs) {
1505   verify_oop_args(masm, method, sig_bt, regs);
1506   vmIntrinsics::ID iid = method->intrinsic_id();
1507 
1508   // Now write the args into the outgoing interpreter space
1509   bool     has_receiver   = false;
1510   Register receiver_reg   = noreg;
1511   int      member_arg_pos = -1;
1512   Register member_reg     = noreg;
1513   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1514   if (ref_kind != 0) {
1515     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1516     member_reg = rbx;  // known to be free at this point
1517     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1518   } else if (iid == vmIntrinsics::_invokeBasic) {
1519     has_receiver = true;
1520   } else if (iid == vmIntrinsics::_linkToNative) {
1521     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1522     member_reg = rbx;  // known to be free at this point
1523   } else {
1524     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1525   }
1526 
1527   if (member_reg != noreg) {
1528     // Load the member_arg into register, if necessary.
1529     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1530     VMReg r = regs[member_arg_pos].first();
1531     if (r->is_stack()) {
1532       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1533     } else {
1534       // no data motion is needed
1535       member_reg = r->as_Register();
1536     }
1537   }
1538 
1539   if (has_receiver) {
1540     // Make sure the receiver is loaded into a register.
1541     assert(method->size_of_parameters() > 0, "oob");
1542     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1543     VMReg r = regs[0].first();
1544     assert(r->is_valid(), "bad receiver arg");
1545     if (r->is_stack()) {
1546       // Porting note:  This assumes that compiled calling conventions always
1547       // pass the receiver oop in a register.  If this is not true on some
1548       // platform, pick a temp and load the receiver from stack.
1549       fatal("receiver always in a register");
1550       receiver_reg = j_rarg0;  // known to be free at this point
1551       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1552     } else {
1553       // no data motion is needed
1554       receiver_reg = r->as_Register();
1555     }
1556   }
1557 
1558   // Figure out which address we are really jumping to:
1559   MethodHandles::generate_method_handle_dispatch(masm, iid,
1560                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1561 }
1562 
1563 // ---------------------------------------------------------------------------
1564 // Generate a native wrapper for a given method.  The method takes arguments
1565 // in the Java compiled code convention, marshals them to the native
1566 // convention (handlizes oops, etc), transitions to native, makes the call,
1567 // returns to java state (possibly blocking), unhandlizes any result and
1568 // returns.
1569 //
1570 // Critical native functions are a shorthand for the use of
1571 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1572 // functions.  The wrapper is expected to unpack the arguments before
1573 // passing them to the callee. Critical native functions leave the state _in_Java,
1574 // since they cannot stop for GC.
1575 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1576 // block and the check for pending exceptions it's impossible for them
1577 // to be thrown.
1578 //
1579 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1580                                                 const methodHandle& method,
1581                                                 int compile_id,
1582                                                 BasicType* in_sig_bt,
1583                                                 VMRegPair* in_regs,
1584                                                 BasicType ret_type) {
1585   if (method->is_continuation_native_intrinsic()) {
1586     int exception_offset = -1;
1587     OopMapSet* oop_maps = new OopMapSet();
1588     int frame_complete = -1;
1589     int stack_slots = -1;
1590     int interpreted_entry_offset = -1;
1591     int vep_offset = -1;
1592     if (method->is_continuation_enter_intrinsic()) {
1593       gen_continuation_enter(masm,
1594                              in_regs,
1595                              exception_offset,
1596                              oop_maps,
1597                              frame_complete,
1598                              stack_slots,
1599                              interpreted_entry_offset,
1600                              vep_offset);
1601     } else if (method->is_continuation_yield_intrinsic()) {
1602       gen_continuation_yield(masm,
1603                              in_regs,
1604                              oop_maps,
1605                              frame_complete,
1606                              stack_slots,
1607                              vep_offset);
1608     } else {
1609       guarantee(false, "Unknown Continuation native intrinsic");
1610     }
1611 
1612 #ifdef ASSERT
1613     if (method->is_continuation_enter_intrinsic()) {
1614       assert(interpreted_entry_offset != -1, "Must be set");
1615       assert(exception_offset != -1,         "Must be set");
1616     } else {
1617       assert(interpreted_entry_offset == -1, "Must be unset");
1618       assert(exception_offset == -1,         "Must be unset");
1619     }
1620     assert(frame_complete != -1,    "Must be set");
1621     assert(stack_slots != -1,       "Must be set");
1622     assert(vep_offset != -1,        "Must be set");
1623 #endif
1624 
1625     __ flush();
1626     nmethod* nm = nmethod::new_native_nmethod(method,
1627                                               compile_id,
1628                                               masm->code(),
1629                                               vep_offset,
1630                                               frame_complete,
1631                                               stack_slots,
1632                                               in_ByteSize(-1),
1633                                               in_ByteSize(-1),
1634                                               oop_maps,
1635                                               exception_offset);
1636     if (method->is_continuation_enter_intrinsic()) {
1637       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
1638     } else if (method->is_continuation_yield_intrinsic()) {
1639       _cont_doYield_stub = nm;
1640     }
1641     return nm;
1642   }
1643 
1644   if (method->is_method_handle_intrinsic()) {
1645     vmIntrinsics::ID iid = method->intrinsic_id();
1646     intptr_t start = (intptr_t)__ pc();
1647     int vep_offset = ((intptr_t)__ pc()) - start;
1648     gen_special_dispatch(masm,
1649                          method,
1650                          in_sig_bt,
1651                          in_regs);
1652     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1653     __ flush();
1654     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1655     return nmethod::new_native_nmethod(method,
1656                                        compile_id,
1657                                        masm->code(),
1658                                        vep_offset,
1659                                        frame_complete,
1660                                        stack_slots / VMRegImpl::slots_per_word,
1661                                        in_ByteSize(-1),
1662                                        in_ByteSize(-1),
1663                                        (OopMapSet*)NULL);
1664   }
1665   address native_func = method->native_function();
1666   assert(native_func != NULL, "must have function");
1667 
1668   // An OopMap for lock (and class if static)
1669   OopMapSet *oop_maps = new OopMapSet();
1670   intptr_t start = (intptr_t)__ pc();
1671 
1672   // We have received a description of where all the java arg are located
1673   // on entry to the wrapper. We need to convert these args to where
1674   // the jni function will expect them. To figure out where they go
1675   // we convert the java signature to a C signature by inserting
1676   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1677 
1678   const int total_in_args = method->size_of_parameters();
1679   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
1680 
1681   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1682   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1683   BasicType* in_elem_bt = NULL;
1684 
1685   int argc = 0;
1686   out_sig_bt[argc++] = T_ADDRESS;
1687   if (method->is_static()) {
1688     out_sig_bt[argc++] = T_OBJECT;
1689   }
1690 
1691   for (int i = 0; i < total_in_args ; i++ ) {
1692     out_sig_bt[argc++] = in_sig_bt[i];
1693   }
1694 
1695   // Now figure out where the args must be stored and how much stack space
1696   // they require.
1697   int out_arg_slots;
1698   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1699 
1700   // Compute framesize for the wrapper.  We need to handlize all oops in
1701   // incoming registers
1702 
1703   // Calculate the total number of stack slots we will need.
1704 
1705   // First count the abi requirement plus all of the outgoing args
1706   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1707 
1708   // Now the space for the inbound oop handle area
1709   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1710 
1711   int oop_handle_offset = stack_slots;
1712   stack_slots += total_save_slots;
1713 
1714   // Now any space we need for handlizing a klass if static method
1715 
1716   int klass_slot_offset = 0;
1717   int klass_offset = -1;
1718   int lock_slot_offset = 0;
1719   bool is_static = false;
1720 
1721   if (method->is_static()) {
1722     klass_slot_offset = stack_slots;
1723     stack_slots += VMRegImpl::slots_per_word;
1724     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1725     is_static = true;
1726   }
1727 
1728   // Plus a lock if needed
1729 
1730   if (method->is_synchronized()) {
1731     lock_slot_offset = stack_slots;
1732     stack_slots += VMRegImpl::slots_per_word;
1733   }
1734 
1735   // Now a place (+2) to save return values or temp during shuffling
1736   // + 4 for return address (which we own) and saved rbp
1737   stack_slots += 6;
1738 
1739   // Ok The space we have allocated will look like:
1740   //
1741   //
1742   // FP-> |                     |
1743   //      |---------------------|
1744   //      | 2 slots for moves   |
1745   //      |---------------------|
1746   //      | lock box (if sync)  |
1747   //      |---------------------| <- lock_slot_offset
1748   //      | klass (if static)   |
1749   //      |---------------------| <- klass_slot_offset
1750   //      | oopHandle area      |
1751   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1752   //      | outbound memory     |
1753   //      | based arguments     |
1754   //      |                     |
1755   //      |---------------------|
1756   //      |                     |
1757   // SP-> | out_preserved_slots |
1758   //
1759   //
1760 
1761 
1762   // Now compute actual number of stack words we need rounding to make
1763   // stack properly aligned.
1764   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
1765 
1766   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
1767 
1768   // First thing make an ic check to see if we should even be here
1769 
1770   // We are free to use all registers as temps without saving them and
1771   // restoring them except rbp. rbp is the only callee save register
1772   // as far as the interpreter and the compiler(s) are concerned.
1773 
1774 
1775   const Register ic_reg = rax;
1776   const Register receiver = j_rarg0;
1777 
1778   Label hit;
1779   Label exception_pending;
1780 
1781   assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
1782   __ verify_oop(receiver);
1783   __ load_klass(rscratch1, receiver, rscratch2);
1784   __ cmpq(ic_reg, rscratch1);
1785   __ jcc(Assembler::equal, hit);
1786 
1787   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1788 
1789   // Verified entry point must be aligned
1790   __ align(8);
1791 
1792   __ bind(hit);
1793 
1794   int vep_offset = ((intptr_t)__ pc()) - start;
1795 
1796   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
1797     Label L_skip_barrier;
1798     Register klass = r10;
1799     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
1800     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1801 
1802     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1803 
1804     __ bind(L_skip_barrier);
1805   }
1806 
1807 #ifdef COMPILER1
1808   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
1809   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
1810     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
1811   }
1812 #endif // COMPILER1
1813 
1814   // The instruction at the verified entry point must be 5 bytes or longer
1815   // because it can be patched on the fly by make_non_entrant. The stack bang
1816   // instruction fits that requirement.
1817 
1818   // Generate stack overflow check
1819   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
1820 
1821   // Generate a new frame for the wrapper.
1822   __ enter();
1823   // -2 because return address is already present and so is saved rbp
1824   __ subptr(rsp, stack_size - 2*wordSize);
1825 
1826   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1827   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
1828   bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);
1829 
1830   // Frame is now completed as far as size and linkage.
1831   int frame_complete = ((intptr_t)__ pc()) - start;
1832 
1833     if (UseRTMLocking) {
1834       // Abort RTM transaction before calling JNI
1835       // because critical section will be large and will be
1836       // aborted anyway. Also nmethod could be deoptimized.
1837       __ xabort(0);
1838     }
1839 
1840 #ifdef ASSERT
1841   __ check_stack_alignment(rsp, "improperly aligned stack");
1842 #endif /* ASSERT */
1843 
1844 
1845   // We use r14 as the oop handle for the receiver/klass
1846   // It is callee save so it survives the call to native
1847 
1848   const Register oop_handle_reg = r14;
1849 
1850   //
1851   // We immediately shuffle the arguments so that any vm call we have to
1852   // make from here on out (sync slow path, jvmti, etc.) we will have
1853   // captured the oops from our caller and have a valid oopMap for
1854   // them.
1855 
1856   // -----------------
1857   // The Grand Shuffle
1858 
1859   // The Java calling convention is either equal (linux) or denser (win64) than the
1860   // c calling convention. However the because of the jni_env argument the c calling
1861   // convention always has at least one more (and two for static) arguments than Java.
1862   // Therefore if we move the args from java -> c backwards then we will never have
1863   // a register->register conflict and we don't have to build a dependency graph
1864   // and figure out how to break any cycles.
1865   //
1866 
1867   // Record esp-based slot for receiver on stack for non-static methods
1868   int receiver_offset = -1;
1869 
1870   // This is a trick. We double the stack slots so we can claim
1871   // the oops in the caller's frame. Since we are sure to have
1872   // more args than the caller doubling is enough to make
1873   // sure we can capture all the incoming oop args from the
1874   // caller.
1875   //
1876   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
1877 
1878   // Mark location of rbp (someday)
1879   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
1880 
1881   // Use eax, ebx as temporaries during any memory-memory moves we have to do
1882   // All inbound args are referenced based on rbp and all outbound args via rsp.
1883 
1884 
1885 #ifdef ASSERT
1886   bool reg_destroyed[Register::number_of_registers];
1887   bool freg_destroyed[XMMRegister::number_of_registers];
1888   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
1889     reg_destroyed[r] = false;
1890   }
1891   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
1892     freg_destroyed[f] = false;
1893   }
1894 
1895 #endif /* ASSERT */
1896 
1897   // For JNI natives the incoming and outgoing registers are offset upwards.
1898   GrowableArray<int> arg_order(2 * total_in_args);
1899 
1900   VMRegPair tmp_vmreg;
1901   tmp_vmreg.set2(rbx->as_VMReg());
1902 
1903   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1904     arg_order.push(i);
1905     arg_order.push(c_arg);
1906   }
1907 
1908   int temploc = -1;
1909   for (int ai = 0; ai < arg_order.length(); ai += 2) {
1910     int i = arg_order.at(ai);
1911     int c_arg = arg_order.at(ai + 1);
1912     __ block_comment(err_msg("move %d -> %d", i, c_arg));
1913 #ifdef ASSERT
1914     if (in_regs[i].first()->is_Register()) {
1915       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
1916     } else if (in_regs[i].first()->is_XMMRegister()) {
1917       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
1918     }
1919     if (out_regs[c_arg].first()->is_Register()) {
1920       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
1921     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
1922       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
1923     }
1924 #endif /* ASSERT */
1925     switch (in_sig_bt[i]) {
1926       case T_ARRAY:
1927       case T_OBJECT:
1928         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
1929                     ((i == 0) && (!is_static)),
1930                     &receiver_offset);
1931         break;
1932       case T_VOID:
1933         break;
1934 
1935       case T_FLOAT:
1936         __ float_move(in_regs[i], out_regs[c_arg]);
1937           break;
1938 
1939       case T_DOUBLE:
1940         assert( i + 1 < total_in_args &&
1941                 in_sig_bt[i + 1] == T_VOID &&
1942                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
1943         __ double_move(in_regs[i], out_regs[c_arg]);
1944         break;
1945 
1946       case T_LONG :
1947         __ long_move(in_regs[i], out_regs[c_arg]);
1948         break;
1949 
1950       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
1951 
1952       default:
1953         __ move32_64(in_regs[i], out_regs[c_arg]);
1954     }
1955   }
1956 
1957   int c_arg;
1958 
1959   // Pre-load a static method's oop into r14.  Used both by locking code and
1960   // the normal JNI call code.
1961   // point c_arg at the first arg that is already loaded in case we
1962   // need to spill before we call out
1963   c_arg = total_c_args - total_in_args;
1964 
1965   if (method->is_static()) {
1966 
1967     //  load oop into a register
1968     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
1969 
1970     // Now handlize the static class mirror it's known not-null.
1971     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
1972     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
1973 
1974     // Now get the handle
1975     __ lea(oop_handle_reg, Address(rsp, klass_offset));
1976     // store the klass handle as second argument
1977     __ movptr(c_rarg1, oop_handle_reg);
1978     // and protect the arg if we must spill
1979     c_arg--;
1980   }
1981 
1982   // Change state to native (we save the return address in the thread, since it might not
1983   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
1984   // points into the right code segment. It does not have to be the correct return pc.
1985   // We use the same pc/oopMap repeatedly when we call out
1986 
1987   intptr_t the_pc = (intptr_t) __ pc();
1988   oop_maps->add_gc_map(the_pc - start, map);
1989 
1990   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
1991 
1992 
1993   // We have all of the arguments setup at this point. We must not touch any register
1994   // argument registers at this point (what if we save/restore them there are no oop?
1995 
1996   {
1997     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
1998     // protect the args we've loaded
1999     save_args(masm, total_c_args, c_arg, out_regs);
2000     __ mov_metadata(c_rarg1, method());
2001     __ call_VM_leaf(
2002       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2003       r15_thread, c_rarg1);
2004     restore_args(masm, total_c_args, c_arg, out_regs);
2005   }
2006 
2007   // RedefineClasses() tracing support for obsolete method entry
2008   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2009     // protect the args we've loaded
2010     save_args(masm, total_c_args, c_arg, out_regs);
2011     __ mov_metadata(c_rarg1, method());
2012     __ call_VM_leaf(
2013       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2014       r15_thread, c_rarg1);
2015     restore_args(masm, total_c_args, c_arg, out_regs);
2016   }
2017 
2018   // Lock a synchronized method
2019 
2020   // Register definitions used by locking and unlocking
2021 
2022   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2023   const Register obj_reg  = rbx;  // Will contain the oop
2024   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2025   const Register old_hdr  = r13;  // value of old header at unlock time
2026 
2027   Label slow_path_lock;
2028   Label lock_done;
2029 
2030   if (method->is_synchronized()) {
2031     Label count_mon;
2032 
2033     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2034 
2035     // Get the handle (the 2nd argument)
2036     __ mov(oop_handle_reg, c_rarg1);
2037 
2038     // Get address of the box
2039 
2040     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2041 
2042     // Load the oop from the handle
2043     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2044 
2045     if (!UseHeavyMonitors) {
2046 
2047       // Load immediate 1 into swap_reg %rax
2048       __ movl(swap_reg, 1);
2049 
2050       // Load (object->mark() | 1) into swap_reg %rax
2051       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2052 
2053       // Save (object->mark() | 1) into BasicLock's displaced header
2054       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2055 
2056       // src -> dest iff dest == rax else rax <- dest
2057       __ lock();
2058       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2059       __ jcc(Assembler::equal, count_mon);
2060 
2061       // Hmm should this move to the slow path code area???
2062 
2063       // Test if the oopMark is an obvious stack pointer, i.e.,
2064       //  1) (mark & 3) == 0, and
2065       //  2) rsp <= mark < mark + os::pagesize()
2066       // These 3 tests can be done by evaluating the following
2067       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2068       // assuming both stack pointer and pagesize have their
2069       // least significant 2 bits clear.
2070       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2071 
2072       __ subptr(swap_reg, rsp);
2073       __ andptr(swap_reg, 3 - os::vm_page_size());
2074 
2075       // Save the test result, for recursive case, the result is zero
2076       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2077       __ jcc(Assembler::notEqual, slow_path_lock);
2078     } else {
2079       __ jmp(slow_path_lock);
2080     }
2081     __ bind(count_mon);
2082     __ inc_held_monitor_count();
2083 
2084     // Slow path will re-enter here
2085     __ bind(lock_done);
2086   }
2087 
2088   // Finally just about ready to make the JNI call
2089 
2090   // get JNIEnv* which is first argument to native
2091   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2092 
2093   // Now set thread in native
2094   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2095 
2096   __ call(RuntimeAddress(native_func));
2097 
2098   // Verify or restore cpu control state after JNI call
2099   __ restore_cpu_control_state_after_jni(rscratch1);
2100 
2101   // Unpack native results.
2102   switch (ret_type) {
2103   case T_BOOLEAN: __ c2bool(rax);            break;
2104   case T_CHAR   : __ movzwl(rax, rax);      break;
2105   case T_BYTE   : __ sign_extend_byte (rax); break;
2106   case T_SHORT  : __ sign_extend_short(rax); break;
2107   case T_INT    : /* nothing to do */        break;
2108   case T_DOUBLE :
2109   case T_FLOAT  :
2110     // Result is in xmm0 we'll save as needed
2111     break;
2112   case T_ARRAY:                 // Really a handle
2113   case T_OBJECT:                // Really a handle
2114       break; // can't de-handlize until after safepoint check
2115   case T_VOID: break;
2116   case T_LONG: break;
2117   default       : ShouldNotReachHere();
2118   }
2119 
2120   Label after_transition;
2121 
2122   // Switch thread to "native transition" state before reading the synchronization state.
2123   // This additional state is necessary because reading and testing the synchronization
2124   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2125   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2126   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2127   //     Thread A is resumed to finish this native method, but doesn't block here since it
2128   //     didn't see any synchronization is progress, and escapes.
2129   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2130 
2131   // Force this write out before the read below
2132   if (!UseSystemMemoryBarrier) {
2133     __ membar(Assembler::Membar_mask_bits(
2134               Assembler::LoadLoad | Assembler::LoadStore |
2135               Assembler::StoreLoad | Assembler::StoreStore));
2136   }
2137 
2138   // check for safepoint operation in progress and/or pending suspend requests
2139   {
2140     Label Continue;
2141     Label slow_path;
2142 
2143     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2144 
2145     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2146     __ jcc(Assembler::equal, Continue);
2147     __ bind(slow_path);
2148 
2149     // Don't use call_VM as it will see a possible pending exception and forward it
2150     // and never return here preventing us from clearing _last_native_pc down below.
2151     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2152     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2153     // by hand.
2154     //
2155     __ vzeroupper();
2156     save_native_result(masm, ret_type, stack_slots);
2157     __ mov(c_rarg0, r15_thread);
2158     __ mov(r12, rsp); // remember sp
2159     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2160     __ andptr(rsp, -16); // align stack as required by ABI
2161     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2162     __ mov(rsp, r12); // restore sp
2163     __ reinit_heapbase();
2164     // Restore any method result value
2165     restore_native_result(masm, ret_type, stack_slots);
2166     __ bind(Continue);
2167   }
2168 
2169   // change thread state
2170   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2171   __ bind(after_transition);
2172 
2173   Label reguard;
2174   Label reguard_done;
2175   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2176   __ jcc(Assembler::equal, reguard);
2177   __ bind(reguard_done);
2178 
2179   // native result if any is live
2180 
2181   // Unlock
2182   Label slow_path_unlock;
2183   Label unlock_done;
2184   if (method->is_synchronized()) {
2185 
2186     Label fast_done;
2187 
2188     // Get locked oop from the handle we passed to jni
2189     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2190 
2191     if (!UseHeavyMonitors) {
2192       Label not_recur;
2193       // Simple recursive lock?
2194       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2195       __ jcc(Assembler::notEqual, not_recur);
2196       __ dec_held_monitor_count();
2197       __ jmpb(fast_done);
2198       __ bind(not_recur);
2199     }
2200 
2201     // Must save rax if it is live now because cmpxchg must use it
2202     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2203       save_native_result(masm, ret_type, stack_slots);
2204     }
2205 
2206     if (!UseHeavyMonitors) {
2207       // get address of the stack lock
2208       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2209       //  get old displaced header
2210       __ movptr(old_hdr, Address(rax, 0));
2211 
2212       // Atomic swap old header if oop still contains the stack lock
2213       __ lock();
2214       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2215       __ jcc(Assembler::notEqual, slow_path_unlock);
2216       __ dec_held_monitor_count();
2217     } else {
2218       __ jmp(slow_path_unlock);
2219     }
2220 
2221     // slow path re-enters here
2222     __ bind(unlock_done);
2223     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2224       restore_native_result(masm, ret_type, stack_slots);
2225     }
2226 
2227     __ bind(fast_done);
2228   }
2229   {
2230     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2231     save_native_result(masm, ret_type, stack_slots);
2232     __ mov_metadata(c_rarg1, method());
2233     __ call_VM_leaf(
2234          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2235          r15_thread, c_rarg1);
2236     restore_native_result(masm, ret_type, stack_slots);
2237   }
2238 
2239   __ reset_last_Java_frame(false);
2240 
2241   // Unbox oop result, e.g. JNIHandles::resolve value.
2242   if (is_reference_type(ret_type)) {
2243     __ resolve_jobject(rax /* value */,
2244                        r15_thread /* thread */,
2245                        rcx /* tmp */);
2246   }
2247 
2248   if (CheckJNICalls) {
2249     // clear_pending_jni_exception_check
2250     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2251   }
2252 
2253   // reset handle block
2254   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2255   __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), NULL_WORD);
2256 
2257   // pop our frame
2258 
2259   __ leave();
2260 
2261   // Any exception pending?
2262   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2263   __ jcc(Assembler::notEqual, exception_pending);
2264 
2265   // Return
2266 
2267   __ ret(0);
2268 
2269   // Unexpected paths are out of line and go here
2270 
2271   // forward the exception
2272   __ bind(exception_pending);
2273 
2274   // and forward the exception
2275   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2276 
2277   // Slow path locking & unlocking
2278   if (method->is_synchronized()) {
2279 
2280     // BEGIN Slow path lock
2281     __ bind(slow_path_lock);
2282 
2283     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2284     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2285 
2286     // protect the args we've loaded
2287     save_args(masm, total_c_args, c_arg, out_regs);
2288 
2289     __ mov(c_rarg0, obj_reg);
2290     __ mov(c_rarg1, lock_reg);
2291     __ mov(c_rarg2, r15_thread);
2292 
2293     // Not a leaf but we have last_Java_frame setup as we want
2294     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2295     restore_args(masm, total_c_args, c_arg, out_regs);
2296 
2297 #ifdef ASSERT
2298     { Label L;
2299     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2300     __ jcc(Assembler::equal, L);
2301     __ stop("no pending exception allowed on exit from monitorenter");
2302     __ bind(L);
2303     }
2304 #endif
2305     __ jmp(lock_done);
2306 
2307     // END Slow path lock
2308 
2309     // BEGIN Slow path unlock
2310     __ bind(slow_path_unlock);
2311 
2312     // If we haven't already saved the native result we must save it now as xmm registers
2313     // are still exposed.
2314     __ vzeroupper();
2315     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2316       save_native_result(masm, ret_type, stack_slots);
2317     }
2318 
2319     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2320 
2321     __ mov(c_rarg0, obj_reg);
2322     __ mov(c_rarg2, r15_thread);
2323     __ mov(r12, rsp); // remember sp
2324     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2325     __ andptr(rsp, -16); // align stack as required by ABI
2326 
2327     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2328     // NOTE that obj_reg == rbx currently
2329     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2330     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2331 
2332     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2333     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2334     __ mov(rsp, r12); // restore sp
2335     __ reinit_heapbase();
2336 #ifdef ASSERT
2337     {
2338       Label L;
2339       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2340       __ jcc(Assembler::equal, L);
2341       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2342       __ bind(L);
2343     }
2344 #endif /* ASSERT */
2345 
2346     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2347 
2348     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2349       restore_native_result(masm, ret_type, stack_slots);
2350     }
2351     __ jmp(unlock_done);
2352 
2353     // END Slow path unlock
2354 
2355   } // synchronized
2356 
2357   // SLOW PATH Reguard the stack if needed
2358 
2359   __ bind(reguard);
2360   __ vzeroupper();
2361   save_native_result(masm, ret_type, stack_slots);
2362   __ mov(r12, rsp); // remember sp
2363   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2364   __ andptr(rsp, -16); // align stack as required by ABI
2365   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2366   __ mov(rsp, r12); // restore sp
2367   __ reinit_heapbase();
2368   restore_native_result(masm, ret_type, stack_slots);
2369   // and continue
2370   __ jmp(reguard_done);
2371 
2372 
2373 
2374   __ flush();
2375 
2376   nmethod *nm = nmethod::new_native_nmethod(method,
2377                                             compile_id,
2378                                             masm->code(),
2379                                             vep_offset,
2380                                             frame_complete,
2381                                             stack_slots / VMRegImpl::slots_per_word,
2382                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2383                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2384                                             oop_maps);
2385 
2386   return nm;
2387 }
2388 
2389 // this function returns the adjust size (in number of words) to a c2i adapter
2390 // activation for use during deoptimization
2391 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2392   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2393 }
2394 
2395 
2396 uint SharedRuntime::out_preserve_stack_slots() {
2397   return 0;
2398 }
2399 
2400 
2401 // Number of stack slots between incoming argument block and the start of
2402 // a new frame.  The PROLOG must add this many slots to the stack.  The
2403 // EPILOG must remove this many slots.  amd64 needs two slots for
2404 // return address.
2405 uint SharedRuntime::in_preserve_stack_slots() {
2406   return 4 + 2 * VerifyStackAtCalls;
2407 }
2408 
2409 //------------------------------generate_deopt_blob----------------------------
2410 void SharedRuntime::generate_deopt_blob() {
2411   // Allocate space for the code
2412   ResourceMark rm;
2413   // Setup code generation tools
2414   int pad = 0;
2415   if (UseAVX > 2) {
2416     pad += 1024;
2417   }
2418 #if INCLUDE_JVMCI
2419   if (EnableJVMCI) {
2420     pad += 512; // Increase the buffer size when compiling for JVMCI
2421   }
2422 #endif
2423   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2424   MacroAssembler* masm = new MacroAssembler(&buffer);
2425   int frame_size_in_words;
2426   OopMap* map = NULL;
2427   OopMapSet *oop_maps = new OopMapSet();
2428 
2429   // -------------
2430   // This code enters when returning to a de-optimized nmethod.  A return
2431   // address has been pushed on the stack, and return values are in
2432   // registers.
2433   // If we are doing a normal deopt then we were called from the patched
2434   // nmethod from the point we returned to the nmethod. So the return
2435   // address on the stack is wrong by NativeCall::instruction_size
2436   // We will adjust the value so it looks like we have the original return
2437   // address on the stack (like when we eagerly deoptimized).
2438   // In the case of an exception pending when deoptimizing, we enter
2439   // with a return address on the stack that points after the call we patched
2440   // into the exception handler. We have the following register state from,
2441   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2442   //    rax: exception oop
2443   //    rbx: exception handler
2444   //    rdx: throwing pc
2445   // So in this case we simply jam rdx into the useless return address and
2446   // the stack looks just like we want.
2447   //
2448   // At this point we need to de-opt.  We save the argument return
2449   // registers.  We call the first C routine, fetch_unroll_info().  This
2450   // routine captures the return values and returns a structure which
2451   // describes the current frame size and the sizes of all replacement frames.
2452   // The current frame is compiled code and may contain many inlined
2453   // functions, each with their own JVM state.  We pop the current frame, then
2454   // push all the new frames.  Then we call the C routine unpack_frames() to
2455   // populate these frames.  Finally unpack_frames() returns us the new target
2456   // address.  Notice that callee-save registers are BLOWN here; they have
2457   // already been captured in the vframeArray at the time the return PC was
2458   // patched.
2459   address start = __ pc();
2460   Label cont;
2461 
2462   // Prolog for non exception case!
2463 
2464   // Save everything in sight.
2465   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2466 
2467   // Normal deoptimization.  Save exec mode for unpack_frames.
2468   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2469   __ jmp(cont);
2470 
2471   int reexecute_offset = __ pc() - start;
2472 #if INCLUDE_JVMCI && !defined(COMPILER1)
2473   if (EnableJVMCI && UseJVMCICompiler) {
2474     // JVMCI does not use this kind of deoptimization
2475     __ should_not_reach_here();
2476   }
2477 #endif
2478 
2479   // Reexecute case
2480   // return address is the pc describes what bci to do re-execute at
2481 
2482   // No need to update map as each call to save_live_registers will produce identical oopmap
2483   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2484 
2485   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2486   __ jmp(cont);
2487 
2488 #if INCLUDE_JVMCI
2489   Label after_fetch_unroll_info_call;
2490   int implicit_exception_uncommon_trap_offset = 0;
2491   int uncommon_trap_offset = 0;
2492 
2493   if (EnableJVMCI) {
2494     implicit_exception_uncommon_trap_offset = __ pc() - start;
2495 
2496     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2497     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2498 
2499     uncommon_trap_offset = __ pc() - start;
2500 
2501     // Save everything in sight.
2502     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2503     // fetch_unroll_info needs to call last_java_frame()
2504     __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2505 
2506     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2507     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2508 
2509     __ movl(r14, Deoptimization::Unpack_reexecute);
2510     __ mov(c_rarg0, r15_thread);
2511     __ movl(c_rarg2, r14); // exec mode
2512     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2513     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2514 
2515     __ reset_last_Java_frame(false);
2516 
2517     __ jmp(after_fetch_unroll_info_call);
2518   } // EnableJVMCI
2519 #endif // INCLUDE_JVMCI
2520 
2521   int exception_offset = __ pc() - start;
2522 
2523   // Prolog for exception case
2524 
2525   // all registers are dead at this entry point, except for rax, and
2526   // rdx which contain the exception oop and exception pc
2527   // respectively.  Set them in TLS and fall thru to the
2528   // unpack_with_exception_in_tls entry point.
2529 
2530   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2531   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2532 
2533   int exception_in_tls_offset = __ pc() - start;
2534 
2535   // new implementation because exception oop is now passed in JavaThread
2536 
2537   // Prolog for exception case
2538   // All registers must be preserved because they might be used by LinearScan
2539   // Exceptiop oop and throwing PC are passed in JavaThread
2540   // tos: stack at point of call to method that threw the exception (i.e. only
2541   // args are on the stack, no return address)
2542 
2543   // make room on stack for the return address
2544   // It will be patched later with the throwing pc. The correct value is not
2545   // available now because loading it from memory would destroy registers.
2546   __ push(0);
2547 
2548   // Save everything in sight.
2549   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2550 
2551   // Now it is safe to overwrite any register
2552 
2553   // Deopt during an exception.  Save exec mode for unpack_frames.
2554   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2555 
2556   // load throwing pc from JavaThread and patch it as the return address
2557   // of the current frame. Then clear the field in JavaThread
2558 
2559   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2560   __ movptr(Address(rbp, wordSize), rdx);
2561   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2562 
2563 #ifdef ASSERT
2564   // verify that there is really an exception oop in JavaThread
2565   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2566   __ verify_oop(rax);
2567 
2568   // verify that there is no pending exception
2569   Label no_pending_exception;
2570   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2571   __ testptr(rax, rax);
2572   __ jcc(Assembler::zero, no_pending_exception);
2573   __ stop("must not have pending exception here");
2574   __ bind(no_pending_exception);
2575 #endif
2576 
2577   __ bind(cont);
2578 
2579   // Call C code.  Need thread and this frame, but NOT official VM entry
2580   // crud.  We cannot block on this call, no GC can happen.
2581   //
2582   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2583 
2584   // fetch_unroll_info needs to call last_java_frame().
2585 
2586   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2587 #ifdef ASSERT
2588   { Label L;
2589     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2590     __ jcc(Assembler::equal, L);
2591     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2592     __ bind(L);
2593   }
2594 #endif // ASSERT
2595   __ mov(c_rarg0, r15_thread);
2596   __ movl(c_rarg1, r14); // exec_mode
2597   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2598 
2599   // Need to have an oopmap that tells fetch_unroll_info where to
2600   // find any register it might need.
2601   oop_maps->add_gc_map(__ pc() - start, map);
2602 
2603   __ reset_last_Java_frame(false);
2604 
2605 #if INCLUDE_JVMCI
2606   if (EnableJVMCI) {
2607     __ bind(after_fetch_unroll_info_call);
2608   }
2609 #endif
2610 
2611   // Load UnrollBlock* into rdi
2612   __ mov(rdi, rax);
2613 
2614   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2615    Label noException;
2616   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2617   __ jcc(Assembler::notEqual, noException);
2618   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2619   // QQQ this is useless it was NULL above
2620   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2621   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
2622   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2623 
2624   __ verify_oop(rax);
2625 
2626   // Overwrite the result registers with the exception results.
2627   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2628   // I think this is useless
2629   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2630 
2631   __ bind(noException);
2632 
2633   // Only register save data is on the stack.
2634   // Now restore the result registers.  Everything else is either dead
2635   // or captured in the vframeArray.
2636   RegisterSaver::restore_result_registers(masm);
2637 
2638   // All of the register save area has been popped of the stack. Only the
2639   // return address remains.
2640 
2641   // Pop all the frames we must move/replace.
2642   //
2643   // Frame picture (youngest to oldest)
2644   // 1: self-frame (no frame link)
2645   // 2: deopting frame  (no frame link)
2646   // 3: caller of deopting frame (could be compiled/interpreted).
2647   //
2648   // Note: by leaving the return address of self-frame on the stack
2649   // and using the size of frame 2 to adjust the stack
2650   // when we are done the return to frame 3 will still be on the stack.
2651 
2652   // Pop deoptimized frame
2653   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2654   __ addptr(rsp, rcx);
2655 
2656   // rsp should be pointing at the return address to the caller (3)
2657 
2658   // Pick up the initial fp we should save
2659   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2660   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2661 
2662 #ifdef ASSERT
2663   // Compilers generate code that bang the stack by as much as the
2664   // interpreter would need. So this stack banging should never
2665   // trigger a fault. Verify that it does not on non product builds.
2666   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2667   __ bang_stack_size(rbx, rcx);
2668 #endif
2669 
2670   // Load address of array of frame pcs into rcx
2671   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2672 
2673   // Trash the old pc
2674   __ addptr(rsp, wordSize);
2675 
2676   // Load address of array of frame sizes into rsi
2677   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2678 
2679   // Load counter into rdx
2680   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2681 
2682   // Now adjust the caller's stack to make up for the extra locals
2683   // but record the original sp so that we can save it in the skeletal interpreter
2684   // frame and the stack walking of interpreter_sender will get the unextended sp
2685   // value and not the "real" sp value.
2686 
2687   const Register sender_sp = r8;
2688 
2689   __ mov(sender_sp, rsp);
2690   __ movl(rbx, Address(rdi,
2691                        Deoptimization::UnrollBlock::
2692                        caller_adjustment_offset_in_bytes()));
2693   __ subptr(rsp, rbx);
2694 
2695   // Push interpreter frames in a loop
2696   Label loop;
2697   __ bind(loop);
2698   __ movptr(rbx, Address(rsi, 0));      // Load frame size
2699   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
2700   __ pushptr(Address(rcx, 0));          // Save return address
2701   __ enter();                           // Save old & set new ebp
2702   __ subptr(rsp, rbx);                  // Prolog
2703   // This value is corrected by layout_activation_impl
2704   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2705   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
2706   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
2707   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
2708   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
2709   __ decrementl(rdx);                   // Decrement counter
2710   __ jcc(Assembler::notZero, loop);
2711   __ pushptr(Address(rcx, 0));          // Save final return address
2712 
2713   // Re-push self-frame
2714   __ enter();                           // Save old & set new ebp
2715 
2716   // Allocate a full sized register save area.
2717   // Return address and rbp are in place, so we allocate two less words.
2718   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
2719 
2720   // Restore frame locals after moving the frame
2721   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
2722   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2723 
2724   // Call C code.  Need thread but NOT official VM entry
2725   // crud.  We cannot block on this call, no GC can happen.  Call should
2726   // restore return values to their stack-slots with the new SP.
2727   //
2728   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
2729 
2730   // Use rbp because the frames look interpreted now
2731   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2732   // Don't need the precise return PC here, just precise enough to point into this code blob.
2733   address the_pc = __ pc();
2734   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2735 
2736   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
2737   __ mov(c_rarg0, r15_thread);
2738   __ movl(c_rarg1, r14); // second arg: exec_mode
2739   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2740   // Revert SP alignment after call since we're going to do some SP relative addressing below
2741   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
2742 
2743   // Set an oopmap for the call site
2744   // Use the same PC we used for the last java frame
2745   oop_maps->add_gc_map(the_pc - start,
2746                        new OopMap( frame_size_in_words, 0 ));
2747 
2748   // Clear fp AND pc
2749   __ reset_last_Java_frame(true);
2750 
2751   // Collect return values
2752   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
2753   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
2754   // I think this is useless (throwing pc?)
2755   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
2756 
2757   // Pop self-frame.
2758   __ leave();                           // Epilog
2759 
2760   // Jump to interpreter
2761   __ ret(0);
2762 
2763   // Make sure all code is generated
2764   masm->flush();
2765 
2766   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
2767   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
2768 #if INCLUDE_JVMCI
2769   if (EnableJVMCI) {
2770     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
2771     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
2772   }
2773 #endif
2774 }
2775 
2776 #ifdef COMPILER2
2777 //------------------------------generate_uncommon_trap_blob--------------------
2778 void SharedRuntime::generate_uncommon_trap_blob() {
2779   // Allocate space for the code
2780   ResourceMark rm;
2781   // Setup code generation tools
2782   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
2783   MacroAssembler* masm = new MacroAssembler(&buffer);
2784 
2785   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
2786 
2787   address start = __ pc();
2788 
2789   if (UseRTMLocking) {
2790     // Abort RTM transaction before possible nmethod deoptimization.
2791     __ xabort(0);
2792   }
2793 
2794   // Push self-frame.  We get here with a return address on the
2795   // stack, so rsp is 8-byte aligned until we allocate our frame.
2796   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
2797 
2798   // No callee saved registers. rbp is assumed implicitly saved
2799   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
2800 
2801   // compiler left unloaded_class_index in j_rarg0 move to where the
2802   // runtime expects it.
2803   __ movl(c_rarg1, j_rarg0);
2804 
2805   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
2806 
2807   // Call C code.  Need thread but NOT official VM entry
2808   // crud.  We cannot block on this call, no GC can happen.  Call should
2809   // capture callee-saved registers as well as return values.
2810   // Thread is in rdi already.
2811   //
2812   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
2813 
2814   __ mov(c_rarg0, r15_thread);
2815   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
2816   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2817 
2818   // Set an oopmap for the call site
2819   OopMapSet* oop_maps = new OopMapSet();
2820   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
2821 
2822   // location of rbp is known implicitly by the frame sender code
2823 
2824   oop_maps->add_gc_map(__ pc() - start, map);
2825 
2826   __ reset_last_Java_frame(false);
2827 
2828   // Load UnrollBlock* into rdi
2829   __ mov(rdi, rax);
2830 
2831 #ifdef ASSERT
2832   { Label L;
2833     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
2834               Deoptimization::Unpack_uncommon_trap);
2835     __ jcc(Assembler::equal, L);
2836     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
2837     __ bind(L);
2838   }
2839 #endif
2840 
2841   // Pop all the frames we must move/replace.
2842   //
2843   // Frame picture (youngest to oldest)
2844   // 1: self-frame (no frame link)
2845   // 2: deopting frame  (no frame link)
2846   // 3: caller of deopting frame (could be compiled/interpreted).
2847 
2848   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
2849   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
2850 
2851   // Pop deoptimized frame (int)
2852   __ movl(rcx, Address(rdi,
2853                        Deoptimization::UnrollBlock::
2854                        size_of_deoptimized_frame_offset_in_bytes()));
2855   __ addptr(rsp, rcx);
2856 
2857   // rsp should be pointing at the return address to the caller (3)
2858 
2859   // Pick up the initial fp we should save
2860   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2861   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2862 
2863 #ifdef ASSERT
2864   // Compilers generate code that bang the stack by as much as the
2865   // interpreter would need. So this stack banging should never
2866   // trigger a fault. Verify that it does not on non product builds.
2867   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2868   __ bang_stack_size(rbx, rcx);
2869 #endif
2870 
2871   // Load address of array of frame pcs into rcx (address*)
2872   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2873 
2874   // Trash the return pc
2875   __ addptr(rsp, wordSize);
2876 
2877   // Load address of array of frame sizes into rsi (intptr_t*)
2878   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
2879 
2880   // Counter
2881   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
2882 
2883   // Now adjust the caller's stack to make up for the extra locals but
2884   // record the original sp so that we can save it in the skeletal
2885   // interpreter frame and the stack walking of interpreter_sender
2886   // will get the unextended sp value and not the "real" sp value.
2887 
2888   const Register sender_sp = r8;
2889 
2890   __ mov(sender_sp, rsp);
2891   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
2892   __ subptr(rsp, rbx);
2893 
2894   // Push interpreter frames in a loop
2895   Label loop;
2896   __ bind(loop);
2897   __ movptr(rbx, Address(rsi, 0)); // Load frame size
2898   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
2899   __ pushptr(Address(rcx, 0));     // Save return address
2900   __ enter();                      // Save old & set new rbp
2901   __ subptr(rsp, rbx);             // Prolog
2902   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
2903             sender_sp);            // Make it walkable
2904   // This value is corrected by layout_activation_impl
2905   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
2906   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
2907   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
2908   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
2909   __ decrementl(rdx);              // Decrement counter
2910   __ jcc(Assembler::notZero, loop);
2911   __ pushptr(Address(rcx, 0));     // Save final return address
2912 
2913   // Re-push self-frame
2914   __ enter();                 // Save old & set new rbp
2915   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
2916                               // Prolog
2917 
2918   // Use rbp because the frames look interpreted now
2919   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
2920   // Don't need the precise return PC here, just precise enough to point into this code blob.
2921   address the_pc = __ pc();
2922   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
2923 
2924   // Call C code.  Need thread but NOT official VM entry
2925   // crud.  We cannot block on this call, no GC can happen.  Call should
2926   // restore return values to their stack-slots with the new SP.
2927   // Thread is in rdi already.
2928   //
2929   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
2930 
2931   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
2932   __ mov(c_rarg0, r15_thread);
2933   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
2934   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
2935 
2936   // Set an oopmap for the call site
2937   // Use the same PC we used for the last java frame
2938   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
2939 
2940   // Clear fp AND pc
2941   __ reset_last_Java_frame(true);
2942 
2943   // Pop self-frame.
2944   __ leave();                 // Epilog
2945 
2946   // Jump to interpreter
2947   __ ret(0);
2948 
2949   // Make sure all code is generated
2950   masm->flush();
2951 
2952   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
2953                                                  SimpleRuntimeFrame::framesize >> 1);
2954 }
2955 #endif // COMPILER2
2956 
2957 //------------------------------generate_handler_blob------
2958 //
2959 // Generate a special Compile2Runtime blob that saves all registers,
2960 // and setup oopmap.
2961 //
2962 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
2963   assert(StubRoutines::forward_exception_entry() != NULL,
2964          "must be generated before");
2965 
2966   ResourceMark rm;
2967   OopMapSet *oop_maps = new OopMapSet();
2968   OopMap* map;
2969 
2970   // Allocate space for the code.  Setup code generation tools.
2971   CodeBuffer buffer("handler_blob", 2048, 1024);
2972   MacroAssembler* masm = new MacroAssembler(&buffer);
2973 
2974   address start   = __ pc();
2975   address call_pc = NULL;
2976   int frame_size_in_words;
2977   bool cause_return = (poll_type == POLL_AT_RETURN);
2978   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
2979 
2980   if (UseRTMLocking) {
2981     // Abort RTM transaction before calling runtime
2982     // because critical section will be large and will be
2983     // aborted anyway. Also nmethod could be deoptimized.
2984     __ xabort(0);
2985   }
2986 
2987   // Make room for return address (or push it again)
2988   if (!cause_return) {
2989     __ push(rbx);
2990   }
2991 
2992   // Save registers, fpu state, and flags
2993   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
2994 
2995   // The following is basically a call_VM.  However, we need the precise
2996   // address of the call in order to generate an oopmap. Hence, we do all the
2997   // work ourselves.
2998 
2999   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3000 
3001   // The return address must always be correct so that frame constructor never
3002   // sees an invalid pc.
3003 
3004   if (!cause_return) {
3005     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3006     // Additionally, rbx is a callee saved register and we can look at it later to determine
3007     // if someone changed the return address for us!
3008     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3009     __ movptr(Address(rbp, wordSize), rbx);
3010   }
3011 
3012   // Do the call
3013   __ mov(c_rarg0, r15_thread);
3014   __ call(RuntimeAddress(call_ptr));
3015 
3016   // Set an oopmap for the call site.  This oopmap will map all
3017   // oop-registers and debug-info registers as callee-saved.  This
3018   // will allow deoptimization at this safepoint to find all possible
3019   // debug-info recordings, as well as let GC find all oops.
3020 
3021   oop_maps->add_gc_map( __ pc() - start, map);
3022 
3023   Label noException;
3024 
3025   __ reset_last_Java_frame(false);
3026 
3027   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3028   __ jcc(Assembler::equal, noException);
3029 
3030   // Exception pending
3031 
3032   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3033 
3034   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3035 
3036   // No exception case
3037   __ bind(noException);
3038 
3039   Label no_adjust;
3040 #ifdef ASSERT
3041   Label bail;
3042 #endif
3043   if (!cause_return) {
3044     Label no_prefix, not_special;
3045 
3046     // If our stashed return pc was modified by the runtime we avoid touching it
3047     __ cmpptr(rbx, Address(rbp, wordSize));
3048     __ jccb(Assembler::notEqual, no_adjust);
3049 
3050     // Skip over the poll instruction.
3051     // See NativeInstruction::is_safepoint_poll()
3052     // Possible encodings:
3053     //      85 00       test   %eax,(%rax)
3054     //      85 01       test   %eax,(%rcx)
3055     //      85 02       test   %eax,(%rdx)
3056     //      85 03       test   %eax,(%rbx)
3057     //      85 06       test   %eax,(%rsi)
3058     //      85 07       test   %eax,(%rdi)
3059     //
3060     //   41 85 00       test   %eax,(%r8)
3061     //   41 85 01       test   %eax,(%r9)
3062     //   41 85 02       test   %eax,(%r10)
3063     //   41 85 03       test   %eax,(%r11)
3064     //   41 85 06       test   %eax,(%r14)
3065     //   41 85 07       test   %eax,(%r15)
3066     //
3067     //      85 04 24    test   %eax,(%rsp)
3068     //   41 85 04 24    test   %eax,(%r12)
3069     //      85 45 00    test   %eax,0x0(%rbp)
3070     //   41 85 45 00    test   %eax,0x0(%r13)
3071 
3072     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3073     __ jcc(Assembler::notEqual, no_prefix);
3074     __ addptr(rbx, 1);
3075     __ bind(no_prefix);
3076 #ifdef ASSERT
3077     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3078 #endif
3079     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3080     // r12/rsp 0x04
3081     // r13/rbp 0x05
3082     __ movzbq(rcx, Address(rbx, 1));
3083     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3084     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3085     __ cmpptr(rcx, 1);
3086     __ jcc(Assembler::above, not_special);
3087     __ addptr(rbx, 1);
3088     __ bind(not_special);
3089 #ifdef ASSERT
3090     // Verify the correct encoding of the poll we're about to skip.
3091     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3092     __ jcc(Assembler::notEqual, bail);
3093     // Mask out the modrm bits
3094     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3095     // rax encodes to 0, so if the bits are nonzero it's incorrect
3096     __ jcc(Assembler::notZero, bail);
3097 #endif
3098     // Adjust return pc forward to step over the safepoint poll instruction
3099     __ addptr(rbx, 2);
3100     __ movptr(Address(rbp, wordSize), rbx);
3101   }
3102 
3103   __ bind(no_adjust);
3104   // Normal exit, restore registers and exit.
3105   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3106   __ ret(0);
3107 
3108 #ifdef ASSERT
3109   __ bind(bail);
3110   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3111 #endif
3112 
3113   // Make sure all code is generated
3114   masm->flush();
3115 
3116   // Fill-out other meta info
3117   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3118 }
3119 
3120 //
3121 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3122 //
3123 // Generate a stub that calls into vm to find out the proper destination
3124 // of a java call. All the argument registers are live at this point
3125 // but since this is generic code we don't know what they are and the caller
3126 // must do any gc of the args.
3127 //
3128 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3129   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3130 
3131   // allocate space for the code
3132   ResourceMark rm;
3133 
3134   CodeBuffer buffer(name, 1200, 512);
3135   MacroAssembler* masm = new MacroAssembler(&buffer);
3136 
3137   int frame_size_in_words;
3138 
3139   OopMapSet *oop_maps = new OopMapSet();
3140   OopMap* map = NULL;
3141 
3142   int start = __ offset();
3143 
3144   // No need to save vector registers since they are caller-saved anyway.
3145   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3146 
3147   int frame_complete = __ offset();
3148 
3149   __ set_last_Java_frame(noreg, noreg, NULL, rscratch1);
3150 
3151   __ mov(c_rarg0, r15_thread);
3152 
3153   __ call(RuntimeAddress(destination));
3154 
3155 
3156   // Set an oopmap for the call site.
3157   // We need this not only for callee-saved registers, but also for volatile
3158   // registers that the compiler might be keeping live across a safepoint.
3159 
3160   oop_maps->add_gc_map( __ offset() - start, map);
3161 
3162   // rax contains the address we are going to jump to assuming no exception got installed
3163 
3164   // clear last_Java_sp
3165   __ reset_last_Java_frame(false);
3166   // check for pending exceptions
3167   Label pending;
3168   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3169   __ jcc(Assembler::notEqual, pending);
3170 
3171   // get the returned Method*
3172   __ get_vm_result_2(rbx, r15_thread);
3173   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3174 
3175   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3176 
3177   RegisterSaver::restore_live_registers(masm);
3178 
3179   // We are back to the original state on entry and ready to go.
3180 
3181   __ jmp(rax);
3182 
3183   // Pending exception after the safepoint
3184 
3185   __ bind(pending);
3186 
3187   RegisterSaver::restore_live_registers(masm);
3188 
3189   // exception pending => remove activation and forward to exception handler
3190 
3191   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3192 
3193   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3194   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3195 
3196   // -------------
3197   // make sure all code is generated
3198   masm->flush();
3199 
3200   // return the  blob
3201   // frame_size_words or bytes??
3202   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3203 }
3204 
3205 //------------------------------Montgomery multiplication------------------------
3206 //
3207 
3208 #ifndef _WINDOWS
3209 
3210 // Subtract 0:b from carry:a.  Return carry.
3211 static julong
3212 sub(julong a[], julong b[], julong carry, long len) {
3213   long long i = 0, cnt = len;
3214   julong tmp;
3215   asm volatile("clc; "
3216                "0: ; "
3217                "mov (%[b], %[i], 8), %[tmp]; "
3218                "sbb %[tmp], (%[a], %[i], 8); "
3219                "inc %[i]; dec %[cnt]; "
3220                "jne 0b; "
3221                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3222                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3223                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3224                : "memory");
3225   return tmp;
3226 }
3227 
3228 // Multiply (unsigned) Long A by Long B, accumulating the double-
3229 // length result into the accumulator formed of T0, T1, and T2.
3230 #define MACC(A, B, T0, T1, T2)                                  \
3231 do {                                                            \
3232   unsigned long hi, lo;                                         \
3233   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3234            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3235            : "r"(A), "a"(B) : "cc");                            \
3236  } while(0)
3237 
3238 // As above, but add twice the double-length result into the
3239 // accumulator.
3240 #define MACC2(A, B, T0, T1, T2)                                 \
3241 do {                                                            \
3242   unsigned long hi, lo;                                         \
3243   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3244            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3245            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3246            : "r"(A), "a"(B) : "cc");                            \
3247  } while(0)
3248 
3249 #else //_WINDOWS
3250 
3251 static julong
3252 sub(julong a[], julong b[], julong carry, long len) {
3253   long i;
3254   julong tmp;
3255   unsigned char c = 1;
3256   for (i = 0; i < len; i++) {
3257     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3258     a[i] = tmp;
3259   }
3260   c = _addcarry_u64(c, carry, ~0, &tmp);
3261   return tmp;
3262 }
3263 
3264 // Multiply (unsigned) Long A by Long B, accumulating the double-
3265 // length result into the accumulator formed of T0, T1, and T2.
3266 #define MACC(A, B, T0, T1, T2)                          \
3267 do {                                                    \
3268   julong hi, lo;                            \
3269   lo = _umul128(A, B, &hi);                             \
3270   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3271   c = _addcarry_u64(c, hi, T1, &T1);                    \
3272   _addcarry_u64(c, T2, 0, &T2);                         \
3273  } while(0)
3274 
3275 // As above, but add twice the double-length result into the
3276 // accumulator.
3277 #define MACC2(A, B, T0, T1, T2)                         \
3278 do {                                                    \
3279   julong hi, lo;                            \
3280   lo = _umul128(A, B, &hi);                             \
3281   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3282   c = _addcarry_u64(c, hi, T1, &T1);                    \
3283   _addcarry_u64(c, T2, 0, &T2);                         \
3284   c = _addcarry_u64(0, lo, T0, &T0);                    \
3285   c = _addcarry_u64(c, hi, T1, &T1);                    \
3286   _addcarry_u64(c, T2, 0, &T2);                         \
3287  } while(0)
3288 
3289 #endif //_WINDOWS
3290 
3291 // Fast Montgomery multiplication.  The derivation of the algorithm is
3292 // in  A Cryptographic Library for the Motorola DSP56000,
3293 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3294 
3295 static void NOINLINE
3296 montgomery_multiply(julong a[], julong b[], julong n[],
3297                     julong m[], julong inv, int len) {
3298   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3299   int i;
3300 
3301   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3302 
3303   for (i = 0; i < len; i++) {
3304     int j;
3305     for (j = 0; j < i; j++) {
3306       MACC(a[j], b[i-j], t0, t1, t2);
3307       MACC(m[j], n[i-j], t0, t1, t2);
3308     }
3309     MACC(a[i], b[0], t0, t1, t2);
3310     m[i] = t0 * inv;
3311     MACC(m[i], n[0], t0, t1, t2);
3312 
3313     assert(t0 == 0, "broken Montgomery multiply");
3314 
3315     t0 = t1; t1 = t2; t2 = 0;
3316   }
3317 
3318   for (i = len; i < 2*len; i++) {
3319     int j;
3320     for (j = i-len+1; j < len; j++) {
3321       MACC(a[j], b[i-j], t0, t1, t2);
3322       MACC(m[j], n[i-j], t0, t1, t2);
3323     }
3324     m[i-len] = t0;
3325     t0 = t1; t1 = t2; t2 = 0;
3326   }
3327 
3328   while (t0)
3329     t0 = sub(m, n, t0, len);
3330 }
3331 
3332 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3333 // multiplies so it should be up to 25% faster than Montgomery
3334 // multiplication.  However, its loop control is more complex and it
3335 // may actually run slower on some machines.
3336 
3337 static void NOINLINE
3338 montgomery_square(julong a[], julong n[],
3339                   julong m[], julong inv, int len) {
3340   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3341   int i;
3342 
3343   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3344 
3345   for (i = 0; i < len; i++) {
3346     int j;
3347     int end = (i+1)/2;
3348     for (j = 0; j < end; j++) {
3349       MACC2(a[j], a[i-j], t0, t1, t2);
3350       MACC(m[j], n[i-j], t0, t1, t2);
3351     }
3352     if ((i & 1) == 0) {
3353       MACC(a[j], a[j], t0, t1, t2);
3354     }
3355     for (; j < i; j++) {
3356       MACC(m[j], n[i-j], t0, t1, t2);
3357     }
3358     m[i] = t0 * inv;
3359     MACC(m[i], n[0], t0, t1, t2);
3360 
3361     assert(t0 == 0, "broken Montgomery square");
3362 
3363     t0 = t1; t1 = t2; t2 = 0;
3364   }
3365 
3366   for (i = len; i < 2*len; i++) {
3367     int start = i-len+1;
3368     int end = start + (len - start)/2;
3369     int j;
3370     for (j = start; j < end; j++) {
3371       MACC2(a[j], a[i-j], t0, t1, t2);
3372       MACC(m[j], n[i-j], t0, t1, t2);
3373     }
3374     if ((i & 1) == 0) {
3375       MACC(a[j], a[j], t0, t1, t2);
3376     }
3377     for (; j < len; j++) {
3378       MACC(m[j], n[i-j], t0, t1, t2);
3379     }
3380     m[i-len] = t0;
3381     t0 = t1; t1 = t2; t2 = 0;
3382   }
3383 
3384   while (t0)
3385     t0 = sub(m, n, t0, len);
3386 }
3387 
3388 // Swap words in a longword.
3389 static julong swap(julong x) {
3390   return (x << 32) | (x >> 32);
3391 }
3392 
3393 // Copy len longwords from s to d, word-swapping as we go.  The
3394 // destination array is reversed.
3395 static void reverse_words(julong *s, julong *d, int len) {
3396   d += len;
3397   while(len-- > 0) {
3398     d--;
3399     *d = swap(*s);
3400     s++;
3401   }
3402 }
3403 
3404 // The threshold at which squaring is advantageous was determined
3405 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3406 #define MONTGOMERY_SQUARING_THRESHOLD 64
3407 
3408 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3409                                         jint len, jlong inv,
3410                                         jint *m_ints) {
3411   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3412   int longwords = len/2;
3413 
3414   // Make very sure we don't use so much space that the stack might
3415   // overflow.  512 jints corresponds to an 16384-bit integer and
3416   // will use here a total of 8k bytes of stack space.
3417   int divisor = sizeof(julong) * 4;
3418   guarantee(longwords <= 8192 / divisor, "must be");
3419   int total_allocation = longwords * sizeof (julong) * 4;
3420   julong *scratch = (julong *)alloca(total_allocation);
3421 
3422   // Local scratch arrays
3423   julong
3424     *a = scratch + 0 * longwords,
3425     *b = scratch + 1 * longwords,
3426     *n = scratch + 2 * longwords,
3427     *m = scratch + 3 * longwords;
3428 
3429   reverse_words((julong *)a_ints, a, longwords);
3430   reverse_words((julong *)b_ints, b, longwords);
3431   reverse_words((julong *)n_ints, n, longwords);
3432 
3433   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3434 
3435   reverse_words(m, (julong *)m_ints, longwords);
3436 }
3437 
3438 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3439                                       jint len, jlong inv,
3440                                       jint *m_ints) {
3441   assert(len % 2 == 0, "array length in montgomery_square must be even");
3442   int longwords = len/2;
3443 
3444   // Make very sure we don't use so much space that the stack might
3445   // overflow.  512 jints corresponds to an 16384-bit integer and
3446   // will use here a total of 6k bytes of stack space.
3447   int divisor = sizeof(julong) * 3;
3448   guarantee(longwords <= (8192 / divisor), "must be");
3449   int total_allocation = longwords * sizeof (julong) * 3;
3450   julong *scratch = (julong *)alloca(total_allocation);
3451 
3452   // Local scratch arrays
3453   julong
3454     *a = scratch + 0 * longwords,
3455     *n = scratch + 1 * longwords,
3456     *m = scratch + 2 * longwords;
3457 
3458   reverse_words((julong *)a_ints, a, longwords);
3459   reverse_words((julong *)n_ints, n, longwords);
3460 
3461   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3462     ::montgomery_square(a, n, m, (julong)inv, longwords);
3463   } else {
3464     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3465   }
3466 
3467   reverse_words(m, (julong *)m_ints, longwords);
3468 }
3469 
3470 #ifdef COMPILER2
3471 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3472 //
3473 //------------------------------generate_exception_blob---------------------------
3474 // creates exception blob at the end
3475 // Using exception blob, this code is jumped from a compiled method.
3476 // (see emit_exception_handler in x86_64.ad file)
3477 //
3478 // Given an exception pc at a call we call into the runtime for the
3479 // handler in this method. This handler might merely restore state
3480 // (i.e. callee save registers) unwind the frame and jump to the
3481 // exception handler for the nmethod if there is no Java level handler
3482 // for the nmethod.
3483 //
3484 // This code is entered with a jmp.
3485 //
3486 // Arguments:
3487 //   rax: exception oop
3488 //   rdx: exception pc
3489 //
3490 // Results:
3491 //   rax: exception oop
3492 //   rdx: exception pc in caller or ???
3493 //   destination: exception handler of caller
3494 //
3495 // Note: the exception pc MUST be at a call (precise debug information)
3496 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3497 //
3498 
3499 void OptoRuntime::generate_exception_blob() {
3500   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3501   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3502   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3503 
3504   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3505 
3506   // Allocate space for the code
3507   ResourceMark rm;
3508   // Setup code generation tools
3509   CodeBuffer buffer("exception_blob", 2048, 1024);
3510   MacroAssembler* masm = new MacroAssembler(&buffer);
3511 
3512 
3513   address start = __ pc();
3514 
3515   // Exception pc is 'return address' for stack walker
3516   __ push(rdx);
3517   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3518 
3519   // Save callee-saved registers.  See x86_64.ad.
3520 
3521   // rbp is an implicitly saved callee saved register (i.e., the calling
3522   // convention will save/restore it in the prolog/epilog). Other than that
3523   // there are no callee save registers now that adapter frames are gone.
3524 
3525   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3526 
3527   // Store exception in Thread object. We cannot pass any arguments to the
3528   // handle_exception call, since we do not want to make any assumption
3529   // about the size of the frame where the exception happened in.
3530   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3531   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3532   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3533 
3534   // This call does all the hard work.  It checks if an exception handler
3535   // exists in the method.
3536   // If so, it returns the handler address.
3537   // If not, it prepares for stack-unwinding, restoring the callee-save
3538   // registers of the frame being removed.
3539   //
3540   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3541 
3542   // At a method handle call, the stack may not be properly aligned
3543   // when returning with an exception.
3544   address the_pc = __ pc();
3545   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3546   __ mov(c_rarg0, r15_thread);
3547   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3548   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3549 
3550   // Set an oopmap for the call site.  This oopmap will only be used if we
3551   // are unwinding the stack.  Hence, all locations will be dead.
3552   // Callee-saved registers will be the same as the frame above (i.e.,
3553   // handle_exception_stub), since they were restored when we got the
3554   // exception.
3555 
3556   OopMapSet* oop_maps = new OopMapSet();
3557 
3558   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3559 
3560   __ reset_last_Java_frame(false);
3561 
3562   // Restore callee-saved registers
3563 
3564   // rbp is an implicitly saved callee-saved register (i.e., the calling
3565   // convention will save restore it in prolog/epilog) Other than that
3566   // there are no callee save registers now that adapter frames are gone.
3567 
3568   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3569 
3570   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3571   __ pop(rdx);                  // No need for exception pc anymore
3572 
3573   // rax: exception handler
3574 
3575   // We have a handler in rax (could be deopt blob).
3576   __ mov(r8, rax);
3577 
3578   // Get the exception oop
3579   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3580   // Get the exception pc in case we are deoptimized
3581   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3582 #ifdef ASSERT
3583   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3584   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3585 #endif
3586   // Clear the exception oop so GC no longer processes it as a root.
3587   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3588 
3589   // rax: exception oop
3590   // r8:  exception handler
3591   // rdx: exception pc
3592   // Jump to handler
3593 
3594   __ jmp(r8);
3595 
3596   // Make sure all code is generated
3597   masm->flush();
3598 
3599   // Set exception blob
3600   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
3601 }
3602 #endif // COMPILER2
3603