1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/compiledIC.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/icBuffer.hpp"
  35 #include "code/nativeInst.hpp"
  36 #include "code/vtableStubs.hpp"
  37 #include "compiler/oopMap.hpp"
  38 #include "gc/shared/collectedHeap.hpp"
  39 #include "gc/shared/gcLocker.hpp"
  40 #include "gc/shared/barrierSet.hpp"
  41 #include "gc/shared/barrierSetAssembler.hpp"
  42 #include "interpreter/interpreter.hpp"
  43 #include "logging/log.hpp"
  44 #include "memory/resourceArea.hpp"
  45 #include "memory/universe.hpp"
  46 #include "oops/compiledICHolder.hpp"
  47 #include "oops/klass.inline.hpp"
  48 #include "oops/method.inline.hpp"
  49 #include "prims/methodHandles.hpp"
  50 #include "runtime/continuation.hpp"
  51 #include "runtime/continuationEntry.inline.hpp"
  52 #include "runtime/globals.hpp"
  53 #include "runtime/jniHandles.hpp"
  54 #include "runtime/safepointMechanism.hpp"
  55 #include "runtime/sharedRuntime.hpp"
  56 #include "runtime/signature.hpp"
  57 #include "runtime/stubRoutines.hpp"
  58 #include "runtime/vframeArray.hpp"
  59 #include "runtime/vm_version.hpp"
  60 #include "utilities/align.hpp"
  61 #include "utilities/formatBuffer.hpp"
  62 #include "vmreg_x86.inline.hpp"
  63 #ifdef COMPILER1
  64 #include "c1/c1_Runtime1.hpp"
  65 #endif
  66 #ifdef COMPILER2
  67 #include "opto/runtime.hpp"
  68 #endif
  69 #if INCLUDE_JVMCI
  70 #include "jvmci/jvmciJavaClasses.hpp"
  71 #endif
  72 
  73 #define __ masm->
  74 
  75 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  76 
  77 class SimpleRuntimeFrame {
  78 
  79   public:
  80 
  81   // Most of the runtime stubs have this simple frame layout.
  82   // This class exists to make the layout shared in one place.
  83   // Offsets are for compiler stack slots, which are jints.
  84   enum layout {
  85     // The frame sender code expects that rbp will be in the "natural" place and
  86     // will override any oopMap setting for it. We must therefore force the layout
  87     // so that it agrees with the frame sender code.
  88     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  89     rbp_off2,
  90     return_off, return_off2,
  91     framesize
  92   };
  93 };
  94 
  95 class RegisterSaver {
  96   // Capture info about frame layout.  Layout offsets are in jint
  97   // units because compiler frame slots are jints.
  98 #define XSAVE_AREA_BEGIN 160
  99 #define XSAVE_AREA_YMM_BEGIN 576
 100 #define XSAVE_AREA_OPMASK_BEGIN 1088
 101 #define XSAVE_AREA_ZMM_BEGIN 1152
 102 #define XSAVE_AREA_UPPERBANK 1664
 103 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
 104 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 105 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 106 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 107 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 108   enum layout {
 109     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 110     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 111     DEF_XMM_OFFS(0),
 112     DEF_XMM_OFFS(1),
 113     // 2..15 are implied in range usage
 114     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 115     DEF_YMM_OFFS(0),
 116     DEF_YMM_OFFS(1),
 117     // 2..15 are implied in range usage
 118     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 119     DEF_OPMASK_OFFS(0),
 120     DEF_OPMASK_OFFS(1),
 121     // 2..7 are implied in range usage
 122     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 123     DEF_ZMM_OFFS(0),
 124     DEF_ZMM_OFFS(1),
 125     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 126     DEF_ZMM_UPPER_OFFS(16),
 127     DEF_ZMM_UPPER_OFFS(17),
 128     // 18..31 are implied in range usage
 129     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 130     fpu_stateH_end,
 131     r15_off, r15H_off,
 132     r14_off, r14H_off,
 133     r13_off, r13H_off,
 134     r12_off, r12H_off,
 135     r11_off, r11H_off,
 136     r10_off, r10H_off,
 137     r9_off,  r9H_off,
 138     r8_off,  r8H_off,
 139     rdi_off, rdiH_off,
 140     rsi_off, rsiH_off,
 141     ignore_off, ignoreH_off,  // extra copy of rbp
 142     rsp_off, rspH_off,
 143     rbx_off, rbxH_off,
 144     rdx_off, rdxH_off,
 145     rcx_off, rcxH_off,
 146     rax_off, raxH_off,
 147     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 148     align_off, alignH_off,
 149     flags_off, flagsH_off,
 150     // The frame sender code expects that rbp will be in the "natural" place and
 151     // will override any oopMap setting for it. We must therefore force the layout
 152     // so that it agrees with the frame sender code.
 153     rbp_off, rbpH_off,        // copy of rbp we will restore
 154     return_off, returnH_off,  // slot for return address
 155     reg_save_size             // size in compiler stack slots
 156   };
 157 
 158  public:
 159   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors);
 160   static void restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors = false);
 161 
 162   // Offsets into the register save area
 163   // Used by deoptimization when it is managing result register
 164   // values on its own
 165 
 166   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 167   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 168   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 169   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 170   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 171 
 172   // During deoptimization only the result registers need to be restored,
 173   // all the other values have already been extracted.
 174   static void restore_result_registers(MacroAssembler* masm);
 175 };
 176 
 177 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_wide_vectors) {
 178   int off = 0;
 179   int num_xmm_regs = XMMRegister::available_xmm_registers();
 180 #if COMPILER2_OR_JVMCI
 181   if (save_wide_vectors && UseAVX == 0) {
 182     save_wide_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 183   }
 184   assert(!save_wide_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 185 #else
 186   save_wide_vectors = false; // vectors are generated only by C2 and JVMCI
 187 #endif
 188 
 189   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 190   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 191   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 192   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 193   // CodeBlob frame size is in words.
 194   int frame_size_in_words = frame_size_in_bytes / wordSize;
 195   *total_frame_words = frame_size_in_words;
 196 
 197   // Save registers, fpu state, and flags.
 198   // We assume caller has already pushed the return address onto the
 199   // stack, so rsp is 8-byte aligned here.
 200   // We push rpb twice in this sequence because we want the real rbp
 201   // to be under the return like a normal enter.
 202 
 203   __ enter();          // rsp becomes 16-byte aligned here
 204   __ push_CPU_state(); // Push a multiple of 16 bytes
 205 
 206   // push cpu state handles this on EVEX enabled targets
 207   if (save_wide_vectors) {
 208     // Save upper half of YMM registers(0..15)
 209     int base_addr = XSAVE_AREA_YMM_BEGIN;
 210     for (int n = 0; n < 16; n++) {
 211       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 212     }
 213     if (VM_Version::supports_evex()) {
 214       // Save upper half of ZMM registers(0..15)
 215       base_addr = XSAVE_AREA_ZMM_BEGIN;
 216       for (int n = 0; n < 16; n++) {
 217         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 218       }
 219       // Save full ZMM registers(16..num_xmm_regs)
 220       base_addr = XSAVE_AREA_UPPERBANK;
 221       off = 0;
 222       int vector_len = Assembler::AVX_512bit;
 223       for (int n = 16; n < num_xmm_regs; n++) {
 224         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 225       }
 226 #if COMPILER2_OR_JVMCI
 227       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 228       off = 0;
 229       for(int n = 0; n < KRegister::number_of_registers; n++) {
 230         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 231       }
 232 #endif
 233     }
 234   } else {
 235     if (VM_Version::supports_evex()) {
 236       // Save upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 237       int base_addr = XSAVE_AREA_UPPERBANK;
 238       off = 0;
 239       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 240       for (int n = 16; n < num_xmm_regs; n++) {
 241         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 242       }
 243 #if COMPILER2_OR_JVMCI
 244       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 245       off = 0;
 246       for(int n = 0; n < KRegister::number_of_registers; n++) {
 247         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 248       }
 249 #endif
 250     }
 251   }
 252   __ vzeroupper();
 253   if (frame::arg_reg_save_area_bytes != 0) {
 254     // Allocate argument register save area
 255     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 256   }
 257 
 258   // Set an oopmap for the call site.  This oopmap will map all
 259   // oop-registers and debug-info registers as callee-saved.  This
 260   // will allow deoptimization at this safepoint to find all possible
 261   // debug-info recordings, as well as let GC find all oops.
 262 
 263   OopMapSet *oop_maps = new OopMapSet();
 264   OopMap* map = new OopMap(frame_size_in_slots, 0);
 265 
 266 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 267 
 268   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 269   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 272   // rbp location is known implicitly by the frame sender code, needs no oopmap
 273   // and the location where rbp was saved by is ignored
 274   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 275   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 284   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 285   // on EVEX enabled targets, we get it included in the xsave area
 286   off = xmm0_off;
 287   int delta = xmm1_off - off;
 288   for (int n = 0; n < 16; n++) {
 289     XMMRegister xmm_name = as_XMMRegister(n);
 290     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 291     off += delta;
 292   }
 293   if (UseAVX > 2) {
 294     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 295     off = zmm16_off;
 296     delta = zmm17_off - off;
 297     for (int n = 16; n < num_xmm_regs; n++) {
 298       XMMRegister zmm_name = as_XMMRegister(n);
 299       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 300       off += delta;
 301     }
 302   }
 303 
 304 #if COMPILER2_OR_JVMCI
 305   if (save_wide_vectors) {
 306     // Save upper half of YMM registers(0..15)
 307     off = ymm0_off;
 308     delta = ymm1_off - ymm0_off;
 309     for (int n = 0; n < 16; n++) {
 310       XMMRegister ymm_name = as_XMMRegister(n);
 311       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 312       off += delta;
 313     }
 314     if (VM_Version::supports_evex()) {
 315       // Save upper half of ZMM registers(0..15)
 316       off = zmm0_off;
 317       delta = zmm1_off - zmm0_off;
 318       for (int n = 0; n < 16; n++) {
 319         XMMRegister zmm_name = as_XMMRegister(n);
 320         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 321         off += delta;
 322       }
 323     }
 324   }
 325 #endif // COMPILER2_OR_JVMCI
 326 
 327   // %%% These should all be a waste but we'll keep things as they were for now
 328   if (true) {
 329     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 330     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 333     // rbp location is known implicitly by the frame sender code, needs no oopmap
 334     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 335     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 344     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 345     // on EVEX enabled targets, we get it included in the xsave area
 346     off = xmm0H_off;
 347     delta = xmm1H_off - off;
 348     for (int n = 0; n < 16; n++) {
 349       XMMRegister xmm_name = as_XMMRegister(n);
 350       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 351       off += delta;
 352     }
 353     if (UseAVX > 2) {
 354       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 355       off = zmm16H_off;
 356       delta = zmm17H_off - off;
 357       for (int n = 16; n < num_xmm_regs; n++) {
 358         XMMRegister zmm_name = as_XMMRegister(n);
 359         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 360         off += delta;
 361       }
 362     }
 363   }
 364 
 365   return map;
 366 }
 367 
 368 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wide_vectors) {
 369   int num_xmm_regs = XMMRegister::available_xmm_registers();
 370   if (frame::arg_reg_save_area_bytes != 0) {
 371     // Pop arg register save area
 372     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 373   }
 374 
 375 #if COMPILER2_OR_JVMCI
 376   if (restore_wide_vectors) {
 377     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 378     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 379   }
 380 #else
 381   assert(!restore_wide_vectors, "vectors are generated only by C2");
 382 #endif
 383 
 384   __ vzeroupper();
 385 
 386   // On EVEX enabled targets everything is handled in pop fpu state
 387   if (restore_wide_vectors) {
 388     // Restore upper half of YMM registers (0..15)
 389     int base_addr = XSAVE_AREA_YMM_BEGIN;
 390     for (int n = 0; n < 16; n++) {
 391       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 392     }
 393     if (VM_Version::supports_evex()) {
 394       // Restore upper half of ZMM registers (0..15)
 395       base_addr = XSAVE_AREA_ZMM_BEGIN;
 396       for (int n = 0; n < 16; n++) {
 397         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 398       }
 399       // Restore full ZMM registers(16..num_xmm_regs)
 400       base_addr = XSAVE_AREA_UPPERBANK;
 401       int vector_len = Assembler::AVX_512bit;
 402       int off = 0;
 403       for (int n = 16; n < num_xmm_regs; n++) {
 404         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 405       }
 406 #if COMPILER2_OR_JVMCI
 407       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 408       off = 0;
 409       for (int n = 0; n < KRegister::number_of_registers; n++) {
 410         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 411       }
 412 #endif
 413     }
 414   } else {
 415     if (VM_Version::supports_evex()) {
 416       // Restore upper bank of XMM registers(16..31) for scalar or 16-byte vector usage
 417       int base_addr = XSAVE_AREA_UPPERBANK;
 418       int off = 0;
 419       int vector_len = VM_Version::supports_avx512vl() ?  Assembler::AVX_128bit : Assembler::AVX_512bit;
 420       for (int n = 16; n < num_xmm_regs; n++) {
 421         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 422       }
 423 #if COMPILER2_OR_JVMCI
 424       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 425       off = 0;
 426       for (int n = 0; n < KRegister::number_of_registers; n++) {
 427         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 428       }
 429 #endif
 430     }
 431   }
 432 
 433   // Recover CPU state
 434   __ pop_CPU_state();
 435   // Get the rbp described implicitly by the calling convention (no oopMap)
 436   __ pop(rbp);
 437 }
 438 
 439 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 440 
 441   // Just restore result register. Only used by deoptimization. By
 442   // now any callee save register that needs to be restored to a c2
 443   // caller of the deoptee has been extracted into the vframeArray
 444   // and will be stuffed into the c2i adapter we create for later
 445   // restoration so only result registers need to be restored here.
 446 
 447   // Restore fp result register
 448   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 449   // Restore integer result register
 450   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 451   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 452 
 453   // Pop all of the register save are off the stack except the return address
 454   __ addptr(rsp, return_offset_in_bytes());
 455 }
 456 
 457 // Is vector's size (in bytes) bigger than a size saved by default?
 458 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 459 bool SharedRuntime::is_wide_vector(int size) {
 460   return size > 16;
 461 }
 462 
 463 // ---------------------------------------------------------------------------
 464 // Read the array of BasicTypes from a signature, and compute where the
 465 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 466 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 467 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 468 // as framesizes are fixed.
 469 // VMRegImpl::stack0 refers to the first slot 0(sp).
 470 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.
 471 // Register up to Register::number_of_registers are the 64-bit
 472 // integer registers.
 473 
 474 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 475 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 476 // units regardless of build. Of course for i486 there is no 64 bit build
 477 
 478 // The Java calling convention is a "shifted" version of the C ABI.
 479 // By skipping the first C ABI register we can call non-static jni methods
 480 // with small numbers of arguments without having to shuffle the arguments
 481 // at all. Since we control the java ABI we ought to at least get some
 482 // advantage out of it.
 483 
 484 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 485                                            VMRegPair *regs,
 486                                            int total_args_passed) {
 487 
 488   // Create the mapping between argument positions and
 489   // registers.
 490   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 491     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 492   };
 493   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 494     j_farg0, j_farg1, j_farg2, j_farg3,
 495     j_farg4, j_farg5, j_farg6, j_farg7
 496   };
 497 
 498 
 499   uint int_args = 0;
 500   uint fp_args = 0;
 501   uint stk_args = 0; // inc by 2 each time
 502 
 503   for (int i = 0; i < total_args_passed; i++) {
 504     switch (sig_bt[i]) {
 505     case T_BOOLEAN:
 506     case T_CHAR:
 507     case T_BYTE:
 508     case T_SHORT:
 509     case T_INT:
 510       if (int_args < Argument::n_int_register_parameters_j) {
 511         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 512       } else {
 513         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 514         stk_args += 2;
 515       }
 516       break;
 517     case T_VOID:
 518       // halves of T_LONG or T_DOUBLE
 519       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 520       regs[i].set_bad();
 521       break;
 522     case T_LONG:
 523       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 524       // fall through
 525     case T_OBJECT:
 526     case T_ARRAY:
 527     case T_ADDRESS:
 528     case T_PRIMITIVE_OBJECT:
 529       if (int_args < Argument::n_int_register_parameters_j) {
 530         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 531       } else {
 532         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 533         stk_args += 2;
 534       }
 535       break;
 536     case T_FLOAT:
 537       if (fp_args < Argument::n_float_register_parameters_j) {
 538         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 539       } else {
 540         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 541         stk_args += 2;
 542       }
 543       break;
 544     case T_DOUBLE:
 545       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 546       if (fp_args < Argument::n_float_register_parameters_j) {
 547         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 548       } else {
 549         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 550         stk_args += 2;
 551       }
 552       break;
 553     default:
 554       ShouldNotReachHere();
 555       break;
 556     }
 557   }
 558 
 559   return align_up(stk_args, 2);
 560 }
 561 
 562 // Same as java_calling_convention() but for multiple return
 563 // values. There's no way to store them on the stack so if we don't
 564 // have enough registers, multiple values can't be returned.
 565 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 566 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 567 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 568                                           VMRegPair *regs,
 569                                           int total_args_passed) {
 570   // Create the mapping between argument positions and
 571   // registers.
 572   static const Register INT_ArgReg[java_return_convention_max_int] = {
 573     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 574   };
 575   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 576     j_farg0, j_farg1, j_farg2, j_farg3,
 577     j_farg4, j_farg5, j_farg6, j_farg7
 578   };
 579 
 580 
 581   uint int_args = 0;
 582   uint fp_args = 0;
 583 
 584   for (int i = 0; i < total_args_passed; i++) {
 585     switch (sig_bt[i]) {
 586     case T_BOOLEAN:
 587     case T_CHAR:
 588     case T_BYTE:
 589     case T_SHORT:
 590     case T_INT:
 591       if (int_args < Argument::n_int_register_parameters_j+1) {
 592         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 593         int_args++;
 594       } else {
 595         return -1;
 596       }
 597       break;
 598     case T_VOID:
 599       // halves of T_LONG or T_DOUBLE
 600       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 601       regs[i].set_bad();
 602       break;
 603     case T_LONG:
 604       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 605       // fall through
 606     case T_OBJECT:
 607     case T_PRIMITIVE_OBJECT:
 608     case T_ARRAY:
 609     case T_ADDRESS:
 610     case T_METADATA:
 611       if (int_args < Argument::n_int_register_parameters_j+1) {
 612         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 613         int_args++;
 614       } else {
 615         return -1;
 616       }
 617       break;
 618     case T_FLOAT:
 619       if (fp_args < Argument::n_float_register_parameters_j) {
 620         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 621         fp_args++;
 622       } else {
 623         return -1;
 624       }
 625       break;
 626     case T_DOUBLE:
 627       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 628       if (fp_args < Argument::n_float_register_parameters_j) {
 629         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 630         fp_args++;
 631       } else {
 632         return -1;
 633       }
 634       break;
 635     default:
 636       ShouldNotReachHere();
 637       break;
 638     }
 639   }
 640 
 641   return int_args + fp_args;
 642 }
 643 
 644 // Patch the callers callsite with entry to compiled code if it exists.
 645 static void patch_callers_callsite(MacroAssembler *masm) {
 646   Label L;
 647   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
 648   __ jcc(Assembler::equal, L);
 649 
 650   // Save the current stack pointer
 651   __ mov(r13, rsp);
 652   // Schedule the branch target address early.
 653   // Call into the VM to patch the caller, then jump to compiled callee
 654   // rax isn't live so capture return address while we easily can
 655   __ movptr(rax, Address(rsp, 0));
 656 
 657   // align stack so push_CPU_state doesn't fault
 658   __ andptr(rsp, -(StackAlignmentInBytes));
 659   __ push_CPU_state();
 660   __ vzeroupper();
 661   // VM needs caller's callsite
 662   // VM needs target method
 663   // This needs to be a long call since we will relocate this adapter to
 664   // the codeBuffer and it may not reach
 665 
 666   // Allocate argument register save area
 667   if (frame::arg_reg_save_area_bytes != 0) {
 668     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 669   }
 670   __ mov(c_rarg0, rbx);
 671   __ mov(c_rarg1, rax);
 672   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 673 
 674   // De-allocate argument register save area
 675   if (frame::arg_reg_save_area_bytes != 0) {
 676     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 677   }
 678 
 679   __ vzeroupper();
 680   __ pop_CPU_state();
 681   // restore sp
 682   __ mov(rsp, r13);
 683   __ bind(L);
 684 }
 685 
 686 // For each inline type argument, sig includes the list of fields of
 687 // the inline type. This utility function computes the number of
 688 // arguments for the call if inline types are passed by reference (the
 689 // calling convention the interpreter expects).
 690 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 691   int total_args_passed = 0;
 692   if (InlineTypePassFieldsAsArgs) {
 693     for (int i = 0; i < sig_extended->length(); i++) {
 694       BasicType bt = sig_extended->at(i)._bt;
 695       if (bt == T_PRIMITIVE_OBJECT) {
 696         // In sig_extended, an inline type argument starts with:
 697         // T_PRIMITIVE_OBJECT, followed by the types of the fields of the
 698         // inline type and T_VOID to mark the end of the value
 699         // type. Inline types are flattened so, for instance, in the
 700         // case of an inline type with an int field and an inline type
 701         // field that itself has 2 fields, an int and a long:
 702         // T_PRIMITIVE_OBJECT T_INT T_PRIMITIVE_OBJECT T_INT T_LONG T_VOID (second
 703         // slot for the T_LONG) T_VOID (inner T_PRIMITIVE_OBJECT) T_VOID
 704         // (outer T_PRIMITIVE_OBJECT)
 705         total_args_passed++;
 706         int vt = 1;
 707         do {
 708           i++;
 709           BasicType bt = sig_extended->at(i)._bt;
 710           BasicType prev_bt = sig_extended->at(i-1)._bt;
 711           if (bt == T_PRIMITIVE_OBJECT) {
 712             vt++;
 713           } else if (bt == T_VOID &&
 714                      prev_bt != T_LONG &&
 715                      prev_bt != T_DOUBLE) {
 716             vt--;
 717           }
 718         } while (vt != 0);
 719       } else {
 720         total_args_passed++;
 721       }
 722     }
 723   } else {
 724     total_args_passed = sig_extended->length();
 725   }
 726   return total_args_passed;
 727 }
 728 
 729 
 730 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 731                                    BasicType bt,
 732                                    BasicType prev_bt,
 733                                    size_t size_in_bytes,
 734                                    const VMRegPair& reg_pair,
 735                                    const Address& to,
 736                                    int extraspace,
 737                                    bool is_oop) {
 738   assert(bt != T_PRIMITIVE_OBJECT || !InlineTypePassFieldsAsArgs, "no inline type here");
 739   if (bt == T_VOID) {
 740     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 741     return;
 742   }
 743 
 744   // Say 4 args:
 745   // i   st_off
 746   // 0   32 T_LONG
 747   // 1   24 T_VOID
 748   // 2   16 T_OBJECT
 749   // 3    8 T_BOOL
 750   // -    0 return address
 751   //
 752   // However to make thing extra confusing. Because we can fit a long/double in
 753   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 754   // leaves one slot empty and only stores to a single slot. In this case the
 755   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 756 
 757   bool wide = (size_in_bytes == wordSize);
 758   VMReg r_1 = reg_pair.first();
 759   VMReg r_2 = reg_pair.second();
 760   assert(r_2->is_valid() == wide, "invalid size");
 761   if (!r_1->is_valid()) {
 762     assert(!r_2->is_valid(), "must be invalid");
 763     return;
 764   }
 765 
 766   if (!r_1->is_XMMRegister()) {
 767     Register val = rax;
 768     if (r_1->is_stack()) {
 769       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 770       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 771     } else {
 772       val = r_1->as_Register();
 773     }
 774     assert_different_registers(to.base(), val, rscratch1);
 775     if (is_oop) {
 776       __ push(r13);
 777       __ push(rbx);
 778       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 779       __ pop(rbx);
 780       __ pop(r13);
 781     } else {
 782       __ store_sized_value(to, val, size_in_bytes);
 783     }
 784   } else {
 785     if (wide) {
 786       __ movdbl(to, r_1->as_XMMRegister());
 787     } else {
 788       __ movflt(to, r_1->as_XMMRegister());
 789     }
 790   }
 791 }
 792 
 793 static void gen_c2i_adapter(MacroAssembler *masm,
 794                             const GrowableArray<SigEntry>* sig_extended,
 795                             const VMRegPair *regs,
 796                             bool requires_clinit_barrier,
 797                             address& c2i_no_clinit_check_entry,
 798                             Label& skip_fixup,
 799                             address start,
 800                             OopMapSet* oop_maps,
 801                             int& frame_complete,
 802                             int& frame_size_in_words,
 803                             bool alloc_inline_receiver) {
 804   if (requires_clinit_barrier && VM_Version::supports_fast_class_init_checks()) {
 805     Label L_skip_barrier;
 806     Register method = rbx;
 807 
 808     { // Bypass the barrier for non-static methods
 809       Register flags = rscratch1;
 810       __ movl(flags, Address(method, Method::access_flags_offset()));
 811       __ testl(flags, JVM_ACC_STATIC);
 812       __ jcc(Assembler::zero, L_skip_barrier); // non-static
 813     }
 814 
 815     Register klass = rscratch1;
 816     __ load_method_holder(klass, method);
 817     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
 818 
 819     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
 820 
 821     __ bind(L_skip_barrier);
 822     c2i_no_clinit_check_entry = __ pc();
 823   }
 824 
 825   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 826   bs->c2i_entry_barrier(masm);
 827 
 828   // Before we get into the guts of the C2I adapter, see if we should be here
 829   // at all.  We've come from compiled code and are attempting to jump to the
 830   // interpreter, which means the caller made a static call to get here
 831   // (vcalls always get a compiled target if there is one).  Check for a
 832   // compiled target.  If there is one, we need to patch the caller's call.
 833   patch_callers_callsite(masm);
 834 
 835   __ bind(skip_fixup);
 836 
 837   if (InlineTypePassFieldsAsArgs) {
 838     // Is there an inline type argument?
 839     bool has_inline_argument = false;
 840     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 841       has_inline_argument = (sig_extended->at(i)._bt == T_PRIMITIVE_OBJECT);
 842     }
 843     if (has_inline_argument) {
 844       // There is at least an inline type argument: we're coming from
 845       // compiled code so we have no buffers to back the inline types.
 846       // Allocate the buffers here with a runtime call.
 847       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 848 
 849       frame_complete = __ offset();
 850 
 851       __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
 852 
 853       __ mov(c_rarg0, r15_thread);
 854       __ mov(c_rarg1, rbx);
 855       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 856       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 857 
 858       oop_maps->add_gc_map((int)(__ pc() - start), map);
 859       __ reset_last_Java_frame(false);
 860 
 861       RegisterSaver::restore_live_registers(masm);
 862 
 863       Label no_exception;
 864       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
 865       __ jcc(Assembler::equal, no_exception);
 866 
 867       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
 868       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 869       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 870 
 871       __ bind(no_exception);
 872 
 873       // We get an array of objects from the runtime call
 874       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 875       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 876     }
 877   }
 878 
 879   // Since all args are passed on the stack, total_args_passed *
 880   // Interpreter::stackElementSize is the space we need.
 881   int total_args_passed = compute_total_args_passed_int(sig_extended);
 882   assert(total_args_passed >= 0, "total_args_passed is %d", total_args_passed);
 883 
 884   int extraspace = (total_args_passed * Interpreter::stackElementSize);
 885 
 886   // stack is aligned, keep it that way
 887   // This is not currently needed or enforced by the interpreter, but
 888   // we might as well conform to the ABI.
 889   extraspace = align_up(extraspace, 2*wordSize);
 890 
 891   // set senderSP value
 892   __ lea(r13, Address(rsp, wordSize));
 893 
 894 #ifdef ASSERT
 895   __ check_stack_alignment(r13, "sender stack not aligned");
 896 #endif
 897   if (extraspace > 0) {
 898     // Pop the return address
 899     __ pop(rax);
 900 
 901     __ subptr(rsp, extraspace);
 902 
 903     // Push the return address
 904     __ push(rax);
 905 
 906     // Account for the return address location since we store it first rather
 907     // than hold it in a register across all the shuffling
 908     extraspace += wordSize;
 909   }
 910 
 911 #ifdef ASSERT
 912   __ check_stack_alignment(rsp, "callee stack not aligned", wordSize, rax);
 913 #endif
 914 
 915   // Now write the args into the outgoing interpreter space
 916 
 917   // next_arg_comp is the next argument from the compiler point of
 918   // view (inline type fields are passed in registers/on the stack). In
 919   // sig_extended, an inline type argument starts with: T_PRIMITIVE_OBJECT,
 920   // followed by the types of the fields of the inline type and T_VOID
 921   // to mark the end of the inline type. ignored counts the number of
 922   // T_PRIMITIVE_OBJECT/T_VOID. next_vt_arg is the next inline type argument:
 923   // used to get the buffer for that argument from the pool of buffers
 924   // we allocated above and want to pass to the
 925   // interpreter. next_arg_int is the next argument from the
 926   // interpreter point of view (inline types are passed by reference).
 927   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
 928        next_arg_comp < sig_extended->length(); next_arg_comp++) {
 929     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
 930     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
 931     BasicType bt = sig_extended->at(next_arg_comp)._bt;
 932     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
 933     if (!InlineTypePassFieldsAsArgs || bt != T_PRIMITIVE_OBJECT) {
 934       int next_off = st_off - Interpreter::stackElementSize;
 935       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
 936       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
 937       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
 938       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 939                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
 940       next_arg_int++;
 941 #ifdef ASSERT
 942       if (bt == T_LONG || bt == T_DOUBLE) {
 943         // Overwrite the unused slot with known junk
 944         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 945         __ movptr(Address(rsp, st_off), rax);
 946       }
 947 #endif /* ASSERT */
 948     } else {
 949       ignored++;
 950       // get the buffer from the just allocated pool of buffers
 951       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_PRIMITIVE_OBJECT);
 952       __ load_heap_oop(r14, Address(rscratch2, index));
 953       next_vt_arg++; next_arg_int++;
 954       int vt = 1;
 955       // write fields we get from compiled code in registers/stack
 956       // slots to the buffer: we know we are done with that inline type
 957       // argument when we hit the T_VOID that acts as an end of inline
 958       // type delimiter for this inline type. Inline types are flattened
 959       // so we might encounter embedded inline types. Each entry in
 960       // sig_extended contains a field offset in the buffer.
 961       Label L_null;
 962       do {
 963         next_arg_comp++;
 964         BasicType bt = sig_extended->at(next_arg_comp)._bt;
 965         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
 966         if (bt == T_PRIMITIVE_OBJECT) {
 967           vt++;
 968           ignored++;
 969         } else if (bt == T_VOID &&
 970                    prev_bt != T_LONG &&
 971                    prev_bt != T_DOUBLE) {
 972           vt--;
 973           ignored++;
 974         } else {
 975           int off = sig_extended->at(next_arg_comp)._offset;
 976           if (off == -1) {
 977             // Nullable inline type argument, emit null check
 978             VMReg reg = regs[next_arg_comp-ignored].first();
 979             Label L_notNull;
 980             if (reg->is_stack()) {
 981               int ld_off = reg->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 982               __ testb(Address(rsp, ld_off), 1);
 983             } else {
 984               __ testb(reg->as_Register(), 1);
 985             }
 986             __ jcc(Assembler::notZero, L_notNull);
 987             __ movptr(Address(rsp, st_off), 0);
 988             __ jmp(L_null);
 989             __ bind(L_notNull);
 990             continue;
 991           }
 992           assert(off > 0, "offset in object should be positive");
 993           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
 994           bool is_oop = is_reference_type(bt);
 995           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 996                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
 997         }
 998       } while (vt != 0);
 999       // pass the buffer to the interpreter
1000       __ movptr(Address(rsp, st_off), r14);
1001       __ bind(L_null);
1002     }
1003   }
1004 
1005   // Schedule the branch target address early.
1006   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
1007   __ jmp(rcx);
1008 }
1009 
1010 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
1011                         address code_start, address code_end,
1012                         Label& L_ok) {
1013   Label L_fail;
1014   __ lea(temp_reg, ExternalAddress(code_start));
1015   __ cmpptr(pc_reg, temp_reg);
1016   __ jcc(Assembler::belowEqual, L_fail);
1017   __ lea(temp_reg, ExternalAddress(code_end));
1018   __ cmpptr(pc_reg, temp_reg);
1019   __ jcc(Assembler::below, L_ok);
1020   __ bind(L_fail);
1021 }
1022 
1023 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
1024                                     int comp_args_on_stack,
1025                                     const GrowableArray<SigEntry>* sig,
1026                                     const VMRegPair *regs) {
1027 
1028   // Note: r13 contains the senderSP on entry. We must preserve it since
1029   // we may do a i2c -> c2i transition if we lose a race where compiled
1030   // code goes non-entrant while we get args ready.
1031   // In addition we use r13 to locate all the interpreter args as
1032   // we must align the stack to 16 bytes on an i2c entry else we
1033   // lose alignment we expect in all compiled code and register
1034   // save code can segv when fxsave instructions find improperly
1035   // aligned stack pointer.
1036 
1037   // Adapters can be frameless because they do not require the caller
1038   // to perform additional cleanup work, such as correcting the stack pointer.
1039   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
1040   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
1041   // even if a callee has modified the stack pointer.
1042   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
1043   // routinely repairs its caller's stack pointer (from sender_sp, which is set
1044   // up via the senderSP register).
1045   // In other words, if *either* the caller or callee is interpreted, we can
1046   // get the stack pointer repaired after a call.
1047   // This is why c2i and i2c adapters cannot be indefinitely composed.
1048   // In particular, if a c2i adapter were to somehow call an i2c adapter,
1049   // both caller and callee would be compiled methods, and neither would
1050   // clean up the stack pointer changes performed by the two adapters.
1051   // If this happens, control eventually transfers back to the compiled
1052   // caller, but with an uncorrected stack, causing delayed havoc.
1053 
1054   if (VerifyAdapterCalls &&
1055       (Interpreter::code() != nullptr || StubRoutines::final_stubs_code() != nullptr)) {
1056     // So, let's test for cascading c2i/i2c adapters right now.
1057     //  assert(Interpreter::contains($return_addr) ||
1058     //         StubRoutines::contains($return_addr),
1059     //         "i2c adapter must return to an interpreter frame");
1060     __ block_comment("verify_i2c { ");
1061     // Pick up the return address
1062     __ movptr(rax, Address(rsp, 0));
1063     Label L_ok;
1064     if (Interpreter::code() != nullptr) {
1065       range_check(masm, rax, r11,
1066                   Interpreter::code()->code_start(),
1067                   Interpreter::code()->code_end(),
1068                   L_ok);
1069     }
1070     if (StubRoutines::initial_stubs_code() != nullptr) {
1071       range_check(masm, rax, r11,
1072                   StubRoutines::initial_stubs_code()->code_begin(),
1073                   StubRoutines::initial_stubs_code()->code_end(),
1074                   L_ok);
1075     }
1076     if (StubRoutines::final_stubs_code() != nullptr) {
1077       range_check(masm, rax, r11,
1078                   StubRoutines::final_stubs_code()->code_begin(),
1079                   StubRoutines::final_stubs_code()->code_end(),
1080                   L_ok);
1081     }
1082     const char* msg = "i2c adapter must return to an interpreter frame";
1083     __ block_comment(msg);
1084     __ stop(msg);
1085     __ bind(L_ok);
1086     __ block_comment("} verify_i2ce ");
1087   }
1088 
1089   // Must preserve original SP for loading incoming arguments because
1090   // we need to align the outgoing SP for compiled code.
1091   __ movptr(r11, rsp);
1092 
1093   // Pick up the return address
1094   __ pop(rax);
1095 
1096   // Convert 4-byte c2 stack slots to words.
1097   int comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1098 
1099   if (comp_args_on_stack) {
1100     __ subptr(rsp, comp_words_on_stack * wordSize);
1101   }
1102 
1103   // Ensure compiled code always sees stack at proper alignment
1104   __ andptr(rsp, -16);
1105 
1106   // push the return address and misalign the stack that youngest frame always sees
1107   // as far as the placement of the call instruction
1108   __ push(rax);
1109 
1110   // Put saved SP in another register
1111   const Register saved_sp = rax;
1112   __ movptr(saved_sp, r11);
1113 
1114   // Will jump to the compiled code just as if compiled code was doing it.
1115   // Pre-load the register-jump target early, to schedule it better.
1116   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1117 
1118 #if INCLUDE_JVMCI
1119   if (EnableJVMCI) {
1120     // check if this call should be routed towards a specific entry point
1121     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1122     Label no_alternative_target;
1123     __ jcc(Assembler::equal, no_alternative_target);
1124     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1125     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1126     __ bind(no_alternative_target);
1127   }
1128 #endif // INCLUDE_JVMCI
1129 
1130   int total_args_passed = sig->length();
1131 
1132   // Now generate the shuffle code.  Pick up all register args and move the
1133   // rest through the floating point stack top.
1134   for (int i = 0; i < total_args_passed; i++) {
1135     BasicType bt = sig->at(i)._bt;
1136     assert(bt != T_PRIMITIVE_OBJECT, "i2c adapter doesn't unpack inline type args");
1137     if (bt == T_VOID) {
1138       // Longs and doubles are passed in native word order, but misaligned
1139       // in the 32-bit build.
1140       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1141       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1142       continue;
1143     }
1144 
1145     // Pick up 0, 1 or 2 words from SP+offset.
1146 
1147     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1148             "scrambled load targets?");
1149     // Load in argument order going down.
1150     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1151     // Point to interpreter value (vs. tag)
1152     int next_off = ld_off - Interpreter::stackElementSize;
1153     //
1154     //
1155     //
1156     VMReg r_1 = regs[i].first();
1157     VMReg r_2 = regs[i].second();
1158     if (!r_1->is_valid()) {
1159       assert(!r_2->is_valid(), "");
1160       continue;
1161     }
1162     if (r_1->is_stack()) {
1163       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1164       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1165 
1166       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1167       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1168       // will be generated.
1169       if (!r_2->is_valid()) {
1170         // sign extend???
1171         __ movl(r13, Address(saved_sp, ld_off));
1172         __ movptr(Address(rsp, st_off), r13);
1173       } else {
1174         //
1175         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1176         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1177         // So we must adjust where to pick up the data to match the interpreter.
1178         //
1179         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1180         // are accessed as negative so LSW is at LOW address
1181 
1182         // ld_off is MSW so get LSW
1183         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1184                            next_off : ld_off;
1185         __ movq(r13, Address(saved_sp, offset));
1186         // st_off is LSW (i.e. reg.first())
1187         __ movq(Address(rsp, st_off), r13);
1188       }
1189     } else if (r_1->is_Register()) {  // Register argument
1190       Register r = r_1->as_Register();
1191       assert(r != rax, "must be different");
1192       if (r_2->is_valid()) {
1193         //
1194         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1195         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1196         // So we must adjust where to pick up the data to match the interpreter.
1197 
1198         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1199                            next_off : ld_off;
1200 
1201         // this can be a misaligned move
1202         __ movq(r, Address(saved_sp, offset));
1203       } else {
1204         // sign extend and use a full word?
1205         __ movl(r, Address(saved_sp, ld_off));
1206       }
1207     } else {
1208       if (!r_2->is_valid()) {
1209         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1210       } else {
1211         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1212       }
1213     }
1214   }
1215 
1216   __ push_cont_fastpath(); // Set JavaThread::_cont_fastpath to the sp of the oldest interpreted frame we know about
1217 
1218   // 6243940 We might end up in handle_wrong_method if
1219   // the callee is deoptimized as we race thru here. If that
1220   // happens we don't want to take a safepoint because the
1221   // caller frame will look interpreted and arguments are now
1222   // "compiled" so it is much better to make this transition
1223   // invisible to the stack walking code. Unfortunately if
1224   // we try and find the callee by normal means a safepoint
1225   // is possible. So we stash the desired callee in the thread
1226   // and the vm will find there should this case occur.
1227 
1228   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1229 
1230   // put Method* where a c2i would expect should we end up there
1231   // only needed because of c2 resolve stubs return Method* as a result in
1232   // rax
1233   __ mov(rax, rbx);
1234   __ jmp(r11);
1235 }
1236 
1237 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1238   Label ok;
1239 
1240   Register holder = rax;
1241   Register receiver = j_rarg0;
1242   Register temp = rbx;
1243 
1244   __ load_klass(temp, receiver, rscratch1);
1245   __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1246   __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1247   __ jcc(Assembler::equal, ok);
1248   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1249 
1250   __ bind(ok);
1251   // Method might have been compiled since the call site was patched to
1252   // interpreted if that is the case treat it as a miss so we can get
1253   // the call site corrected.
1254   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), NULL_WORD);
1255   __ jcc(Assembler::equal, skip_fixup);
1256   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1257 }
1258 
1259 // ---------------------------------------------------------------
1260 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1261                                                             int comp_args_on_stack,
1262                                                             const GrowableArray<SigEntry>* sig,
1263                                                             const VMRegPair* regs,
1264                                                             const GrowableArray<SigEntry>* sig_cc,
1265                                                             const VMRegPair* regs_cc,
1266                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1267                                                             const VMRegPair* regs_cc_ro,
1268                                                             AdapterFingerPrint* fingerprint,
1269                                                             AdapterBlob*& new_adapter,
1270                                                             bool allocate_code_blob) {
1271   address i2c_entry = __ pc();
1272   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1273 
1274   // -------------------------------------------------------------------------
1275   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1276   // to the interpreter.  The args start out packed in the compiled layout.  They
1277   // need to be unpacked into the interpreter layout.  This will almost always
1278   // require some stack space.  We grow the current (compiled) stack, then repack
1279   // the args.  We  finally end in a jump to the generic interpreter entry point.
1280   // On exit from the interpreter, the interpreter will restore our SP (lest the
1281   // compiled code, which relies solely on SP and not RBP, get sick).
1282 
1283   address c2i_unverified_entry        = __ pc();
1284   address c2i_unverified_inline_entry = __ pc();
1285   Label skip_fixup;
1286 
1287   gen_inline_cache_check(masm, skip_fixup);
1288 
1289   OopMapSet* oop_maps = new OopMapSet();
1290   int frame_complete = CodeOffsets::frame_never_safe;
1291   int frame_size_in_words = 0;
1292 
1293   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1294   address c2i_no_clinit_check_entry = nullptr;
1295   address c2i_inline_ro_entry = __ pc();
1296   if (regs_cc != regs_cc_ro) {
1297     // No class init barrier needed because method is guaranteed to be non-static
1298     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, /* requires_clinit_barrier = */ false, c2i_no_clinit_check_entry,
1299                     skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1300     skip_fixup.reset();
1301   }
1302 
1303   // Scalarized c2i adapter
1304   address c2i_entry        = __ pc();
1305   address c2i_inline_entry = __ pc();
1306   gen_c2i_adapter(masm, sig_cc, regs_cc, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1307                   skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ true);
1308 
1309   // Non-scalarized c2i adapter
1310   if (regs != regs_cc) {
1311     c2i_unverified_inline_entry = __ pc();
1312     Label inline_entry_skip_fixup;
1313     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1314 
1315     c2i_inline_entry = __ pc();
1316     gen_c2i_adapter(masm, sig, regs, /* requires_clinit_barrier = */ true, c2i_no_clinit_check_entry,
1317                     inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, /* alloc_inline_receiver = */ false);
1318   }
1319 
1320 
1321   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1322   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1323   if (allocate_code_blob) {
1324     bool caller_must_gc_arguments = (regs != regs_cc);
1325     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1326   }
1327 
1328   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1329 }
1330 
1331 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1332                                          VMRegPair *regs,
1333                                          VMRegPair *regs2,
1334                                          int total_args_passed) {
1335   assert(regs2 == nullptr, "not needed on x86");
1336 // We return the amount of VMRegImpl stack slots we need to reserve for all
1337 // the arguments NOT counting out_preserve_stack_slots.
1338 
1339 // NOTE: These arrays will have to change when c1 is ported
1340 #ifdef _WIN64
1341     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1342       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1343     };
1344     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1345       c_farg0, c_farg1, c_farg2, c_farg3
1346     };
1347 #else
1348     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1349       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1350     };
1351     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1352       c_farg0, c_farg1, c_farg2, c_farg3,
1353       c_farg4, c_farg5, c_farg6, c_farg7
1354     };
1355 #endif // _WIN64
1356 
1357 
1358     uint int_args = 0;
1359     uint fp_args = 0;
1360     uint stk_args = 0; // inc by 2 each time
1361 
1362     for (int i = 0; i < total_args_passed; i++) {
1363       switch (sig_bt[i]) {
1364       case T_BOOLEAN:
1365       case T_CHAR:
1366       case T_BYTE:
1367       case T_SHORT:
1368       case T_INT:
1369         if (int_args < Argument::n_int_register_parameters_c) {
1370           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1371 #ifdef _WIN64
1372           fp_args++;
1373           // Allocate slots for callee to stuff register args the stack.
1374           stk_args += 2;
1375 #endif
1376         } else {
1377           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1378           stk_args += 2;
1379         }
1380         break;
1381       case T_LONG:
1382         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1383         // fall through
1384       case T_OBJECT:
1385       case T_ARRAY:
1386       case T_PRIMITIVE_OBJECT:
1387       case T_ADDRESS:
1388       case T_METADATA:
1389         if (int_args < Argument::n_int_register_parameters_c) {
1390           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1391 #ifdef _WIN64
1392           fp_args++;
1393           stk_args += 2;
1394 #endif
1395         } else {
1396           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1397           stk_args += 2;
1398         }
1399         break;
1400       case T_FLOAT:
1401         if (fp_args < Argument::n_float_register_parameters_c) {
1402           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1403 #ifdef _WIN64
1404           int_args++;
1405           // Allocate slots for callee to stuff register args the stack.
1406           stk_args += 2;
1407 #endif
1408         } else {
1409           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1410           stk_args += 2;
1411         }
1412         break;
1413       case T_DOUBLE:
1414         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1415         if (fp_args < Argument::n_float_register_parameters_c) {
1416           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1417 #ifdef _WIN64
1418           int_args++;
1419           // Allocate slots for callee to stuff register args the stack.
1420           stk_args += 2;
1421 #endif
1422         } else {
1423           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1424           stk_args += 2;
1425         }
1426         break;
1427       case T_VOID: // Halves of longs and doubles
1428         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1429         regs[i].set_bad();
1430         break;
1431       default:
1432         ShouldNotReachHere();
1433         break;
1434       }
1435     }
1436 #ifdef _WIN64
1437   // windows abi requires that we always allocate enough stack space
1438   // for 4 64bit registers to be stored down.
1439   if (stk_args < 8) {
1440     stk_args = 8;
1441   }
1442 #endif // _WIN64
1443 
1444   return stk_args;
1445 }
1446 
1447 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1448                                              uint num_bits,
1449                                              uint total_args_passed) {
1450   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1451          "only certain vector sizes are supported for now");
1452 
1453   static const XMMRegister VEC_ArgReg[32] = {
1454      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1455      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1456     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1457     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1458   };
1459 
1460   uint stk_args = 0;
1461   uint fp_args = 0;
1462 
1463   for (uint i = 0; i < total_args_passed; i++) {
1464     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1465     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1466     regs[i].set_pair(vmreg->next(next_val), vmreg);
1467   }
1468 
1469   return stk_args;
1470 }
1471 
1472 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1473   // We always ignore the frame_slots arg and just use the space just below frame pointer
1474   // which by this time is free to use
1475   switch (ret_type) {
1476   case T_FLOAT:
1477     __ movflt(Address(rbp, -wordSize), xmm0);
1478     break;
1479   case T_DOUBLE:
1480     __ movdbl(Address(rbp, -wordSize), xmm0);
1481     break;
1482   case T_VOID:  break;
1483   default: {
1484     __ movptr(Address(rbp, -wordSize), rax);
1485     }
1486   }
1487 }
1488 
1489 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1490   // We always ignore the frame_slots arg and just use the space just below frame pointer
1491   // which by this time is free to use
1492   switch (ret_type) {
1493   case T_FLOAT:
1494     __ movflt(xmm0, Address(rbp, -wordSize));
1495     break;
1496   case T_DOUBLE:
1497     __ movdbl(xmm0, Address(rbp, -wordSize));
1498     break;
1499   case T_VOID:  break;
1500   default: {
1501     __ movptr(rax, Address(rbp, -wordSize));
1502     }
1503   }
1504 }
1505 
1506 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1507     for ( int i = first_arg ; i < arg_count ; i++ ) {
1508       if (args[i].first()->is_Register()) {
1509         __ push(args[i].first()->as_Register());
1510       } else if (args[i].first()->is_XMMRegister()) {
1511         __ subptr(rsp, 2*wordSize);
1512         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1513       }
1514     }
1515 }
1516 
1517 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1518     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1519       if (args[i].first()->is_Register()) {
1520         __ pop(args[i].first()->as_Register());
1521       } else if (args[i].first()->is_XMMRegister()) {
1522         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1523         __ addptr(rsp, 2*wordSize);
1524       }
1525     }
1526 }
1527 
1528 static void verify_oop_args(MacroAssembler* masm,
1529                             const methodHandle& method,
1530                             const BasicType* sig_bt,
1531                             const VMRegPair* regs) {
1532   Register temp_reg = rbx;  // not part of any compiled calling seq
1533   if (VerifyOops) {
1534     for (int i = 0; i < method->size_of_parameters(); i++) {
1535       if (is_reference_type(sig_bt[i])) {
1536         VMReg r = regs[i].first();
1537         assert(r->is_valid(), "bad oop arg");
1538         if (r->is_stack()) {
1539           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1540           __ verify_oop(temp_reg);
1541         } else {
1542           __ verify_oop(r->as_Register());
1543         }
1544       }
1545     }
1546   }
1547 }
1548 
1549 static void check_continuation_enter_argument(VMReg actual_vmreg,
1550                                               Register expected_reg,
1551                                               const char* name) {
1552   assert(!actual_vmreg->is_stack(), "%s cannot be on stack", name);
1553   assert(actual_vmreg->as_Register() == expected_reg,
1554          "%s is in unexpected register: %s instead of %s",
1555          name, actual_vmreg->as_Register()->name(), expected_reg->name());
1556 }
1557 
1558 
1559 //---------------------------- continuation_enter_setup ---------------------------
1560 //
1561 // Arguments:
1562 //   None.
1563 //
1564 // Results:
1565 //   rsp: pointer to blank ContinuationEntry
1566 //
1567 // Kills:
1568 //   rax
1569 //
1570 static OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
1571   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
1572   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
1573   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
1574 
1575   stack_slots += checked_cast<int>(ContinuationEntry::size()) / wordSize;
1576   __ subptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1577 
1578   int frame_size = (checked_cast<int>(ContinuationEntry::size()) + wordSize) / VMRegImpl::stack_slot_size;
1579   OopMap* map = new OopMap(frame_size, 0);
1580 
1581   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
1582   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
1583   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
1584 
1585   return map;
1586 }
1587 
1588 //---------------------------- fill_continuation_entry ---------------------------
1589 //
1590 // Arguments:
1591 //   rsp: pointer to blank Continuation entry
1592 //   reg_cont_obj: pointer to the continuation
1593 //   reg_flags: flags
1594 //
1595 // Results:
1596 //   rsp: pointer to filled out ContinuationEntry
1597 //
1598 // Kills:
1599 //   rax
1600 //
1601 static void fill_continuation_entry(MacroAssembler* masm, Register reg_cont_obj, Register reg_flags) {
1602   assert_different_registers(rax, reg_cont_obj, reg_flags);
1603 #ifdef ASSERT
1604   __ movl(Address(rsp, ContinuationEntry::cookie_offset()), ContinuationEntry::cookie_value());
1605 #endif
1606   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), reg_cont_obj);
1607   __ movl  (Address(rsp, ContinuationEntry::flags_offset()), reg_flags);
1608   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), 0);
1609   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), 0);
1610   __ movl(Address(rsp, ContinuationEntry::pin_count_offset()), 0);
1611 
1612   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
1613   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
1614   __ movq(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
1615   __ movq(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
1616 
1617   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
1618   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), 0);
1619 }
1620 
1621 //---------------------------- continuation_enter_cleanup ---------------------------
1622 //
1623 // Arguments:
1624 //   rsp: pointer to the ContinuationEntry
1625 //
1626 // Results:
1627 //   rsp: pointer to the spilled rbp in the entry frame
1628 //
1629 // Kills:
1630 //   rbx
1631 //
1632 void static continuation_enter_cleanup(MacroAssembler* masm) {
1633 #ifdef ASSERT
1634   Label L_good_sp;
1635   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1636   __ jcc(Assembler::equal, L_good_sp);
1637   __ stop("Incorrect rsp at continuation_enter_cleanup");
1638   __ bind(L_good_sp);
1639 #endif
1640 
1641   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
1642   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
1643   __ movq(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
1644   __ movq(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
1645 
1646   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_offset()));
1647   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rbx);
1648   __ addptr(rsp, checked_cast<int32_t>(ContinuationEntry::size()));
1649 }
1650 
1651 static void gen_continuation_enter(MacroAssembler* masm,
1652                                    const VMRegPair* regs,
1653                                    int& exception_offset,
1654                                    OopMapSet* oop_maps,
1655                                    int& frame_complete,
1656                                    int& stack_slots,
1657                                    int& interpreted_entry_offset,
1658                                    int& compiled_entry_offset) {
1659 
1660   // enterSpecial(Continuation c, boolean isContinue, boolean isVirtualThread)
1661   int pos_cont_obj   = 0;
1662   int pos_is_cont    = 1;
1663   int pos_is_virtual = 2;
1664 
1665   // The platform-specific calling convention may present the arguments in various registers.
1666   // To simplify the rest of the code, we expect the arguments to reside at these known
1667   // registers, and we additionally check the placement here in case calling convention ever
1668   // changes.
1669   Register reg_cont_obj   = c_rarg1;
1670   Register reg_is_cont    = c_rarg2;
1671   Register reg_is_virtual = c_rarg3;
1672 
1673   check_continuation_enter_argument(regs[pos_cont_obj].first(),   reg_cont_obj,   "Continuation object");
1674   check_continuation_enter_argument(regs[pos_is_cont].first(),    reg_is_cont,    "isContinue");
1675   check_continuation_enter_argument(regs[pos_is_virtual].first(), reg_is_virtual, "isVirtualThread");
1676 
1677   // Utility methods kill rax, make sure there are no collisions
1678   assert_different_registers(rax, reg_cont_obj, reg_is_cont, reg_is_virtual);
1679 
1680   AddressLiteral resolve(SharedRuntime::get_resolve_static_call_stub(),
1681                          relocInfo::static_call_type);
1682 
1683   address start = __ pc();
1684 
1685   Label L_thaw, L_exit;
1686 
1687   // i2i entry used at interp_only_mode only
1688   interpreted_entry_offset = __ pc() - start;
1689   {
1690 #ifdef ASSERT
1691     Label is_interp_only;
1692     __ cmpb(Address(r15_thread, JavaThread::interp_only_mode_offset()), 0);
1693     __ jcc(Assembler::notEqual, is_interp_only);
1694     __ stop("enterSpecial interpreter entry called when not in interp_only_mode");
1695     __ bind(is_interp_only);
1696 #endif
1697 
1698     __ pop(rax); // return address
1699     // Read interpreter arguments into registers (this is an ad-hoc i2c adapter)
1700     __ movptr(c_rarg1, Address(rsp, Interpreter::stackElementSize*2));
1701     __ movl(c_rarg2,   Address(rsp, Interpreter::stackElementSize*1));
1702     __ movl(c_rarg3,   Address(rsp, Interpreter::stackElementSize*0));
1703     __ andptr(rsp, -16); // Ensure compiled code always sees stack at proper alignment
1704     __ push(rax); // return address
1705     __ push_cont_fastpath();
1706 
1707     __ enter();
1708 
1709     stack_slots = 2; // will be adjusted in setup
1710     OopMap* map = continuation_enter_setup(masm, stack_slots);
1711     // The frame is complete here, but we only record it for the compiled entry, so the frame would appear unsafe,
1712     // but that's okay because at the very worst we'll miss an async sample, but we're in interp_only_mode anyway.
1713 
1714     __ verify_oop(reg_cont_obj);
1715 
1716     fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1717 
1718     // If continuation, call to thaw. Otherwise, resolve the call and exit.
1719     __ testptr(reg_is_cont, reg_is_cont);
1720     __ jcc(Assembler::notZero, L_thaw);
1721 
1722     // --- Resolve path
1723 
1724     // Make sure the call is patchable
1725     __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1726     // Emit stub for static call
1727     CodeBuffer* cbuf = masm->code_section()->outer();
1728     address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1729     if (stub == nullptr) {
1730       fatal("CodeCache is full at gen_continuation_enter");
1731     }
1732     __ call(resolve);
1733     oop_maps->add_gc_map(__ pc() - start, map);
1734     __ post_call_nop();
1735 
1736     __ jmp(L_exit);
1737   }
1738 
1739   // compiled entry
1740   __ align(CodeEntryAlignment);
1741   compiled_entry_offset = __ pc() - start;
1742   __ enter();
1743 
1744   stack_slots = 2; // will be adjusted in setup
1745   OopMap* map = continuation_enter_setup(masm, stack_slots);
1746 
1747   // Frame is now completed as far as size and linkage.
1748   frame_complete = __ pc() - start;
1749 
1750   __ verify_oop(reg_cont_obj);
1751 
1752   fill_continuation_entry(masm, reg_cont_obj, reg_is_virtual);
1753 
1754   // If isContinue, call to thaw. Otherwise, call Continuation.enter(Continuation c, boolean isContinue)
1755   __ testptr(reg_is_cont, reg_is_cont);
1756   __ jccb(Assembler::notZero, L_thaw);
1757 
1758   // --- call Continuation.enter(Continuation c, boolean isContinue)
1759 
1760   // Make sure the call is patchable
1761   __ align(BytesPerWord, __ offset() + NativeCall::displacement_offset);
1762 
1763   // Emit stub for static call
1764   CodeBuffer* cbuf = masm->code_section()->outer();
1765   address stub = CompiledStaticCall::emit_to_interp_stub(*cbuf, __ pc());
1766   if (stub == nullptr) {
1767     fatal("CodeCache is full at gen_continuation_enter");
1768   }
1769 
1770   // The call needs to be resolved. There's a special case for this in
1771   // SharedRuntime::find_callee_info_helper() which calls
1772   // LinkResolver::resolve_continuation_enter() which resolves the call to
1773   // Continuation.enter(Continuation c, boolean isContinue).
1774   __ call(resolve);
1775 
1776   oop_maps->add_gc_map(__ pc() - start, map);
1777   __ post_call_nop();
1778 
1779   __ jmpb(L_exit);
1780 
1781   // --- Thawing path
1782 
1783   __ bind(L_thaw);
1784 
1785   __ call(RuntimeAddress(StubRoutines::cont_thaw()));
1786 
1787   ContinuationEntry::_return_pc_offset = __ pc() - start;
1788   oop_maps->add_gc_map(__ pc() - start, map->deep_copy());
1789   __ post_call_nop();
1790 
1791   // --- Normal exit (resolve/thawing)
1792 
1793   __ bind(L_exit);
1794 
1795   continuation_enter_cleanup(masm);
1796   __ pop(rbp);
1797   __ ret(0);
1798 
1799   // --- Exception handling path
1800 
1801   exception_offset = __ pc() - start;
1802 
1803   continuation_enter_cleanup(masm);
1804   __ pop(rbp);
1805 
1806   __ movptr(c_rarg0, r15_thread);
1807   __ movptr(c_rarg1, Address(rsp, 0)); // return address
1808 
1809   // rax still holds the original exception oop, save it before the call
1810   __ push(rax);
1811 
1812   __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), 2);
1813   __ movptr(rbx, rax);
1814 
1815   // Continue at exception handler:
1816   //   rax: exception oop
1817   //   rbx: exception handler
1818   //   rdx: exception pc
1819   __ pop(rax);
1820   __ verify_oop(rax);
1821   __ pop(rdx);
1822   __ jmp(rbx);
1823 }
1824 
1825 static void gen_continuation_yield(MacroAssembler* masm,
1826                                    const VMRegPair* regs,
1827                                    OopMapSet* oop_maps,
1828                                    int& frame_complete,
1829                                    int& stack_slots,
1830                                    int& compiled_entry_offset) {
1831   enum layout {
1832     rbp_off,
1833     rbpH_off,
1834     return_off,
1835     return_off2,
1836     framesize // inclusive of return address
1837   };
1838   stack_slots = framesize /  VMRegImpl::slots_per_word;
1839   assert(stack_slots == 2, "recheck layout");
1840 
1841   address start = __ pc();
1842   compiled_entry_offset = __ pc() - start;
1843   __ enter();
1844   address the_pc = __ pc();
1845 
1846   frame_complete = the_pc - start;
1847 
1848   // This nop must be exactly at the PC we push into the frame info.
1849   // We use this nop for fast CodeBlob lookup, associate the OopMap
1850   // with it right away.
1851   __ post_call_nop();
1852   OopMap* map = new OopMap(framesize, 1);
1853   oop_maps->add_gc_map(frame_complete, map);
1854 
1855   __ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
1856   __ movptr(c_rarg0, r15_thread);
1857   __ movptr(c_rarg1, rsp);
1858   __ call_VM_leaf(Continuation::freeze_entry(), 2);
1859   __ reset_last_Java_frame(true);
1860 
1861   Label L_pinned;
1862 
1863   __ testptr(rax, rax);
1864   __ jcc(Assembler::notZero, L_pinned);
1865 
1866   __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
1867   continuation_enter_cleanup(masm);
1868   __ pop(rbp);
1869   __ ret(0);
1870 
1871   __ bind(L_pinned);
1872 
1873   // Pinned, return to caller
1874 
1875   // handle pending exception thrown by freeze
1876   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
1877   Label ok;
1878   __ jcc(Assembler::equal, ok);
1879   __ leave();
1880   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1881   __ bind(ok);
1882 
1883   __ leave();
1884   __ ret(0);
1885 }
1886 
1887 static void gen_special_dispatch(MacroAssembler* masm,
1888                                  const methodHandle& method,
1889                                  const BasicType* sig_bt,
1890                                  const VMRegPair* regs) {
1891   verify_oop_args(masm, method, sig_bt, regs);
1892   vmIntrinsics::ID iid = method->intrinsic_id();
1893 
1894   // Now write the args into the outgoing interpreter space
1895   bool     has_receiver   = false;
1896   Register receiver_reg   = noreg;
1897   int      member_arg_pos = -1;
1898   Register member_reg     = noreg;
1899   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1900   if (ref_kind != 0) {
1901     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1902     member_reg = rbx;  // known to be free at this point
1903     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1904   } else if (iid == vmIntrinsics::_invokeBasic) {
1905     has_receiver = true;
1906   } else if (iid == vmIntrinsics::_linkToNative) {
1907     member_arg_pos = method->size_of_parameters() - 1;  // trailing NativeEntryPoint argument
1908     member_reg = rbx;  // known to be free at this point
1909   } else {
1910     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1911   }
1912 
1913   if (member_reg != noreg) {
1914     // Load the member_arg into register, if necessary.
1915     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1916     VMReg r = regs[member_arg_pos].first();
1917     if (r->is_stack()) {
1918       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1919     } else {
1920       // no data motion is needed
1921       member_reg = r->as_Register();
1922     }
1923   }
1924 
1925   if (has_receiver) {
1926     // Make sure the receiver is loaded into a register.
1927     assert(method->size_of_parameters() > 0, "oob");
1928     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1929     VMReg r = regs[0].first();
1930     assert(r->is_valid(), "bad receiver arg");
1931     if (r->is_stack()) {
1932       // Porting note:  This assumes that compiled calling conventions always
1933       // pass the receiver oop in a register.  If this is not true on some
1934       // platform, pick a temp and load the receiver from stack.
1935       fatal("receiver always in a register");
1936       receiver_reg = j_rarg0;  // known to be free at this point
1937       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1938     } else {
1939       // no data motion is needed
1940       receiver_reg = r->as_Register();
1941     }
1942   }
1943 
1944   // Figure out which address we are really jumping to:
1945   MethodHandles::generate_method_handle_dispatch(masm, iid,
1946                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1947 }
1948 
1949 // ---------------------------------------------------------------------------
1950 // Generate a native wrapper for a given method.  The method takes arguments
1951 // in the Java compiled code convention, marshals them to the native
1952 // convention (handlizes oops, etc), transitions to native, makes the call,
1953 // returns to java state (possibly blocking), unhandlizes any result and
1954 // returns.
1955 //
1956 // Critical native functions are a shorthand for the use of
1957 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1958 // functions.  The wrapper is expected to unpack the arguments before
1959 // passing them to the callee. Critical native functions leave the state _in_Java,
1960 // since they cannot stop for GC.
1961 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1962 // block and the check for pending exceptions it's impossible for them
1963 // to be thrown.
1964 //
1965 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1966                                                 const methodHandle& method,
1967                                                 int compile_id,
1968                                                 BasicType* in_sig_bt,
1969                                                 VMRegPair* in_regs,
1970                                                 BasicType ret_type) {
1971   if (method->is_continuation_native_intrinsic()) {
1972     int exception_offset = -1;
1973     OopMapSet* oop_maps = new OopMapSet();
1974     int frame_complete = -1;
1975     int stack_slots = -1;
1976     int interpreted_entry_offset = -1;
1977     int vep_offset = -1;
1978     if (method->is_continuation_enter_intrinsic()) {
1979       gen_continuation_enter(masm,
1980                              in_regs,
1981                              exception_offset,
1982                              oop_maps,
1983                              frame_complete,
1984                              stack_slots,
1985                              interpreted_entry_offset,
1986                              vep_offset);
1987     } else if (method->is_continuation_yield_intrinsic()) {
1988       gen_continuation_yield(masm,
1989                              in_regs,
1990                              oop_maps,
1991                              frame_complete,
1992                              stack_slots,
1993                              vep_offset);
1994     } else {
1995       guarantee(false, "Unknown Continuation native intrinsic");
1996     }
1997 
1998 #ifdef ASSERT
1999     if (method->is_continuation_enter_intrinsic()) {
2000       assert(interpreted_entry_offset != -1, "Must be set");
2001       assert(exception_offset != -1,         "Must be set");
2002     } else {
2003       assert(interpreted_entry_offset == -1, "Must be unset");
2004       assert(exception_offset == -1,         "Must be unset");
2005     }
2006     assert(frame_complete != -1,    "Must be set");
2007     assert(stack_slots != -1,       "Must be set");
2008     assert(vep_offset != -1,        "Must be set");
2009 #endif
2010 
2011     __ flush();
2012     nmethod* nm = nmethod::new_native_nmethod(method,
2013                                               compile_id,
2014                                               masm->code(),
2015                                               vep_offset,
2016                                               frame_complete,
2017                                               stack_slots,
2018                                               in_ByteSize(-1),
2019                                               in_ByteSize(-1),
2020                                               oop_maps,
2021                                               exception_offset);
2022     if (method->is_continuation_enter_intrinsic()) {
2023       ContinuationEntry::set_enter_code(nm, interpreted_entry_offset);
2024     } else if (method->is_continuation_yield_intrinsic()) {
2025       _cont_doYield_stub = nm;
2026     }
2027     return nm;
2028   }
2029 
2030   if (method->is_method_handle_intrinsic()) {
2031     vmIntrinsics::ID iid = method->intrinsic_id();
2032     intptr_t start = (intptr_t)__ pc();
2033     int vep_offset = ((intptr_t)__ pc()) - start;
2034     gen_special_dispatch(masm,
2035                          method,
2036                          in_sig_bt,
2037                          in_regs);
2038     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
2039     __ flush();
2040     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
2041     return nmethod::new_native_nmethod(method,
2042                                        compile_id,
2043                                        masm->code(),
2044                                        vep_offset,
2045                                        frame_complete,
2046                                        stack_slots / VMRegImpl::slots_per_word,
2047                                        in_ByteSize(-1),
2048                                        in_ByteSize(-1),
2049                                        nullptr);
2050   }
2051   address native_func = method->native_function();
2052   assert(native_func != nullptr, "must have function");
2053 
2054   // An OopMap for lock (and class if static)
2055   OopMapSet *oop_maps = new OopMapSet();
2056   intptr_t start = (intptr_t)__ pc();
2057 
2058   // We have received a description of where all the java arg are located
2059   // on entry to the wrapper. We need to convert these args to where
2060   // the jni function will expect them. To figure out where they go
2061   // we convert the java signature to a C signature by inserting
2062   // the hidden arguments as arg[0] and possibly arg[1] (static method)
2063 
2064   const int total_in_args = method->size_of_parameters();
2065   int total_c_args = total_in_args + (method->is_static() ? 2 : 1);
2066 
2067   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
2068   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
2069   BasicType* in_elem_bt = nullptr;
2070 
2071   int argc = 0;
2072   out_sig_bt[argc++] = T_ADDRESS;
2073   if (method->is_static()) {
2074     out_sig_bt[argc++] = T_OBJECT;
2075   }
2076 
2077   for (int i = 0; i < total_in_args ; i++ ) {
2078     out_sig_bt[argc++] = in_sig_bt[i];
2079   }
2080 
2081   // Now figure out where the args must be stored and how much stack space
2082   // they require.
2083   int out_arg_slots;
2084   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, nullptr, total_c_args);
2085 
2086   // Compute framesize for the wrapper.  We need to handlize all oops in
2087   // incoming registers
2088 
2089   // Calculate the total number of stack slots we will need.
2090 
2091   // First count the abi requirement plus all of the outgoing args
2092   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
2093 
2094   // Now the space for the inbound oop handle area
2095   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
2096 
2097   int oop_handle_offset = stack_slots;
2098   stack_slots += total_save_slots;
2099 
2100   // Now any space we need for handlizing a klass if static method
2101 
2102   int klass_slot_offset = 0;
2103   int klass_offset = -1;
2104   int lock_slot_offset = 0;
2105   bool is_static = false;
2106 
2107   if (method->is_static()) {
2108     klass_slot_offset = stack_slots;
2109     stack_slots += VMRegImpl::slots_per_word;
2110     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
2111     is_static = true;
2112   }
2113 
2114   // Plus a lock if needed
2115 
2116   if (method->is_synchronized()) {
2117     lock_slot_offset = stack_slots;
2118     stack_slots += VMRegImpl::slots_per_word;
2119   }
2120 
2121   // Now a place (+2) to save return values or temp during shuffling
2122   // + 4 for return address (which we own) and saved rbp
2123   stack_slots += 6;
2124 
2125   // Ok The space we have allocated will look like:
2126   //
2127   //
2128   // FP-> |                     |
2129   //      |---------------------|
2130   //      | 2 slots for moves   |
2131   //      |---------------------|
2132   //      | lock box (if sync)  |
2133   //      |---------------------| <- lock_slot_offset
2134   //      | klass (if static)   |
2135   //      |---------------------| <- klass_slot_offset
2136   //      | oopHandle area      |
2137   //      |---------------------| <- oop_handle_offset (6 java arg registers)
2138   //      | outbound memory     |
2139   //      | based arguments     |
2140   //      |                     |
2141   //      |---------------------|
2142   //      |                     |
2143   // SP-> | out_preserved_slots |
2144   //
2145   //
2146 
2147 
2148   // Now compute actual number of stack words we need rounding to make
2149   // stack properly aligned.
2150   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2151 
2152   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2153 
2154   // First thing make an ic check to see if we should even be here
2155 
2156   // We are free to use all registers as temps without saving them and
2157   // restoring them except rbp. rbp is the only callee save register
2158   // as far as the interpreter and the compiler(s) are concerned.
2159 
2160 
2161   const Register ic_reg = rax;
2162   const Register receiver = j_rarg0;
2163 
2164   Label hit;
2165   Label exception_pending;
2166 
2167   assert_different_registers(ic_reg, receiver, rscratch1, rscratch2);
2168   __ verify_oop(receiver);
2169   __ load_klass(rscratch1, receiver, rscratch2);
2170   __ cmpq(ic_reg, rscratch1);
2171   __ jcc(Assembler::equal, hit);
2172 
2173   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2174 
2175   // Verified entry point must be aligned
2176   __ align(8);
2177 
2178   __ bind(hit);
2179 
2180   int vep_offset = ((intptr_t)__ pc()) - start;
2181 
2182   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2183     Label L_skip_barrier;
2184     Register klass = r10;
2185     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2186     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2187 
2188     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2189 
2190     __ bind(L_skip_barrier);
2191   }
2192 
2193 #ifdef COMPILER1
2194   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2195   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2196     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2197   }
2198 #endif // COMPILER1
2199 
2200   // The instruction at the verified entry point must be 5 bytes or longer
2201   // because it can be patched on the fly by make_non_entrant. The stack bang
2202   // instruction fits that requirement.
2203 
2204   // Generate stack overflow check
2205   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2206 
2207   // Generate a new frame for the wrapper.
2208   __ enter();
2209   // -2 because return address is already present and so is saved rbp
2210   __ subptr(rsp, stack_size - 2*wordSize);
2211 
2212   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2213   // native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
2214   bs->nmethod_entry_barrier(masm, nullptr /* slow_path */, nullptr /* continuation */);
2215 
2216   // Frame is now completed as far as size and linkage.
2217   int frame_complete = ((intptr_t)__ pc()) - start;
2218 
2219     if (UseRTMLocking) {
2220       // Abort RTM transaction before calling JNI
2221       // because critical section will be large and will be
2222       // aborted anyway. Also nmethod could be deoptimized.
2223       __ xabort(0);
2224     }
2225 
2226 #ifdef ASSERT
2227   __ check_stack_alignment(rsp, "improperly aligned stack");
2228 #endif /* ASSERT */
2229 
2230 
2231   // We use r14 as the oop handle for the receiver/klass
2232   // It is callee save so it survives the call to native
2233 
2234   const Register oop_handle_reg = r14;
2235 
2236   //
2237   // We immediately shuffle the arguments so that any vm call we have to
2238   // make from here on out (sync slow path, jvmti, etc.) we will have
2239   // captured the oops from our caller and have a valid oopMap for
2240   // them.
2241 
2242   // -----------------
2243   // The Grand Shuffle
2244 
2245   // The Java calling convention is either equal (linux) or denser (win64) than the
2246   // c calling convention. However the because of the jni_env argument the c calling
2247   // convention always has at least one more (and two for static) arguments than Java.
2248   // Therefore if we move the args from java -> c backwards then we will never have
2249   // a register->register conflict and we don't have to build a dependency graph
2250   // and figure out how to break any cycles.
2251   //
2252 
2253   // Record esp-based slot for receiver on stack for non-static methods
2254   int receiver_offset = -1;
2255 
2256   // This is a trick. We double the stack slots so we can claim
2257   // the oops in the caller's frame. Since we are sure to have
2258   // more args than the caller doubling is enough to make
2259   // sure we can capture all the incoming oop args from the
2260   // caller.
2261   //
2262   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2263 
2264   // Mark location of rbp (someday)
2265   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2266 
2267   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2268   // All inbound args are referenced based on rbp and all outbound args via rsp.
2269 
2270 
2271 #ifdef ASSERT
2272   bool reg_destroyed[Register::number_of_registers];
2273   bool freg_destroyed[XMMRegister::number_of_registers];
2274   for ( int r = 0 ; r < Register::number_of_registers ; r++ ) {
2275     reg_destroyed[r] = false;
2276   }
2277   for ( int f = 0 ; f < XMMRegister::number_of_registers ; f++ ) {
2278     freg_destroyed[f] = false;
2279   }
2280 
2281 #endif /* ASSERT */
2282 
2283   // For JNI natives the incoming and outgoing registers are offset upwards.
2284   GrowableArray<int> arg_order(2 * total_in_args);
2285 
2286   VMRegPair tmp_vmreg;
2287   tmp_vmreg.set2(rbx->as_VMReg());
2288 
2289   for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2290     arg_order.push(i);
2291     arg_order.push(c_arg);
2292   }
2293 
2294   int temploc = -1;
2295   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2296     int i = arg_order.at(ai);
2297     int c_arg = arg_order.at(ai + 1);
2298     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2299 #ifdef ASSERT
2300     if (in_regs[i].first()->is_Register()) {
2301       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2302     } else if (in_regs[i].first()->is_XMMRegister()) {
2303       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2304     }
2305     if (out_regs[c_arg].first()->is_Register()) {
2306       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2307     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2308       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2309     }
2310 #endif /* ASSERT */
2311     switch (in_sig_bt[i]) {
2312       case T_ARRAY:
2313       case T_PRIMITIVE_OBJECT:
2314       case T_OBJECT:
2315         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2316                     ((i == 0) && (!is_static)),
2317                     &receiver_offset);
2318         break;
2319       case T_VOID:
2320         break;
2321 
2322       case T_FLOAT:
2323         __ float_move(in_regs[i], out_regs[c_arg]);
2324           break;
2325 
2326       case T_DOUBLE:
2327         assert( i + 1 < total_in_args &&
2328                 in_sig_bt[i + 1] == T_VOID &&
2329                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2330         __ double_move(in_regs[i], out_regs[c_arg]);
2331         break;
2332 
2333       case T_LONG :
2334         __ long_move(in_regs[i], out_regs[c_arg]);
2335         break;
2336 
2337       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2338 
2339       default:
2340         __ move32_64(in_regs[i], out_regs[c_arg]);
2341     }
2342   }
2343 
2344   int c_arg;
2345 
2346   // Pre-load a static method's oop into r14.  Used both by locking code and
2347   // the normal JNI call code.
2348   // point c_arg at the first arg that is already loaded in case we
2349   // need to spill before we call out
2350   c_arg = total_c_args - total_in_args;
2351 
2352   if (method->is_static()) {
2353 
2354     //  load oop into a register
2355     __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2356 
2357     // Now handlize the static class mirror it's known not-null.
2358     __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2359     map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2360 
2361     // Now get the handle
2362     __ lea(oop_handle_reg, Address(rsp, klass_offset));
2363     // store the klass handle as second argument
2364     __ movptr(c_rarg1, oop_handle_reg);
2365     // and protect the arg if we must spill
2366     c_arg--;
2367   }
2368 
2369   // Change state to native (we save the return address in the thread, since it might not
2370   // be pushed on the stack when we do a stack traversal). It is enough that the pc()
2371   // points into the right code segment. It does not have to be the correct return pc.
2372   // We use the same pc/oopMap repeatedly when we call out
2373 
2374   intptr_t the_pc = (intptr_t) __ pc();
2375   oop_maps->add_gc_map(the_pc - start, map);
2376 
2377   __ set_last_Java_frame(rsp, noreg, (address)the_pc, rscratch1);
2378 
2379 
2380   // We have all of the arguments setup at this point. We must not touch any register
2381   // argument registers at this point (what if we save/restore them there are no oop?
2382 
2383   {
2384     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2385     // protect the args we've loaded
2386     save_args(masm, total_c_args, c_arg, out_regs);
2387     __ mov_metadata(c_rarg1, method());
2388     __ call_VM_leaf(
2389       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2390       r15_thread, c_rarg1);
2391     restore_args(masm, total_c_args, c_arg, out_regs);
2392   }
2393 
2394   // RedefineClasses() tracing support for obsolete method entry
2395   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2396     // protect the args we've loaded
2397     save_args(masm, total_c_args, c_arg, out_regs);
2398     __ mov_metadata(c_rarg1, method());
2399     __ call_VM_leaf(
2400       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2401       r15_thread, c_rarg1);
2402     restore_args(masm, total_c_args, c_arg, out_regs);
2403   }
2404 
2405   // Lock a synchronized method
2406 
2407   // Register definitions used by locking and unlocking
2408 
2409   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2410   const Register obj_reg  = rbx;  // Will contain the oop
2411   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2412   const Register old_hdr  = r13;  // value of old header at unlock time
2413 
2414   Label slow_path_lock;
2415   Label lock_done;
2416 
2417   if (method->is_synchronized()) {
2418     Label count_mon;
2419 
2420     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2421 
2422     // Get the handle (the 2nd argument)
2423     __ mov(oop_handle_reg, c_rarg1);
2424 
2425     // Get address of the box
2426 
2427     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2428 
2429     // Load the oop from the handle
2430     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2431 
2432     if (LockingMode == LM_MONITOR) {
2433       __ jmp(slow_path_lock);
2434     } else if (LockingMode == LM_LEGACY) {
2435       // Load immediate 1 into swap_reg %rax
2436       __ movl(swap_reg, 1);
2437 
2438       // Load (object->mark() | 1) into swap_reg %rax
2439       __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2440       if (EnableValhalla) {
2441         // Mask inline_type bit such that we go to the slow path if object is an inline type
2442         __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2443       }
2444 
2445       // Save (object->mark() | 1) into BasicLock's displaced header
2446       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2447 
2448       // src -> dest iff dest == rax else rax <- dest
2449       __ lock();
2450       __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2451       __ jcc(Assembler::equal, count_mon);
2452 
2453       // Hmm should this move to the slow path code area???
2454 
2455       // Test if the oopMark is an obvious stack pointer, i.e.,
2456       //  1) (mark & 3) == 0, and
2457       //  2) rsp <= mark < mark + os::pagesize()
2458       // These 3 tests can be done by evaluating the following
2459       // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2460       // assuming both stack pointer and pagesize have their
2461       // least significant 2 bits clear.
2462       // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2463 
2464       __ subptr(swap_reg, rsp);
2465       __ andptr(swap_reg, 3 - (int)os::vm_page_size());
2466 
2467       // Save the test result, for recursive case, the result is zero
2468       __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2469       __ jcc(Assembler::notEqual, slow_path_lock);
2470     } else {
2471       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2472       // Load object header
2473       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2474       __ fast_lock_impl(obj_reg, swap_reg, r15_thread, rscratch1, slow_path_lock);
2475     }
2476     __ bind(count_mon);
2477     __ inc_held_monitor_count();
2478 
2479     // Slow path will re-enter here
2480     __ bind(lock_done);
2481   }
2482 
2483   // Finally just about ready to make the JNI call
2484 
2485   // get JNIEnv* which is first argument to native
2486   __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2487 
2488   // Now set thread in native
2489   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2490 
2491   __ call(RuntimeAddress(native_func));
2492 
2493   // Verify or restore cpu control state after JNI call
2494   __ restore_cpu_control_state_after_jni(rscratch1);
2495 
2496   // Unpack native results.
2497   switch (ret_type) {
2498   case T_BOOLEAN: __ c2bool(rax);            break;
2499   case T_CHAR   : __ movzwl(rax, rax);      break;
2500   case T_BYTE   : __ sign_extend_byte (rax); break;
2501   case T_SHORT  : __ sign_extend_short(rax); break;
2502   case T_INT    : /* nothing to do */        break;
2503   case T_DOUBLE :
2504   case T_FLOAT  :
2505     // Result is in xmm0 we'll save as needed
2506     break;
2507   case T_ARRAY:                 // Really a handle
2508   case T_PRIMITIVE_OBJECT:           // Really a handle
2509   case T_OBJECT:                // Really a handle
2510       break; // can't de-handlize until after safepoint check
2511   case T_VOID: break;
2512   case T_LONG: break;
2513   default       : ShouldNotReachHere();
2514   }
2515 
2516   Label after_transition;
2517 
2518   // Switch thread to "native transition" state before reading the synchronization state.
2519   // This additional state is necessary because reading and testing the synchronization
2520   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2521   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2522   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2523   //     Thread A is resumed to finish this native method, but doesn't block here since it
2524   //     didn't see any synchronization is progress, and escapes.
2525   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2526 
2527   // Force this write out before the read below
2528   if (!UseSystemMemoryBarrier) {
2529     __ membar(Assembler::Membar_mask_bits(
2530               Assembler::LoadLoad | Assembler::LoadStore |
2531               Assembler::StoreLoad | Assembler::StoreStore));
2532   }
2533 
2534   // check for safepoint operation in progress and/or pending suspend requests
2535   {
2536     Label Continue;
2537     Label slow_path;
2538 
2539     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2540 
2541     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2542     __ jcc(Assembler::equal, Continue);
2543     __ bind(slow_path);
2544 
2545     // Don't use call_VM as it will see a possible pending exception and forward it
2546     // and never return here preventing us from clearing _last_native_pc down below.
2547     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2548     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2549     // by hand.
2550     //
2551     __ vzeroupper();
2552     save_native_result(masm, ret_type, stack_slots);
2553     __ mov(c_rarg0, r15_thread);
2554     __ mov(r12, rsp); // remember sp
2555     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2556     __ andptr(rsp, -16); // align stack as required by ABI
2557     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2558     __ mov(rsp, r12); // restore sp
2559     __ reinit_heapbase();
2560     // Restore any method result value
2561     restore_native_result(masm, ret_type, stack_slots);
2562     __ bind(Continue);
2563   }
2564 
2565   // change thread state
2566   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2567   __ bind(after_transition);
2568 
2569   Label reguard;
2570   Label reguard_done;
2571   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2572   __ jcc(Assembler::equal, reguard);
2573   __ bind(reguard_done);
2574 
2575   // native result if any is live
2576 
2577   // Unlock
2578   Label slow_path_unlock;
2579   Label unlock_done;
2580   if (method->is_synchronized()) {
2581 
2582     Label fast_done;
2583 
2584     // Get locked oop from the handle we passed to jni
2585     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2586 
2587     if (LockingMode == LM_LEGACY) {
2588       Label not_recur;
2589       // Simple recursive lock?
2590       __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), NULL_WORD);
2591       __ jcc(Assembler::notEqual, not_recur);
2592       __ dec_held_monitor_count();
2593       __ jmpb(fast_done);
2594       __ bind(not_recur);
2595     }
2596 
2597     // Must save rax if it is live now because cmpxchg must use it
2598     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2599       save_native_result(masm, ret_type, stack_slots);
2600     }
2601 
2602     if (LockingMode == LM_MONITOR) {
2603       __ jmp(slow_path_unlock);
2604     } else if (LockingMode == LM_LEGACY) {
2605       // get address of the stack lock
2606       __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2607       //  get old displaced header
2608       __ movptr(old_hdr, Address(rax, 0));
2609 
2610       // Atomic swap old header if oop still contains the stack lock
2611       __ lock();
2612       __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2613       __ jcc(Assembler::notEqual, slow_path_unlock);
2614       __ dec_held_monitor_count();
2615     } else {
2616       assert(LockingMode == LM_LIGHTWEIGHT, "must be");
2617       __ movptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2618       __ andptr(swap_reg, ~(int32_t)markWord::lock_mask_in_place);
2619       __ fast_unlock_impl(obj_reg, swap_reg, lock_reg, slow_path_unlock);
2620       __ dec_held_monitor_count();
2621     }
2622 
2623     // slow path re-enters here
2624     __ bind(unlock_done);
2625     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2626       restore_native_result(masm, ret_type, stack_slots);
2627     }
2628 
2629     __ bind(fast_done);
2630   }
2631   {
2632     SkipIfEqual skip(masm, &DTraceMethodProbes, false, rscratch1);
2633     save_native_result(masm, ret_type, stack_slots);
2634     __ mov_metadata(c_rarg1, method());
2635     __ call_VM_leaf(
2636          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2637          r15_thread, c_rarg1);
2638     restore_native_result(masm, ret_type, stack_slots);
2639   }
2640 
2641   __ reset_last_Java_frame(false);
2642 
2643   // Unbox oop result, e.g. JNIHandles::resolve value.
2644   if (is_reference_type(ret_type)) {
2645     __ resolve_jobject(rax /* value */,
2646                        r15_thread /* thread */,
2647                        rcx /* tmp */);
2648   }
2649 
2650   if (CheckJNICalls) {
2651     // clear_pending_jni_exception_check
2652     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2653   }
2654 
2655   // reset handle block
2656   __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2657   __ movl(Address(rcx, JNIHandleBlock::top_offset()), NULL_WORD);
2658 
2659   // pop our frame
2660 
2661   __ leave();
2662 
2663   // Any exception pending?
2664   __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2665   __ jcc(Assembler::notEqual, exception_pending);
2666 
2667   // Return
2668 
2669   __ ret(0);
2670 
2671   // Unexpected paths are out of line and go here
2672 
2673   // forward the exception
2674   __ bind(exception_pending);
2675 
2676   // and forward the exception
2677   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2678 
2679   // Slow path locking & unlocking
2680   if (method->is_synchronized()) {
2681 
2682     // BEGIN Slow path lock
2683     __ bind(slow_path_lock);
2684 
2685     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2686     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2687 
2688     // protect the args we've loaded
2689     save_args(masm, total_c_args, c_arg, out_regs);
2690 
2691     __ mov(c_rarg0, obj_reg);
2692     __ mov(c_rarg1, lock_reg);
2693     __ mov(c_rarg2, r15_thread);
2694 
2695     // Not a leaf but we have last_Java_frame setup as we want
2696     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2697     restore_args(masm, total_c_args, c_arg, out_regs);
2698 
2699 #ifdef ASSERT
2700     { Label L;
2701     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2702     __ jcc(Assembler::equal, L);
2703     __ stop("no pending exception allowed on exit from monitorenter");
2704     __ bind(L);
2705     }
2706 #endif
2707     __ jmp(lock_done);
2708 
2709     // END Slow path lock
2710 
2711     // BEGIN Slow path unlock
2712     __ bind(slow_path_unlock);
2713 
2714     // If we haven't already saved the native result we must save it now as xmm registers
2715     // are still exposed.
2716     __ vzeroupper();
2717     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2718       save_native_result(masm, ret_type, stack_slots);
2719     }
2720 
2721     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2722 
2723     __ mov(c_rarg0, obj_reg);
2724     __ mov(c_rarg2, r15_thread);
2725     __ mov(r12, rsp); // remember sp
2726     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2727     __ andptr(rsp, -16); // align stack as required by ABI
2728 
2729     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2730     // NOTE that obj_reg == rbx currently
2731     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2732     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2733 
2734     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2735     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2736     __ mov(rsp, r12); // restore sp
2737     __ reinit_heapbase();
2738 #ifdef ASSERT
2739     {
2740       Label L;
2741       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), NULL_WORD);
2742       __ jcc(Assembler::equal, L);
2743       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2744       __ bind(L);
2745     }
2746 #endif /* ASSERT */
2747 
2748     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2749 
2750     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2751       restore_native_result(masm, ret_type, stack_slots);
2752     }
2753     __ jmp(unlock_done);
2754 
2755     // END Slow path unlock
2756 
2757   } // synchronized
2758 
2759   // SLOW PATH Reguard the stack if needed
2760 
2761   __ bind(reguard);
2762   __ vzeroupper();
2763   save_native_result(masm, ret_type, stack_slots);
2764   __ mov(r12, rsp); // remember sp
2765   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2766   __ andptr(rsp, -16); // align stack as required by ABI
2767   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2768   __ mov(rsp, r12); // restore sp
2769   __ reinit_heapbase();
2770   restore_native_result(masm, ret_type, stack_slots);
2771   // and continue
2772   __ jmp(reguard_done);
2773 
2774 
2775 
2776   __ flush();
2777 
2778   nmethod *nm = nmethod::new_native_nmethod(method,
2779                                             compile_id,
2780                                             masm->code(),
2781                                             vep_offset,
2782                                             frame_complete,
2783                                             stack_slots / VMRegImpl::slots_per_word,
2784                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2785                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2786                                             oop_maps);
2787 
2788   return nm;
2789 }
2790 
2791 // this function returns the adjust size (in number of words) to a c2i adapter
2792 // activation for use during deoptimization
2793 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2794   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2795 }
2796 
2797 
2798 uint SharedRuntime::out_preserve_stack_slots() {
2799   return 0;
2800 }
2801 
2802 
2803 // Number of stack slots between incoming argument block and the start of
2804 // a new frame.  The PROLOG must add this many slots to the stack.  The
2805 // EPILOG must remove this many slots.  amd64 needs two slots for
2806 // return address.
2807 uint SharedRuntime::in_preserve_stack_slots() {
2808   return 4 + 2 * VerifyStackAtCalls;
2809 }
2810 
2811 //------------------------------generate_deopt_blob----------------------------
2812 void SharedRuntime::generate_deopt_blob() {
2813   // Allocate space for the code
2814   ResourceMark rm;
2815   // Setup code generation tools
2816   int pad = 0;
2817   if (UseAVX > 2) {
2818     pad += 1024;
2819   }
2820 #if INCLUDE_JVMCI
2821   if (EnableJVMCI) {
2822     pad += 512; // Increase the buffer size when compiling for JVMCI
2823   }
2824 #endif
2825   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2826   MacroAssembler* masm = new MacroAssembler(&buffer);
2827   int frame_size_in_words;
2828   OopMap* map = nullptr;
2829   OopMapSet *oop_maps = new OopMapSet();
2830 
2831   // -------------
2832   // This code enters when returning to a de-optimized nmethod.  A return
2833   // address has been pushed on the stack, and return values are in
2834   // registers.
2835   // If we are doing a normal deopt then we were called from the patched
2836   // nmethod from the point we returned to the nmethod. So the return
2837   // address on the stack is wrong by NativeCall::instruction_size
2838   // We will adjust the value so it looks like we have the original return
2839   // address on the stack (like when we eagerly deoptimized).
2840   // In the case of an exception pending when deoptimizing, we enter
2841   // with a return address on the stack that points after the call we patched
2842   // into the exception handler. We have the following register state from,
2843   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2844   //    rax: exception oop
2845   //    rbx: exception handler
2846   //    rdx: throwing pc
2847   // So in this case we simply jam rdx into the useless return address and
2848   // the stack looks just like we want.
2849   //
2850   // At this point we need to de-opt.  We save the argument return
2851   // registers.  We call the first C routine, fetch_unroll_info().  This
2852   // routine captures the return values and returns a structure which
2853   // describes the current frame size and the sizes of all replacement frames.
2854   // The current frame is compiled code and may contain many inlined
2855   // functions, each with their own JVM state.  We pop the current frame, then
2856   // push all the new frames.  Then we call the C routine unpack_frames() to
2857   // populate these frames.  Finally unpack_frames() returns us the new target
2858   // address.  Notice that callee-save registers are BLOWN here; they have
2859   // already been captured in the vframeArray at the time the return PC was
2860   // patched.
2861   address start = __ pc();
2862   Label cont;
2863 
2864   // Prolog for non exception case!
2865 
2866   // Save everything in sight.
2867   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2868 
2869   // Normal deoptimization.  Save exec mode for unpack_frames.
2870   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2871   __ jmp(cont);
2872 
2873   int reexecute_offset = __ pc() - start;
2874 #if INCLUDE_JVMCI && !defined(COMPILER1)
2875   if (EnableJVMCI && UseJVMCICompiler) {
2876     // JVMCI does not use this kind of deoptimization
2877     __ should_not_reach_here();
2878   }
2879 #endif
2880 
2881   // Reexecute case
2882   // return address is the pc describes what bci to do re-execute at
2883 
2884   // No need to update map as each call to save_live_registers will produce identical oopmap
2885   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2886 
2887   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2888   __ jmp(cont);
2889 
2890 #if INCLUDE_JVMCI
2891   Label after_fetch_unroll_info_call;
2892   int implicit_exception_uncommon_trap_offset = 0;
2893   int uncommon_trap_offset = 0;
2894 
2895   if (EnableJVMCI) {
2896     implicit_exception_uncommon_trap_offset = __ pc() - start;
2897 
2898     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2899     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), NULL_WORD);
2900 
2901     uncommon_trap_offset = __ pc() - start;
2902 
2903     // Save everything in sight.
2904     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2905     // fetch_unroll_info needs to call last_java_frame()
2906     __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2907 
2908     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2909     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2910 
2911     __ movl(r14, Deoptimization::Unpack_reexecute);
2912     __ mov(c_rarg0, r15_thread);
2913     __ movl(c_rarg2, r14); // exec mode
2914     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2915     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2916 
2917     __ reset_last_Java_frame(false);
2918 
2919     __ jmp(after_fetch_unroll_info_call);
2920   } // EnableJVMCI
2921 #endif // INCLUDE_JVMCI
2922 
2923   int exception_offset = __ pc() - start;
2924 
2925   // Prolog for exception case
2926 
2927   // all registers are dead at this entry point, except for rax, and
2928   // rdx which contain the exception oop and exception pc
2929   // respectively.  Set them in TLS and fall thru to the
2930   // unpack_with_exception_in_tls entry point.
2931 
2932   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2933   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2934 
2935   int exception_in_tls_offset = __ pc() - start;
2936 
2937   // new implementation because exception oop is now passed in JavaThread
2938 
2939   // Prolog for exception case
2940   // All registers must be preserved because they might be used by LinearScan
2941   // Exceptiop oop and throwing PC are passed in JavaThread
2942   // tos: stack at point of call to method that threw the exception (i.e. only
2943   // args are on the stack, no return address)
2944 
2945   // make room on stack for the return address
2946   // It will be patched later with the throwing pc. The correct value is not
2947   // available now because loading it from memory would destroy registers.
2948   __ push(0);
2949 
2950   // Save everything in sight.
2951   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ true);
2952 
2953   // Now it is safe to overwrite any register
2954 
2955   // Deopt during an exception.  Save exec mode for unpack_frames.
2956   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2957 
2958   // load throwing pc from JavaThread and patch it as the return address
2959   // of the current frame. Then clear the field in JavaThread
2960 
2961   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2962   __ movptr(Address(rbp, wordSize), rdx);
2963   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
2964 
2965 #ifdef ASSERT
2966   // verify that there is really an exception oop in JavaThread
2967   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2968   __ verify_oop(rax);
2969 
2970   // verify that there is no pending exception
2971   Label no_pending_exception;
2972   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2973   __ testptr(rax, rax);
2974   __ jcc(Assembler::zero, no_pending_exception);
2975   __ stop("must not have pending exception here");
2976   __ bind(no_pending_exception);
2977 #endif
2978 
2979   __ bind(cont);
2980 
2981   // Call C code.  Need thread and this frame, but NOT official VM entry
2982   // crud.  We cannot block on this call, no GC can happen.
2983   //
2984   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2985 
2986   // fetch_unroll_info needs to call last_java_frame().
2987 
2988   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
2989 #ifdef ASSERT
2990   { Label L;
2991     __ cmpptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2992     __ jcc(Assembler::equal, L);
2993     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2994     __ bind(L);
2995   }
2996 #endif // ASSERT
2997   __ mov(c_rarg0, r15_thread);
2998   __ movl(c_rarg1, r14); // exec_mode
2999   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
3000 
3001   // Need to have an oopmap that tells fetch_unroll_info where to
3002   // find any register it might need.
3003   oop_maps->add_gc_map(__ pc() - start, map);
3004 
3005   __ reset_last_Java_frame(false);
3006 
3007 #if INCLUDE_JVMCI
3008   if (EnableJVMCI) {
3009     __ bind(after_fetch_unroll_info_call);
3010   }
3011 #endif
3012 
3013   // Load UnrollBlock* into rdi
3014   __ mov(rdi, rax);
3015 
3016   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()));
3017    Label noException;
3018   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
3019   __ jcc(Assembler::notEqual, noException);
3020   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3021   // QQQ this is useless it was null above
3022   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3023   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3024   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3025 
3026   __ verify_oop(rax);
3027 
3028   // Overwrite the result registers with the exception results.
3029   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3030   // I think this is useless
3031   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
3032 
3033   __ bind(noException);
3034 
3035   // Only register save data is on the stack.
3036   // Now restore the result registers.  Everything else is either dead
3037   // or captured in the vframeArray.
3038   RegisterSaver::restore_result_registers(masm);
3039 
3040   // All of the register save area has been popped of the stack. Only the
3041   // return address remains.
3042 
3043   // Pop all the frames we must move/replace.
3044   //
3045   // Frame picture (youngest to oldest)
3046   // 1: self-frame (no frame link)
3047   // 2: deopting frame  (no frame link)
3048   // 3: caller of deopting frame (could be compiled/interpreted).
3049   //
3050   // Note: by leaving the return address of self-frame on the stack
3051   // and using the size of frame 2 to adjust the stack
3052   // when we are done the return to frame 3 will still be on the stack.
3053 
3054   // Pop deoptimized frame
3055   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset()));
3056   __ addptr(rsp, rcx);
3057 
3058   // rsp should be pointing at the return address to the caller (3)
3059 
3060   // Pick up the initial fp we should save
3061   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3062   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3063 
3064 #ifdef ASSERT
3065   // Compilers generate code that bang the stack by as much as the
3066   // interpreter would need. So this stack banging should never
3067   // trigger a fault. Verify that it does not on non product builds.
3068   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3069   __ bang_stack_size(rbx, rcx);
3070 #endif
3071 
3072   // Load address of array of frame pcs into rcx
3073   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3074 
3075   // Trash the old pc
3076   __ addptr(rsp, wordSize);
3077 
3078   // Load address of array of frame sizes into rsi
3079   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset()));
3080 
3081   // Load counter into rdx
3082   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset()));
3083 
3084   // Now adjust the caller's stack to make up for the extra locals
3085   // but record the original sp so that we can save it in the skeletal interpreter
3086   // frame and the stack walking of interpreter_sender will get the unextended sp
3087   // value and not the "real" sp value.
3088 
3089   const Register sender_sp = r8;
3090 
3091   __ mov(sender_sp, rsp);
3092   __ movl(rbx, Address(rdi,
3093                        Deoptimization::UnrollBlock::
3094                        caller_adjustment_offset()));
3095   __ subptr(rsp, rbx);
3096 
3097   // Push interpreter frames in a loop
3098   Label loop;
3099   __ bind(loop);
3100   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3101   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3102   __ pushptr(Address(rcx, 0));          // Save return address
3103   __ enter();                           // Save old & set new ebp
3104   __ subptr(rsp, rbx);                  // Prolog
3105   // This value is corrected by layout_activation_impl
3106   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3107   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3108   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3109   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3110   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3111   __ decrementl(rdx);                   // Decrement counter
3112   __ jcc(Assembler::notZero, loop);
3113   __ pushptr(Address(rcx, 0));          // Save final return address
3114 
3115   // Re-push self-frame
3116   __ enter();                           // Save old & set new ebp
3117 
3118   // Allocate a full sized register save area.
3119   // Return address and rbp are in place, so we allocate two less words.
3120   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3121 
3122   // Restore frame locals after moving the frame
3123   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3124   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3125 
3126   // Call C code.  Need thread but NOT official VM entry
3127   // crud.  We cannot block on this call, no GC can happen.  Call should
3128   // restore return values to their stack-slots with the new SP.
3129   //
3130   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3131 
3132   // Use rbp because the frames look interpreted now
3133   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3134   // Don't need the precise return PC here, just precise enough to point into this code blob.
3135   address the_pc = __ pc();
3136   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3137 
3138   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3139   __ mov(c_rarg0, r15_thread);
3140   __ movl(c_rarg1, r14); // second arg: exec_mode
3141   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3142   // Revert SP alignment after call since we're going to do some SP relative addressing below
3143   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3144 
3145   // Set an oopmap for the call site
3146   // Use the same PC we used for the last java frame
3147   oop_maps->add_gc_map(the_pc - start,
3148                        new OopMap( frame_size_in_words, 0 ));
3149 
3150   // Clear fp AND pc
3151   __ reset_last_Java_frame(true);
3152 
3153   // Collect return values
3154   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3155   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3156   // I think this is useless (throwing pc?)
3157   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3158 
3159   // Pop self-frame.
3160   __ leave();                           // Epilog
3161 
3162   // Jump to interpreter
3163   __ ret(0);
3164 
3165   // Make sure all code is generated
3166   masm->flush();
3167 
3168   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3169   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3170 #if INCLUDE_JVMCI
3171   if (EnableJVMCI) {
3172     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3173     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3174   }
3175 #endif
3176 }
3177 
3178 #ifdef COMPILER2
3179 //------------------------------generate_uncommon_trap_blob--------------------
3180 void SharedRuntime::generate_uncommon_trap_blob() {
3181   // Allocate space for the code
3182   ResourceMark rm;
3183   // Setup code generation tools
3184   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3185   MacroAssembler* masm = new MacroAssembler(&buffer);
3186 
3187   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3188 
3189   address start = __ pc();
3190 
3191   if (UseRTMLocking) {
3192     // Abort RTM transaction before possible nmethod deoptimization.
3193     __ xabort(0);
3194   }
3195 
3196   // Push self-frame.  We get here with a return address on the
3197   // stack, so rsp is 8-byte aligned until we allocate our frame.
3198   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3199 
3200   // No callee saved registers. rbp is assumed implicitly saved
3201   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3202 
3203   // compiler left unloaded_class_index in j_rarg0 move to where the
3204   // runtime expects it.
3205   __ movl(c_rarg1, j_rarg0);
3206 
3207   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3208 
3209   // Call C code.  Need thread but NOT official VM entry
3210   // crud.  We cannot block on this call, no GC can happen.  Call should
3211   // capture callee-saved registers as well as return values.
3212   // Thread is in rdi already.
3213   //
3214   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3215 
3216   __ mov(c_rarg0, r15_thread);
3217   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3218   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3219 
3220   // Set an oopmap for the call site
3221   OopMapSet* oop_maps = new OopMapSet();
3222   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3223 
3224   // location of rbp is known implicitly by the frame sender code
3225 
3226   oop_maps->add_gc_map(__ pc() - start, map);
3227 
3228   __ reset_last_Java_frame(false);
3229 
3230   // Load UnrollBlock* into rdi
3231   __ mov(rdi, rax);
3232 
3233 #ifdef ASSERT
3234   { Label L;
3235     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset()),
3236               Deoptimization::Unpack_uncommon_trap);
3237     __ jcc(Assembler::equal, L);
3238     __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
3239     __ bind(L);
3240   }
3241 #endif
3242 
3243   // Pop all the frames we must move/replace.
3244   //
3245   // Frame picture (youngest to oldest)
3246   // 1: self-frame (no frame link)
3247   // 2: deopting frame  (no frame link)
3248   // 3: caller of deopting frame (could be compiled/interpreted).
3249 
3250   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3251   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3252 
3253   // Pop deoptimized frame (int)
3254   __ movl(rcx, Address(rdi,
3255                        Deoptimization::UnrollBlock::
3256                        size_of_deoptimized_frame_offset()));
3257   __ addptr(rsp, rcx);
3258 
3259   // rsp should be pointing at the return address to the caller (3)
3260 
3261   // Pick up the initial fp we should save
3262   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3263   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset()));
3264 
3265 #ifdef ASSERT
3266   // Compilers generate code that bang the stack by as much as the
3267   // interpreter would need. So this stack banging should never
3268   // trigger a fault. Verify that it does not on non product builds.
3269   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset()));
3270   __ bang_stack_size(rbx, rcx);
3271 #endif
3272 
3273   // Load address of array of frame pcs into rcx (address*)
3274   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset()));
3275 
3276   // Trash the return pc
3277   __ addptr(rsp, wordSize);
3278 
3279   // Load address of array of frame sizes into rsi (intptr_t*)
3280   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset()));
3281 
3282   // Counter
3283   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset())); // (int)
3284 
3285   // Now adjust the caller's stack to make up for the extra locals but
3286   // record the original sp so that we can save it in the skeletal
3287   // interpreter frame and the stack walking of interpreter_sender
3288   // will get the unextended sp value and not the "real" sp value.
3289 
3290   const Register sender_sp = r8;
3291 
3292   __ mov(sender_sp, rsp);
3293   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset())); // (int)
3294   __ subptr(rsp, rbx);
3295 
3296   // Push interpreter frames in a loop
3297   Label loop;
3298   __ bind(loop);
3299   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3300   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3301   __ pushptr(Address(rcx, 0));     // Save return address
3302   __ enter();                      // Save old & set new rbp
3303   __ subptr(rsp, rbx);             // Prolog
3304   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3305             sender_sp);            // Make it walkable
3306   // This value is corrected by layout_activation_impl
3307   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), NULL_WORD);
3308   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3309   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3310   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3311   __ decrementl(rdx);              // Decrement counter
3312   __ jcc(Assembler::notZero, loop);
3313   __ pushptr(Address(rcx, 0));     // Save final return address
3314 
3315   // Re-push self-frame
3316   __ enter();                 // Save old & set new rbp
3317   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3318                               // Prolog
3319 
3320   // Use rbp because the frames look interpreted now
3321   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3322   // Don't need the precise return PC here, just precise enough to point into this code blob.
3323   address the_pc = __ pc();
3324   __ set_last_Java_frame(noreg, rbp, the_pc, rscratch1);
3325 
3326   // Call C code.  Need thread but NOT official VM entry
3327   // crud.  We cannot block on this call, no GC can happen.  Call should
3328   // restore return values to their stack-slots with the new SP.
3329   // Thread is in rdi already.
3330   //
3331   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3332 
3333   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3334   __ mov(c_rarg0, r15_thread);
3335   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3336   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3337 
3338   // Set an oopmap for the call site
3339   // Use the same PC we used for the last java frame
3340   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3341 
3342   // Clear fp AND pc
3343   __ reset_last_Java_frame(true);
3344 
3345   // Pop self-frame.
3346   __ leave();                 // Epilog
3347 
3348   // Jump to interpreter
3349   __ ret(0);
3350 
3351   // Make sure all code is generated
3352   masm->flush();
3353 
3354   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3355                                                  SimpleRuntimeFrame::framesize >> 1);
3356 }
3357 #endif // COMPILER2
3358 
3359 //------------------------------generate_handler_blob------
3360 //
3361 // Generate a special Compile2Runtime blob that saves all registers,
3362 // and setup oopmap.
3363 //
3364 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3365   assert(StubRoutines::forward_exception_entry() != nullptr,
3366          "must be generated before");
3367 
3368   ResourceMark rm;
3369   OopMapSet *oop_maps = new OopMapSet();
3370   OopMap* map;
3371 
3372   // Allocate space for the code.  Setup code generation tools.
3373   CodeBuffer buffer("handler_blob", 2048, 1024);
3374   MacroAssembler* masm = new MacroAssembler(&buffer);
3375 
3376   address start   = __ pc();
3377   address call_pc = nullptr;
3378   int frame_size_in_words;
3379   bool cause_return = (poll_type == POLL_AT_RETURN);
3380   bool save_wide_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3381 
3382   if (UseRTMLocking) {
3383     // Abort RTM transaction before calling runtime
3384     // because critical section will be large and will be
3385     // aborted anyway. Also nmethod could be deoptimized.
3386     __ xabort(0);
3387   }
3388 
3389   // Make room for return address (or push it again)
3390   if (!cause_return) {
3391     __ push(rbx);
3392   }
3393 
3394   // Save registers, fpu state, and flags
3395   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_wide_vectors);
3396 
3397   // The following is basically a call_VM.  However, we need the precise
3398   // address of the call in order to generate an oopmap. Hence, we do all the
3399   // work ourselves.
3400 
3401   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);  // JavaFrameAnchor::capture_last_Java_pc() will get the pc from the return address, which we store next:
3402 
3403   // The return address must always be correct so that frame constructor never
3404   // sees an invalid pc.
3405 
3406   if (!cause_return) {
3407     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3408     // Additionally, rbx is a callee saved register and we can look at it later to determine
3409     // if someone changed the return address for us!
3410     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3411     __ movptr(Address(rbp, wordSize), rbx);
3412   }
3413 
3414   // Do the call
3415   __ mov(c_rarg0, r15_thread);
3416   __ call(RuntimeAddress(call_ptr));
3417 
3418   // Set an oopmap for the call site.  This oopmap will map all
3419   // oop-registers and debug-info registers as callee-saved.  This
3420   // will allow deoptimization at this safepoint to find all possible
3421   // debug-info recordings, as well as let GC find all oops.
3422 
3423   oop_maps->add_gc_map( __ pc() - start, map);
3424 
3425   Label noException;
3426 
3427   __ reset_last_Java_frame(false);
3428 
3429   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3430   __ jcc(Assembler::equal, noException);
3431 
3432   // Exception pending
3433 
3434   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3435 
3436   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3437 
3438   // No exception case
3439   __ bind(noException);
3440 
3441   Label no_adjust;
3442 #ifdef ASSERT
3443   Label bail;
3444 #endif
3445   if (!cause_return) {
3446     Label no_prefix, not_special;
3447 
3448     // If our stashed return pc was modified by the runtime we avoid touching it
3449     __ cmpptr(rbx, Address(rbp, wordSize));
3450     __ jccb(Assembler::notEqual, no_adjust);
3451 
3452     // Skip over the poll instruction.
3453     // See NativeInstruction::is_safepoint_poll()
3454     // Possible encodings:
3455     //      85 00       test   %eax,(%rax)
3456     //      85 01       test   %eax,(%rcx)
3457     //      85 02       test   %eax,(%rdx)
3458     //      85 03       test   %eax,(%rbx)
3459     //      85 06       test   %eax,(%rsi)
3460     //      85 07       test   %eax,(%rdi)
3461     //
3462     //   41 85 00       test   %eax,(%r8)
3463     //   41 85 01       test   %eax,(%r9)
3464     //   41 85 02       test   %eax,(%r10)
3465     //   41 85 03       test   %eax,(%r11)
3466     //   41 85 06       test   %eax,(%r14)
3467     //   41 85 07       test   %eax,(%r15)
3468     //
3469     //      85 04 24    test   %eax,(%rsp)
3470     //   41 85 04 24    test   %eax,(%r12)
3471     //      85 45 00    test   %eax,0x0(%rbp)
3472     //   41 85 45 00    test   %eax,0x0(%r13)
3473 
3474     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3475     __ jcc(Assembler::notEqual, no_prefix);
3476     __ addptr(rbx, 1);
3477     __ bind(no_prefix);
3478 #ifdef ASSERT
3479     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3480 #endif
3481     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3482     // r12/rsp 0x04
3483     // r13/rbp 0x05
3484     __ movzbq(rcx, Address(rbx, 1));
3485     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3486     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3487     __ cmpptr(rcx, 1);
3488     __ jcc(Assembler::above, not_special);
3489     __ addptr(rbx, 1);
3490     __ bind(not_special);
3491 #ifdef ASSERT
3492     // Verify the correct encoding of the poll we're about to skip.
3493     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3494     __ jcc(Assembler::notEqual, bail);
3495     // Mask out the modrm bits
3496     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3497     // rax encodes to 0, so if the bits are nonzero it's incorrect
3498     __ jcc(Assembler::notZero, bail);
3499 #endif
3500     // Adjust return pc forward to step over the safepoint poll instruction
3501     __ addptr(rbx, 2);
3502     __ movptr(Address(rbp, wordSize), rbx);
3503   }
3504 
3505   __ bind(no_adjust);
3506   // Normal exit, restore registers and exit.
3507   RegisterSaver::restore_live_registers(masm, save_wide_vectors);
3508   __ ret(0);
3509 
3510 #ifdef ASSERT
3511   __ bind(bail);
3512   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3513 #endif
3514 
3515   // Make sure all code is generated
3516   masm->flush();
3517 
3518   // Fill-out other meta info
3519   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3520 }
3521 
3522 //
3523 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3524 //
3525 // Generate a stub that calls into vm to find out the proper destination
3526 // of a java call. All the argument registers are live at this point
3527 // but since this is generic code we don't know what they are and the caller
3528 // must do any gc of the args.
3529 //
3530 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3531   assert (StubRoutines::forward_exception_entry() != nullptr, "must be generated before");
3532 
3533   // allocate space for the code
3534   ResourceMark rm;
3535 
3536   CodeBuffer buffer(name, 1200, 512);
3537   MacroAssembler* masm = new MacroAssembler(&buffer);
3538 
3539   int frame_size_in_words;
3540 
3541   OopMapSet *oop_maps = new OopMapSet();
3542   OopMap* map = nullptr;
3543 
3544   int start = __ offset();
3545 
3546   // No need to save vector registers since they are caller-saved anyway.
3547   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_wide_vectors*/ false);
3548 
3549   int frame_complete = __ offset();
3550 
3551   __ set_last_Java_frame(noreg, noreg, nullptr, rscratch1);
3552 
3553   __ mov(c_rarg0, r15_thread);
3554 
3555   __ call(RuntimeAddress(destination));
3556 
3557 
3558   // Set an oopmap for the call site.
3559   // We need this not only for callee-saved registers, but also for volatile
3560   // registers that the compiler might be keeping live across a safepoint.
3561 
3562   oop_maps->add_gc_map( __ offset() - start, map);
3563 
3564   // rax contains the address we are going to jump to assuming no exception got installed
3565 
3566   // clear last_Java_sp
3567   __ reset_last_Java_frame(false);
3568   // check for pending exceptions
3569   Label pending;
3570   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
3571   __ jcc(Assembler::notEqual, pending);
3572 
3573   // get the returned Method*
3574   __ get_vm_result_2(rbx, r15_thread);
3575   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3576 
3577   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3578 
3579   RegisterSaver::restore_live_registers(masm);
3580 
3581   // We are back to the original state on entry and ready to go.
3582 
3583   __ jmp(rax);
3584 
3585   // Pending exception after the safepoint
3586 
3587   __ bind(pending);
3588 
3589   RegisterSaver::restore_live_registers(masm);
3590 
3591   // exception pending => remove activation and forward to exception handler
3592 
3593   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD);
3594 
3595   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3596   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3597 
3598   // -------------
3599   // make sure all code is generated
3600   masm->flush();
3601 
3602   // return the  blob
3603   // frame_size_words or bytes??
3604   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3605 }
3606 
3607 //------------------------------Montgomery multiplication------------------------
3608 //
3609 
3610 #ifndef _WINDOWS
3611 
3612 // Subtract 0:b from carry:a.  Return carry.
3613 static julong
3614 sub(julong a[], julong b[], julong carry, long len) {
3615   long long i = 0, cnt = len;
3616   julong tmp;
3617   asm volatile("clc; "
3618                "0: ; "
3619                "mov (%[b], %[i], 8), %[tmp]; "
3620                "sbb %[tmp], (%[a], %[i], 8); "
3621                "inc %[i]; dec %[cnt]; "
3622                "jne 0b; "
3623                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3624                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3625                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3626                : "memory");
3627   return tmp;
3628 }
3629 
3630 // Multiply (unsigned) Long A by Long B, accumulating the double-
3631 // length result into the accumulator formed of T0, T1, and T2.
3632 #define MACC(A, B, T0, T1, T2)                                  \
3633 do {                                                            \
3634   unsigned long hi, lo;                                         \
3635   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3636            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3637            : "r"(A), "a"(B) : "cc");                            \
3638  } while(0)
3639 
3640 // As above, but add twice the double-length result into the
3641 // accumulator.
3642 #define MACC2(A, B, T0, T1, T2)                                 \
3643 do {                                                            \
3644   unsigned long hi, lo;                                         \
3645   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3646            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3647            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3648            : "r"(A), "a"(B) : "cc");                            \
3649  } while(0)
3650 
3651 #else //_WINDOWS
3652 
3653 static julong
3654 sub(julong a[], julong b[], julong carry, long len) {
3655   long i;
3656   julong tmp;
3657   unsigned char c = 1;
3658   for (i = 0; i < len; i++) {
3659     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3660     a[i] = tmp;
3661   }
3662   c = _addcarry_u64(c, carry, ~0, &tmp);
3663   return tmp;
3664 }
3665 
3666 // Multiply (unsigned) Long A by Long B, accumulating the double-
3667 // length result into the accumulator formed of T0, T1, and T2.
3668 #define MACC(A, B, T0, T1, T2)                          \
3669 do {                                                    \
3670   julong hi, lo;                            \
3671   lo = _umul128(A, B, &hi);                             \
3672   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3673   c = _addcarry_u64(c, hi, T1, &T1);                    \
3674   _addcarry_u64(c, T2, 0, &T2);                         \
3675  } while(0)
3676 
3677 // As above, but add twice the double-length result into the
3678 // accumulator.
3679 #define MACC2(A, B, T0, T1, T2)                         \
3680 do {                                                    \
3681   julong hi, lo;                            \
3682   lo = _umul128(A, B, &hi);                             \
3683   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3684   c = _addcarry_u64(c, hi, T1, &T1);                    \
3685   _addcarry_u64(c, T2, 0, &T2);                         \
3686   c = _addcarry_u64(0, lo, T0, &T0);                    \
3687   c = _addcarry_u64(c, hi, T1, &T1);                    \
3688   _addcarry_u64(c, T2, 0, &T2);                         \
3689  } while(0)
3690 
3691 #endif //_WINDOWS
3692 
3693 // Fast Montgomery multiplication.  The derivation of the algorithm is
3694 // in  A Cryptographic Library for the Motorola DSP56000,
3695 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3696 
3697 static void NOINLINE
3698 montgomery_multiply(julong a[], julong b[], julong n[],
3699                     julong m[], julong inv, int len) {
3700   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3701   int i;
3702 
3703   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3704 
3705   for (i = 0; i < len; i++) {
3706     int j;
3707     for (j = 0; j < i; j++) {
3708       MACC(a[j], b[i-j], t0, t1, t2);
3709       MACC(m[j], n[i-j], t0, t1, t2);
3710     }
3711     MACC(a[i], b[0], t0, t1, t2);
3712     m[i] = t0 * inv;
3713     MACC(m[i], n[0], t0, t1, t2);
3714 
3715     assert(t0 == 0, "broken Montgomery multiply");
3716 
3717     t0 = t1; t1 = t2; t2 = 0;
3718   }
3719 
3720   for (i = len; i < 2*len; i++) {
3721     int j;
3722     for (j = i-len+1; j < len; j++) {
3723       MACC(a[j], b[i-j], t0, t1, t2);
3724       MACC(m[j], n[i-j], t0, t1, t2);
3725     }
3726     m[i-len] = t0;
3727     t0 = t1; t1 = t2; t2 = 0;
3728   }
3729 
3730   while (t0)
3731     t0 = sub(m, n, t0, len);
3732 }
3733 
3734 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3735 // multiplies so it should be up to 25% faster than Montgomery
3736 // multiplication.  However, its loop control is more complex and it
3737 // may actually run slower on some machines.
3738 
3739 static void NOINLINE
3740 montgomery_square(julong a[], julong n[],
3741                   julong m[], julong inv, int len) {
3742   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3743   int i;
3744 
3745   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3746 
3747   for (i = 0; i < len; i++) {
3748     int j;
3749     int end = (i+1)/2;
3750     for (j = 0; j < end; j++) {
3751       MACC2(a[j], a[i-j], t0, t1, t2);
3752       MACC(m[j], n[i-j], t0, t1, t2);
3753     }
3754     if ((i & 1) == 0) {
3755       MACC(a[j], a[j], t0, t1, t2);
3756     }
3757     for (; j < i; j++) {
3758       MACC(m[j], n[i-j], t0, t1, t2);
3759     }
3760     m[i] = t0 * inv;
3761     MACC(m[i], n[0], t0, t1, t2);
3762 
3763     assert(t0 == 0, "broken Montgomery square");
3764 
3765     t0 = t1; t1 = t2; t2 = 0;
3766   }
3767 
3768   for (i = len; i < 2*len; i++) {
3769     int start = i-len+1;
3770     int end = start + (len - start)/2;
3771     int j;
3772     for (j = start; j < end; j++) {
3773       MACC2(a[j], a[i-j], t0, t1, t2);
3774       MACC(m[j], n[i-j], t0, t1, t2);
3775     }
3776     if ((i & 1) == 0) {
3777       MACC(a[j], a[j], t0, t1, t2);
3778     }
3779     for (; j < len; j++) {
3780       MACC(m[j], n[i-j], t0, t1, t2);
3781     }
3782     m[i-len] = t0;
3783     t0 = t1; t1 = t2; t2 = 0;
3784   }
3785 
3786   while (t0)
3787     t0 = sub(m, n, t0, len);
3788 }
3789 
3790 // Swap words in a longword.
3791 static julong swap(julong x) {
3792   return (x << 32) | (x >> 32);
3793 }
3794 
3795 // Copy len longwords from s to d, word-swapping as we go.  The
3796 // destination array is reversed.
3797 static void reverse_words(julong *s, julong *d, int len) {
3798   d += len;
3799   while(len-- > 0) {
3800     d--;
3801     *d = swap(*s);
3802     s++;
3803   }
3804 }
3805 
3806 // The threshold at which squaring is advantageous was determined
3807 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3808 #define MONTGOMERY_SQUARING_THRESHOLD 64
3809 
3810 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3811                                         jint len, jlong inv,
3812                                         jint *m_ints) {
3813   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3814   int longwords = len/2;
3815 
3816   // Make very sure we don't use so much space that the stack might
3817   // overflow.  512 jints corresponds to an 16384-bit integer and
3818   // will use here a total of 8k bytes of stack space.
3819   int divisor = sizeof(julong) * 4;
3820   guarantee(longwords <= 8192 / divisor, "must be");
3821   int total_allocation = longwords * sizeof (julong) * 4;
3822   julong *scratch = (julong *)alloca(total_allocation);
3823 
3824   // Local scratch arrays
3825   julong
3826     *a = scratch + 0 * longwords,
3827     *b = scratch + 1 * longwords,
3828     *n = scratch + 2 * longwords,
3829     *m = scratch + 3 * longwords;
3830 
3831   reverse_words((julong *)a_ints, a, longwords);
3832   reverse_words((julong *)b_ints, b, longwords);
3833   reverse_words((julong *)n_ints, n, longwords);
3834 
3835   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3836 
3837   reverse_words(m, (julong *)m_ints, longwords);
3838 }
3839 
3840 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3841                                       jint len, jlong inv,
3842                                       jint *m_ints) {
3843   assert(len % 2 == 0, "array length in montgomery_square must be even");
3844   int longwords = len/2;
3845 
3846   // Make very sure we don't use so much space that the stack might
3847   // overflow.  512 jints corresponds to an 16384-bit integer and
3848   // will use here a total of 6k bytes of stack space.
3849   int divisor = sizeof(julong) * 3;
3850   guarantee(longwords <= (8192 / divisor), "must be");
3851   int total_allocation = longwords * sizeof (julong) * 3;
3852   julong *scratch = (julong *)alloca(total_allocation);
3853 
3854   // Local scratch arrays
3855   julong
3856     *a = scratch + 0 * longwords,
3857     *n = scratch + 1 * longwords,
3858     *m = scratch + 2 * longwords;
3859 
3860   reverse_words((julong *)a_ints, a, longwords);
3861   reverse_words((julong *)n_ints, n, longwords);
3862 
3863   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3864     ::montgomery_square(a, n, m, (julong)inv, longwords);
3865   } else {
3866     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
3867   }
3868 
3869   reverse_words(m, (julong *)m_ints, longwords);
3870 }
3871 
3872 #ifdef COMPILER2
3873 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3874 //
3875 //------------------------------generate_exception_blob---------------------------
3876 // creates exception blob at the end
3877 // Using exception blob, this code is jumped from a compiled method.
3878 // (see emit_exception_handler in x86_64.ad file)
3879 //
3880 // Given an exception pc at a call we call into the runtime for the
3881 // handler in this method. This handler might merely restore state
3882 // (i.e. callee save registers) unwind the frame and jump to the
3883 // exception handler for the nmethod if there is no Java level handler
3884 // for the nmethod.
3885 //
3886 // This code is entered with a jmp.
3887 //
3888 // Arguments:
3889 //   rax: exception oop
3890 //   rdx: exception pc
3891 //
3892 // Results:
3893 //   rax: exception oop
3894 //   rdx: exception pc in caller or ???
3895 //   destination: exception handler of caller
3896 //
3897 // Note: the exception pc MUST be at a call (precise debug information)
3898 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
3899 //
3900 
3901 void OptoRuntime::generate_exception_blob() {
3902   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
3903   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
3904   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
3905 
3906   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3907 
3908   // Allocate space for the code
3909   ResourceMark rm;
3910   // Setup code generation tools
3911   CodeBuffer buffer("exception_blob", 2048, 1024);
3912   MacroAssembler* masm = new MacroAssembler(&buffer);
3913 
3914 
3915   address start = __ pc();
3916 
3917   // Exception pc is 'return address' for stack walker
3918   __ push(rdx);
3919   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
3920 
3921   // Save callee-saved registers.  See x86_64.ad.
3922 
3923   // rbp is an implicitly saved callee saved register (i.e., the calling
3924   // convention will save/restore it in the prolog/epilog). Other than that
3925   // there are no callee save registers now that adapter frames are gone.
3926 
3927   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3928 
3929   // Store exception in Thread object. We cannot pass any arguments to the
3930   // handle_exception call, since we do not want to make any assumption
3931   // about the size of the frame where the exception happened in.
3932   // c_rarg0 is either rdi (Linux) or rcx (Windows).
3933   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
3934   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
3935 
3936   // This call does all the hard work.  It checks if an exception handler
3937   // exists in the method.
3938   // If so, it returns the handler address.
3939   // If not, it prepares for stack-unwinding, restoring the callee-save
3940   // registers of the frame being removed.
3941   //
3942   // address OptoRuntime::handle_exception_C(JavaThread* thread)
3943 
3944   // At a method handle call, the stack may not be properly aligned
3945   // when returning with an exception.
3946   address the_pc = __ pc();
3947   __ set_last_Java_frame(noreg, noreg, the_pc, rscratch1);
3948   __ mov(c_rarg0, r15_thread);
3949   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
3950   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
3951 
3952   // Set an oopmap for the call site.  This oopmap will only be used if we
3953   // are unwinding the stack.  Hence, all locations will be dead.
3954   // Callee-saved registers will be the same as the frame above (i.e.,
3955   // handle_exception_stub), since they were restored when we got the
3956   // exception.
3957 
3958   OopMapSet* oop_maps = new OopMapSet();
3959 
3960   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3961 
3962   __ reset_last_Java_frame(false);
3963 
3964   // Restore callee-saved registers
3965 
3966   // rbp is an implicitly saved callee-saved register (i.e., the calling
3967   // convention will save restore it in prolog/epilog) Other than that
3968   // there are no callee save registers now that adapter frames are gone.
3969 
3970   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
3971 
3972   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
3973   __ pop(rdx);                  // No need for exception pc anymore
3974 
3975   // rax: exception handler
3976 
3977   // We have a handler in rax (could be deopt blob).
3978   __ mov(r8, rax);
3979 
3980   // Get the exception oop
3981   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
3982   // Get the exception pc in case we are deoptimized
3983   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
3984 #ifdef ASSERT
3985   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), NULL_WORD);
3986   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), NULL_WORD);
3987 #endif
3988   // Clear the exception oop so GC no longer processes it as a root.
3989   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), NULL_WORD);
3990 
3991   // rax: exception oop
3992   // r8:  exception handler
3993   // rdx: exception pc
3994   // Jump to handler
3995 
3996   __ jmp(r8);
3997 
3998   // Make sure all code is generated
3999   masm->flush();
4000 
4001   // Set exception blob
4002   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
4003 }
4004 #endif // COMPILER2
4005 
4006 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
4007   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
4008   CodeBuffer buffer(buf);
4009   short buffer_locs[20];
4010   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
4011                                          sizeof(buffer_locs)/sizeof(relocInfo));
4012 
4013   MacroAssembler* masm = new MacroAssembler(&buffer);
4014 
4015   const Array<SigEntry>* sig_vk = vk->extended_sig();
4016   const Array<VMRegPair>* regs = vk->return_regs();
4017 
4018   int pack_fields_jobject_off = __ offset();
4019   // Resolve pre-allocated buffer from JNI handle.
4020   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
4021   __ movptr(rax, Address(r13, 0));
4022   __ resolve_jobject(rax /* value */,
4023                      r15_thread /* thread */,
4024                      r12 /* tmp */);
4025   __ movptr(Address(r13, 0), rax);
4026 
4027   int pack_fields_off = __ offset();
4028 
4029   int j = 1;
4030   for (int i = 0; i < sig_vk->length(); i++) {
4031     BasicType bt = sig_vk->at(i)._bt;
4032     if (bt == T_PRIMITIVE_OBJECT) {
4033       continue;
4034     }
4035     if (bt == T_VOID) {
4036       if (sig_vk->at(i-1)._bt == T_LONG ||
4037           sig_vk->at(i-1)._bt == T_DOUBLE) {
4038         j++;
4039       }
4040       continue;
4041     }
4042     int off = sig_vk->at(i)._offset;
4043     assert(off > 0, "offset in object should be positive");
4044     VMRegPair pair = regs->at(j);
4045     VMReg r_1 = pair.first();
4046     VMReg r_2 = pair.second();
4047     Address to(rax, off);
4048     if (bt == T_FLOAT) {
4049       __ movflt(to, r_1->as_XMMRegister());
4050     } else if (bt == T_DOUBLE) {
4051       __ movdbl(to, r_1->as_XMMRegister());
4052     } else {
4053       Register val = r_1->as_Register();
4054       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
4055       if (is_reference_type(bt)) {
4056         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
4057       } else {
4058         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
4059       }
4060     }
4061     j++;
4062   }
4063   assert(j == regs->length(), "missed a field?");
4064 
4065   __ ret(0);
4066 
4067   int unpack_fields_off = __ offset();
4068 
4069   Label skip;
4070   __ testptr(rax, rax);
4071   __ jcc(Assembler::zero, skip);
4072 
4073   j = 1;
4074   for (int i = 0; i < sig_vk->length(); i++) {
4075     BasicType bt = sig_vk->at(i)._bt;
4076     if (bt == T_PRIMITIVE_OBJECT) {
4077       continue;
4078     }
4079     if (bt == T_VOID) {
4080       if (sig_vk->at(i-1)._bt == T_LONG ||
4081           sig_vk->at(i-1)._bt == T_DOUBLE) {
4082         j++;
4083       }
4084       continue;
4085     }
4086     int off = sig_vk->at(i)._offset;
4087     assert(off > 0, "offset in object should be positive");
4088     VMRegPair pair = regs->at(j);
4089     VMReg r_1 = pair.first();
4090     VMReg r_2 = pair.second();
4091     Address from(rax, off);
4092     if (bt == T_FLOAT) {
4093       __ movflt(r_1->as_XMMRegister(), from);
4094     } else if (bt == T_DOUBLE) {
4095       __ movdbl(r_1->as_XMMRegister(), from);
4096     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4097       assert_different_registers(rax, r_1->as_Register());
4098       __ load_heap_oop(r_1->as_Register(), from);
4099     } else {
4100       assert(is_java_primitive(bt), "unexpected basic type");
4101       assert_different_registers(rax, r_1->as_Register());
4102       size_t size_in_bytes = type2aelembytes(bt);
4103       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4104     }
4105     j++;
4106   }
4107   assert(j == regs->length(), "missed a field?");
4108 
4109   __ bind(skip);
4110   __ ret(0);
4111 
4112   __ flush();
4113 
4114   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4115 }