1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "classfile/symbolTable.hpp"
  32 #include "code/debugInfoRec.hpp"
  33 #include "code/icBuffer.hpp"
  34 #include "code/nativeInst.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "compiler/oopMap.hpp"
  37 #include "gc/shared/collectedHeap.hpp"
  38 #include "gc/shared/gcLocker.hpp"
  39 #include "gc/shared/barrierSet.hpp"
  40 #include "gc/shared/barrierSetAssembler.hpp"
  41 #include "interpreter/interpreter.hpp"
  42 #include "logging/log.hpp"
  43 #include "memory/resourceArea.hpp"
  44 #include "memory/universe.hpp"
  45 #include "oops/compiledICHolder.hpp"
  46 #include "oops/klass.inline.hpp"
  47 #include "prims/methodHandles.hpp"
  48 #include "runtime/jniHandles.hpp"
  49 #include "runtime/safepointMechanism.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/signature.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "runtime/vframeArray.hpp"
  54 #include "runtime/vm_version.hpp"
  55 #include "utilities/align.hpp"
  56 #include "utilities/formatBuffer.hpp"
  57 #include "vmreg_x86.inline.hpp"
  58 #ifdef COMPILER1
  59 #include "c1/c1_Runtime1.hpp"
  60 #endif
  61 #ifdef COMPILER2
  62 #include "opto/runtime.hpp"
  63 #endif
  64 #if INCLUDE_JVMCI
  65 #include "jvmci/jvmciJavaClasses.hpp"
  66 #endif
  67 
  68 #define __ masm->
  69 
  70 const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
  71 
  72 class SimpleRuntimeFrame {
  73 
  74   public:
  75 
  76   // Most of the runtime stubs have this simple frame layout.
  77   // This class exists to make the layout shared in one place.
  78   // Offsets are for compiler stack slots, which are jints.
  79   enum layout {
  80     // The frame sender code expects that rbp will be in the "natural" place and
  81     // will override any oopMap setting for it. We must therefore force the layout
  82     // so that it agrees with the frame sender code.
  83     rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
  84     rbp_off2,
  85     return_off, return_off2,
  86     framesize
  87   };
  88 };
  89 
  90 class RegisterSaver {
  91   // Capture info about frame layout.  Layout offsets are in jint
  92   // units because compiler frame slots are jints.
  93 #define XSAVE_AREA_BEGIN 160
  94 #define XSAVE_AREA_YMM_BEGIN 576
  95 #define XSAVE_AREA_OPMASK_BEGIN 1088
  96 #define XSAVE_AREA_ZMM_BEGIN 1152
  97 #define XSAVE_AREA_UPPERBANK 1664
  98 #define DEF_XMM_OFFS(regnum)       xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
  99 #define DEF_YMM_OFFS(regnum)       ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 100 #define DEF_ZMM_OFFS(regnum)       zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off
 101 #define DEF_OPMASK_OFFS(regnum)    opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt,     opmask ## regnum ## H_off
 102 #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
 103   enum layout {
 104     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
 105     xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
 106     DEF_XMM_OFFS(0),
 107     DEF_XMM_OFFS(1),
 108     // 2..15 are implied in range usage
 109     ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 110     DEF_YMM_OFFS(0),
 111     DEF_YMM_OFFS(1),
 112     // 2..15 are implied in range usage
 113     opmask_off         = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 114     DEF_OPMASK_OFFS(0),
 115     DEF_OPMASK_OFFS(1),
 116     // 2..7 are implied in range usage
 117     zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
 118     DEF_ZMM_OFFS(0),
 119     DEF_ZMM_OFFS(1),
 120     zmm_upper_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
 121     DEF_ZMM_UPPER_OFFS(16),
 122     DEF_ZMM_UPPER_OFFS(17),
 123     // 18..31 are implied in range usage
 124     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
 125     fpu_stateH_end,
 126     r15_off, r15H_off,
 127     r14_off, r14H_off,
 128     r13_off, r13H_off,
 129     r12_off, r12H_off,
 130     r11_off, r11H_off,
 131     r10_off, r10H_off,
 132     r9_off,  r9H_off,
 133     r8_off,  r8H_off,
 134     rdi_off, rdiH_off,
 135     rsi_off, rsiH_off,
 136     ignore_off, ignoreH_off,  // extra copy of rbp
 137     rsp_off, rspH_off,
 138     rbx_off, rbxH_off,
 139     rdx_off, rdxH_off,
 140     rcx_off, rcxH_off,
 141     rax_off, raxH_off,
 142     // 16-byte stack alignment fill word: see MacroAssembler::push/pop_IU_state
 143     align_off, alignH_off,
 144     flags_off, flagsH_off,
 145     // The frame sender code expects that rbp will be in the "natural" place and
 146     // will override any oopMap setting for it. We must therefore force the layout
 147     // so that it agrees with the frame sender code.
 148     rbp_off, rbpH_off,        // copy of rbp we will restore
 149     return_off, returnH_off,  // slot for return address
 150     reg_save_size             // size in compiler stack slots
 151   };
 152 
 153  public:
 154   static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
 155   static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
 156 
 157   // Offsets into the register save area
 158   // Used by deoptimization when it is managing result register
 159   // values on its own
 160 
 161   static int rax_offset_in_bytes(void)    { return BytesPerInt * rax_off; }
 162   static int rdx_offset_in_bytes(void)    { return BytesPerInt * rdx_off; }
 163   static int rbx_offset_in_bytes(void)    { return BytesPerInt * rbx_off; }
 164   static int xmm0_offset_in_bytes(void)   { return BytesPerInt * xmm0_off; }
 165   static int return_offset_in_bytes(void) { return BytesPerInt * return_off; }
 166 
 167   // During deoptimization only the result registers need to be restored,
 168   // all the other values have already been extracted.
 169   static void restore_result_registers(MacroAssembler* masm);
 170 };
 171 
 172 // Register is a class, but it would be assigned numerical value.
 173 // "0" is assigned for rax. Thus we need to ignore -Wnonnull.
 174 PRAGMA_DIAG_PUSH
 175 PRAGMA_NONNULL_IGNORED
 176 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
 177   int off = 0;
 178   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 179   if (UseAVX < 3) {
 180     num_xmm_regs = num_xmm_regs/2;
 181   }
 182 #if COMPILER2_OR_JVMCI
 183   if (save_vectors && UseAVX == 0) {
 184     save_vectors = false; // vectors larger than 16 byte long are supported only with AVX
 185   }
 186   assert(!save_vectors || MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 187 #else
 188   save_vectors = false; // vectors are generated only by C2 and JVMCI
 189 #endif
 190 
 191   // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
 192   int frame_size_in_bytes = align_up(reg_save_size*BytesPerInt, num_xmm_regs);
 193   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
 194   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 195   // CodeBlob frame size is in words.
 196   int frame_size_in_words = frame_size_in_bytes / wordSize;
 197   *total_frame_words = frame_size_in_words;
 198 
 199   // Save registers, fpu state, and flags.
 200   // We assume caller has already pushed the return address onto the
 201   // stack, so rsp is 8-byte aligned here.
 202   // We push rpb twice in this sequence because we want the real rbp
 203   // to be under the return like a normal enter.
 204 
 205   __ enter();          // rsp becomes 16-byte aligned here
 206   __ push_CPU_state(); // Push a multiple of 16 bytes
 207 
 208   // push cpu state handles this on EVEX enabled targets
 209   if (save_vectors) {
 210     // Save upper half of YMM registers(0..15)
 211     int base_addr = XSAVE_AREA_YMM_BEGIN;
 212     for (int n = 0; n < 16; n++) {
 213       __ vextractf128_high(Address(rsp, base_addr+n*16), as_XMMRegister(n));
 214     }
 215     if (VM_Version::supports_evex()) {
 216       // Save upper half of ZMM registers(0..15)
 217       base_addr = XSAVE_AREA_ZMM_BEGIN;
 218       for (int n = 0; n < 16; n++) {
 219         __ vextractf64x4_high(Address(rsp, base_addr+n*32), as_XMMRegister(n));
 220       }
 221       // Save full ZMM registers(16..num_xmm_regs)
 222       base_addr = XSAVE_AREA_UPPERBANK;
 223       off = 0;
 224       int vector_len = Assembler::AVX_512bit;
 225       for (int n = 16; n < num_xmm_regs; n++) {
 226         __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
 227       }
 228 #if COMPILER2_OR_JVMCI
 229       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 230       off = 0;
 231       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 232         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 233       }
 234 #endif
 235     }
 236   } else {
 237     if (VM_Version::supports_evex()) {
 238       // Save upper bank of ZMM registers(16..31) for double/float usage
 239       int base_addr = XSAVE_AREA_UPPERBANK;
 240       off = 0;
 241       for (int n = 16; n < num_xmm_regs; n++) {
 242         __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
 243       }
 244 #if COMPILER2_OR_JVMCI
 245       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 246       off = 0;
 247       for(int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 248         __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n));
 249       }
 250 #endif
 251     }
 252   }
 253   __ vzeroupper();
 254   if (frame::arg_reg_save_area_bytes != 0) {
 255     // Allocate argument register save area
 256     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 257   }
 258 
 259   // Set an oopmap for the call site.  This oopmap will map all
 260   // oop-registers and debug-info registers as callee-saved.  This
 261   // will allow deoptimization at this safepoint to find all possible
 262   // debug-info recordings, as well as let GC find all oops.
 263 
 264   OopMapSet *oop_maps = new OopMapSet();
 265   OopMap* map = new OopMap(frame_size_in_slots, 0);
 266 
 267 #define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 268 
 269   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
 270   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
 271   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
 272   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());
 273   // rbp location is known implicitly by the frame sender code, needs no oopmap
 274   // and the location where rbp was saved by is ignored
 275   map->set_callee_saved(STACK_OFFSET( rsi_off ), rsi->as_VMReg());
 276   map->set_callee_saved(STACK_OFFSET( rdi_off ), rdi->as_VMReg());
 277   map->set_callee_saved(STACK_OFFSET( r8_off  ), r8->as_VMReg());
 278   map->set_callee_saved(STACK_OFFSET( r9_off  ), r9->as_VMReg());
 279   map->set_callee_saved(STACK_OFFSET( r10_off ), r10->as_VMReg());
 280   map->set_callee_saved(STACK_OFFSET( r11_off ), r11->as_VMReg());
 281   map->set_callee_saved(STACK_OFFSET( r12_off ), r12->as_VMReg());
 282   map->set_callee_saved(STACK_OFFSET( r13_off ), r13->as_VMReg());
 283   map->set_callee_saved(STACK_OFFSET( r14_off ), r14->as_VMReg());
 284   map->set_callee_saved(STACK_OFFSET( r15_off ), r15->as_VMReg());
 285   // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 286   // on EVEX enabled targets, we get it included in the xsave area
 287   off = xmm0_off;
 288   int delta = xmm1_off - off;
 289   for (int n = 0; n < 16; n++) {
 290     XMMRegister xmm_name = as_XMMRegister(n);
 291     map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
 292     off += delta;
 293   }
 294   if (UseAVX > 2) {
 295     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 296     off = zmm16_off;
 297     delta = zmm17_off - off;
 298     for (int n = 16; n < num_xmm_regs; n++) {
 299       XMMRegister zmm_name = as_XMMRegister(n);
 300       map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
 301       off += delta;
 302     }
 303   }
 304 
 305 #if COMPILER2_OR_JVMCI
 306   if (save_vectors) {
 307     // Save upper half of YMM registers(0..15)
 308     off = ymm0_off;
 309     delta = ymm1_off - ymm0_off;
 310     for (int n = 0; n < 16; n++) {
 311       XMMRegister ymm_name = as_XMMRegister(n);
 312       map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
 313       off += delta;
 314     }
 315     if (VM_Version::supports_evex()) {
 316       // Save upper half of ZMM registers(0..15)
 317       off = zmm0_off;
 318       delta = zmm1_off - zmm0_off;
 319       for (int n = 0; n < 16; n++) {
 320         XMMRegister zmm_name = as_XMMRegister(n);
 321         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next(8));
 322         off += delta;
 323       }
 324     }
 325   }
 326 #endif // COMPILER2_OR_JVMCI
 327 
 328   // %%% These should all be a waste but we'll keep things as they were for now
 329   if (true) {
 330     map->set_callee_saved(STACK_OFFSET( raxH_off ), rax->as_VMReg()->next());
 331     map->set_callee_saved(STACK_OFFSET( rcxH_off ), rcx->as_VMReg()->next());
 332     map->set_callee_saved(STACK_OFFSET( rdxH_off ), rdx->as_VMReg()->next());
 333     map->set_callee_saved(STACK_OFFSET( rbxH_off ), rbx->as_VMReg()->next());
 334     // rbp location is known implicitly by the frame sender code, needs no oopmap
 335     map->set_callee_saved(STACK_OFFSET( rsiH_off ), rsi->as_VMReg()->next());
 336     map->set_callee_saved(STACK_OFFSET( rdiH_off ), rdi->as_VMReg()->next());
 337     map->set_callee_saved(STACK_OFFSET( r8H_off  ), r8->as_VMReg()->next());
 338     map->set_callee_saved(STACK_OFFSET( r9H_off  ), r9->as_VMReg()->next());
 339     map->set_callee_saved(STACK_OFFSET( r10H_off ), r10->as_VMReg()->next());
 340     map->set_callee_saved(STACK_OFFSET( r11H_off ), r11->as_VMReg()->next());
 341     map->set_callee_saved(STACK_OFFSET( r12H_off ), r12->as_VMReg()->next());
 342     map->set_callee_saved(STACK_OFFSET( r13H_off ), r13->as_VMReg()->next());
 343     map->set_callee_saved(STACK_OFFSET( r14H_off ), r14->as_VMReg()->next());
 344     map->set_callee_saved(STACK_OFFSET( r15H_off ), r15->as_VMReg()->next());
 345     // For both AVX and EVEX we will use the legacy FXSAVE area for xmm0..xmm15,
 346     // on EVEX enabled targets, we get it included in the xsave area
 347     off = xmm0H_off;
 348     delta = xmm1H_off - off;
 349     for (int n = 0; n < 16; n++) {
 350       XMMRegister xmm_name = as_XMMRegister(n);
 351       map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
 352       off += delta;
 353     }
 354     if (UseAVX > 2) {
 355       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
 356       off = zmm16H_off;
 357       delta = zmm17H_off - off;
 358       for (int n = 16; n < num_xmm_regs; n++) {
 359         XMMRegister zmm_name = as_XMMRegister(n);
 360         map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
 361         off += delta;
 362       }
 363     }
 364   }
 365 
 366   return map;
 367 }
 368 PRAGMA_DIAG_POP
 369 
 370 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
 371   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
 372   if (UseAVX < 3) {
 373     num_xmm_regs = num_xmm_regs/2;
 374   }
 375   if (frame::arg_reg_save_area_bytes != 0) {
 376     // Pop arg register save area
 377     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 378   }
 379 
 380 #if COMPILER2_OR_JVMCI
 381   if (restore_vectors) {
 382     assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
 383     assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
 384   }
 385 #else
 386   assert(!restore_vectors, "vectors are generated only by C2");
 387 #endif
 388 
 389   __ vzeroupper();
 390 
 391   // On EVEX enabled targets everything is handled in pop fpu state
 392   if (restore_vectors) {
 393     // Restore upper half of YMM registers (0..15)
 394     int base_addr = XSAVE_AREA_YMM_BEGIN;
 395     for (int n = 0; n < 16; n++) {
 396       __ vinsertf128_high(as_XMMRegister(n), Address(rsp, base_addr+n*16));
 397     }
 398     if (VM_Version::supports_evex()) {
 399       // Restore upper half of ZMM registers (0..15)
 400       base_addr = XSAVE_AREA_ZMM_BEGIN;
 401       for (int n = 0; n < 16; n++) {
 402         __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, base_addr+n*32));
 403       }
 404       // Restore full ZMM registers(16..num_xmm_regs)
 405       base_addr = XSAVE_AREA_UPPERBANK;
 406       int vector_len = Assembler::AVX_512bit;
 407       int off = 0;
 408       for (int n = 16; n < num_xmm_regs; n++) {
 409         __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
 410       }
 411 #if COMPILER2_OR_JVMCI
 412       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 413       off = 0;
 414       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 415         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 416       }
 417 #endif
 418     }
 419   } else {
 420     if (VM_Version::supports_evex()) {
 421       // Restore upper bank of ZMM registers(16..31) for double/float usage
 422       int base_addr = XSAVE_AREA_UPPERBANK;
 423       int off = 0;
 424       for (int n = 16; n < num_xmm_regs; n++) {
 425         __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
 426       }
 427 #if COMPILER2_OR_JVMCI
 428       base_addr = XSAVE_AREA_OPMASK_BEGIN;
 429       off = 0;
 430       for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
 431         __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8)));
 432       }
 433 #endif
 434     }
 435   }
 436 
 437   // Recover CPU state
 438   __ pop_CPU_state();
 439   // Get the rbp described implicitly by the calling convention (no oopMap)
 440   __ pop(rbp);
 441 }
 442 
 443 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
 444 
 445   // Just restore result register. Only used by deoptimization. By
 446   // now any callee save register that needs to be restored to a c2
 447   // caller of the deoptee has been extracted into the vframeArray
 448   // and will be stuffed into the c2i adapter we create for later
 449   // restoration so only result registers need to be restored here.
 450 
 451   // Restore fp result register
 452   __ movdbl(xmm0, Address(rsp, xmm0_offset_in_bytes()));
 453   // Restore integer result register
 454   __ movptr(rax, Address(rsp, rax_offset_in_bytes()));
 455   __ movptr(rdx, Address(rsp, rdx_offset_in_bytes()));
 456 
 457   // Pop all of the register save are off the stack except the return address
 458   __ addptr(rsp, return_offset_in_bytes());
 459 }
 460 
 461 // Is vector's size (in bytes) bigger than a size saved by default?
 462 // 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
 463 bool SharedRuntime::is_wide_vector(int size) {
 464   return size > 16;
 465 }
 466 
 467 // ---------------------------------------------------------------------------
 468 // Read the array of BasicTypes from a signature, and compute where the
 469 // arguments should go.  Values in the VMRegPair regs array refer to 4-byte
 470 // quantities.  Values less than VMRegImpl::stack0 are registers, those above
 471 // refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
 472 // as framesizes are fixed.
 473 // VMRegImpl::stack0 refers to the first slot 0(sp).
 474 // and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
 475 // up to RegisterImpl::number_of_registers) are the 64-bit
 476 // integer registers.
 477 
 478 // Note: the INPUTS in sig_bt are in units of Java argument words, which are
 479 // either 32-bit or 64-bit depending on the build.  The OUTPUTS are in 32-bit
 480 // units regardless of build. Of course for i486 there is no 64 bit build
 481 
 482 // The Java calling convention is a "shifted" version of the C ABI.
 483 // By skipping the first C ABI register we can call non-static jni methods
 484 // with small numbers of arguments without having to shuffle the arguments
 485 // at all. Since we control the java ABI we ought to at least get some
 486 // advantage out of it.
 487 
 488 int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
 489                                            VMRegPair *regs,
 490                                            int total_args_passed) {
 491 
 492   // Create the mapping between argument positions and
 493   // registers.
 494   static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
 495     j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5
 496   };
 497   static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
 498     j_farg0, j_farg1, j_farg2, j_farg3,
 499     j_farg4, j_farg5, j_farg6, j_farg7
 500   };
 501 
 502 
 503   uint int_args = 0;
 504   uint fp_args = 0;
 505   uint stk_args = 0; // inc by 2 each time
 506 
 507   for (int i = 0; i < total_args_passed; i++) {
 508     switch (sig_bt[i]) {
 509     case T_BOOLEAN:
 510     case T_CHAR:
 511     case T_BYTE:
 512     case T_SHORT:
 513     case T_INT:
 514       if (int_args < Argument::n_int_register_parameters_j) {
 515         regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
 516       } else {
 517         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 518         stk_args += 2;
 519       }
 520       break;
 521     case T_VOID:
 522       // halves of T_LONG or T_DOUBLE
 523       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 524       regs[i].set_bad();
 525       break;
 526     case T_LONG:
 527       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 528       // fall through
 529     case T_OBJECT:
 530     case T_ARRAY:
 531     case T_ADDRESS:
 532     case T_INLINE_TYPE:
 533       if (int_args < Argument::n_int_register_parameters_j) {
 534         regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
 535       } else {
 536         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 537         stk_args += 2;
 538       }
 539       break;
 540     case T_FLOAT:
 541       if (fp_args < Argument::n_float_register_parameters_j) {
 542         regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
 543       } else {
 544         regs[i].set1(VMRegImpl::stack2reg(stk_args));
 545         stk_args += 2;
 546       }
 547       break;
 548     case T_DOUBLE:
 549       assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
 550       if (fp_args < Argument::n_float_register_parameters_j) {
 551         regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
 552       } else {
 553         regs[i].set2(VMRegImpl::stack2reg(stk_args));
 554         stk_args += 2;
 555       }
 556       break;
 557     default:
 558       ShouldNotReachHere();
 559       break;
 560     }
 561   }
 562 
 563   return align_up(stk_args, 2);
 564 }
 565 
 566 // Same as java_calling_convention() but for multiple return
 567 // values. There's no way to store them on the stack so if we don't
 568 // have enough registers, multiple values can't be returned.
 569 const uint SharedRuntime::java_return_convention_max_int = Argument::n_int_register_parameters_j+1;
 570 const uint SharedRuntime::java_return_convention_max_float = Argument::n_float_register_parameters_j;
 571 int SharedRuntime::java_return_convention(const BasicType *sig_bt,
 572                                           VMRegPair *regs,
 573                                           int total_args_passed) {
 574   // Create the mapping between argument positions and
 575   // registers.
 576   static const Register INT_ArgReg[java_return_convention_max_int] = {
 577     rax, j_rarg5, j_rarg4, j_rarg3, j_rarg2, j_rarg1, j_rarg0
 578   };
 579   static const XMMRegister FP_ArgReg[java_return_convention_max_float] = {
 580     j_farg0, j_farg1, j_farg2, j_farg3,
 581     j_farg4, j_farg5, j_farg6, j_farg7
 582   };
 583 
 584 
 585   uint int_args = 0;
 586   uint fp_args = 0;
 587 
 588   for (int i = 0; i < total_args_passed; i++) {
 589     switch (sig_bt[i]) {
 590     case T_BOOLEAN:
 591     case T_CHAR:
 592     case T_BYTE:
 593     case T_SHORT:
 594     case T_INT:
 595       if (int_args < Argument::n_int_register_parameters_j+1) {
 596         regs[i].set1(INT_ArgReg[int_args]->as_VMReg());
 597         int_args++;
 598       } else {
 599         return -1;
 600       }
 601       break;
 602     case T_VOID:
 603       // halves of T_LONG or T_DOUBLE
 604       assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
 605       regs[i].set_bad();
 606       break;
 607     case T_LONG:
 608       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 609       // fall through
 610     case T_OBJECT:
 611     case T_INLINE_TYPE:
 612     case T_ARRAY:
 613     case T_ADDRESS:
 614     case T_METADATA:
 615       if (int_args < Argument::n_int_register_parameters_j+1) {
 616         regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
 617         int_args++;
 618       } else {
 619         return -1;
 620       }
 621       break;
 622     case T_FLOAT:
 623       if (fp_args < Argument::n_float_register_parameters_j) {
 624         regs[i].set1(FP_ArgReg[fp_args]->as_VMReg());
 625         fp_args++;
 626       } else {
 627         return -1;
 628       }
 629       break;
 630     case T_DOUBLE:
 631       assert(sig_bt[i + 1] == T_VOID, "expecting half");
 632       if (fp_args < Argument::n_float_register_parameters_j) {
 633         regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
 634         fp_args++;
 635       } else {
 636         return -1;
 637       }
 638       break;
 639     default:
 640       ShouldNotReachHere();
 641       break;
 642     }
 643   }
 644 
 645   return int_args + fp_args;
 646 }
 647 
 648 // Patch the callers callsite with entry to compiled code if it exists.
 649 static void patch_callers_callsite(MacroAssembler *masm) {
 650   Label L;
 651   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
 652   __ jcc(Assembler::equal, L);
 653 
 654   // Save the current stack pointer
 655   __ mov(r13, rsp);
 656   // Schedule the branch target address early.
 657   // Call into the VM to patch the caller, then jump to compiled callee
 658   // rax isn't live so capture return address while we easily can
 659   __ movptr(rax, Address(rsp, 0));
 660 
 661   // align stack so push_CPU_state doesn't fault
 662   __ andptr(rsp, -(StackAlignmentInBytes));
 663   __ push_CPU_state();
 664   __ vzeroupper();
 665   // VM needs caller's callsite
 666   // VM needs target method
 667   // This needs to be a long call since we will relocate this adapter to
 668   // the codeBuffer and it may not reach
 669 
 670   // Allocate argument register save area
 671   if (frame::arg_reg_save_area_bytes != 0) {
 672     __ subptr(rsp, frame::arg_reg_save_area_bytes);
 673   }
 674   __ mov(c_rarg0, rbx);
 675   __ mov(c_rarg1, rax);
 676   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
 677 
 678   // De-allocate argument register save area
 679   if (frame::arg_reg_save_area_bytes != 0) {
 680     __ addptr(rsp, frame::arg_reg_save_area_bytes);
 681   }
 682 
 683   __ vzeroupper();
 684   __ pop_CPU_state();
 685   // restore sp
 686   __ mov(rsp, r13);
 687   __ bind(L);
 688 }
 689 
 690 // For each inline type argument, sig includes the list of fields of
 691 // the inline type. This utility function computes the number of
 692 // arguments for the call if inline types are passed by reference (the
 693 // calling convention the interpreter expects).
 694 static int compute_total_args_passed_int(const GrowableArray<SigEntry>* sig_extended) {
 695   int total_args_passed = 0;
 696   if (InlineTypePassFieldsAsArgs) {
 697     for (int i = 0; i < sig_extended->length(); i++) {
 698       BasicType bt = sig_extended->at(i)._bt;
 699       if (bt == T_INLINE_TYPE) {
 700         // In sig_extended, an inline type argument starts with:
 701         // T_INLINE_TYPE, followed by the types of the fields of the
 702         // inline type and T_VOID to mark the end of the value
 703         // type. Inline types are flattened so, for instance, in the
 704         // case of an inline type with an int field and an inline type
 705         // field that itself has 2 fields, an int and a long:
 706         // T_INLINE_TYPE T_INT T_INLINE_TYPE T_INT T_LONG T_VOID (second
 707         // slot for the T_LONG) T_VOID (inner T_INLINE_TYPE) T_VOID
 708         // (outer T_INLINE_TYPE)
 709         total_args_passed++;
 710         int vt = 1;
 711         do {
 712           i++;
 713           BasicType bt = sig_extended->at(i)._bt;
 714           BasicType prev_bt = sig_extended->at(i-1)._bt;
 715           if (bt == T_INLINE_TYPE) {
 716             vt++;
 717           } else if (bt == T_VOID &&
 718                      prev_bt != T_LONG &&
 719                      prev_bt != T_DOUBLE) {
 720             vt--;
 721           }
 722         } while (vt != 0);
 723       } else {
 724         total_args_passed++;
 725       }
 726     }
 727   } else {
 728     total_args_passed = sig_extended->length();
 729   }
 730   return total_args_passed;
 731 }
 732 
 733 
 734 static void gen_c2i_adapter_helper(MacroAssembler* masm,
 735                                    BasicType bt,
 736                                    BasicType prev_bt,
 737                                    size_t size_in_bytes,
 738                                    const VMRegPair& reg_pair,
 739                                    const Address& to,
 740                                    int extraspace,
 741                                    bool is_oop) {
 742   assert(bt != T_INLINE_TYPE || !InlineTypePassFieldsAsArgs, "no inline type here");
 743   if (bt == T_VOID) {
 744     assert(prev_bt == T_LONG || prev_bt == T_DOUBLE, "missing half");
 745     return;
 746   }
 747 
 748   // Say 4 args:
 749   // i   st_off
 750   // 0   32 T_LONG
 751   // 1   24 T_VOID
 752   // 2   16 T_OBJECT
 753   // 3    8 T_BOOL
 754   // -    0 return address
 755   //
 756   // However to make thing extra confusing. Because we can fit a long/double in
 757   // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
 758   // leaves one slot empty and only stores to a single slot. In this case the
 759   // slot that is occupied is the T_VOID slot. See I said it was confusing.
 760 
 761   bool wide = (size_in_bytes == wordSize);
 762   VMReg r_1 = reg_pair.first();
 763   VMReg r_2 = reg_pair.second();
 764   assert(r_2->is_valid() == wide, "invalid size");
 765   if (!r_1->is_valid()) {
 766     assert(!r_2->is_valid(), "must be invalid");
 767     return;
 768   }
 769 
 770   if (!r_1->is_XMMRegister()) {
 771     Register val = rax;
 772     if (r_1->is_stack()) {
 773       int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
 774       __ load_sized_value(val, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
 775     } else {
 776       val = r_1->as_Register();
 777     }
 778     assert_different_registers(to.base(), val, rscratch1);
 779     if (is_oop) {
 780       __ push(r13);
 781       __ push(rbx);
 782       __ store_heap_oop(to, val, rscratch1, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
 783       __ pop(rbx);
 784       __ pop(r13);
 785     } else {
 786       __ store_sized_value(to, val, size_in_bytes);
 787     }
 788   } else {
 789     if (wide) {
 790       __ movdbl(to, r_1->as_XMMRegister());
 791     } else {
 792       __ movflt(to, r_1->as_XMMRegister());
 793     }
 794   }
 795 }
 796 
 797 static void gen_c2i_adapter(MacroAssembler *masm,
 798                             const GrowableArray<SigEntry>* sig_extended,
 799                             const VMRegPair *regs,
 800                             Label& skip_fixup,
 801                             address start,
 802                             OopMapSet* oop_maps,
 803                             int& frame_complete,
 804                             int& frame_size_in_words,
 805                             bool alloc_inline_receiver) {
 806   // Before we get into the guts of the C2I adapter, see if we should be here
 807   // at all.  We've come from compiled code and are attempting to jump to the
 808   // interpreter, which means the caller made a static call to get here
 809   // (vcalls always get a compiled target if there is one).  Check for a
 810   // compiled target.  If there is one, we need to patch the caller's call.
 811   patch_callers_callsite(masm);
 812 
 813   __ bind(skip_fixup);
 814 
 815   if (InlineTypePassFieldsAsArgs) {
 816     // Is there an inline type argument?
 817     bool has_inline_argument = false;
 818     for (int i = 0; i < sig_extended->length() && !has_inline_argument; i++) {
 819       has_inline_argument = (sig_extended->at(i)._bt == T_INLINE_TYPE);
 820     }
 821     if (has_inline_argument) {
 822       // There is at least an inline type argument: we're coming from
 823       // compiled code so we have no buffers to back the inline types.
 824       // Allocate the buffers here with a runtime call.
 825       OopMap* map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
 826 
 827       frame_complete = __ offset();
 828 
 829       __ set_last_Java_frame(noreg, noreg, NULL);
 830 
 831       __ mov(c_rarg0, r15_thread);
 832       __ mov(c_rarg1, rbx);
 833       __ mov64(c_rarg2, (int64_t)alloc_inline_receiver);
 834       __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::allocate_inline_types)));
 835 
 836       oop_maps->add_gc_map((int)(__ pc() - start), map);
 837       __ reset_last_Java_frame(false);
 838 
 839       RegisterSaver::restore_live_registers(masm);
 840 
 841       Label no_exception;
 842       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 843       __ jcc(Assembler::equal, no_exception);
 844 
 845       __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
 846       __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 847       __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 848 
 849       __ bind(no_exception);
 850 
 851       // We get an array of objects from the runtime call
 852       __ get_vm_result(rscratch2, r15_thread); // Use rscratch2 (r11) as temporary because rscratch1 (r10) is trashed by movptr()
 853       __ get_vm_result_2(rbx, r15_thread); // TODO: required to keep the callee Method live?
 854     }
 855   }
 856 
 857   // Since all args are passed on the stack, total_args_passed *
 858   // Interpreter::stackElementSize is the space we need. Plus 1 because
 859   // we also account for the return address location since
 860   // we store it first rather than hold it in rax across all the shuffling
 861   int total_args_passed = compute_total_args_passed_int(sig_extended);
 862   int extraspace = (total_args_passed * Interpreter::stackElementSize) + wordSize;
 863 
 864   // stack is aligned, keep it that way
 865   extraspace = align_up(extraspace, 2*wordSize);
 866 
 867   // Get return address
 868   __ pop(rax);
 869 
 870   // set senderSP value
 871   __ mov(r13, rsp);
 872 
 873   __ subptr(rsp, extraspace);
 874 
 875   // Store the return address in the expected location
 876   __ movptr(Address(rsp, 0), rax);
 877 
 878   // Now write the args into the outgoing interpreter space
 879 
 880   // next_arg_comp is the next argument from the compiler point of
 881   // view (inline type fields are passed in registers/on the stack). In
 882   // sig_extended, an inline type argument starts with: T_INLINE_TYPE,
 883   // followed by the types of the fields of the inline type and T_VOID
 884   // to mark the end of the inline type. ignored counts the number of
 885   // T_INLINE_TYPE/T_VOID. next_vt_arg is the next inline type argument:
 886   // used to get the buffer for that argument from the pool of buffers
 887   // we allocated above and want to pass to the
 888   // interpreter. next_arg_int is the next argument from the
 889   // interpreter point of view (inline types are passed by reference).
 890   for (int next_arg_comp = 0, ignored = 0, next_vt_arg = 0, next_arg_int = 0;
 891        next_arg_comp < sig_extended->length(); next_arg_comp++) {
 892     assert(ignored <= next_arg_comp, "shouldn't skip over more slots than there are arguments");
 893     assert(next_arg_int <= total_args_passed, "more arguments for the interpreter than expected?");
 894     BasicType bt = sig_extended->at(next_arg_comp)._bt;
 895     int st_off = (total_args_passed - next_arg_int) * Interpreter::stackElementSize;
 896     if (!InlineTypePassFieldsAsArgs || bt != T_INLINE_TYPE) {
 897       int next_off = st_off - Interpreter::stackElementSize;
 898       const int offset = (bt == T_LONG || bt == T_DOUBLE) ? next_off : st_off;
 899       const VMRegPair reg_pair = regs[next_arg_comp-ignored];
 900       size_t size_in_bytes = reg_pair.second()->is_valid() ? 8 : 4;
 901       gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 902                              size_in_bytes, reg_pair, Address(rsp, offset), extraspace, false);
 903       next_arg_int++;
 904 #ifdef ASSERT
 905       if (bt == T_LONG || bt == T_DOUBLE) {
 906         // Overwrite the unused slot with known junk
 907         __ mov64(rax, CONST64(0xdeadffffdeadaaaa));
 908         __ movptr(Address(rsp, st_off), rax);
 909       }
 910 #endif /* ASSERT */
 911     } else {
 912       ignored++;
 913       // get the buffer from the just allocated pool of buffers
 914       int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + next_vt_arg * type2aelembytes(T_INLINE_TYPE);
 915       __ load_heap_oop(r14, Address(rscratch2, index));
 916       next_vt_arg++; next_arg_int++;
 917       int vt = 1;
 918       // write fields we get from compiled code in registers/stack
 919       // slots to the buffer: we know we are done with that inline type
 920       // argument when we hit the T_VOID that acts as an end of inline
 921       // type delimiter for this inline type. Inline types are flattened
 922       // so we might encounter embedded inline types. Each entry in
 923       // sig_extended contains a field offset in the buffer.
 924       do {
 925         next_arg_comp++;
 926         BasicType bt = sig_extended->at(next_arg_comp)._bt;
 927         BasicType prev_bt = sig_extended->at(next_arg_comp-1)._bt;
 928         if (bt == T_INLINE_TYPE) {
 929           vt++;
 930           ignored++;
 931         } else if (bt == T_VOID &&
 932                    prev_bt != T_LONG &&
 933                    prev_bt != T_DOUBLE) {
 934           vt--;
 935           ignored++;
 936         } else {
 937           int off = sig_extended->at(next_arg_comp)._offset;
 938           assert(off > 0, "offset in object should be positive");
 939           size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
 940           bool is_oop = is_reference_type(bt);
 941           gen_c2i_adapter_helper(masm, bt, next_arg_comp > 0 ? sig_extended->at(next_arg_comp-1)._bt : T_ILLEGAL,
 942                                  size_in_bytes, regs[next_arg_comp-ignored], Address(r14, off), extraspace, is_oop);
 943         }
 944       } while (vt != 0);
 945       // pass the buffer to the interpreter
 946       __ movptr(Address(rsp, st_off), r14);
 947     }
 948   }
 949 
 950   // Schedule the branch target address early.
 951   __ movptr(rcx, Address(rbx, in_bytes(Method::interpreter_entry_offset())));
 952   __ jmp(rcx);
 953 }
 954 
 955 static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
 956                         address code_start, address code_end,
 957                         Label& L_ok) {
 958   Label L_fail;
 959   __ lea(temp_reg, ExternalAddress(code_start));
 960   __ cmpptr(pc_reg, temp_reg);
 961   __ jcc(Assembler::belowEqual, L_fail);
 962   __ lea(temp_reg, ExternalAddress(code_end));
 963   __ cmpptr(pc_reg, temp_reg);
 964   __ jcc(Assembler::below, L_ok);
 965   __ bind(L_fail);
 966 }
 967 
 968 void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
 969                                     int comp_args_on_stack,
 970                                     const GrowableArray<SigEntry>* sig,
 971                                     const VMRegPair *regs) {
 972 
 973   // Note: r13 contains the senderSP on entry. We must preserve it since
 974   // we may do a i2c -> c2i transition if we lose a race where compiled
 975   // code goes non-entrant while we get args ready.
 976   // In addition we use r13 to locate all the interpreter args as
 977   // we must align the stack to 16 bytes on an i2c entry else we
 978   // lose alignment we expect in all compiled code and register
 979   // save code can segv when fxsave instructions find improperly
 980   // aligned stack pointer.
 981 
 982   // Adapters can be frameless because they do not require the caller
 983   // to perform additional cleanup work, such as correcting the stack pointer.
 984   // An i2c adapter is frameless because the *caller* frame, which is interpreted,
 985   // routinely repairs its own stack pointer (from interpreter_frame_last_sp),
 986   // even if a callee has modified the stack pointer.
 987   // A c2i adapter is frameless because the *callee* frame, which is interpreted,
 988   // routinely repairs its caller's stack pointer (from sender_sp, which is set
 989   // up via the senderSP register).
 990   // In other words, if *either* the caller or callee is interpreted, we can
 991   // get the stack pointer repaired after a call.
 992   // This is why c2i and i2c adapters cannot be indefinitely composed.
 993   // In particular, if a c2i adapter were to somehow call an i2c adapter,
 994   // both caller and callee would be compiled methods, and neither would
 995   // clean up the stack pointer changes performed by the two adapters.
 996   // If this happens, control eventually transfers back to the compiled
 997   // caller, but with an uncorrected stack, causing delayed havoc.
 998 
 999   // Pick up the return address
1000   __ movptr(rax, Address(rsp, 0));
1001 
1002   if (VerifyAdapterCalls &&
1003       (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
1004     // So, let's test for cascading c2i/i2c adapters right now.
1005     //  assert(Interpreter::contains($return_addr) ||
1006     //         StubRoutines::contains($return_addr),
1007     //         "i2c adapter must return to an interpreter frame");
1008     __ block_comment("verify_i2c { ");
1009     Label L_ok;
1010     if (Interpreter::code() != NULL)
1011       range_check(masm, rax, r11,
1012                   Interpreter::code()->code_start(), Interpreter::code()->code_end(),
1013                   L_ok);
1014     if (StubRoutines::code1() != NULL)
1015       range_check(masm, rax, r11,
1016                   StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
1017                   L_ok);
1018     if (StubRoutines::code2() != NULL)
1019       range_check(masm, rax, r11,
1020                   StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
1021                   L_ok);
1022     const char* msg = "i2c adapter must return to an interpreter frame";
1023     __ block_comment(msg);
1024     __ stop(msg);
1025     __ bind(L_ok);
1026     __ block_comment("} verify_i2ce ");
1027   }
1028 
1029   // Must preserve original SP for loading incoming arguments because
1030   // we need to align the outgoing SP for compiled code.
1031   __ movptr(r11, rsp);
1032 
1033   // Cut-out for having no stack args.  Since up to 2 int/oop args are passed
1034   // in registers, we will occasionally have no stack args.
1035   int comp_words_on_stack = 0;
1036   if (comp_args_on_stack) {
1037     // Sig words on the stack are greater-than VMRegImpl::stack0.  Those in
1038     // registers are below.  By subtracting stack0, we either get a negative
1039     // number (all values in registers) or the maximum stack slot accessed.
1040 
1041     // Convert 4-byte c2 stack slots to words.
1042     comp_words_on_stack = align_up(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
1043     // Round up to miminum stack alignment, in wordSize
1044     comp_words_on_stack = align_up(comp_words_on_stack, 2);
1045     __ subptr(rsp, comp_words_on_stack * wordSize);
1046   }
1047 
1048 
1049   // Ensure compiled code always sees stack at proper alignment
1050   __ andptr(rsp, -16);
1051 
1052   // push the return address and misalign the stack that youngest frame always sees
1053   // as far as the placement of the call instruction
1054   __ push(rax);
1055 
1056   // Put saved SP in another register
1057   const Register saved_sp = rax;
1058   __ movptr(saved_sp, r11);
1059 
1060   // Will jump to the compiled code just as if compiled code was doing it.
1061   // Pre-load the register-jump target early, to schedule it better.
1062   __ movptr(r11, Address(rbx, in_bytes(Method::from_compiled_inline_offset())));
1063 
1064 #if INCLUDE_JVMCI
1065   if (EnableJVMCI) {
1066     // check if this call should be routed towards a specific entry point
1067     __ cmpptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1068     Label no_alternative_target;
1069     __ jcc(Assembler::equal, no_alternative_target);
1070     __ movptr(r11, Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())));
1071     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_alternate_call_target_offset())), 0);
1072     __ bind(no_alternative_target);
1073   }
1074 #endif // INCLUDE_JVMCI
1075 
1076   int total_args_passed = sig->length();
1077 
1078   // Now generate the shuffle code.  Pick up all register args and move the
1079   // rest through the floating point stack top.
1080   for (int i = 0; i < total_args_passed; i++) {
1081     BasicType bt = sig->at(i)._bt;
1082     assert(bt != T_INLINE_TYPE, "i2c adapter doesn't unpack inline type args");
1083     if (bt == T_VOID) {
1084       // Longs and doubles are passed in native word order, but misaligned
1085       // in the 32-bit build.
1086       BasicType prev_bt = (i > 0) ? sig->at(i-1)._bt : T_ILLEGAL;
1087       assert(i > 0 && (prev_bt == T_LONG || prev_bt == T_DOUBLE), "missing half");
1088       continue;
1089     }
1090 
1091     // Pick up 0, 1 or 2 words from SP+offset.
1092 
1093     assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
1094             "scrambled load targets?");
1095     // Load in argument order going down.
1096     int ld_off = (total_args_passed - i)*Interpreter::stackElementSize;
1097     // Point to interpreter value (vs. tag)
1098     int next_off = ld_off - Interpreter::stackElementSize;
1099     //
1100     //
1101     //
1102     VMReg r_1 = regs[i].first();
1103     VMReg r_2 = regs[i].second();
1104     if (!r_1->is_valid()) {
1105       assert(!r_2->is_valid(), "");
1106       continue;
1107     }
1108     if (r_1->is_stack()) {
1109       // Convert stack slot to an SP offset (+ wordSize to account for return address )
1110       int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size + wordSize;
1111 
1112       // We can use r13 as a temp here because compiled code doesn't need r13 as an input
1113       // and if we end up going thru a c2i because of a miss a reasonable value of r13
1114       // will be generated.
1115       if (!r_2->is_valid()) {
1116         // sign extend???
1117         __ movl(r13, Address(saved_sp, ld_off));
1118         __ movptr(Address(rsp, st_off), r13);
1119       } else {
1120         //
1121         // We are using two optoregs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1122         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1123         // So we must adjust where to pick up the data to match the interpreter.
1124         //
1125         // Interpreter local[n] == MSW, local[n+1] == LSW however locals
1126         // are accessed as negative so LSW is at LOW address
1127 
1128         // ld_off is MSW so get LSW
1129         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1130                            next_off : ld_off;
1131         __ movq(r13, Address(saved_sp, offset));
1132         // st_off is LSW (i.e. reg.first())
1133         __ movq(Address(rsp, st_off), r13);
1134       }
1135     } else if (r_1->is_Register()) {  // Register argument
1136       Register r = r_1->as_Register();
1137       assert(r != rax, "must be different");
1138       if (r_2->is_valid()) {
1139         //
1140         // We are using two VMRegs. This can be either T_OBJECT, T_ADDRESS, T_LONG, or T_DOUBLE
1141         // the interpreter allocates two slots but only uses one for thr T_LONG or T_DOUBLE case
1142         // So we must adjust where to pick up the data to match the interpreter.
1143 
1144         const int offset = (bt==T_LONG||bt==T_DOUBLE)?
1145                            next_off : ld_off;
1146 
1147         // this can be a misaligned move
1148         __ movq(r, Address(saved_sp, offset));
1149       } else {
1150         // sign extend and use a full word?
1151         __ movl(r, Address(saved_sp, ld_off));
1152       }
1153     } else {
1154       if (!r_2->is_valid()) {
1155         __ movflt(r_1->as_XMMRegister(), Address(saved_sp, ld_off));
1156       } else {
1157         __ movdbl(r_1->as_XMMRegister(), Address(saved_sp, next_off));
1158       }
1159     }
1160   }
1161 
1162   // 6243940 We might end up in handle_wrong_method if
1163   // the callee is deoptimized as we race thru here. If that
1164   // happens we don't want to take a safepoint because the
1165   // caller frame will look interpreted and arguments are now
1166   // "compiled" so it is much better to make this transition
1167   // invisible to the stack walking code. Unfortunately if
1168   // we try and find the callee by normal means a safepoint
1169   // is possible. So we stash the desired callee in the thread
1170   // and the vm will find there should this case occur.
1171 
1172   __ movptr(Address(r15_thread, JavaThread::callee_target_offset()), rbx);
1173 
1174   // put Method* where a c2i would expect should we end up there
1175   // only needed because of c2 resolve stubs return Method* as a result in
1176   // rax
1177   __ mov(rax, rbx);
1178   __ jmp(r11);
1179 }
1180 
1181 static void gen_inline_cache_check(MacroAssembler *masm, Label& skip_fixup) {
1182   Label ok;
1183 
1184   Register holder = rax;
1185   Register receiver = j_rarg0;
1186   Register temp = rbx;
1187 
1188   __ load_klass(temp, receiver, rscratch1);
1189   __ cmpptr(temp, Address(holder, CompiledICHolder::holder_klass_offset()));
1190   __ movptr(rbx, Address(holder, CompiledICHolder::holder_metadata_offset()));
1191   __ jcc(Assembler::equal, ok);
1192   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1193 
1194   __ bind(ok);
1195   // Method might have been compiled since the call site was patched to
1196   // interpreted if that is the case treat it as a miss so we can get
1197   // the call site corrected.
1198   __ cmpptr(Address(rbx, in_bytes(Method::code_offset())), (int32_t)NULL_WORD);
1199   __ jcc(Assembler::equal, skip_fixup);
1200   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1201 }
1202 
1203 // ---------------------------------------------------------------
1204 AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler* masm,
1205                                                             int comp_args_on_stack,
1206                                                             const GrowableArray<SigEntry>* sig,
1207                                                             const VMRegPair* regs,
1208                                                             const GrowableArray<SigEntry>* sig_cc,
1209                                                             const VMRegPair* regs_cc,
1210                                                             const GrowableArray<SigEntry>* sig_cc_ro,
1211                                                             const VMRegPair* regs_cc_ro,
1212                                                             AdapterFingerPrint* fingerprint,
1213                                                             AdapterBlob*& new_adapter,
1214                                                             bool allocate_code_blob) {
1215   address i2c_entry = __ pc();
1216   gen_i2c_adapter(masm, comp_args_on_stack, sig, regs);
1217 
1218   // -------------------------------------------------------------------------
1219   // Generate a C2I adapter.  On entry we know rbx holds the Method* during calls
1220   // to the interpreter.  The args start out packed in the compiled layout.  They
1221   // need to be unpacked into the interpreter layout.  This will almost always
1222   // require some stack space.  We grow the current (compiled) stack, then repack
1223   // the args.  We  finally end in a jump to the generic interpreter entry point.
1224   // On exit from the interpreter, the interpreter will restore our SP (lest the
1225   // compiled code, which relys solely on SP and not RBP, get sick).
1226 
1227   address c2i_unverified_entry = __ pc();
1228   Label skip_fixup;
1229 
1230   gen_inline_cache_check(masm, skip_fixup);
1231 
1232   OopMapSet* oop_maps = new OopMapSet();
1233   int frame_complete = CodeOffsets::frame_never_safe;
1234   int frame_size_in_words = 0;
1235 
1236   // Scalarized c2i adapter with non-scalarized receiver (i.e., don't pack receiver)
1237   address c2i_inline_ro_entry = __ pc();
1238   if (regs_cc != regs_cc_ro) {
1239     gen_c2i_adapter(masm, sig_cc_ro, regs_cc_ro, skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, false);
1240     skip_fixup.reset();
1241   }
1242 
1243   // Scalarized c2i adapter
1244   address c2i_entry = __ pc();
1245 
1246   // Class initialization barrier for static methods
1247   address c2i_no_clinit_check_entry = NULL;
1248   if (VM_Version::supports_fast_class_init_checks()) {
1249     Label L_skip_barrier;
1250     Register method = rbx;
1251 
1252     { // Bypass the barrier for non-static methods
1253       Register flags  = rscratch1;
1254       __ movl(flags, Address(method, Method::access_flags_offset()));
1255       __ testl(flags, JVM_ACC_STATIC);
1256       __ jcc(Assembler::zero, L_skip_barrier); // non-static
1257     }
1258 
1259     Register klass = rscratch1;
1260     __ load_method_holder(klass, method);
1261     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
1262 
1263     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
1264 
1265     __ bind(L_skip_barrier);
1266     c2i_no_clinit_check_entry = __ pc();
1267   }
1268 
1269   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1270   bs->c2i_entry_barrier(masm);
1271 
1272   gen_c2i_adapter(masm, sig_cc, regs_cc, skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, true);
1273 
1274   address c2i_unverified_inline_entry = c2i_unverified_entry;
1275 
1276   // Non-scalarized c2i adapter
1277   address c2i_inline_entry = c2i_entry;
1278   if (regs != regs_cc) {
1279     Label inline_entry_skip_fixup;
1280     c2i_unverified_inline_entry = __ pc();
1281     gen_inline_cache_check(masm, inline_entry_skip_fixup);
1282 
1283     c2i_inline_entry = __ pc();
1284     gen_c2i_adapter(masm, sig, regs, inline_entry_skip_fixup, i2c_entry, oop_maps, frame_complete, frame_size_in_words, false);
1285   }
1286 
1287   __ flush();
1288 
1289   // The c2i adapters might safepoint and trigger a GC. The caller must make sure that
1290   // the GC knows about the location of oop argument locations passed to the c2i adapter.
1291   if (allocate_code_blob) {
1292     bool caller_must_gc_arguments = (regs != regs_cc);
1293     new_adapter = AdapterBlob::create(masm->code(), frame_complete, frame_size_in_words, oop_maps, caller_must_gc_arguments);
1294   }
1295 
1296   return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_inline_entry, c2i_inline_ro_entry, c2i_unverified_entry, c2i_unverified_inline_entry, c2i_no_clinit_check_entry);
1297 }
1298 
1299 int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
1300                                          VMRegPair *regs,
1301                                          VMRegPair *regs2,
1302                                          int total_args_passed) {
1303   assert(regs2 == NULL, "not needed on x86");
1304 // We return the amount of VMRegImpl stack slots we need to reserve for all
1305 // the arguments NOT counting out_preserve_stack_slots.
1306 
1307 // NOTE: These arrays will have to change when c1 is ported
1308 #ifdef _WIN64
1309     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1310       c_rarg0, c_rarg1, c_rarg2, c_rarg3
1311     };
1312     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1313       c_farg0, c_farg1, c_farg2, c_farg3
1314     };
1315 #else
1316     static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
1317       c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5
1318     };
1319     static const XMMRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
1320       c_farg0, c_farg1, c_farg2, c_farg3,
1321       c_farg4, c_farg5, c_farg6, c_farg7
1322     };
1323 #endif // _WIN64
1324 
1325 
1326     uint int_args = 0;
1327     uint fp_args = 0;
1328     uint stk_args = 0; // inc by 2 each time
1329 
1330     for (int i = 0; i < total_args_passed; i++) {
1331       switch (sig_bt[i]) {
1332       case T_BOOLEAN:
1333       case T_CHAR:
1334       case T_BYTE:
1335       case T_SHORT:
1336       case T_INT:
1337         if (int_args < Argument::n_int_register_parameters_c) {
1338           regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
1339 #ifdef _WIN64
1340           fp_args++;
1341           // Allocate slots for callee to stuff register args the stack.
1342           stk_args += 2;
1343 #endif
1344         } else {
1345           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1346           stk_args += 2;
1347         }
1348         break;
1349       case T_LONG:
1350         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1351         // fall through
1352       case T_OBJECT:
1353       case T_ARRAY:
1354       case T_INLINE_TYPE:
1355       case T_ADDRESS:
1356       case T_METADATA:
1357         if (int_args < Argument::n_int_register_parameters_c) {
1358           regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
1359 #ifdef _WIN64
1360           fp_args++;
1361           stk_args += 2;
1362 #endif
1363         } else {
1364           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1365           stk_args += 2;
1366         }
1367         break;
1368       case T_FLOAT:
1369         if (fp_args < Argument::n_float_register_parameters_c) {
1370           regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
1371 #ifdef _WIN64
1372           int_args++;
1373           // Allocate slots for callee to stuff register args the stack.
1374           stk_args += 2;
1375 #endif
1376         } else {
1377           regs[i].set1(VMRegImpl::stack2reg(stk_args));
1378           stk_args += 2;
1379         }
1380         break;
1381       case T_DOUBLE:
1382         assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
1383         if (fp_args < Argument::n_float_register_parameters_c) {
1384           regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
1385 #ifdef _WIN64
1386           int_args++;
1387           // Allocate slots for callee to stuff register args the stack.
1388           stk_args += 2;
1389 #endif
1390         } else {
1391           regs[i].set2(VMRegImpl::stack2reg(stk_args));
1392           stk_args += 2;
1393         }
1394         break;
1395       case T_VOID: // Halves of longs and doubles
1396         assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
1397         regs[i].set_bad();
1398         break;
1399       default:
1400         ShouldNotReachHere();
1401         break;
1402       }
1403     }
1404 #ifdef _WIN64
1405   // windows abi requires that we always allocate enough stack space
1406   // for 4 64bit registers to be stored down.
1407   if (stk_args < 8) {
1408     stk_args = 8;
1409   }
1410 #endif // _WIN64
1411 
1412   return stk_args;
1413 }
1414 
1415 int SharedRuntime::vector_calling_convention(VMRegPair *regs,
1416                                              uint num_bits,
1417                                              uint total_args_passed) {
1418   assert(num_bits == 64 || num_bits == 128 || num_bits == 256 || num_bits == 512,
1419          "only certain vector sizes are supported for now");
1420 
1421   static const XMMRegister VEC_ArgReg[32] = {
1422      xmm0,  xmm1,  xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7,
1423      xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15,
1424     xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23,
1425     xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31
1426   };
1427 
1428   uint stk_args = 0;
1429   uint fp_args = 0;
1430 
1431   for (uint i = 0; i < total_args_passed; i++) {
1432     VMReg vmreg = VEC_ArgReg[fp_args++]->as_VMReg();
1433     int next_val = num_bits == 64 ? 1 : (num_bits == 128 ? 3 : (num_bits  == 256 ? 7 : 15));
1434     regs[i].set_pair(vmreg->next(next_val), vmreg);
1435   }
1436 
1437   return stk_args;
1438 }
1439 
1440 void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1441   // We always ignore the frame_slots arg and just use the space just below frame pointer
1442   // which by this time is free to use
1443   switch (ret_type) {
1444   case T_FLOAT:
1445     __ movflt(Address(rbp, -wordSize), xmm0);
1446     break;
1447   case T_DOUBLE:
1448     __ movdbl(Address(rbp, -wordSize), xmm0);
1449     break;
1450   case T_VOID:  break;
1451   default: {
1452     __ movptr(Address(rbp, -wordSize), rax);
1453     }
1454   }
1455 }
1456 
1457 void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
1458   // We always ignore the frame_slots arg and just use the space just below frame pointer
1459   // which by this time is free to use
1460   switch (ret_type) {
1461   case T_FLOAT:
1462     __ movflt(xmm0, Address(rbp, -wordSize));
1463     break;
1464   case T_DOUBLE:
1465     __ movdbl(xmm0, Address(rbp, -wordSize));
1466     break;
1467   case T_VOID:  break;
1468   default: {
1469     __ movptr(rax, Address(rbp, -wordSize));
1470     }
1471   }
1472 }
1473 
1474 static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1475     for ( int i = first_arg ; i < arg_count ; i++ ) {
1476       if (args[i].first()->is_Register()) {
1477         __ push(args[i].first()->as_Register());
1478       } else if (args[i].first()->is_XMMRegister()) {
1479         __ subptr(rsp, 2*wordSize);
1480         __ movdbl(Address(rsp, 0), args[i].first()->as_XMMRegister());
1481       }
1482     }
1483 }
1484 
1485 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
1486     for ( int i = arg_count - 1 ; i >= first_arg ; i-- ) {
1487       if (args[i].first()->is_Register()) {
1488         __ pop(args[i].first()->as_Register());
1489       } else if (args[i].first()->is_XMMRegister()) {
1490         __ movdbl(args[i].first()->as_XMMRegister(), Address(rsp, 0));
1491         __ addptr(rsp, 2*wordSize);
1492       }
1493     }
1494 }
1495 
1496 // Unpack an array argument into a pointer to the body and the length
1497 // if the array is non-null, otherwise pass 0 for both.
1498 static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) {
1499   Register tmp_reg = rax;
1500   assert(!body_arg.first()->is_Register() || body_arg.first()->as_Register() != tmp_reg,
1501          "possible collision");
1502   assert(!length_arg.first()->is_Register() || length_arg.first()->as_Register() != tmp_reg,
1503          "possible collision");
1504 
1505   __ block_comment("unpack_array_argument {");
1506 
1507   // Pass the length, ptr pair
1508   Label is_null, done;
1509   VMRegPair tmp;
1510   tmp.set_ptr(tmp_reg->as_VMReg());
1511   if (reg.first()->is_stack()) {
1512     // Load the arg up from the stack
1513     __ move_ptr(reg, tmp);
1514     reg = tmp;
1515   }
1516   __ testptr(reg.first()->as_Register(), reg.first()->as_Register());
1517   __ jccb(Assembler::equal, is_null);
1518   __ lea(tmp_reg, Address(reg.first()->as_Register(), arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1519   __ move_ptr(tmp, body_arg);
1520   // load the length relative to the body.
1521   __ movl(tmp_reg, Address(tmp_reg, arrayOopDesc::length_offset_in_bytes() -
1522                            arrayOopDesc::base_offset_in_bytes(in_elem_type)));
1523   __ move32_64(tmp, length_arg);
1524   __ jmpb(done);
1525   __ bind(is_null);
1526   // Pass zeros
1527   __ xorptr(tmp_reg, tmp_reg);
1528   __ move_ptr(tmp, body_arg);
1529   __ move32_64(tmp, length_arg);
1530   __ bind(done);
1531 
1532   __ block_comment("} unpack_array_argument");
1533 }
1534 
1535 
1536 // Different signatures may require very different orders for the move
1537 // to avoid clobbering other arguments.  There's no simple way to
1538 // order them safely.  Compute a safe order for issuing stores and
1539 // break any cycles in those stores.  This code is fairly general but
1540 // it's not necessary on the other platforms so we keep it in the
1541 // platform dependent code instead of moving it into a shared file.
1542 // (See bugs 7013347 & 7145024.)
1543 // Note that this code is specific to LP64.
1544 class ComputeMoveOrder: public StackObj {
1545   class MoveOperation: public ResourceObj {
1546     friend class ComputeMoveOrder;
1547    private:
1548     VMRegPair        _src;
1549     VMRegPair        _dst;
1550     int              _src_index;
1551     int              _dst_index;
1552     bool             _processed;
1553     MoveOperation*  _next;
1554     MoveOperation*  _prev;
1555 
1556     static int get_id(VMRegPair r) {
1557       return r.first()->value();
1558     }
1559 
1560    public:
1561     MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
1562       _src(src)
1563     , _dst(dst)
1564     , _src_index(src_index)
1565     , _dst_index(dst_index)
1566     , _processed(false)
1567     , _next(NULL)
1568     , _prev(NULL) {
1569     }
1570 
1571     VMRegPair src() const              { return _src; }
1572     int src_id() const                 { return get_id(src()); }
1573     int src_index() const              { return _src_index; }
1574     VMRegPair dst() const              { return _dst; }
1575     void set_dst(int i, VMRegPair dst) { _dst_index = i, _dst = dst; }
1576     int dst_index() const              { return _dst_index; }
1577     int dst_id() const                 { return get_id(dst()); }
1578     MoveOperation* next() const       { return _next; }
1579     MoveOperation* prev() const       { return _prev; }
1580     void set_processed()               { _processed = true; }
1581     bool is_processed() const          { return _processed; }
1582 
1583     // insert
1584     void break_cycle(VMRegPair temp_register) {
1585       // create a new store following the last store
1586       // to move from the temp_register to the original
1587       MoveOperation* new_store = new MoveOperation(-1, temp_register, dst_index(), dst());
1588 
1589       // break the cycle of links and insert new_store at the end
1590       // break the reverse link.
1591       MoveOperation* p = prev();
1592       assert(p->next() == this, "must be");
1593       _prev = NULL;
1594       p->_next = new_store;
1595       new_store->_prev = p;
1596 
1597       // change the original store to save it's value in the temp.
1598       set_dst(-1, temp_register);
1599     }
1600 
1601     void link(GrowableArray<MoveOperation*>& killer) {
1602       // link this store in front the store that it depends on
1603       MoveOperation* n = killer.at_grow(src_id(), NULL);
1604       if (n != NULL) {
1605         assert(_next == NULL && n->_prev == NULL, "shouldn't have been set yet");
1606         _next = n;
1607         n->_prev = this;
1608       }
1609     }
1610   };
1611 
1612  private:
1613   GrowableArray<MoveOperation*> edges;
1614 
1615  public:
1616   ComputeMoveOrder(int total_in_args, const VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
1617                   const BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) {
1618     // Move operations where the dest is the stack can all be
1619     // scheduled first since they can't interfere with the other moves.
1620     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
1621       if (in_sig_bt[i] == T_ARRAY) {
1622         c_arg--;
1623         if (out_regs[c_arg].first()->is_stack() &&
1624             out_regs[c_arg + 1].first()->is_stack()) {
1625           arg_order.push(i);
1626           arg_order.push(c_arg);
1627         } else {
1628           if (out_regs[c_arg].first()->is_stack() ||
1629               in_regs[i].first() == out_regs[c_arg].first()) {
1630             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg + 1]);
1631           } else {
1632             add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1633           }
1634         }
1635       } else if (in_sig_bt[i] == T_VOID) {
1636         arg_order.push(i);
1637         arg_order.push(c_arg);
1638       } else {
1639         if (out_regs[c_arg].first()->is_stack() ||
1640             in_regs[i].first() == out_regs[c_arg].first()) {
1641           arg_order.push(i);
1642           arg_order.push(c_arg);
1643         } else {
1644           add_edge(i, in_regs[i].first(), c_arg, out_regs[c_arg]);
1645         }
1646       }
1647     }
1648     // Break any cycles in the register moves and emit the in the
1649     // proper order.
1650     GrowableArray<MoveOperation*>* stores = get_store_order(tmp_vmreg);
1651     for (int i = 0; i < stores->length(); i++) {
1652       arg_order.push(stores->at(i)->src_index());
1653       arg_order.push(stores->at(i)->dst_index());
1654     }
1655  }
1656 
1657   // Collected all the move operations
1658   void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) {
1659     if (src.first() == dst.first()) return;
1660     edges.append(new MoveOperation(src_index, src, dst_index, dst));
1661   }
1662 
1663   // Walk the edges breaking cycles between moves.  The result list
1664   // can be walked in order to produce the proper set of loads
1665   GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) {
1666     // Record which moves kill which values
1667     GrowableArray<MoveOperation*> killer;
1668     for (int i = 0; i < edges.length(); i++) {
1669       MoveOperation* s = edges.at(i);
1670       assert(killer.at_grow(s->dst_id(), NULL) == NULL, "only one killer");
1671       killer.at_put_grow(s->dst_id(), s, NULL);
1672     }
1673     assert(killer.at_grow(MoveOperation::get_id(temp_register), NULL) == NULL,
1674            "make sure temp isn't in the registers that are killed");
1675 
1676     // create links between loads and stores
1677     for (int i = 0; i < edges.length(); i++) {
1678       edges.at(i)->link(killer);
1679     }
1680 
1681     // at this point, all the move operations are chained together
1682     // in a doubly linked list.  Processing it backwards finds
1683     // the beginning of the chain, forwards finds the end.  If there's
1684     // a cycle it can be broken at any point,  so pick an edge and walk
1685     // backward until the list ends or we end where we started.
1686     GrowableArray<MoveOperation*>* stores = new GrowableArray<MoveOperation*>();
1687     for (int e = 0; e < edges.length(); e++) {
1688       MoveOperation* s = edges.at(e);
1689       if (!s->is_processed()) {
1690         MoveOperation* start = s;
1691         // search for the beginning of the chain or cycle
1692         while (start->prev() != NULL && start->prev() != s) {
1693           start = start->prev();
1694         }
1695         if (start->prev() == s) {
1696           start->break_cycle(temp_register);
1697         }
1698         // walk the chain forward inserting to store list
1699         while (start != NULL) {
1700           stores->append(start);
1701           start->set_processed();
1702           start = start->next();
1703         }
1704       }
1705     }
1706     return stores;
1707   }
1708 };
1709 
1710 static void verify_oop_args(MacroAssembler* masm,
1711                             const methodHandle& method,
1712                             const BasicType* sig_bt,
1713                             const VMRegPair* regs) {
1714   Register temp_reg = rbx;  // not part of any compiled calling seq
1715   if (VerifyOops) {
1716     for (int i = 0; i < method->size_of_parameters(); i++) {
1717       if (is_reference_type(sig_bt[i])) {
1718         VMReg r = regs[i].first();
1719         assert(r->is_valid(), "bad oop arg");
1720         if (r->is_stack()) {
1721           __ movptr(temp_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1722           __ verify_oop(temp_reg);
1723         } else {
1724           __ verify_oop(r->as_Register());
1725         }
1726       }
1727     }
1728   }
1729 }
1730 
1731 static void gen_special_dispatch(MacroAssembler* masm,
1732                                  const methodHandle& method,
1733                                  const BasicType* sig_bt,
1734                                  const VMRegPair* regs) {
1735   verify_oop_args(masm, method, sig_bt, regs);
1736   vmIntrinsics::ID iid = method->intrinsic_id();
1737 
1738   // Now write the args into the outgoing interpreter space
1739   bool     has_receiver   = false;
1740   Register receiver_reg   = noreg;
1741   int      member_arg_pos = -1;
1742   Register member_reg     = noreg;
1743   int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
1744   if (ref_kind != 0) {
1745     member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
1746     member_reg = rbx;  // known to be free at this point
1747     has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
1748   } else if (iid == vmIntrinsics::_invokeBasic || iid == vmIntrinsics::_linkToNative) {
1749     has_receiver = true;
1750   } else {
1751     fatal("unexpected intrinsic id %d", vmIntrinsics::as_int(iid));
1752   }
1753 
1754   if (member_reg != noreg) {
1755     // Load the member_arg into register, if necessary.
1756     SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
1757     VMReg r = regs[member_arg_pos].first();
1758     if (r->is_stack()) {
1759       __ movptr(member_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1760     } else {
1761       // no data motion is needed
1762       member_reg = r->as_Register();
1763     }
1764   }
1765 
1766   if (has_receiver) {
1767     // Make sure the receiver is loaded into a register.
1768     assert(method->size_of_parameters() > 0, "oob");
1769     assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
1770     VMReg r = regs[0].first();
1771     assert(r->is_valid(), "bad receiver arg");
1772     if (r->is_stack()) {
1773       // Porting note:  This assumes that compiled calling conventions always
1774       // pass the receiver oop in a register.  If this is not true on some
1775       // platform, pick a temp and load the receiver from stack.
1776       fatal("receiver always in a register");
1777       receiver_reg = j_rarg0;  // known to be free at this point
1778       __ movptr(receiver_reg, Address(rsp, r->reg2stack() * VMRegImpl::stack_slot_size + wordSize));
1779     } else {
1780       // no data motion is needed
1781       receiver_reg = r->as_Register();
1782     }
1783   }
1784 
1785   // Figure out which address we are really jumping to:
1786   MethodHandles::generate_method_handle_dispatch(masm, iid,
1787                                                  receiver_reg, member_reg, /*for_compiler_entry:*/ true);
1788 }
1789 
1790 // ---------------------------------------------------------------------------
1791 // Generate a native wrapper for a given method.  The method takes arguments
1792 // in the Java compiled code convention, marshals them to the native
1793 // convention (handlizes oops, etc), transitions to native, makes the call,
1794 // returns to java state (possibly blocking), unhandlizes any result and
1795 // returns.
1796 //
1797 // Critical native functions are a shorthand for the use of
1798 // GetPrimtiveArrayCritical and disallow the use of any other JNI
1799 // functions.  The wrapper is expected to unpack the arguments before
1800 // passing them to the callee. Critical native functions leave the state _in_Java,
1801 // since they cannot stop for GC.
1802 // Some other parts of JNI setup are skipped like the tear down of the JNI handle
1803 // block and the check for pending exceptions it's impossible for them
1804 // to be thrown.
1805 //
1806 nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
1807                                                 const methodHandle& method,
1808                                                 int compile_id,
1809                                                 BasicType* in_sig_bt,
1810                                                 VMRegPair* in_regs,
1811                                                 BasicType ret_type,
1812                                                 address critical_entry) {
1813   if (method->is_method_handle_intrinsic()) {
1814     vmIntrinsics::ID iid = method->intrinsic_id();
1815     intptr_t start = (intptr_t)__ pc();
1816     int vep_offset = ((intptr_t)__ pc()) - start;
1817     gen_special_dispatch(masm,
1818                          method,
1819                          in_sig_bt,
1820                          in_regs);
1821     int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
1822     __ flush();
1823     int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
1824     return nmethod::new_native_nmethod(method,
1825                                        compile_id,
1826                                        masm->code(),
1827                                        vep_offset,
1828                                        frame_complete,
1829                                        stack_slots / VMRegImpl::slots_per_word,
1830                                        in_ByteSize(-1),
1831                                        in_ByteSize(-1),
1832                                        (OopMapSet*)NULL);
1833   }
1834   bool is_critical_native = true;
1835   address native_func = critical_entry;
1836   if (native_func == NULL) {
1837     native_func = method->native_function();
1838     is_critical_native = false;
1839   }
1840   assert(native_func != NULL, "must have function");
1841 
1842   // An OopMap for lock (and class if static)
1843   OopMapSet *oop_maps = new OopMapSet();
1844   intptr_t start = (intptr_t)__ pc();
1845 
1846   // We have received a description of where all the java arg are located
1847   // on entry to the wrapper. We need to convert these args to where
1848   // the jni function will expect them. To figure out where they go
1849   // we convert the java signature to a C signature by inserting
1850   // the hidden arguments as arg[0] and possibly arg[1] (static method)
1851 
1852   const int total_in_args = method->size_of_parameters();
1853   int total_c_args = total_in_args;
1854   if (!is_critical_native) {
1855     total_c_args += 1;
1856     if (method->is_static()) {
1857       total_c_args++;
1858     }
1859   } else {
1860     for (int i = 0; i < total_in_args; i++) {
1861       if (in_sig_bt[i] == T_ARRAY) {
1862         total_c_args++;
1863       }
1864     }
1865   }
1866 
1867   BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
1868   VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
1869   BasicType* in_elem_bt = NULL;
1870 
1871   int argc = 0;
1872   if (!is_critical_native) {
1873     out_sig_bt[argc++] = T_ADDRESS;
1874     if (method->is_static()) {
1875       out_sig_bt[argc++] = T_OBJECT;
1876     }
1877 
1878     for (int i = 0; i < total_in_args ; i++ ) {
1879       out_sig_bt[argc++] = in_sig_bt[i];
1880     }
1881   } else {
1882     in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
1883     SignatureStream ss(method->signature());
1884     for (int i = 0; i < total_in_args ; i++ ) {
1885       if (in_sig_bt[i] == T_ARRAY) {
1886         // Arrays are passed as int, elem* pair
1887         out_sig_bt[argc++] = T_INT;
1888         out_sig_bt[argc++] = T_ADDRESS;
1889         ss.skip_array_prefix(1);  // skip one '['
1890         assert(ss.is_primitive(), "primitive type expected");
1891         in_elem_bt[i] = ss.type();
1892       } else {
1893         out_sig_bt[argc++] = in_sig_bt[i];
1894         in_elem_bt[i] = T_VOID;
1895       }
1896       if (in_sig_bt[i] != T_VOID) {
1897         assert(in_sig_bt[i] == ss.type() ||
1898                in_sig_bt[i] == T_ARRAY, "must match");
1899         ss.next();
1900       }
1901     }
1902   }
1903 
1904   // Now figure out where the args must be stored and how much stack space
1905   // they require.
1906   int out_arg_slots;
1907   out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
1908 
1909   // Compute framesize for the wrapper.  We need to handlize all oops in
1910   // incoming registers
1911 
1912   // Calculate the total number of stack slots we will need.
1913 
1914   // First count the abi requirement plus all of the outgoing args
1915   int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
1916 
1917   // Now the space for the inbound oop handle area
1918   int total_save_slots = 6 * VMRegImpl::slots_per_word;  // 6 arguments passed in registers
1919   if (is_critical_native) {
1920     // Critical natives may have to call out so they need a save area
1921     // for register arguments.
1922     int double_slots = 0;
1923     int single_slots = 0;
1924     for ( int i = 0; i < total_in_args; i++) {
1925       if (in_regs[i].first()->is_Register()) {
1926         const Register reg = in_regs[i].first()->as_Register();
1927         switch (in_sig_bt[i]) {
1928           case T_BOOLEAN:
1929           case T_BYTE:
1930           case T_SHORT:
1931           case T_CHAR:
1932           case T_INT:  single_slots++; break;
1933           case T_ARRAY:  // specific to LP64 (7145024)
1934           case T_LONG: double_slots++; break;
1935           default:  ShouldNotReachHere();
1936         }
1937       } else if (in_regs[i].first()->is_XMMRegister()) {
1938         switch (in_sig_bt[i]) {
1939           case T_FLOAT:  single_slots++; break;
1940           case T_DOUBLE: double_slots++; break;
1941           default:  ShouldNotReachHere();
1942         }
1943       } else if (in_regs[i].first()->is_FloatRegister()) {
1944         ShouldNotReachHere();
1945       }
1946     }
1947     total_save_slots = double_slots * 2 + single_slots;
1948     // align the save area
1949     if (double_slots != 0) {
1950       stack_slots = align_up(stack_slots, 2);
1951     }
1952   }
1953 
1954   int oop_handle_offset = stack_slots;
1955   stack_slots += total_save_slots;
1956 
1957   // Now any space we need for handlizing a klass if static method
1958 
1959   int klass_slot_offset = 0;
1960   int klass_offset = -1;
1961   int lock_slot_offset = 0;
1962   bool is_static = false;
1963 
1964   if (method->is_static()) {
1965     klass_slot_offset = stack_slots;
1966     stack_slots += VMRegImpl::slots_per_word;
1967     klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
1968     is_static = true;
1969   }
1970 
1971   // Plus a lock if needed
1972 
1973   if (method->is_synchronized()) {
1974     lock_slot_offset = stack_slots;
1975     stack_slots += VMRegImpl::slots_per_word;
1976   }
1977 
1978   // Now a place (+2) to save return values or temp during shuffling
1979   // + 4 for return address (which we own) and saved rbp
1980   stack_slots += 6;
1981 
1982   // Ok The space we have allocated will look like:
1983   //
1984   //
1985   // FP-> |                     |
1986   //      |---------------------|
1987   //      | 2 slots for moves   |
1988   //      |---------------------|
1989   //      | lock box (if sync)  |
1990   //      |---------------------| <- lock_slot_offset
1991   //      | klass (if static)   |
1992   //      |---------------------| <- klass_slot_offset
1993   //      | oopHandle area      |
1994   //      |---------------------| <- oop_handle_offset (6 java arg registers)
1995   //      | outbound memory     |
1996   //      | based arguments     |
1997   //      |                     |
1998   //      |---------------------|
1999   //      |                     |
2000   // SP-> | out_preserved_slots |
2001   //
2002   //
2003 
2004 
2005   // Now compute actual number of stack words we need rounding to make
2006   // stack properly aligned.
2007   stack_slots = align_up(stack_slots, StackAlignmentInSlots);
2008 
2009   int stack_size = stack_slots * VMRegImpl::stack_slot_size;
2010 
2011   // First thing make an ic check to see if we should even be here
2012 
2013   // We are free to use all registers as temps without saving them and
2014   // restoring them except rbp. rbp is the only callee save register
2015   // as far as the interpreter and the compiler(s) are concerned.
2016 
2017 
2018   const Register ic_reg = rax;
2019   const Register receiver = j_rarg0;
2020 
2021   Label hit;
2022   Label exception_pending;
2023 
2024   assert_different_registers(ic_reg, receiver, rscratch1);
2025   __ verify_oop(receiver);
2026   __ load_klass(rscratch1, receiver, rscratch2);
2027   __ cmpq(ic_reg, rscratch1);
2028   __ jcc(Assembler::equal, hit);
2029 
2030   __ jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
2031 
2032   // Verified entry point must be aligned
2033   __ align(8);
2034 
2035   __ bind(hit);
2036 
2037   int vep_offset = ((intptr_t)__ pc()) - start;
2038 
2039   if (VM_Version::supports_fast_class_init_checks() && method->needs_clinit_barrier()) {
2040     Label L_skip_barrier;
2041     Register klass = r10;
2042     __ mov_metadata(klass, method->method_holder()); // InstanceKlass*
2043     __ clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/);
2044 
2045     __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
2046 
2047     __ bind(L_skip_barrier);
2048   }
2049 
2050 #ifdef COMPILER1
2051   // For Object.hashCode, System.identityHashCode try to pull hashCode from object header if available.
2052   if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
2053     inline_check_hashcode_from_object_header(masm, method, j_rarg0 /*obj_reg*/, rax /*result*/);
2054   }
2055 #endif // COMPILER1
2056 
2057   // The instruction at the verified entry point must be 5 bytes or longer
2058   // because it can be patched on the fly by make_non_entrant. The stack bang
2059   // instruction fits that requirement.
2060 
2061   // Generate stack overflow check
2062   __ bang_stack_with_offset((int)StackOverflow::stack_shadow_zone_size());
2063 
2064   // Generate a new frame for the wrapper.
2065   __ enter();
2066   // -2 because return address is already present and so is saved rbp
2067   __ subptr(rsp, stack_size - 2*wordSize);
2068 
2069   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2070   bs->nmethod_entry_barrier(masm);
2071 
2072   // Frame is now completed as far as size and linkage.
2073   int frame_complete = ((intptr_t)__ pc()) - start;
2074 
2075     if (UseRTMLocking) {
2076       // Abort RTM transaction before calling JNI
2077       // because critical section will be large and will be
2078       // aborted anyway. Also nmethod could be deoptimized.
2079       __ xabort(0);
2080     }
2081 
2082 #ifdef ASSERT
2083     {
2084       Label L;
2085       __ mov(rax, rsp);
2086       __ andptr(rax, -16); // must be 16 byte boundary (see amd64 ABI)
2087       __ cmpptr(rax, rsp);
2088       __ jcc(Assembler::equal, L);
2089       __ stop("improperly aligned stack");
2090       __ bind(L);
2091     }
2092 #endif /* ASSERT */
2093 
2094 
2095   // We use r14 as the oop handle for the receiver/klass
2096   // It is callee save so it survives the call to native
2097 
2098   const Register oop_handle_reg = r14;
2099 
2100   //
2101   // We immediately shuffle the arguments so that any vm call we have to
2102   // make from here on out (sync slow path, jvmti, etc.) we will have
2103   // captured the oops from our caller and have a valid oopMap for
2104   // them.
2105 
2106   // -----------------
2107   // The Grand Shuffle
2108 
2109   // The Java calling convention is either equal (linux) or denser (win64) than the
2110   // c calling convention. However the because of the jni_env argument the c calling
2111   // convention always has at least one more (and two for static) arguments than Java.
2112   // Therefore if we move the args from java -> c backwards then we will never have
2113   // a register->register conflict and we don't have to build a dependency graph
2114   // and figure out how to break any cycles.
2115   //
2116 
2117   // Record esp-based slot for receiver on stack for non-static methods
2118   int receiver_offset = -1;
2119 
2120   // This is a trick. We double the stack slots so we can claim
2121   // the oops in the caller's frame. Since we are sure to have
2122   // more args than the caller doubling is enough to make
2123   // sure we can capture all the incoming oop args from the
2124   // caller.
2125   //
2126   OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
2127 
2128   // Mark location of rbp (someday)
2129   // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rbp));
2130 
2131   // Use eax, ebx as temporaries during any memory-memory moves we have to do
2132   // All inbound args are referenced based on rbp and all outbound args via rsp.
2133 
2134 
2135 #ifdef ASSERT
2136   bool reg_destroyed[RegisterImpl::number_of_registers];
2137   bool freg_destroyed[XMMRegisterImpl::number_of_registers];
2138   for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
2139     reg_destroyed[r] = false;
2140   }
2141   for ( int f = 0 ; f < XMMRegisterImpl::number_of_registers ; f++ ) {
2142     freg_destroyed[f] = false;
2143   }
2144 
2145 #endif /* ASSERT */
2146 
2147   // This may iterate in two different directions depending on the
2148   // kind of native it is.  The reason is that for regular JNI natives
2149   // the incoming and outgoing registers are offset upwards and for
2150   // critical natives they are offset down.
2151   GrowableArray<int> arg_order(2 * total_in_args);
2152 
2153   VMRegPair tmp_vmreg;
2154   tmp_vmreg.set2(rbx->as_VMReg());
2155 
2156   if (!is_critical_native) {
2157     for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
2158       arg_order.push(i);
2159       arg_order.push(c_arg);
2160     }
2161   } else {
2162     // Compute a valid move order, using tmp_vmreg to break any cycles
2163     ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
2164   }
2165 
2166   int temploc = -1;
2167   for (int ai = 0; ai < arg_order.length(); ai += 2) {
2168     int i = arg_order.at(ai);
2169     int c_arg = arg_order.at(ai + 1);
2170     __ block_comment(err_msg("move %d -> %d", i, c_arg));
2171     if (c_arg == -1) {
2172       assert(is_critical_native, "should only be required for critical natives");
2173       // This arg needs to be moved to a temporary
2174       __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
2175       in_regs[i] = tmp_vmreg;
2176       temploc = i;
2177       continue;
2178     } else if (i == -1) {
2179       assert(is_critical_native, "should only be required for critical natives");
2180       // Read from the temporary location
2181       assert(temploc != -1, "must be valid");
2182       i = temploc;
2183       temploc = -1;
2184     }
2185 #ifdef ASSERT
2186     if (in_regs[i].first()->is_Register()) {
2187       assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
2188     } else if (in_regs[i].first()->is_XMMRegister()) {
2189       assert(!freg_destroyed[in_regs[i].first()->as_XMMRegister()->encoding()], "destroyed reg!");
2190     }
2191     if (out_regs[c_arg].first()->is_Register()) {
2192       reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2193     } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2194       freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2195     }
2196 #endif /* ASSERT */
2197     switch (in_sig_bt[i]) {
2198       case T_ARRAY:
2199         if (is_critical_native) {
2200           unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
2201           c_arg++;
2202 #ifdef ASSERT
2203           if (out_regs[c_arg].first()->is_Register()) {
2204             reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
2205           } else if (out_regs[c_arg].first()->is_XMMRegister()) {
2206             freg_destroyed[out_regs[c_arg].first()->as_XMMRegister()->encoding()] = true;
2207           }
2208 #endif
2209           break;
2210         }
2211       case T_INLINE_TYPE:
2212       case T_OBJECT:
2213         assert(!is_critical_native, "no oop arguments");
2214         __ object_move(map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
2215                     ((i == 0) && (!is_static)),
2216                     &receiver_offset);
2217         break;
2218       case T_VOID:
2219         break;
2220 
2221       case T_FLOAT:
2222         __ float_move(in_regs[i], out_regs[c_arg]);
2223           break;
2224 
2225       case T_DOUBLE:
2226         assert( i + 1 < total_in_args &&
2227                 in_sig_bt[i + 1] == T_VOID &&
2228                 out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
2229         __ double_move(in_regs[i], out_regs[c_arg]);
2230         break;
2231 
2232       case T_LONG :
2233         __ long_move(in_regs[i], out_regs[c_arg]);
2234         break;
2235 
2236       case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
2237 
2238       default:
2239         __ move32_64(in_regs[i], out_regs[c_arg]);
2240     }
2241   }
2242 
2243   int c_arg;
2244 
2245   // Pre-load a static method's oop into r14.  Used both by locking code and
2246   // the normal JNI call code.
2247   if (!is_critical_native) {
2248     // point c_arg at the first arg that is already loaded in case we
2249     // need to spill before we call out
2250     c_arg = total_c_args - total_in_args;
2251 
2252     if (method->is_static()) {
2253 
2254       //  load oop into a register
2255       __ movoop(oop_handle_reg, JNIHandles::make_local(method->method_holder()->java_mirror()));
2256 
2257       // Now handlize the static class mirror it's known not-null.
2258       __ movptr(Address(rsp, klass_offset), oop_handle_reg);
2259       map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
2260 
2261       // Now get the handle
2262       __ lea(oop_handle_reg, Address(rsp, klass_offset));
2263       // store the klass handle as second argument
2264       __ movptr(c_rarg1, oop_handle_reg);
2265       // and protect the arg if we must spill
2266       c_arg--;
2267     }
2268   } else {
2269     // For JNI critical methods we need to save all registers in save_args.
2270     c_arg = 0;
2271   }
2272 
2273   // Change state to native (we save the return address in the thread, since it might not
2274   // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
2275   // points into the right code segment. It does not have to be the correct return pc.
2276   // We use the same pc/oopMap repeatedly when we call out
2277 
2278   intptr_t the_pc = (intptr_t) __ pc();
2279   oop_maps->add_gc_map(the_pc - start, map);
2280 
2281   __ set_last_Java_frame(rsp, noreg, (address)the_pc);
2282 
2283 
2284   // We have all of the arguments setup at this point. We must not touch any register
2285   // argument registers at this point (what if we save/restore them there are no oop?
2286 
2287   {
2288     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2289     // protect the args we've loaded
2290     save_args(masm, total_c_args, c_arg, out_regs);
2291     __ mov_metadata(c_rarg1, method());
2292     __ call_VM_leaf(
2293       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
2294       r15_thread, c_rarg1);
2295     restore_args(masm, total_c_args, c_arg, out_regs);
2296   }
2297 
2298   // RedefineClasses() tracing support for obsolete method entry
2299   if (log_is_enabled(Trace, redefine, class, obsolete)) {
2300     // protect the args we've loaded
2301     save_args(masm, total_c_args, c_arg, out_regs);
2302     __ mov_metadata(c_rarg1, method());
2303     __ call_VM_leaf(
2304       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
2305       r15_thread, c_rarg1);
2306     restore_args(masm, total_c_args, c_arg, out_regs);
2307   }
2308 
2309   // Lock a synchronized method
2310 
2311   // Register definitions used by locking and unlocking
2312 
2313   const Register swap_reg = rax;  // Must use rax for cmpxchg instruction
2314   const Register obj_reg  = rbx;  // Will contain the oop
2315   const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
2316   const Register old_hdr  = r13;  // value of old header at unlock time
2317 
2318   Label slow_path_lock;
2319   Label lock_done;
2320 
2321   if (method->is_synchronized()) {
2322     assert(!is_critical_native, "unhandled");
2323 
2324 
2325     const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
2326 
2327     // Get the handle (the 2nd argument)
2328     __ mov(oop_handle_reg, c_rarg1);
2329 
2330     // Get address of the box
2331 
2332     __ lea(lock_reg, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2333 
2334     // Load the oop from the handle
2335     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2336 
2337     // Load immediate 1 into swap_reg %rax
2338     __ movl(swap_reg, 1);
2339 
2340     // Load (object->mark() | 1) into swap_reg %rax
2341     __ orptr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2342     if (EnableValhalla) {
2343       // Mask inline_type bit such that we go to the slow path if object is an inline type
2344       __ andptr(swap_reg, ~((int) markWord::inline_type_bit_in_place));
2345     }
2346 
2347 
2348     // Save (object->mark() | 1) into BasicLock's displaced header
2349     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2350 
2351     // src -> dest iff dest == rax else rax <- dest
2352     __ lock();
2353     __ cmpxchgptr(lock_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2354     __ jcc(Assembler::equal, lock_done);
2355 
2356     // Hmm should this move to the slow path code area???
2357 
2358     // Test if the oopMark is an obvious stack pointer, i.e.,
2359     //  1) (mark & 3) == 0, and
2360     //  2) rsp <= mark < mark + os::pagesize()
2361     // These 3 tests can be done by evaluating the following
2362     // expression: ((mark - rsp) & (3 - os::vm_page_size())),
2363     // assuming both stack pointer and pagesize have their
2364     // least significant 2 bits clear.
2365     // NOTE: the oopMark is in swap_reg %rax as the result of cmpxchg
2366 
2367     __ subptr(swap_reg, rsp);
2368     __ andptr(swap_reg, 3 - os::vm_page_size());
2369 
2370     // Save the test result, for recursive case, the result is zero
2371     __ movptr(Address(lock_reg, mark_word_offset), swap_reg);
2372     __ jcc(Assembler::notEqual, slow_path_lock);
2373 
2374     // Slow path will re-enter here
2375 
2376     __ bind(lock_done);
2377   }
2378 
2379   // Finally just about ready to make the JNI call
2380 
2381   // get JNIEnv* which is first argument to native
2382   if (!is_critical_native) {
2383     __ lea(c_rarg0, Address(r15_thread, in_bytes(JavaThread::jni_environment_offset())));
2384 
2385     // Now set thread in native
2386     __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
2387   }
2388 
2389   __ call(RuntimeAddress(native_func));
2390 
2391   // Verify or restore cpu control state after JNI call
2392   __ restore_cpu_control_state_after_jni();
2393 
2394   // Unpack native results.
2395   switch (ret_type) {
2396   case T_BOOLEAN: __ c2bool(rax);            break;
2397   case T_CHAR   : __ movzwl(rax, rax);      break;
2398   case T_BYTE   : __ sign_extend_byte (rax); break;
2399   case T_SHORT  : __ sign_extend_short(rax); break;
2400   case T_INT    : /* nothing to do */        break;
2401   case T_DOUBLE :
2402   case T_FLOAT  :
2403     // Result is in xmm0 we'll save as needed
2404     break;
2405   case T_ARRAY:                 // Really a handle
2406   case T_INLINE_TYPE:           // Really a handle
2407   case T_OBJECT:                // Really a handle
2408       break; // can't de-handlize until after safepoint check
2409   case T_VOID: break;
2410   case T_LONG: break;
2411   default       : ShouldNotReachHere();
2412   }
2413 
2414   Label after_transition;
2415 
2416   // If this is a critical native, check for a safepoint or suspend request after the call.
2417   // If a safepoint is needed, transition to native, then to native_trans to handle
2418   // safepoints like the native methods that are not critical natives.
2419   if (is_critical_native) {
2420     Label needs_safepoint;
2421     __ safepoint_poll(needs_safepoint, r15_thread, false /* at_return */, false /* in_nmethod */);
2422     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2423     __ jcc(Assembler::equal, after_transition);
2424     __ bind(needs_safepoint);
2425   }
2426 
2427   // Switch thread to "native transition" state before reading the synchronization state.
2428   // This additional state is necessary because reading and testing the synchronization
2429   // state is not atomic w.r.t. GC, as this scenario demonstrates:
2430   //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
2431   //     VM thread changes sync state to synchronizing and suspends threads for GC.
2432   //     Thread A is resumed to finish this native method, but doesn't block here since it
2433   //     didn't see any synchronization is progress, and escapes.
2434   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
2435 
2436   // Force this write out before the read below
2437   __ membar(Assembler::Membar_mask_bits(
2438               Assembler::LoadLoad | Assembler::LoadStore |
2439               Assembler::StoreLoad | Assembler::StoreStore));
2440 
2441   // check for safepoint operation in progress and/or pending suspend requests
2442   {
2443     Label Continue;
2444     Label slow_path;
2445 
2446     __ safepoint_poll(slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
2447 
2448     __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
2449     __ jcc(Assembler::equal, Continue);
2450     __ bind(slow_path);
2451 
2452     // Don't use call_VM as it will see a possible pending exception and forward it
2453     // and never return here preventing us from clearing _last_native_pc down below.
2454     // Also can't use call_VM_leaf either as it will check to see if rsi & rdi are
2455     // preserved and correspond to the bcp/locals pointers. So we do a runtime call
2456     // by hand.
2457     //
2458     __ vzeroupper();
2459     save_native_result(masm, ret_type, stack_slots);
2460     __ mov(c_rarg0, r15_thread);
2461     __ mov(r12, rsp); // remember sp
2462     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2463     __ andptr(rsp, -16); // align stack as required by ABI
2464     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
2465     __ mov(rsp, r12); // restore sp
2466     __ reinit_heapbase();
2467     // Restore any method result value
2468     restore_native_result(masm, ret_type, stack_slots);
2469     __ bind(Continue);
2470   }
2471 
2472   // change thread state
2473   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
2474   __ bind(after_transition);
2475 
2476   Label reguard;
2477   Label reguard_done;
2478   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
2479   __ jcc(Assembler::equal, reguard);
2480   __ bind(reguard_done);
2481 
2482   // native result if any is live
2483 
2484   // Unlock
2485   Label unlock_done;
2486   Label slow_path_unlock;
2487   if (method->is_synchronized()) {
2488 
2489     // Get locked oop from the handle we passed to jni
2490     __ movptr(obj_reg, Address(oop_handle_reg, 0));
2491 
2492     Label done;
2493     // Simple recursive lock?
2494 
2495     __ cmpptr(Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size), (int32_t)NULL_WORD);
2496     __ jcc(Assembler::equal, done);
2497 
2498     // Must save rax if if it is live now because cmpxchg must use it
2499     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2500       save_native_result(masm, ret_type, stack_slots);
2501     }
2502 
2503 
2504     // get address of the stack lock
2505     __ lea(rax, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2506     //  get old displaced header
2507     __ movptr(old_hdr, Address(rax, 0));
2508 
2509     // Atomic swap old header if oop still contains the stack lock
2510     __ lock();
2511     __ cmpxchgptr(old_hdr, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2512     __ jcc(Assembler::notEqual, slow_path_unlock);
2513 
2514     // slow path re-enters here
2515     __ bind(unlock_done);
2516     if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
2517       restore_native_result(masm, ret_type, stack_slots);
2518     }
2519 
2520     __ bind(done);
2521 
2522   }
2523   {
2524     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
2525     save_native_result(masm, ret_type, stack_slots);
2526     __ mov_metadata(c_rarg1, method());
2527     __ call_VM_leaf(
2528          CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
2529          r15_thread, c_rarg1);
2530     restore_native_result(masm, ret_type, stack_slots);
2531   }
2532 
2533   __ reset_last_Java_frame(false);
2534 
2535   // Unbox oop result, e.g. JNIHandles::resolve value.
2536   if (is_reference_type(ret_type)) {
2537     __ resolve_jobject(rax /* value */,
2538                        r15_thread /* thread */,
2539                        rcx /* tmp */);
2540   }
2541 
2542   if (CheckJNICalls) {
2543     // clear_pending_jni_exception_check
2544     __ movptr(Address(r15_thread, JavaThread::pending_jni_exception_check_fn_offset()), NULL_WORD);
2545   }
2546 
2547   if (!is_critical_native) {
2548     // reset handle block
2549     __ movptr(rcx, Address(r15_thread, JavaThread::active_handles_offset()));
2550     __ movl(Address(rcx, JNIHandleBlock::top_offset_in_bytes()), (int32_t)NULL_WORD);
2551   }
2552 
2553   // pop our frame
2554 
2555   __ leave();
2556 
2557   if (!is_critical_native) {
2558     // Any exception pending?
2559     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2560     __ jcc(Assembler::notEqual, exception_pending);
2561   }
2562 
2563   // Return
2564 
2565   __ ret(0);
2566 
2567   // Unexpected paths are out of line and go here
2568 
2569   if (!is_critical_native) {
2570     // forward the exception
2571     __ bind(exception_pending);
2572 
2573     // and forward the exception
2574     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2575   }
2576 
2577   // Slow path locking & unlocking
2578   if (method->is_synchronized()) {
2579 
2580     // BEGIN Slow path lock
2581     __ bind(slow_path_lock);
2582 
2583     // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
2584     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2585 
2586     // protect the args we've loaded
2587     save_args(masm, total_c_args, c_arg, out_regs);
2588 
2589     __ mov(c_rarg0, obj_reg);
2590     __ mov(c_rarg1, lock_reg);
2591     __ mov(c_rarg2, r15_thread);
2592 
2593     // Not a leaf but we have last_Java_frame setup as we want
2594     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
2595     restore_args(masm, total_c_args, c_arg, out_regs);
2596 
2597 #ifdef ASSERT
2598     { Label L;
2599     __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2600     __ jcc(Assembler::equal, L);
2601     __ stop("no pending exception allowed on exit from monitorenter");
2602     __ bind(L);
2603     }
2604 #endif
2605     __ jmp(lock_done);
2606 
2607     // END Slow path lock
2608 
2609     // BEGIN Slow path unlock
2610     __ bind(slow_path_unlock);
2611 
2612     // If we haven't already saved the native result we must save it now as xmm registers
2613     // are still exposed.
2614     __ vzeroupper();
2615     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2616       save_native_result(masm, ret_type, stack_slots);
2617     }
2618 
2619     __ lea(c_rarg1, Address(rsp, lock_slot_offset * VMRegImpl::stack_slot_size));
2620 
2621     __ mov(c_rarg0, obj_reg);
2622     __ mov(c_rarg2, r15_thread);
2623     __ mov(r12, rsp); // remember sp
2624     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2625     __ andptr(rsp, -16); // align stack as required by ABI
2626 
2627     // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
2628     // NOTE that obj_reg == rbx currently
2629     __ movptr(rbx, Address(r15_thread, in_bytes(Thread::pending_exception_offset())));
2630     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int32_t)NULL_WORD);
2631 
2632     // args are (oop obj, BasicLock* lock, JavaThread* thread)
2633     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C)));
2634     __ mov(rsp, r12); // restore sp
2635     __ reinit_heapbase();
2636 #ifdef ASSERT
2637     {
2638       Label L;
2639       __ cmpptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), (int)NULL_WORD);
2640       __ jcc(Assembler::equal, L);
2641       __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
2642       __ bind(L);
2643     }
2644 #endif /* ASSERT */
2645 
2646     __ movptr(Address(r15_thread, in_bytes(Thread::pending_exception_offset())), rbx);
2647 
2648     if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
2649       restore_native_result(masm, ret_type, stack_slots);
2650     }
2651     __ jmp(unlock_done);
2652 
2653     // END Slow path unlock
2654 
2655   } // synchronized
2656 
2657   // SLOW PATH Reguard the stack if needed
2658 
2659   __ bind(reguard);
2660   __ vzeroupper();
2661   save_native_result(masm, ret_type, stack_slots);
2662   __ mov(r12, rsp); // remember sp
2663   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
2664   __ andptr(rsp, -16); // align stack as required by ABI
2665   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
2666   __ mov(rsp, r12); // restore sp
2667   __ reinit_heapbase();
2668   restore_native_result(masm, ret_type, stack_slots);
2669   // and continue
2670   __ jmp(reguard_done);
2671 
2672 
2673 
2674   __ flush();
2675 
2676   nmethod *nm = nmethod::new_native_nmethod(method,
2677                                             compile_id,
2678                                             masm->code(),
2679                                             vep_offset,
2680                                             frame_complete,
2681                                             stack_slots / VMRegImpl::slots_per_word,
2682                                             (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
2683                                             in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
2684                                             oop_maps);
2685 
2686   return nm;
2687 }
2688 
2689 // this function returns the adjust size (in number of words) to a c2i adapter
2690 // activation for use during deoptimization
2691 int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals ) {
2692   return (callee_locals - callee_parameters) * Interpreter::stackElementWords;
2693 }
2694 
2695 
2696 uint SharedRuntime::out_preserve_stack_slots() {
2697   return 0;
2698 }
2699 
2700 
2701 // Number of stack slots between incoming argument block and the start of
2702 // a new frame.  The PROLOG must add this many slots to the stack.  The
2703 // EPILOG must remove this many slots.  amd64 needs two slots for
2704 // return address.
2705 uint SharedRuntime::in_preserve_stack_slots() {
2706   return 4 + 2 * VerifyStackAtCalls;
2707 }
2708 
2709 //------------------------------generate_deopt_blob----------------------------
2710 void SharedRuntime::generate_deopt_blob() {
2711   // Allocate space for the code
2712   ResourceMark rm;
2713   // Setup code generation tools
2714   int pad = 0;
2715   if (UseAVX > 2) {
2716     pad += 1024;
2717   }
2718 #if INCLUDE_JVMCI
2719   if (EnableJVMCI) {
2720     pad += 512; // Increase the buffer size when compiling for JVMCI
2721   }
2722 #endif
2723   CodeBuffer buffer("deopt_blob", 2560+pad, 1024);
2724   MacroAssembler* masm = new MacroAssembler(&buffer);
2725   int frame_size_in_words;
2726   OopMap* map = NULL;
2727   OopMapSet *oop_maps = new OopMapSet();
2728 
2729   // -------------
2730   // This code enters when returning to a de-optimized nmethod.  A return
2731   // address has been pushed on the the stack, and return values are in
2732   // registers.
2733   // If we are doing a normal deopt then we were called from the patched
2734   // nmethod from the point we returned to the nmethod. So the return
2735   // address on the stack is wrong by NativeCall::instruction_size
2736   // We will adjust the value so it looks like we have the original return
2737   // address on the stack (like when we eagerly deoptimized).
2738   // In the case of an exception pending when deoptimizing, we enter
2739   // with a return address on the stack that points after the call we patched
2740   // into the exception handler. We have the following register state from,
2741   // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
2742   //    rax: exception oop
2743   //    rbx: exception handler
2744   //    rdx: throwing pc
2745   // So in this case we simply jam rdx into the useless return address and
2746   // the stack looks just like we want.
2747   //
2748   // At this point we need to de-opt.  We save the argument return
2749   // registers.  We call the first C routine, fetch_unroll_info().  This
2750   // routine captures the return values and returns a structure which
2751   // describes the current frame size and the sizes of all replacement frames.
2752   // The current frame is compiled code and may contain many inlined
2753   // functions, each with their own JVM state.  We pop the current frame, then
2754   // push all the new frames.  Then we call the C routine unpack_frames() to
2755   // populate these frames.  Finally unpack_frames() returns us the new target
2756   // address.  Notice that callee-save registers are BLOWN here; they have
2757   // already been captured in the vframeArray at the time the return PC was
2758   // patched.
2759   address start = __ pc();
2760   Label cont;
2761 
2762   // Prolog for non exception case!
2763 
2764   // Save everything in sight.
2765   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2766 
2767   // Normal deoptimization.  Save exec mode for unpack_frames.
2768   __ movl(r14, Deoptimization::Unpack_deopt); // callee-saved
2769   __ jmp(cont);
2770 
2771   int reexecute_offset = __ pc() - start;
2772 #if INCLUDE_JVMCI && !defined(COMPILER1)
2773   if (EnableJVMCI && UseJVMCICompiler) {
2774     // JVMCI does not use this kind of deoptimization
2775     __ should_not_reach_here();
2776   }
2777 #endif
2778 
2779   // Reexecute case
2780   // return address is the pc describes what bci to do re-execute at
2781 
2782   // No need to update map as each call to save_live_registers will produce identical oopmap
2783   (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2784 
2785   __ movl(r14, Deoptimization::Unpack_reexecute); // callee-saved
2786   __ jmp(cont);
2787 
2788 #if INCLUDE_JVMCI
2789   Label after_fetch_unroll_info_call;
2790   int implicit_exception_uncommon_trap_offset = 0;
2791   int uncommon_trap_offset = 0;
2792 
2793   if (EnableJVMCI) {
2794     implicit_exception_uncommon_trap_offset = __ pc() - start;
2795 
2796     __ pushptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())));
2797     __ movptr(Address(r15_thread, in_bytes(JavaThread::jvmci_implicit_exception_pc_offset())), (int32_t)NULL_WORD);
2798 
2799     uncommon_trap_offset = __ pc() - start;
2800 
2801     // Save everything in sight.
2802     RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2803     // fetch_unroll_info needs to call last_java_frame()
2804     __ set_last_Java_frame(noreg, noreg, NULL);
2805 
2806     __ movl(c_rarg1, Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())));
2807     __ movl(Address(r15_thread, in_bytes(JavaThread::pending_deoptimization_offset())), -1);
2808 
2809     __ movl(r14, (int32_t)Deoptimization::Unpack_reexecute);
2810     __ mov(c_rarg0, r15_thread);
2811     __ movl(c_rarg2, r14); // exec mode
2812     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
2813     oop_maps->add_gc_map( __ pc()-start, map->deep_copy());
2814 
2815     __ reset_last_Java_frame(false);
2816 
2817     __ jmp(after_fetch_unroll_info_call);
2818   } // EnableJVMCI
2819 #endif // INCLUDE_JVMCI
2820 
2821   int exception_offset = __ pc() - start;
2822 
2823   // Prolog for exception case
2824 
2825   // all registers are dead at this entry point, except for rax, and
2826   // rdx which contain the exception oop and exception pc
2827   // respectively.  Set them in TLS and fall thru to the
2828   // unpack_with_exception_in_tls entry point.
2829 
2830   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
2831   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), rax);
2832 
2833   int exception_in_tls_offset = __ pc() - start;
2834 
2835   // new implementation because exception oop is now passed in JavaThread
2836 
2837   // Prolog for exception case
2838   // All registers must be preserved because they might be used by LinearScan
2839   // Exceptiop oop and throwing PC are passed in JavaThread
2840   // tos: stack at point of call to method that threw the exception (i.e. only
2841   // args are on the stack, no return address)
2842 
2843   // make room on stack for the return address
2844   // It will be patched later with the throwing pc. The correct value is not
2845   // available now because loading it from memory would destroy registers.
2846   __ push(0);
2847 
2848   // Save everything in sight.
2849   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ true);
2850 
2851   // Now it is safe to overwrite any register
2852 
2853   // Deopt during an exception.  Save exec mode for unpack_frames.
2854   __ movl(r14, Deoptimization::Unpack_exception); // callee-saved
2855 
2856   // load throwing pc from JavaThread and patch it as the return address
2857   // of the current frame. Then clear the field in JavaThread
2858 
2859   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2860   __ movptr(Address(rbp, wordSize), rdx);
2861   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2862 
2863 #ifdef ASSERT
2864   // verify that there is really an exception oop in JavaThread
2865   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2866   __ verify_oop(rax);
2867 
2868   // verify that there is no pending exception
2869   Label no_pending_exception;
2870   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
2871   __ testptr(rax, rax);
2872   __ jcc(Assembler::zero, no_pending_exception);
2873   __ stop("must not have pending exception here");
2874   __ bind(no_pending_exception);
2875 #endif
2876 
2877   __ bind(cont);
2878 
2879   // Call C code.  Need thread and this frame, but NOT official VM entry
2880   // crud.  We cannot block on this call, no GC can happen.
2881   //
2882   // UnrollBlock* fetch_unroll_info(JavaThread* thread)
2883 
2884   // fetch_unroll_info needs to call last_java_frame().
2885 
2886   __ set_last_Java_frame(noreg, noreg, NULL);
2887 #ifdef ASSERT
2888   { Label L;
2889     __ cmpptr(Address(r15_thread,
2890                     JavaThread::last_Java_fp_offset()),
2891             (int32_t)0);
2892     __ jcc(Assembler::equal, L);
2893     __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
2894     __ bind(L);
2895   }
2896 #endif // ASSERT
2897   __ mov(c_rarg0, r15_thread);
2898   __ movl(c_rarg1, r14); // exec_mode
2899   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
2900 
2901   // Need to have an oopmap that tells fetch_unroll_info where to
2902   // find any register it might need.
2903   oop_maps->add_gc_map(__ pc() - start, map);
2904 
2905   __ reset_last_Java_frame(false);
2906 
2907 #if INCLUDE_JVMCI
2908   if (EnableJVMCI) {
2909     __ bind(after_fetch_unroll_info_call);
2910   }
2911 #endif
2912 
2913   // Load UnrollBlock* into rdi
2914   __ mov(rdi, rax);
2915 
2916   __ movl(r14, Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
2917    Label noException;
2918   __ cmpl(r14, Deoptimization::Unpack_exception);   // Was exception pending?
2919   __ jcc(Assembler::notEqual, noException);
2920   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
2921   // QQQ this is useless it was NULL above
2922   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
2923   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int32_t)NULL_WORD);
2924   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int32_t)NULL_WORD);
2925 
2926   __ verify_oop(rax);
2927 
2928   // Overwrite the result registers with the exception results.
2929   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
2930   // I think this is useless
2931   __ movptr(Address(rsp, RegisterSaver::rdx_offset_in_bytes()), rdx);
2932 
2933   __ bind(noException);
2934 
2935   // Only register save data is on the stack.
2936   // Now restore the result registers.  Everything else is either dead
2937   // or captured in the vframeArray.
2938   RegisterSaver::restore_result_registers(masm);
2939 
2940   // All of the register save area has been popped of the stack. Only the
2941   // return address remains.
2942 
2943   // Pop all the frames we must move/replace.
2944   //
2945   // Frame picture (youngest to oldest)
2946   // 1: self-frame (no frame link)
2947   // 2: deopting frame  (no frame link)
2948   // 3: caller of deopting frame (could be compiled/interpreted).
2949   //
2950   // Note: by leaving the return address of self-frame on the stack
2951   // and using the size of frame 2 to adjust the stack
2952   // when we are done the return to frame 3 will still be on the stack.
2953 
2954   // Pop deoptimized frame
2955   __ movl(rcx, Address(rdi, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
2956   __ addptr(rsp, rcx);
2957 
2958   // rsp should be pointing at the return address to the caller (3)
2959 
2960   // Pick up the initial fp we should save
2961   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
2962   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
2963 
2964 #ifdef ASSERT
2965   // Compilers generate code that bang the stack by as much as the
2966   // interpreter would need. So this stack banging should never
2967   // trigger a fault. Verify that it does not on non product builds.
2968   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
2969   __ bang_stack_size(rbx, rcx);
2970 #endif
2971 
2972   // Load address of array of frame pcs into rcx
2973   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
2974 
2975   // Trash the old pc
2976   __ addptr(rsp, wordSize);
2977 
2978   // Load address of array of frame sizes into rsi
2979   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
2980 
2981   // Load counter into rdx
2982   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
2983 
2984   // Now adjust the caller's stack to make up for the extra locals
2985   // but record the original sp so that we can save it in the skeletal interpreter
2986   // frame and the stack walking of interpreter_sender will get the unextended sp
2987   // value and not the "real" sp value.
2988 
2989   const Register sender_sp = r8;
2990 
2991   __ mov(sender_sp, rsp);
2992   __ movl(rbx, Address(rdi,
2993                        Deoptimization::UnrollBlock::
2994                        caller_adjustment_offset_in_bytes()));
2995   __ subptr(rsp, rbx);
2996 
2997   // Push interpreter frames in a loop
2998   Label loop;
2999   __ bind(loop);
3000   __ movptr(rbx, Address(rsi, 0));      // Load frame size
3001   __ subptr(rbx, 2*wordSize);           // We'll push pc and ebp by hand
3002   __ pushptr(Address(rcx, 0));          // Save return address
3003   __ enter();                           // Save old & set new ebp
3004   __ subptr(rsp, rbx);                  // Prolog
3005   // This value is corrected by layout_activation_impl
3006   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3007   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize), sender_sp); // Make it walkable
3008   __ mov(sender_sp, rsp);               // Pass sender_sp to next frame
3009   __ addptr(rsi, wordSize);             // Bump array pointer (sizes)
3010   __ addptr(rcx, wordSize);             // Bump array pointer (pcs)
3011   __ decrementl(rdx);                   // Decrement counter
3012   __ jcc(Assembler::notZero, loop);
3013   __ pushptr(Address(rcx, 0));          // Save final return address
3014 
3015   // Re-push self-frame
3016   __ enter();                           // Save old & set new ebp
3017 
3018   // Allocate a full sized register save area.
3019   // Return address and rbp are in place, so we allocate two less words.
3020   __ subptr(rsp, (frame_size_in_words - 2) * wordSize);
3021 
3022   // Restore frame locals after moving the frame
3023   __ movdbl(Address(rsp, RegisterSaver::xmm0_offset_in_bytes()), xmm0);
3024   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3025 
3026   // Call C code.  Need thread but NOT official VM entry
3027   // crud.  We cannot block on this call, no GC can happen.  Call should
3028   // restore return values to their stack-slots with the new SP.
3029   //
3030   // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
3031 
3032   // Use rbp because the frames look interpreted now
3033   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3034   // Don't need the precise return PC here, just precise enough to point into this code blob.
3035   address the_pc = __ pc();
3036   __ set_last_Java_frame(noreg, rbp, the_pc);
3037 
3038   __ andptr(rsp, -(StackAlignmentInBytes));  // Fix stack alignment as required by ABI
3039   __ mov(c_rarg0, r15_thread);
3040   __ movl(c_rarg1, r14); // second arg: exec_mode
3041   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3042   // Revert SP alignment after call since we're going to do some SP relative addressing below
3043   __ movptr(rsp, Address(r15_thread, JavaThread::last_Java_sp_offset()));
3044 
3045   // Set an oopmap for the call site
3046   // Use the same PC we used for the last java frame
3047   oop_maps->add_gc_map(the_pc - start,
3048                        new OopMap( frame_size_in_words, 0 ));
3049 
3050   // Clear fp AND pc
3051   __ reset_last_Java_frame(true);
3052 
3053   // Collect return values
3054   __ movdbl(xmm0, Address(rsp, RegisterSaver::xmm0_offset_in_bytes()));
3055   __ movptr(rax, Address(rsp, RegisterSaver::rax_offset_in_bytes()));
3056   // I think this is useless (throwing pc?)
3057   __ movptr(rdx, Address(rsp, RegisterSaver::rdx_offset_in_bytes()));
3058 
3059   // Pop self-frame.
3060   __ leave();                           // Epilog
3061 
3062   // Jump to interpreter
3063   __ ret(0);
3064 
3065   // Make sure all code is generated
3066   masm->flush();
3067 
3068   _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
3069   _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
3070 #if INCLUDE_JVMCI
3071   if (EnableJVMCI) {
3072     _deopt_blob->set_uncommon_trap_offset(uncommon_trap_offset);
3073     _deopt_blob->set_implicit_exception_uncommon_trap_offset(implicit_exception_uncommon_trap_offset);
3074   }
3075 #endif
3076 }
3077 
3078 #ifdef COMPILER2
3079 //------------------------------generate_uncommon_trap_blob--------------------
3080 void SharedRuntime::generate_uncommon_trap_blob() {
3081   // Allocate space for the code
3082   ResourceMark rm;
3083   // Setup code generation tools
3084   CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
3085   MacroAssembler* masm = new MacroAssembler(&buffer);
3086 
3087   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
3088 
3089   address start = __ pc();
3090 
3091   if (UseRTMLocking) {
3092     // Abort RTM transaction before possible nmethod deoptimization.
3093     __ xabort(0);
3094   }
3095 
3096   // Push self-frame.  We get here with a return address on the
3097   // stack, so rsp is 8-byte aligned until we allocate our frame.
3098   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog!
3099 
3100   // No callee saved registers. rbp is assumed implicitly saved
3101   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
3102 
3103   // compiler left unloaded_class_index in j_rarg0 move to where the
3104   // runtime expects it.
3105   __ movl(c_rarg1, j_rarg0);
3106 
3107   __ set_last_Java_frame(noreg, noreg, NULL);
3108 
3109   // Call C code.  Need thread but NOT official VM entry
3110   // crud.  We cannot block on this call, no GC can happen.  Call should
3111   // capture callee-saved registers as well as return values.
3112   // Thread is in rdi already.
3113   //
3114   // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
3115 
3116   __ mov(c_rarg0, r15_thread);
3117   __ movl(c_rarg2, Deoptimization::Unpack_uncommon_trap);
3118   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap)));
3119 
3120   // Set an oopmap for the call site
3121   OopMapSet* oop_maps = new OopMapSet();
3122   OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
3123 
3124   // location of rbp is known implicitly by the frame sender code
3125 
3126   oop_maps->add_gc_map(__ pc() - start, map);
3127 
3128   __ reset_last_Java_frame(false);
3129 
3130   // Load UnrollBlock* into rdi
3131   __ mov(rdi, rax);
3132 
3133 #ifdef ASSERT
3134   { Label L;
3135     __ cmpptr(Address(rdi, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()),
3136             (int32_t)Deoptimization::Unpack_uncommon_trap);
3137     __ jcc(Assembler::equal, L);
3138     __ stop("SharedRuntime::generate_deopt_blob: expected Unpack_uncommon_trap");
3139     __ bind(L);
3140   }
3141 #endif
3142 
3143   // Pop all the frames we must move/replace.
3144   //
3145   // Frame picture (youngest to oldest)
3146   // 1: self-frame (no frame link)
3147   // 2: deopting frame  (no frame link)
3148   // 3: caller of deopting frame (could be compiled/interpreted).
3149 
3150   // Pop self-frame.  We have no frame, and must rely only on rax and rsp.
3151   __ addptr(rsp, (SimpleRuntimeFrame::framesize - 2) << LogBytesPerInt); // Epilog!
3152 
3153   // Pop deoptimized frame (int)
3154   __ movl(rcx, Address(rdi,
3155                        Deoptimization::UnrollBlock::
3156                        size_of_deoptimized_frame_offset_in_bytes()));
3157   __ addptr(rsp, rcx);
3158 
3159   // rsp should be pointing at the return address to the caller (3)
3160 
3161   // Pick up the initial fp we should save
3162   // restore rbp before stack bang because if stack overflow is thrown it needs to be pushed (and preserved)
3163   __ movptr(rbp, Address(rdi, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
3164 
3165 #ifdef ASSERT
3166   // Compilers generate code that bang the stack by as much as the
3167   // interpreter would need. So this stack banging should never
3168   // trigger a fault. Verify that it does not on non product builds.
3169   __ movl(rbx, Address(rdi ,Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
3170   __ bang_stack_size(rbx, rcx);
3171 #endif
3172 
3173   // Load address of array of frame pcs into rcx (address*)
3174   __ movptr(rcx, Address(rdi, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
3175 
3176   // Trash the return pc
3177   __ addptr(rsp, wordSize);
3178 
3179   // Load address of array of frame sizes into rsi (intptr_t*)
3180   __ movptr(rsi, Address(rdi, Deoptimization::UnrollBlock:: frame_sizes_offset_in_bytes()));
3181 
3182   // Counter
3183   __ movl(rdx, Address(rdi, Deoptimization::UnrollBlock:: number_of_frames_offset_in_bytes())); // (int)
3184 
3185   // Now adjust the caller's stack to make up for the extra locals but
3186   // record the original sp so that we can save it in the skeletal
3187   // interpreter frame and the stack walking of interpreter_sender
3188   // will get the unextended sp value and not the "real" sp value.
3189 
3190   const Register sender_sp = r8;
3191 
3192   __ mov(sender_sp, rsp);
3193   __ movl(rbx, Address(rdi, Deoptimization::UnrollBlock:: caller_adjustment_offset_in_bytes())); // (int)
3194   __ subptr(rsp, rbx);
3195 
3196   // Push interpreter frames in a loop
3197   Label loop;
3198   __ bind(loop);
3199   __ movptr(rbx, Address(rsi, 0)); // Load frame size
3200   __ subptr(rbx, 2 * wordSize);    // We'll push pc and rbp by hand
3201   __ pushptr(Address(rcx, 0));     // Save return address
3202   __ enter();                      // Save old & set new rbp
3203   __ subptr(rsp, rbx);             // Prolog
3204   __ movptr(Address(rbp, frame::interpreter_frame_sender_sp_offset * wordSize),
3205             sender_sp);            // Make it walkable
3206   // This value is corrected by layout_activation_impl
3207   __ movptr(Address(rbp, frame::interpreter_frame_last_sp_offset * wordSize), (int32_t)NULL_WORD );
3208   __ mov(sender_sp, rsp);          // Pass sender_sp to next frame
3209   __ addptr(rsi, wordSize);        // Bump array pointer (sizes)
3210   __ addptr(rcx, wordSize);        // Bump array pointer (pcs)
3211   __ decrementl(rdx);              // Decrement counter
3212   __ jcc(Assembler::notZero, loop);
3213   __ pushptr(Address(rcx, 0));     // Save final return address
3214 
3215   // Re-push self-frame
3216   __ enter();                 // Save old & set new rbp
3217   __ subptr(rsp, (SimpleRuntimeFrame::framesize - 4) << LogBytesPerInt);
3218                               // Prolog
3219 
3220   // Use rbp because the frames look interpreted now
3221   // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
3222   // Don't need the precise return PC here, just precise enough to point into this code blob.
3223   address the_pc = __ pc();
3224   __ set_last_Java_frame(noreg, rbp, the_pc);
3225 
3226   // Call C code.  Need thread but NOT official VM entry
3227   // crud.  We cannot block on this call, no GC can happen.  Call should
3228   // restore return values to their stack-slots with the new SP.
3229   // Thread is in rdi already.
3230   //
3231   // BasicType unpack_frames(JavaThread* thread, int exec_mode);
3232 
3233   __ andptr(rsp, -(StackAlignmentInBytes)); // Align SP as required by ABI
3234   __ mov(c_rarg0, r15_thread);
3235   __ movl(c_rarg1, Deoptimization::Unpack_uncommon_trap);
3236   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
3237 
3238   // Set an oopmap for the call site
3239   // Use the same PC we used for the last java frame
3240   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
3241 
3242   // Clear fp AND pc
3243   __ reset_last_Java_frame(true);
3244 
3245   // Pop self-frame.
3246   __ leave();                 // Epilog
3247 
3248   // Jump to interpreter
3249   __ ret(0);
3250 
3251   // Make sure all code is generated
3252   masm->flush();
3253 
3254   _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
3255                                                  SimpleRuntimeFrame::framesize >> 1);
3256 }
3257 #endif // COMPILER2
3258 
3259 //------------------------------generate_handler_blob------
3260 //
3261 // Generate a special Compile2Runtime blob that saves all registers,
3262 // and setup oopmap.
3263 //
3264 SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
3265   assert(StubRoutines::forward_exception_entry() != NULL,
3266          "must be generated before");
3267 
3268   ResourceMark rm;
3269   OopMapSet *oop_maps = new OopMapSet();
3270   OopMap* map;
3271 
3272   // Allocate space for the code.  Setup code generation tools.
3273   CodeBuffer buffer("handler_blob", 2048, 1024);
3274   MacroAssembler* masm = new MacroAssembler(&buffer);
3275 
3276   address start   = __ pc();
3277   address call_pc = NULL;
3278   int frame_size_in_words;
3279   bool cause_return = (poll_type == POLL_AT_RETURN);
3280   bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
3281 
3282   if (UseRTMLocking) {
3283     // Abort RTM transaction before calling runtime
3284     // because critical section will be large and will be
3285     // aborted anyway. Also nmethod could be deoptimized.
3286     __ xabort(0);
3287   }
3288 
3289   // Make room for return address (or push it again)
3290   if (!cause_return) {
3291     __ push(rbx);
3292   }
3293 
3294   // Save registers, fpu state, and flags
3295   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
3296 
3297   // The following is basically a call_VM.  However, we need the precise
3298   // address of the call in order to generate an oopmap. Hence, we do all the
3299   // work outselves.
3300 
3301   __ set_last_Java_frame(noreg, noreg, NULL);
3302 
3303   // The return address must always be correct so that frame constructor never
3304   // sees an invalid pc.
3305 
3306   if (!cause_return) {
3307     // Get the return pc saved by the signal handler and stash it in its appropriate place on the stack.
3308     // Additionally, rbx is a callee saved register and we can look at it later to determine
3309     // if someone changed the return address for us!
3310     __ movptr(rbx, Address(r15_thread, JavaThread::saved_exception_pc_offset()));
3311     __ movptr(Address(rbp, wordSize), rbx);
3312   }
3313 
3314   // Do the call
3315   __ mov(c_rarg0, r15_thread);
3316   __ call(RuntimeAddress(call_ptr));
3317 
3318   // Set an oopmap for the call site.  This oopmap will map all
3319   // oop-registers and debug-info registers as callee-saved.  This
3320   // will allow deoptimization at this safepoint to find all possible
3321   // debug-info recordings, as well as let GC find all oops.
3322 
3323   oop_maps->add_gc_map( __ pc() - start, map);
3324 
3325   Label noException;
3326 
3327   __ reset_last_Java_frame(false);
3328 
3329   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3330   __ jcc(Assembler::equal, noException);
3331 
3332   // Exception pending
3333 
3334   RegisterSaver::restore_live_registers(masm, save_vectors);
3335 
3336   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3337 
3338   // No exception case
3339   __ bind(noException);
3340 
3341   Label no_adjust;
3342 #ifdef ASSERT
3343   Label bail;
3344 #endif
3345   if (!cause_return) {
3346     Label no_prefix, not_special;
3347 
3348     // If our stashed return pc was modified by the runtime we avoid touching it
3349     __ cmpptr(rbx, Address(rbp, wordSize));
3350     __ jccb(Assembler::notEqual, no_adjust);
3351 
3352     // Skip over the poll instruction.
3353     // See NativeInstruction::is_safepoint_poll()
3354     // Possible encodings:
3355     //      85 00       test   %eax,(%rax)
3356     //      85 01       test   %eax,(%rcx)
3357     //      85 02       test   %eax,(%rdx)
3358     //      85 03       test   %eax,(%rbx)
3359     //      85 06       test   %eax,(%rsi)
3360     //      85 07       test   %eax,(%rdi)
3361     //
3362     //   41 85 00       test   %eax,(%r8)
3363     //   41 85 01       test   %eax,(%r9)
3364     //   41 85 02       test   %eax,(%r10)
3365     //   41 85 03       test   %eax,(%r11)
3366     //   41 85 06       test   %eax,(%r14)
3367     //   41 85 07       test   %eax,(%r15)
3368     //
3369     //      85 04 24    test   %eax,(%rsp)
3370     //   41 85 04 24    test   %eax,(%r12)
3371     //      85 45 00    test   %eax,0x0(%rbp)
3372     //   41 85 45 00    test   %eax,0x0(%r13)
3373 
3374     __ cmpb(Address(rbx, 0), NativeTstRegMem::instruction_rex_b_prefix);
3375     __ jcc(Assembler::notEqual, no_prefix);
3376     __ addptr(rbx, 1);
3377     __ bind(no_prefix);
3378 #ifdef ASSERT
3379     __ movptr(rax, rbx); // remember where 0x85 should be, for verification below
3380 #endif
3381     // r12/r13/rsp/rbp base encoding takes 3 bytes with the following register values:
3382     // r12/rsp 0x04
3383     // r13/rbp 0x05
3384     __ movzbq(rcx, Address(rbx, 1));
3385     __ andptr(rcx, 0x07); // looking for 0x04 .. 0x05
3386     __ subptr(rcx, 4);    // looking for 0x00 .. 0x01
3387     __ cmpptr(rcx, 1);
3388     __ jcc(Assembler::above, not_special);
3389     __ addptr(rbx, 1);
3390     __ bind(not_special);
3391 #ifdef ASSERT
3392     // Verify the correct encoding of the poll we're about to skip.
3393     __ cmpb(Address(rax, 0), NativeTstRegMem::instruction_code_memXregl);
3394     __ jcc(Assembler::notEqual, bail);
3395     // Mask out the modrm bits
3396     __ testb(Address(rax, 1), NativeTstRegMem::modrm_mask);
3397     // rax encodes to 0, so if the bits are nonzero it's incorrect
3398     __ jcc(Assembler::notZero, bail);
3399 #endif
3400     // Adjust return pc forward to step over the safepoint poll instruction
3401     __ addptr(rbx, 2);
3402     __ movptr(Address(rbp, wordSize), rbx);
3403   }
3404 
3405   __ bind(no_adjust);
3406   // Normal exit, restore registers and exit.
3407   RegisterSaver::restore_live_registers(masm, save_vectors);
3408   __ ret(0);
3409 
3410 #ifdef ASSERT
3411   __ bind(bail);
3412   __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
3413 #endif
3414 
3415   // Make sure all code is generated
3416   masm->flush();
3417 
3418   // Fill-out other meta info
3419   return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
3420 }
3421 
3422 //
3423 // generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
3424 //
3425 // Generate a stub that calls into vm to find out the proper destination
3426 // of a java call. All the argument registers are live at this point
3427 // but since this is generic code we don't know what they are and the caller
3428 // must do any gc of the args.
3429 //
3430 RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
3431   assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
3432 
3433   // allocate space for the code
3434   ResourceMark rm;
3435 
3436   CodeBuffer buffer(name, 1000, 512);
3437   MacroAssembler* masm                = new MacroAssembler(&buffer);
3438 
3439   int frame_size_in_words;
3440 
3441   OopMapSet *oop_maps = new OopMapSet();
3442   OopMap* map = NULL;
3443 
3444   int start = __ offset();
3445 
3446   // No need to save vector registers since they are caller-saved anyway.
3447   map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, /*save_vectors*/ false);
3448 
3449   int frame_complete = __ offset();
3450 
3451   __ set_last_Java_frame(noreg, noreg, NULL);
3452 
3453   __ mov(c_rarg0, r15_thread);
3454 
3455   __ call(RuntimeAddress(destination));
3456 
3457 
3458   // Set an oopmap for the call site.
3459   // We need this not only for callee-saved registers, but also for volatile
3460   // registers that the compiler might be keeping live across a safepoint.
3461 
3462   oop_maps->add_gc_map( __ offset() - start, map);
3463 
3464   // rax contains the address we are going to jump to assuming no exception got installed
3465 
3466   // clear last_Java_sp
3467   __ reset_last_Java_frame(false);
3468   // check for pending exceptions
3469   Label pending;
3470   __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3471   __ jcc(Assembler::notEqual, pending);
3472 
3473   // get the returned Method*
3474   __ get_vm_result_2(rbx, r15_thread);
3475   __ movptr(Address(rsp, RegisterSaver::rbx_offset_in_bytes()), rbx);
3476 
3477   __ movptr(Address(rsp, RegisterSaver::rax_offset_in_bytes()), rax);
3478 
3479   RegisterSaver::restore_live_registers(masm);
3480 
3481   // We are back the the original state on entry and ready to go.
3482 
3483   __ jmp(rax);
3484 
3485   // Pending exception after the safepoint
3486 
3487   __ bind(pending);
3488 
3489   RegisterSaver::restore_live_registers(masm);
3490 
3491   // exception pending => remove activation and forward to exception handler
3492 
3493   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3494 
3495   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3496   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3497 
3498   // -------------
3499   // make sure all code is generated
3500   masm->flush();
3501 
3502   // return the  blob
3503   // frame_size_words or bytes??
3504   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3505 }
3506 
3507 #ifdef COMPILER2
3508 static const int native_invoker_code_size = MethodHandles::adapter_code_size;
3509 
3510 class NativeInvokerGenerator : public StubCodeGenerator {
3511   address _call_target;
3512   int _shadow_space_bytes;
3513 
3514   const GrowableArray<VMReg>& _input_registers;
3515   const GrowableArray<VMReg>& _output_registers;
3516 
3517   int _frame_complete;
3518   int _framesize;
3519   OopMapSet* _oop_maps;
3520 public:
3521   NativeInvokerGenerator(CodeBuffer* buffer,
3522                          address call_target,
3523                          int shadow_space_bytes,
3524                          const GrowableArray<VMReg>& input_registers,
3525                          const GrowableArray<VMReg>& output_registers)
3526    : StubCodeGenerator(buffer, PrintMethodHandleStubs),
3527      _call_target(call_target),
3528      _shadow_space_bytes(shadow_space_bytes),
3529      _input_registers(input_registers),
3530      _output_registers(output_registers),
3531      _frame_complete(0),
3532      _framesize(0),
3533      _oop_maps(NULL) {
3534     assert(_output_registers.length() <= 1
3535            || (_output_registers.length() == 2 && !_output_registers.at(1)->is_valid()), "no multi-reg returns");
3536 
3537   }
3538 
3539   void generate();
3540 
3541   int spill_size_in_bytes() const {
3542     if (_output_registers.length() == 0) {
3543       return 0;
3544     }
3545     VMReg reg = _output_registers.at(0);
3546     assert(reg->is_reg(), "must be a register");
3547     if (reg->is_Register()) {
3548       return 8;
3549     } else if (reg->is_XMMRegister()) {
3550       if (UseAVX >= 3) {
3551         return 64;
3552       } else if (UseAVX >= 1) {
3553         return 32;
3554       } else {
3555         return 16;
3556       }
3557     } else {
3558       ShouldNotReachHere();
3559     }
3560     return 0;
3561   }
3562 
3563   void spill_out_registers() {
3564     if (_output_registers.length() == 0) {
3565       return;
3566     }
3567     VMReg reg = _output_registers.at(0);
3568     assert(reg->is_reg(), "must be a register");
3569     MacroAssembler* masm = _masm;
3570     if (reg->is_Register()) {
3571       __ movptr(Address(rsp, 0), reg->as_Register());
3572     } else if (reg->is_XMMRegister()) {
3573       if (UseAVX >= 3) {
3574         __ evmovdqul(Address(rsp, 0), reg->as_XMMRegister(), Assembler::AVX_512bit);
3575       } else if (UseAVX >= 1) {
3576         __ vmovdqu(Address(rsp, 0), reg->as_XMMRegister());
3577       } else {
3578         __ movdqu(Address(rsp, 0), reg->as_XMMRegister());
3579       }
3580     } else {
3581       ShouldNotReachHere();
3582     }
3583   }
3584 
3585   void fill_out_registers() {
3586     if (_output_registers.length() == 0) {
3587       return;
3588     }
3589     VMReg reg = _output_registers.at(0);
3590     assert(reg->is_reg(), "must be a register");
3591     MacroAssembler* masm = _masm;
3592     if (reg->is_Register()) {
3593       __ movptr(reg->as_Register(), Address(rsp, 0));
3594     } else if (reg->is_XMMRegister()) {
3595       if (UseAVX >= 3) {
3596         __ evmovdqul(reg->as_XMMRegister(), Address(rsp, 0), Assembler::AVX_512bit);
3597       } else if (UseAVX >= 1) {
3598         __ vmovdqu(reg->as_XMMRegister(), Address(rsp, 0));
3599       } else {
3600         __ movdqu(reg->as_XMMRegister(), Address(rsp, 0));
3601       }
3602     } else {
3603       ShouldNotReachHere();
3604     }
3605   }
3606 
3607   int frame_complete() const {
3608     return _frame_complete;
3609   }
3610 
3611   int framesize() const {
3612     return (_framesize >> (LogBytesPerWord - LogBytesPerInt));
3613   }
3614 
3615   OopMapSet* oop_maps() const {
3616     return _oop_maps;
3617   }
3618 
3619 private:
3620 #ifdef ASSERT
3621 bool target_uses_register(VMReg reg) {
3622   return _input_registers.contains(reg) || _output_registers.contains(reg);
3623 }
3624 #endif
3625 };
3626 
3627 RuntimeStub* SharedRuntime::make_native_invoker(address call_target,
3628                                                 int shadow_space_bytes,
3629                                                 const GrowableArray<VMReg>& input_registers,
3630                                                 const GrowableArray<VMReg>& output_registers) {
3631   int locs_size  = 64;
3632   CodeBuffer code("nep_invoker_blob", native_invoker_code_size, locs_size);
3633   NativeInvokerGenerator g(&code, call_target, shadow_space_bytes, input_registers, output_registers);
3634   g.generate();
3635   code.log_section_sizes("nep_invoker_blob");
3636 
3637   RuntimeStub* stub =
3638     RuntimeStub::new_runtime_stub("nep_invoker_blob",
3639                                   &code,
3640                                   g.frame_complete(),
3641                                   g.framesize(),
3642                                   g.oop_maps(), false);
3643   return stub;
3644 }
3645 
3646 void NativeInvokerGenerator::generate() {
3647   assert(!(target_uses_register(r15_thread->as_VMReg()) || target_uses_register(rscratch1->as_VMReg())), "Register conflict");
3648 
3649   enum layout {
3650     rbp_off,
3651     rbp_off2,
3652     return_off,
3653     return_off2,
3654     framesize // inclusive of return address
3655   };
3656 
3657   _framesize = align_up(framesize + ((_shadow_space_bytes + spill_size_in_bytes()) >> LogBytesPerInt), 4);
3658   assert(is_even(_framesize/2), "sp not 16-byte aligned");
3659 
3660   _oop_maps  = new OopMapSet();
3661   MacroAssembler* masm = _masm;
3662 
3663   address start = __ pc();
3664 
3665   __ enter();
3666 
3667   // return address and rbp are already in place
3668   __ subptr(rsp, (_framesize-4) << LogBytesPerInt); // prolog
3669 
3670   _frame_complete = __ pc() - start;
3671 
3672   address the_pc = __ pc();
3673 
3674   __ set_last_Java_frame(rsp, rbp, (address)the_pc);
3675   OopMap* map = new OopMap(_framesize, 0);
3676   _oop_maps->add_gc_map(the_pc - start, map);
3677 
3678   // State transition
3679   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native);
3680 
3681   __ call(RuntimeAddress(_call_target));
3682 
3683   __ restore_cpu_control_state_after_jni();
3684 
3685   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_native_trans);
3686 
3687   // Force this write out before the read below
3688   __ membar(Assembler::Membar_mask_bits(
3689           Assembler::LoadLoad | Assembler::LoadStore |
3690           Assembler::StoreLoad | Assembler::StoreStore));
3691 
3692   Label L_after_safepoint_poll;
3693   Label L_safepoint_poll_slow_path;
3694 
3695   __ safepoint_poll(L_safepoint_poll_slow_path, r15_thread, true /* at_return */, false /* in_nmethod */);
3696   __ cmpl(Address(r15_thread, JavaThread::suspend_flags_offset()), 0);
3697   __ jcc(Assembler::notEqual, L_safepoint_poll_slow_path);
3698 
3699   __ bind(L_after_safepoint_poll);
3700 
3701   // change thread state
3702   __ movl(Address(r15_thread, JavaThread::thread_state_offset()), _thread_in_Java);
3703 
3704   __ block_comment("reguard stack check");
3705   Label L_reguard;
3706   Label L_after_reguard;
3707   __ cmpl(Address(r15_thread, JavaThread::stack_guard_state_offset()), StackOverflow::stack_guard_yellow_reserved_disabled);
3708   __ jcc(Assembler::equal, L_reguard);
3709   __ bind(L_after_reguard);
3710 
3711   __ reset_last_Java_frame(r15_thread, true);
3712 
3713   __ leave(); // required for proper stackwalking of RuntimeStub frame
3714   __ ret(0);
3715 
3716   //////////////////////////////////////////////////////////////////////////////
3717 
3718   __ block_comment("{ L_safepoint_poll_slow_path");
3719   __ bind(L_safepoint_poll_slow_path);
3720   __ vzeroupper();
3721 
3722   spill_out_registers();
3723 
3724   __ mov(c_rarg0, r15_thread);
3725   __ mov(r12, rsp); // remember sp
3726   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3727   __ andptr(rsp, -16); // align stack as required by ABI
3728   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
3729   __ mov(rsp, r12); // restore sp
3730   __ reinit_heapbase();
3731 
3732   fill_out_registers();
3733 
3734   __ jmp(L_after_safepoint_poll);
3735   __ block_comment("} L_safepoint_poll_slow_path");
3736 
3737   //////////////////////////////////////////////////////////////////////////////
3738 
3739   __ block_comment("{ L_reguard");
3740   __ bind(L_reguard);
3741   __ vzeroupper();
3742 
3743   spill_out_registers();
3744 
3745   __ mov(r12, rsp); // remember sp
3746   __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
3747   __ andptr(rsp, -16); // align stack as required by ABI
3748   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages)));
3749   __ mov(rsp, r12); // restore sp
3750   __ reinit_heapbase();
3751 
3752   fill_out_registers();
3753 
3754   __ jmp(L_after_reguard);
3755 
3756   __ block_comment("} L_reguard");
3757 
3758   //////////////////////////////////////////////////////////////////////////////
3759 
3760   __ flush();
3761 }
3762 #endif // COMPILER2
3763 
3764 //------------------------------Montgomery multiplication------------------------
3765 //
3766 
3767 #ifndef _WINDOWS
3768 
3769 // Subtract 0:b from carry:a.  Return carry.
3770 static julong
3771 sub(julong a[], julong b[], julong carry, long len) {
3772   long long i = 0, cnt = len;
3773   julong tmp;
3774   asm volatile("clc; "
3775                "0: ; "
3776                "mov (%[b], %[i], 8), %[tmp]; "
3777                "sbb %[tmp], (%[a], %[i], 8); "
3778                "inc %[i]; dec %[cnt]; "
3779                "jne 0b; "
3780                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3781                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3782                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3783                : "memory");
3784   return tmp;
3785 }
3786 
3787 // Multiply (unsigned) Long A by Long B, accumulating the double-
3788 // length result into the accumulator formed of T0, T1, and T2.
3789 #define MACC(A, B, T0, T1, T2)                                  \
3790 do {                                                            \
3791   unsigned long hi, lo;                                         \
3792   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
3793            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3794            : "r"(A), "a"(B) : "cc");                            \
3795  } while(0)
3796 
3797 // As above, but add twice the double-length result into the
3798 // accumulator.
3799 #define MACC2(A, B, T0, T1, T2)                                 \
3800 do {                                                            \
3801   unsigned long hi, lo;                                         \
3802   __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4; " \
3803            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"           \
3804            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)  \
3805            : "r"(A), "a"(B) : "cc");                            \
3806  } while(0)
3807 
3808 #else //_WINDOWS
3809 
3810 static julong
3811 sub(julong a[], julong b[], julong carry, long len) {
3812   long i;
3813   julong tmp;
3814   unsigned char c = 1;
3815   for (i = 0; i < len; i++) {
3816     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3817     a[i] = tmp;
3818   }
3819   c = _addcarry_u64(c, carry, ~0, &tmp);
3820   return tmp;
3821 }
3822 
3823 // Multiply (unsigned) Long A by Long B, accumulating the double-
3824 // length result into the accumulator formed of T0, T1, and T2.
3825 #define MACC(A, B, T0, T1, T2)                          \
3826 do {                                                    \
3827   julong hi, lo;                            \
3828   lo = _umul128(A, B, &hi);                             \
3829   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3830   c = _addcarry_u64(c, hi, T1, &T1);                    \
3831   _addcarry_u64(c, T2, 0, &T2);                         \
3832  } while(0)
3833 
3834 // As above, but add twice the double-length result into the
3835 // accumulator.
3836 #define MACC2(A, B, T0, T1, T2)                         \
3837 do {                                                    \
3838   julong hi, lo;                            \
3839   lo = _umul128(A, B, &hi);                             \
3840   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
3841   c = _addcarry_u64(c, hi, T1, &T1);                    \
3842   _addcarry_u64(c, T2, 0, &T2);                         \
3843   c = _addcarry_u64(0, lo, T0, &T0);                    \
3844   c = _addcarry_u64(c, hi, T1, &T1);                    \
3845   _addcarry_u64(c, T2, 0, &T2);                         \
3846  } while(0)
3847 
3848 #endif //_WINDOWS
3849 
3850 // Fast Montgomery multiplication.  The derivation of the algorithm is
3851 // in  A Cryptographic Library for the Motorola DSP56000,
3852 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3853 
3854 static void NOINLINE
3855 montgomery_multiply(julong a[], julong b[], julong n[],
3856                     julong m[], julong inv, int len) {
3857   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3858   int i;
3859 
3860   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
3861 
3862   for (i = 0; i < len; i++) {
3863     int j;
3864     for (j = 0; j < i; j++) {
3865       MACC(a[j], b[i-j], t0, t1, t2);
3866       MACC(m[j], n[i-j], t0, t1, t2);
3867     }
3868     MACC(a[i], b[0], t0, t1, t2);
3869     m[i] = t0 * inv;
3870     MACC(m[i], n[0], t0, t1, t2);
3871 
3872     assert(t0 == 0, "broken Montgomery multiply");
3873 
3874     t0 = t1; t1 = t2; t2 = 0;
3875   }
3876 
3877   for (i = len; i < 2*len; i++) {
3878     int j;
3879     for (j = i-len+1; j < len; j++) {
3880       MACC(a[j], b[i-j], t0, t1, t2);
3881       MACC(m[j], n[i-j], t0, t1, t2);
3882     }
3883     m[i-len] = t0;
3884     t0 = t1; t1 = t2; t2 = 0;
3885   }
3886 
3887   while (t0)
3888     t0 = sub(m, n, t0, len);
3889 }
3890 
3891 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
3892 // multiplies so it should be up to 25% faster than Montgomery
3893 // multiplication.  However, its loop control is more complex and it
3894 // may actually run slower on some machines.
3895 
3896 static void NOINLINE
3897 montgomery_square(julong a[], julong n[],
3898                   julong m[], julong inv, int len) {
3899   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3900   int i;
3901 
3902   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
3903 
3904   for (i = 0; i < len; i++) {
3905     int j;
3906     int end = (i+1)/2;
3907     for (j = 0; j < end; j++) {
3908       MACC2(a[j], a[i-j], t0, t1, t2);
3909       MACC(m[j], n[i-j], t0, t1, t2);
3910     }
3911     if ((i & 1) == 0) {
3912       MACC(a[j], a[j], t0, t1, t2);
3913     }
3914     for (; j < i; j++) {
3915       MACC(m[j], n[i-j], t0, t1, t2);
3916     }
3917     m[i] = t0 * inv;
3918     MACC(m[i], n[0], t0, t1, t2);
3919 
3920     assert(t0 == 0, "broken Montgomery square");
3921 
3922     t0 = t1; t1 = t2; t2 = 0;
3923   }
3924 
3925   for (i = len; i < 2*len; i++) {
3926     int start = i-len+1;
3927     int end = start + (len - start)/2;
3928     int j;
3929     for (j = start; j < end; j++) {
3930       MACC2(a[j], a[i-j], t0, t1, t2);
3931       MACC(m[j], n[i-j], t0, t1, t2);
3932     }
3933     if ((i & 1) == 0) {
3934       MACC(a[j], a[j], t0, t1, t2);
3935     }
3936     for (; j < len; j++) {
3937       MACC(m[j], n[i-j], t0, t1, t2);
3938     }
3939     m[i-len] = t0;
3940     t0 = t1; t1 = t2; t2 = 0;
3941   }
3942 
3943   while (t0)
3944     t0 = sub(m, n, t0, len);
3945 }
3946 
3947 // Swap words in a longword.
3948 static julong swap(julong x) {
3949   return (x << 32) | (x >> 32);
3950 }
3951 
3952 // Copy len longwords from s to d, word-swapping as we go.  The
3953 // destination array is reversed.
3954 static void reverse_words(julong *s, julong *d, int len) {
3955   d += len;
3956   while(len-- > 0) {
3957     d--;
3958     *d = swap(*s);
3959     s++;
3960   }
3961 }
3962 
3963 // The threshold at which squaring is advantageous was determined
3964 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
3965 #define MONTGOMERY_SQUARING_THRESHOLD 64
3966 
3967 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
3968                                         jint len, jlong inv,
3969                                         jint *m_ints) {
3970   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
3971   int longwords = len/2;
3972 
3973   // Make very sure we don't use so much space that the stack might
3974   // overflow.  512 jints corresponds to an 16384-bit integer and
3975   // will use here a total of 8k bytes of stack space.
3976   int total_allocation = longwords * sizeof (julong) * 4;
3977   guarantee(total_allocation <= 8192, "must be");
3978   julong *scratch = (julong *)alloca(total_allocation);
3979 
3980   // Local scratch arrays
3981   julong
3982     *a = scratch + 0 * longwords,
3983     *b = scratch + 1 * longwords,
3984     *n = scratch + 2 * longwords,
3985     *m = scratch + 3 * longwords;
3986 
3987   reverse_words((julong *)a_ints, a, longwords);
3988   reverse_words((julong *)b_ints, b, longwords);
3989   reverse_words((julong *)n_ints, n, longwords);
3990 
3991   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
3992 
3993   reverse_words(m, (julong *)m_ints, longwords);
3994 }
3995 
3996 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3997                                       jint len, jlong inv,
3998                                       jint *m_ints) {
3999   assert(len % 2 == 0, "array length in montgomery_square must be even");
4000   int longwords = len/2;
4001 
4002   // Make very sure we don't use so much space that the stack might
4003   // overflow.  512 jints corresponds to an 16384-bit integer and
4004   // will use here a total of 6k bytes of stack space.
4005   int total_allocation = longwords * sizeof (julong) * 3;
4006   guarantee(total_allocation <= 8192, "must be");
4007   julong *scratch = (julong *)alloca(total_allocation);
4008 
4009   // Local scratch arrays
4010   julong
4011     *a = scratch + 0 * longwords,
4012     *n = scratch + 1 * longwords,
4013     *m = scratch + 2 * longwords;
4014 
4015   reverse_words((julong *)a_ints, a, longwords);
4016   reverse_words((julong *)n_ints, n, longwords);
4017 
4018   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
4019     ::montgomery_square(a, n, m, (julong)inv, longwords);
4020   } else {
4021     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
4022   }
4023 
4024   reverse_words(m, (julong *)m_ints, longwords);
4025 }
4026 
4027 #ifdef COMPILER2
4028 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
4029 //
4030 //------------------------------generate_exception_blob---------------------------
4031 // creates exception blob at the end
4032 // Using exception blob, this code is jumped from a compiled method.
4033 // (see emit_exception_handler in x86_64.ad file)
4034 //
4035 // Given an exception pc at a call we call into the runtime for the
4036 // handler in this method. This handler might merely restore state
4037 // (i.e. callee save registers) unwind the frame and jump to the
4038 // exception handler for the nmethod if there is no Java level handler
4039 // for the nmethod.
4040 //
4041 // This code is entered with a jmp.
4042 //
4043 // Arguments:
4044 //   rax: exception oop
4045 //   rdx: exception pc
4046 //
4047 // Results:
4048 //   rax: exception oop
4049 //   rdx: exception pc in caller or ???
4050 //   destination: exception handler of caller
4051 //
4052 // Note: the exception pc MUST be at a call (precise debug information)
4053 //       Registers rax, rdx, rcx, rsi, rdi, r8-r11 are not callee saved.
4054 //
4055 
4056 void OptoRuntime::generate_exception_blob() {
4057   assert(!OptoRuntime::is_callee_saved_register(RDX_num), "");
4058   assert(!OptoRuntime::is_callee_saved_register(RAX_num), "");
4059   assert(!OptoRuntime::is_callee_saved_register(RCX_num), "");
4060 
4061   assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
4062 
4063   // Allocate space for the code
4064   ResourceMark rm;
4065   // Setup code generation tools
4066   CodeBuffer buffer("exception_blob", 2048, 1024);
4067   MacroAssembler* masm = new MacroAssembler(&buffer);
4068 
4069 
4070   address start = __ pc();
4071 
4072   // Exception pc is 'return address' for stack walker
4073   __ push(rdx);
4074   __ subptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Prolog
4075 
4076   // Save callee-saved registers.  See x86_64.ad.
4077 
4078   // rbp is an implicitly saved callee saved register (i.e., the calling
4079   // convention will save/restore it in the prolog/epilog). Other than that
4080   // there are no callee save registers now that adapter frames are gone.
4081 
4082   __ movptr(Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt), rbp);
4083 
4084   // Store exception in Thread object. We cannot pass any arguments to the
4085   // handle_exception call, since we do not want to make any assumption
4086   // about the size of the frame where the exception happened in.
4087   // c_rarg0 is either rdi (Linux) or rcx (Windows).
4088   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()),rax);
4089   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), rdx);
4090 
4091   // This call does all the hard work.  It checks if an exception handler
4092   // exists in the method.
4093   // If so, it returns the handler address.
4094   // If not, it prepares for stack-unwinding, restoring the callee-save
4095   // registers of the frame being removed.
4096   //
4097   // address OptoRuntime::handle_exception_C(JavaThread* thread)
4098 
4099   // At a method handle call, the stack may not be properly aligned
4100   // when returning with an exception.
4101   address the_pc = __ pc();
4102   __ set_last_Java_frame(noreg, noreg, the_pc);
4103   __ mov(c_rarg0, r15_thread);
4104   __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
4105   __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
4106 
4107   // Set an oopmap for the call site.  This oopmap will only be used if we
4108   // are unwinding the stack.  Hence, all locations will be dead.
4109   // Callee-saved registers will be the same as the frame above (i.e.,
4110   // handle_exception_stub), since they were restored when we got the
4111   // exception.
4112 
4113   OopMapSet* oop_maps = new OopMapSet();
4114 
4115   oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
4116 
4117   __ reset_last_Java_frame(false);
4118 
4119   // Restore callee-saved registers
4120 
4121   // rbp is an implicitly saved callee-saved register (i.e., the calling
4122   // convention will save restore it in prolog/epilog) Other than that
4123   // there are no callee save registers now that adapter frames are gone.
4124 
4125   __ movptr(rbp, Address(rsp, SimpleRuntimeFrame::rbp_off << LogBytesPerInt));
4126 
4127   __ addptr(rsp, SimpleRuntimeFrame::return_off << LogBytesPerInt); // Epilog
4128   __ pop(rdx);                  // No need for exception pc anymore
4129 
4130   // rax: exception handler
4131 
4132   // We have a handler in rax (could be deopt blob).
4133   __ mov(r8, rax);
4134 
4135   // Get the exception oop
4136   __ movptr(rax, Address(r15_thread, JavaThread::exception_oop_offset()));
4137   // Get the exception pc in case we are deoptimized
4138   __ movptr(rdx, Address(r15_thread, JavaThread::exception_pc_offset()));
4139 #ifdef ASSERT
4140   __ movptr(Address(r15_thread, JavaThread::exception_handler_pc_offset()), (int)NULL_WORD);
4141   __ movptr(Address(r15_thread, JavaThread::exception_pc_offset()), (int)NULL_WORD);
4142 #endif
4143   // Clear the exception oop so GC no longer processes it as a root.
4144   __ movptr(Address(r15_thread, JavaThread::exception_oop_offset()), (int)NULL_WORD);
4145 
4146   // rax: exception oop
4147   // r8:  exception handler
4148   // rdx: exception pc
4149   // Jump to handler
4150 
4151   __ jmp(r8);
4152 
4153   // Make sure all code is generated
4154   masm->flush();
4155 
4156   // Set exception blob
4157   _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
4158 }
4159 #endif // COMPILER2
4160 
4161 BufferedInlineTypeBlob* SharedRuntime::generate_buffered_inline_type_adapter(const InlineKlass* vk) {
4162   BufferBlob* buf = BufferBlob::create("inline types pack/unpack", 16 * K);
4163   CodeBuffer buffer(buf);
4164   short buffer_locs[20];
4165   buffer.insts()->initialize_shared_locs((relocInfo*)buffer_locs,
4166                                          sizeof(buffer_locs)/sizeof(relocInfo));
4167 
4168   MacroAssembler* masm = new MacroAssembler(&buffer);
4169 
4170   const Array<SigEntry>* sig_vk = vk->extended_sig();
4171   const Array<VMRegPair>* regs = vk->return_regs();
4172 
4173   int pack_fields_jobject_off = __ offset();
4174   // Resolve pre-allocated buffer from JNI handle.
4175   // We cannot do this in generate_call_stub() because it requires GC code to be initialized.
4176   __ movptr(rax, Address(r13, 0));
4177   __ resolve_jobject(rax /* value */,
4178                      r15_thread /* thread */,
4179                      r12 /* tmp */);
4180   __ movptr(Address(r13, 0), rax);
4181 
4182   int pack_fields_off = __ offset();
4183 
4184   int j = 1;
4185   for (int i = 0; i < sig_vk->length(); i++) {
4186     BasicType bt = sig_vk->at(i)._bt;
4187     if (bt == T_INLINE_TYPE) {
4188       continue;
4189     }
4190     if (bt == T_VOID) {
4191       if (sig_vk->at(i-1)._bt == T_LONG ||
4192           sig_vk->at(i-1)._bt == T_DOUBLE) {
4193         j++;
4194       }
4195       continue;
4196     }
4197     int off = sig_vk->at(i)._offset;
4198     assert(off > 0, "offset in object should be positive");
4199     VMRegPair pair = regs->at(j);
4200     VMReg r_1 = pair.first();
4201     VMReg r_2 = pair.second();
4202     Address to(rax, off);
4203     if (bt == T_FLOAT) {
4204       __ movflt(to, r_1->as_XMMRegister());
4205     } else if (bt == T_DOUBLE) {
4206       __ movdbl(to, r_1->as_XMMRegister());
4207     } else {
4208       Register val = r_1->as_Register();
4209       assert_different_registers(to.base(), val, r14, r13, rbx, rscratch1);
4210       if (is_reference_type(bt)) {
4211         __ store_heap_oop(to, val, r14, r13, rbx, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED);
4212       } else {
4213         __ store_sized_value(to, r_1->as_Register(), type2aelembytes(bt));
4214       }
4215     }
4216     j++;
4217   }
4218   assert(j == regs->length(), "missed a field?");
4219 
4220   __ ret(0);
4221 
4222   int unpack_fields_off = __ offset();
4223 
4224   j = 1;
4225   for (int i = 0; i < sig_vk->length(); i++) {
4226     BasicType bt = sig_vk->at(i)._bt;
4227     if (bt == T_INLINE_TYPE) {
4228       continue;
4229     }
4230     if (bt == T_VOID) {
4231       if (sig_vk->at(i-1)._bt == T_LONG ||
4232           sig_vk->at(i-1)._bt == T_DOUBLE) {
4233         j++;
4234       }
4235       continue;
4236     }
4237     int off = sig_vk->at(i)._offset;
4238     assert(off > 0, "offset in object should be positive");
4239     VMRegPair pair = regs->at(j);
4240     VMReg r_1 = pair.first();
4241     VMReg r_2 = pair.second();
4242     Address from(rax, off);
4243     if (bt == T_FLOAT) {
4244       __ movflt(r_1->as_XMMRegister(), from);
4245     } else if (bt == T_DOUBLE) {
4246       __ movdbl(r_1->as_XMMRegister(), from);
4247     } else if (bt == T_OBJECT || bt == T_ARRAY) {
4248       assert_different_registers(rax, r_1->as_Register());
4249       __ load_heap_oop(r_1->as_Register(), from);
4250     } else {
4251       assert(is_java_primitive(bt), "unexpected basic type");
4252       assert_different_registers(rax, r_1->as_Register());
4253       size_t size_in_bytes = type2aelembytes(bt);
4254       __ load_sized_value(r_1->as_Register(), from, size_in_bytes, bt != T_CHAR && bt != T_BOOLEAN);
4255     }
4256     j++;
4257   }
4258   assert(j == regs->length(), "missed a field?");
4259 
4260   if (StressInlineTypeReturnedAsFields) {
4261     __ load_klass(rax, rax, rscratch1);
4262     __ orptr(rax, 1);
4263   }
4264 
4265   __ ret(0);
4266 
4267   __ flush();
4268 
4269   return BufferedInlineTypeBlob::create(&buffer, pack_fields_off, pack_fields_jobject_off, unpack_fields_off);
4270 }
4271 
4272 void SharedRuntime::compute_move_order(const BasicType* in_sig_bt,
4273                                        int total_in_args, const VMRegPair* in_regs,
4274                                        int total_out_args, VMRegPair* out_regs,
4275                                        GrowableArray<int>& arg_order,
4276                                        VMRegPair tmp_vmreg) {
4277   ComputeMoveOrder order(total_in_args, in_regs,
4278                          total_out_args, out_regs,
4279                          in_sig_bt, arg_order, tmp_vmreg);
4280 }