1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/collectedHeap.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "memory/universe.hpp"
  36 #include "oops/accessDecorators.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/biasedLocking.hpp"
  41 #include "runtime/flags/flagSetting.hpp"
  42 #include "runtime/interfaceSupport.inline.hpp"
  43 #include "runtime/objectMonitor.hpp"
  44 #include "runtime/os.hpp"
  45 #include "runtime/safepoint.hpp"
  46 #include "runtime/safepointMechanism.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.hpp"
  50 #include "utilities/macros.hpp"
  51 #include "crc32c.h"
  52 #ifdef COMPILER2
  53 #include "opto/intrinsicnode.hpp"
  54 #endif
  55 
  56 #include "runtime/continuation.hpp" // TODO LOOM remove after testing CONT_DOUBLE_NOP
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #define STOP(error) stop(error)
  61 #else
  62 #define BLOCK_COMMENT(str) block_comment(str)
  63 #define STOP(error) block_comment(error); stop(error)
  64 #endif
  65 
  66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  67 
  68 #ifdef ASSERT
  69 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  70 #endif
  71 
  72 static Assembler::Condition reverse[] = {
  73     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  74     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  75     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  76     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  77     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  78     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  79     Assembler::above          /* belowEqual    = 0x6 */ ,
  80     Assembler::belowEqual     /* above         = 0x7 */ ,
  81     Assembler::positive       /* negative      = 0x8 */ ,
  82     Assembler::negative       /* positive      = 0x9 */ ,
  83     Assembler::noParity       /* parity        = 0xa */ ,
  84     Assembler::parity         /* noParity      = 0xb */ ,
  85     Assembler::greaterEqual   /* less          = 0xc */ ,
  86     Assembler::less           /* greaterEqual  = 0xd */ ,
  87     Assembler::greater        /* lessEqual     = 0xe */ ,
  88     Assembler::lessEqual      /* greater       = 0xf, */
  89 
  90 };
  91 
  92 
  93 // Implementation of MacroAssembler
  94 
  95 // First all the versions that have distinct versions depending on 32/64 bit
  96 // Unless the difference is trivial (1 line or so).
  97 
  98 #ifndef _LP64
  99 
 100 // 32bit versions
 101 
 102 Address MacroAssembler::as_Address(AddressLiteral adr) {
 103   return Address(adr.target(), adr.rspec());
 104 }
 105 
 106 Address MacroAssembler::as_Address(ArrayAddress adr) {
 107   return Address::make_array(adr);
 108 }
 109 
 110 void MacroAssembler::call_VM_leaf_base(address entry_point,
 111                                        int number_of_arguments) {
 112   call(RuntimeAddress(entry_point));
 113   increment(rsp, number_of_arguments * wordSize);
 114 }
 115 
 116 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 117   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 118 }
 119 
 120 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 121   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 122 }
 123 
 124 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
 125   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 126 }
 127 
 128 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
 129   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 130 }
 131 
 132 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 133   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 134   bs->obj_equals(this, src1, obj);
 135 }
 136 
 137 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 138   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 139   bs->obj_equals(this, src1, obj);
 140 }
 141 
 142 void MacroAssembler::extend_sign(Register hi, Register lo) {
 143   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 144   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 145     cdql();
 146   } else {
 147     movl(hi, lo);
 148     sarl(hi, 31);
 149   }
 150 }
 151 
 152 void MacroAssembler::jC2(Register tmp, Label& L) {
 153   // set parity bit if FPU flag C2 is set (via rax)
 154   save_rax(tmp);
 155   fwait(); fnstsw_ax();
 156   sahf();
 157   restore_rax(tmp);
 158   // branch
 159   jcc(Assembler::parity, L);
 160 }
 161 
 162 void MacroAssembler::jnC2(Register tmp, Label& L) {
 163   // set parity bit if FPU flag C2 is set (via rax)
 164   save_rax(tmp);
 165   fwait(); fnstsw_ax();
 166   sahf();
 167   restore_rax(tmp);
 168   // branch
 169   jcc(Assembler::noParity, L);
 170 }
 171 
 172 // 32bit can do a case table jump in one instruction but we no longer allow the base
 173 // to be installed in the Address class
 174 void MacroAssembler::jump(ArrayAddress entry) {
 175   jmp(as_Address(entry));
 176 }
 177 
 178 // Note: y_lo will be destroyed
 179 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 180   // Long compare for Java (semantics as described in JVM spec.)
 181   Label high, low, done;
 182 
 183   cmpl(x_hi, y_hi);
 184   jcc(Assembler::less, low);
 185   jcc(Assembler::greater, high);
 186   // x_hi is the return register
 187   xorl(x_hi, x_hi);
 188   cmpl(x_lo, y_lo);
 189   jcc(Assembler::below, low);
 190   jcc(Assembler::equal, done);
 191 
 192   bind(high);
 193   xorl(x_hi, x_hi);
 194   increment(x_hi);
 195   jmp(done);
 196 
 197   bind(low);
 198   xorl(x_hi, x_hi);
 199   decrementl(x_hi);
 200 
 201   bind(done);
 202 }
 203 
 204 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 205     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 206 }
 207 
 208 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 209   // leal(dst, as_Address(adr));
 210   // see note in movl as to why we must use a move
 211   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 212 }
 213 
 214 void MacroAssembler::leave() {
 215   mov(rsp, rbp);
 216   pop(rbp);
 217 }
 218 
 219 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 220   // Multiplication of two Java long values stored on the stack
 221   // as illustrated below. Result is in rdx:rax.
 222   //
 223   // rsp ---> [  ??  ] \               \
 224   //            ....    | y_rsp_offset  |
 225   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 226   //          [ y_hi ]                  | (in bytes)
 227   //            ....                    |
 228   //          [ x_lo ]                 /
 229   //          [ x_hi ]
 230   //            ....
 231   //
 232   // Basic idea: lo(result) = lo(x_lo * y_lo)
 233   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 234   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 235   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 236   Label quick;
 237   // load x_hi, y_hi and check if quick
 238   // multiplication is possible
 239   movl(rbx, x_hi);
 240   movl(rcx, y_hi);
 241   movl(rax, rbx);
 242   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 243   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 244   // do full multiplication
 245   // 1st step
 246   mull(y_lo);                                    // x_hi * y_lo
 247   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 248   // 2nd step
 249   movl(rax, x_lo);
 250   mull(rcx);                                     // x_lo * y_hi
 251   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 252   // 3rd step
 253   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 254   movl(rax, x_lo);
 255   mull(y_lo);                                    // x_lo * y_lo
 256   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 257 }
 258 
 259 void MacroAssembler::lneg(Register hi, Register lo) {
 260   negl(lo);
 261   adcl(hi, 0);
 262   negl(hi);
 263 }
 264 
 265 void MacroAssembler::lshl(Register hi, Register lo) {
 266   // Java shift left long support (semantics as described in JVM spec., p.305)
 267   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 268   // shift value is in rcx !
 269   assert(hi != rcx, "must not use rcx");
 270   assert(lo != rcx, "must not use rcx");
 271   const Register s = rcx;                        // shift count
 272   const int      n = BitsPerWord;
 273   Label L;
 274   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 275   cmpl(s, n);                                    // if (s < n)
 276   jcc(Assembler::less, L);                       // else (s >= n)
 277   movl(hi, lo);                                  // x := x << n
 278   xorl(lo, lo);
 279   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 280   bind(L);                                       // s (mod n) < n
 281   shldl(hi, lo);                                 // x := x << s
 282   shll(lo);
 283 }
 284 
 285 
 286 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 287   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 288   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 289   assert(hi != rcx, "must not use rcx");
 290   assert(lo != rcx, "must not use rcx");
 291   const Register s = rcx;                        // shift count
 292   const int      n = BitsPerWord;
 293   Label L;
 294   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 295   cmpl(s, n);                                    // if (s < n)
 296   jcc(Assembler::less, L);                       // else (s >= n)
 297   movl(lo, hi);                                  // x := x >> n
 298   if (sign_extension) sarl(hi, 31);
 299   else                xorl(hi, hi);
 300   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 301   bind(L);                                       // s (mod n) < n
 302   shrdl(lo, hi);                                 // x := x >> s
 303   if (sign_extension) sarl(hi);
 304   else                shrl(hi);
 305 }
 306 
 307 void MacroAssembler::movoop(Register dst, jobject obj) {
 308   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 309 }
 310 
 311 void MacroAssembler::movoop(Address dst, jobject obj) {
 312   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 313 }
 314 
 315 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 316   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 317 }
 318 
 319 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 320   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 321 }
 322 
 323 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 324   // scratch register is not used,
 325   // it is defined to match parameters of 64-bit version of this method.
 326   if (src.is_lval()) {
 327     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 328   } else {
 329     movl(dst, as_Address(src));
 330   }
 331 }
 332 
 333 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 334   movl(as_Address(dst), src);
 335 }
 336 
 337 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 338   movl(dst, as_Address(src));
 339 }
 340 
 341 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 342 void MacroAssembler::movptr(Address dst, intptr_t src) {
 343   movl(dst, src);
 344 }
 345 
 346 
 347 void MacroAssembler::pop_callee_saved_registers() {
 348   pop(rcx);
 349   pop(rdx);
 350   pop(rdi);
 351   pop(rsi);
 352 }
 353 
 354 void MacroAssembler::pop_fTOS() {
 355   fld_d(Address(rsp, 0));
 356   addl(rsp, 2 * wordSize);
 357 }
 358 
 359 void MacroAssembler::push_callee_saved_registers() {
 360   push(rsi);
 361   push(rdi);
 362   push(rdx);
 363   push(rcx);
 364 }
 365 
 366 void MacroAssembler::push_fTOS() {
 367   subl(rsp, 2 * wordSize);
 368   fstp_d(Address(rsp, 0));
 369 }
 370 
 371 
 372 void MacroAssembler::pushoop(jobject obj) {
 373   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 374 }
 375 
 376 void MacroAssembler::pushklass(Metadata* obj) {
 377   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 378 }
 379 
 380 void MacroAssembler::pushptr(AddressLiteral src) {
 381   if (src.is_lval()) {
 382     push_literal32((int32_t)src.target(), src.rspec());
 383   } else {
 384     pushl(as_Address(src));
 385   }
 386 }
 387 
 388 void MacroAssembler::set_word_if_not_zero(Register dst) {
 389   xorl(dst, dst);
 390   set_byte_if_not_zero(dst);
 391 }
 392 
 393 static void pass_arg0(MacroAssembler* masm, Register arg) {
 394   masm->push(arg);
 395 }
 396 
 397 static void pass_arg1(MacroAssembler* masm, Register arg) {
 398   masm->push(arg);
 399 }
 400 
 401 static void pass_arg2(MacroAssembler* masm, Register arg) {
 402   masm->push(arg);
 403 }
 404 
 405 static void pass_arg3(MacroAssembler* masm, Register arg) {
 406   masm->push(arg);
 407 }
 408 
 409 #ifndef PRODUCT
 410 extern "C" void findpc(intptr_t x);
 411 #endif
 412 
 413 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 414   // In order to get locks to work, we need to fake a in_VM state
 415   JavaThread* thread = JavaThread::current();
 416   JavaThreadState saved_state = thread->thread_state();
 417   thread->set_thread_state(_thread_in_vm);
 418   if (ShowMessageBoxOnError) {
 419     JavaThread* thread = JavaThread::current();
 420     JavaThreadState saved_state = thread->thread_state();
 421     thread->set_thread_state(_thread_in_vm);
 422     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 423       ttyLocker ttyl;
 424       BytecodeCounter::print();
 425     }
 426     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 427     // This is the value of eip which points to where verify_oop will return.
 428     if (os::message_box(msg, "Execution stopped, print registers?")) {
 429       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 430       BREAKPOINT;
 431     }
 432   } else {
 433     ttyLocker ttyl;
 434     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
 435   }
 436   // Don't assert holding the ttyLock
 437     assert(false, "DEBUG MESSAGE: %s", msg);
 438   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 439 }
 440 
 441 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 442   ttyLocker ttyl;
 443   FlagSetting fs(Debugging, true);
 444   tty->print_cr("eip = 0x%08x", eip);
 445 #ifndef PRODUCT
 446   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 447     tty->cr();
 448     findpc(eip);
 449     tty->cr();
 450   }
 451 #endif
 452 #define PRINT_REG(rax) \
 453   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 454   PRINT_REG(rax);
 455   PRINT_REG(rbx);
 456   PRINT_REG(rcx);
 457   PRINT_REG(rdx);
 458   PRINT_REG(rdi);
 459   PRINT_REG(rsi);
 460   PRINT_REG(rbp);
 461   PRINT_REG(rsp);
 462 #undef PRINT_REG
 463   // Print some words near top of staack.
 464   int* dump_sp = (int*) rsp;
 465   for (int col1 = 0; col1 < 8; col1++) {
 466     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 467     os::print_location(tty, *dump_sp++);
 468   }
 469   for (int row = 0; row < 16; row++) {
 470     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 471     for (int col = 0; col < 8; col++) {
 472       tty->print(" 0x%08x", *dump_sp++);
 473     }
 474     tty->cr();
 475   }
 476   // Print some instructions around pc:
 477   Disassembler::decode((address)eip-64, (address)eip);
 478   tty->print_cr("--------");
 479   Disassembler::decode((address)eip, (address)eip+32);
 480 }
 481 
 482 void MacroAssembler::stop(const char* msg) {
 483   ExternalAddress message((address)msg);
 484   // push address of message
 485   pushptr(message.addr());
 486   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 487   pusha();                                            // push registers
 488   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 489   hlt();
 490 }
 491 
 492 void MacroAssembler::warn(const char* msg) {
 493   push_CPU_state();
 494 
 495   ExternalAddress message((address) msg);
 496   // push address of message
 497   pushptr(message.addr());
 498 
 499   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 500   addl(rsp, wordSize);       // discard argument
 501   pop_CPU_state();
 502 }
 503 
 504 void MacroAssembler::print_state() {
 505   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 506   pusha();                                            // push registers
 507 
 508   push_CPU_state();
 509   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 510   pop_CPU_state();
 511 
 512   popa();
 513   addl(rsp, wordSize);
 514 }
 515 
 516 #else // _LP64
 517 
 518 // 64 bit versions
 519 
 520 Address MacroAssembler::as_Address(AddressLiteral adr) {
 521   // amd64 always does this as a pc-rel
 522   // we can be absolute or disp based on the instruction type
 523   // jmp/call are displacements others are absolute
 524   assert(!adr.is_lval(), "must be rval");
 525   assert(reachable(adr), "must be");
 526   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 527 
 528 }
 529 
 530 Address MacroAssembler::as_Address(ArrayAddress adr) {
 531   AddressLiteral base = adr.base();
 532   lea(rscratch1, base);
 533   Address index = adr.index();
 534   assert(index._disp == 0, "must not have disp"); // maybe it can?
 535   Address array(rscratch1, index._index, index._scale, index._disp);
 536   return array;
 537 }
 538 
 539 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 540   Label L, E;
 541 
 542 #ifdef _WIN64
 543   // Windows always allocates space for it's register args
 544   assert(num_args <= 4, "only register arguments supported");
 545   subq(rsp,  frame::arg_reg_save_area_bytes);
 546 #endif
 547 
 548   // Align stack if necessary
 549   testl(rsp, 15);
 550   jcc(Assembler::zero, L);
 551 
 552   subq(rsp, 8);
 553   {
 554     call(RuntimeAddress(entry_point));
 555     oopmap_metadata(-1);
 556   }
 557   addq(rsp, 8);
 558   jmp(E);
 559 
 560   bind(L);
 561   {
 562     call(RuntimeAddress(entry_point));
 563     oopmap_metadata(-1);
 564   }
 565 
 566   bind(E);
 567 
 568 #ifdef _WIN64
 569   // restore stack pointer
 570   addq(rsp, frame::arg_reg_save_area_bytes);
 571 #endif
 572 
 573 }
 574 
 575 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 576   assert(!src2.is_lval(), "should use cmpptr");
 577 
 578   if (reachable(src2)) {
 579     cmpq(src1, as_Address(src2));
 580   } else {
 581     lea(rscratch1, src2);
 582     Assembler::cmpq(src1, Address(rscratch1, 0));
 583   }
 584 }
 585 
 586 int MacroAssembler::corrected_idivq(Register reg) {
 587   // Full implementation of Java ldiv and lrem; checks for special
 588   // case as described in JVM spec., p.243 & p.271.  The function
 589   // returns the (pc) offset of the idivl instruction - may be needed
 590   // for implicit exceptions.
 591   //
 592   //         normal case                           special case
 593   //
 594   // input : rax: dividend                         min_long
 595   //         reg: divisor   (may not be eax/edx)   -1
 596   //
 597   // output: rax: quotient  (= rax idiv reg)       min_long
 598   //         rdx: remainder (= rax irem reg)       0
 599   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 600   static const int64_t min_long = 0x8000000000000000;
 601   Label normal_case, special_case;
 602 
 603   // check for special case
 604   cmp64(rax, ExternalAddress((address) &min_long));
 605   jcc(Assembler::notEqual, normal_case);
 606   xorl(rdx, rdx); // prepare rdx for possible special case (where
 607                   // remainder = 0)
 608   cmpq(reg, -1);
 609   jcc(Assembler::equal, special_case);
 610 
 611   // handle normal case
 612   bind(normal_case);
 613   cdqq();
 614   int idivq_offset = offset();
 615   idivq(reg);
 616 
 617   // normal and special case exit
 618   bind(special_case);
 619 
 620   return idivq_offset;
 621 }
 622 
 623 void MacroAssembler::decrementq(Register reg, int value) {
 624   if (value == min_jint) { subq(reg, value); return; }
 625   if (value <  0) { incrementq(reg, -value); return; }
 626   if (value == 0) {                        ; return; }
 627   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 628   /* else */      { subq(reg, value)       ; return; }
 629 }
 630 
 631 void MacroAssembler::decrementq(Address dst, int value) {
 632   if (value == min_jint) { subq(dst, value); return; }
 633   if (value <  0) { incrementq(dst, -value); return; }
 634   if (value == 0) {                        ; return; }
 635   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 636   /* else */      { subq(dst, value)       ; return; }
 637 }
 638 
 639 void MacroAssembler::incrementq(AddressLiteral dst) {
 640   if (reachable(dst)) {
 641     incrementq(as_Address(dst));
 642   } else {
 643     lea(rscratch1, dst);
 644     incrementq(Address(rscratch1, 0));
 645   }
 646 }
 647 
 648 void MacroAssembler::incrementq(Register reg, int value) {
 649   if (value == min_jint) { addq(reg, value); return; }
 650   if (value <  0) { decrementq(reg, -value); return; }
 651   if (value == 0) {                        ; return; }
 652   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 653   /* else */      { addq(reg, value)       ; return; }
 654 }
 655 
 656 void MacroAssembler::incrementq(Address dst, int value) {
 657   if (value == min_jint) { addq(dst, value); return; }
 658   if (value <  0) { decrementq(dst, -value); return; }
 659   if (value == 0) {                        ; return; }
 660   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 661   /* else */      { addq(dst, value)       ; return; }
 662 }
 663 
 664 // 32bit can do a case table jump in one instruction but we no longer allow the base
 665 // to be installed in the Address class
 666 void MacroAssembler::jump(ArrayAddress entry) {
 667   lea(rscratch1, entry.base());
 668   Address dispatch = entry.index();
 669   assert(dispatch._base == noreg, "must be");
 670   dispatch._base = rscratch1;
 671   jmp(dispatch);
 672 }
 673 
 674 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 675   ShouldNotReachHere(); // 64bit doesn't use two regs
 676   cmpq(x_lo, y_lo);
 677 }
 678 
 679 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 680     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 681 }
 682 
 683 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 684   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 685   movptr(dst, rscratch1);
 686 }
 687 
 688 void MacroAssembler::leave() {
 689   // %%% is this really better? Why not on 32bit too?
 690   emit_int8((unsigned char)0xC9); // LEAVE
 691 }
 692 
 693 void MacroAssembler::lneg(Register hi, Register lo) {
 694   ShouldNotReachHere(); // 64bit doesn't use two regs
 695   negq(lo);
 696 }
 697 
 698 void MacroAssembler::movoop(Register dst, jobject obj) {
 699   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 700 }
 701 
 702 void MacroAssembler::movoop(Address dst, jobject obj) {
 703   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 704   movq(dst, rscratch1);
 705 }
 706 
 707 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 708   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 709 }
 710 
 711 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 712   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 713   movq(dst, rscratch1);
 714 }
 715 
 716 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 717   if (src.is_lval()) {
 718     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 719   } else {
 720     if (reachable(src)) {
 721       movq(dst, as_Address(src));
 722     } else {
 723       lea(scratch, src);
 724       movq(dst, Address(scratch, 0));
 725     }
 726   }
 727 }
 728 
 729 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 730   movq(as_Address(dst), src);
 731 }
 732 
 733 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 734   movq(dst, as_Address(src));
 735 }
 736 
 737 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 738 void MacroAssembler::movptr(Address dst, intptr_t src) {
 739   mov64(rscratch1, src);
 740   movq(dst, rscratch1);
 741 }
 742 
 743 // These are mostly for initializing NULL
 744 void MacroAssembler::movptr(Address dst, int32_t src) {
 745   movslq(dst, src);
 746 }
 747 
 748 void MacroAssembler::movptr(Register dst, int32_t src) {
 749   mov64(dst, (intptr_t)src);
 750 }
 751 
 752 void MacroAssembler::pushoop(jobject obj) {
 753   movoop(rscratch1, obj);
 754   push(rscratch1);
 755 }
 756 
 757 void MacroAssembler::pushklass(Metadata* obj) {
 758   mov_metadata(rscratch1, obj);
 759   push(rscratch1);
 760 }
 761 
 762 void MacroAssembler::pushptr(AddressLiteral src) {
 763   lea(rscratch1, src);
 764   if (src.is_lval()) {
 765     push(rscratch1);
 766   } else {
 767     pushq(Address(rscratch1, 0));
 768   }
 769 }
 770 
 771 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 772   mov64(rscratch1, NULL_WORD);
 773   // we must set sp to zero to clear frame
 774   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), rscratch1);
 775   // must clear fp, so that compiled frames are not confused; it is
 776   // possible that we need it only for debugging
 777   if (clear_fp) {
 778     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), rscratch1);
 779   }
 780 
 781   // Always clear the pc because it could have been set by make_walkable()
 782   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), rscratch1);
 783   vzeroupper();
 784 }
 785 
 786 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 787                                          Register last_java_fp,
 788                                          address  last_java_pc) {
 789   vzeroupper();
 790   // determine last_java_sp register
 791   if (!last_java_sp->is_valid()) {
 792     last_java_sp = rsp;
 793   }
 794 
 795   // last_java_fp is optional
 796   if (last_java_fp->is_valid()) {
 797     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 798            last_java_fp);
 799   }
 800 
 801   // last_java_pc is optional
 802   if (last_java_pc != NULL) {
 803     Address java_pc(r15_thread,
 804                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 805     lea(rscratch1, InternalAddress(last_java_pc));
 806     movptr(java_pc, rscratch1);
 807   }
 808 
 809   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 810 }
 811 
 812 static void pass_arg0(MacroAssembler* masm, Register arg) {
 813   if (c_rarg0 != arg ) {
 814     masm->mov(c_rarg0, arg);
 815   }
 816 }
 817 
 818 static void pass_arg1(MacroAssembler* masm, Register arg) {
 819   if (c_rarg1 != arg ) {
 820     masm->mov(c_rarg1, arg);
 821   }
 822 }
 823 
 824 static void pass_arg2(MacroAssembler* masm, Register arg) {
 825   if (c_rarg2 != arg ) {
 826     masm->mov(c_rarg2, arg);
 827   }
 828 }
 829 
 830 static void pass_arg3(MacroAssembler* masm, Register arg) {
 831   if (c_rarg3 != arg ) {
 832     masm->mov(c_rarg3, arg);
 833   }
 834 }
 835 
 836 void MacroAssembler::stop(const char* msg) {
 837   address rip = pc();
 838   pusha(); // get regs on stack
 839   lea(c_rarg0, ExternalAddress((address) msg));
 840   lea(c_rarg1, InternalAddress(rip));
 841   movq(c_rarg2, rsp); // pass pointer to regs array
 842   andq(rsp, -16); // align stack as required by ABI
 843   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 844   hlt();
 845 }
 846 
 847 void MacroAssembler::warn(const char* msg) {
 848   push(rbp);
 849   movq(rbp, rsp);
 850   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 851   push_CPU_state();   // keeps alignment at 16 bytes
 852   lea(c_rarg0, ExternalAddress((address) msg));
 853   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 854   call(rax);
 855   pop_CPU_state();
 856   mov(rsp, rbp);
 857   pop(rbp);
 858 }
 859 
 860 void MacroAssembler::print_state() {
 861   address rip = pc();
 862   pusha();            // get regs on stack
 863   push(rbp);
 864   movq(rbp, rsp);
 865   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 866   push_CPU_state();   // keeps alignment at 16 bytes
 867 
 868   lea(c_rarg0, InternalAddress(rip));
 869   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 870   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 871 
 872   pop_CPU_state();
 873   mov(rsp, rbp);
 874   pop(rbp);
 875   popa();
 876 }
 877 
 878 #ifndef PRODUCT
 879 extern "C" void findpc(intptr_t x);
 880 #endif
 881 
 882 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 883   // In order to get locks to work, we need to fake a in_VM state
 884   if (ShowMessageBoxOnError) {
 885     JavaThread* thread = JavaThread::current();
 886     JavaThreadState saved_state = thread->thread_state();
 887     thread->set_thread_state(_thread_in_vm);
 888 #ifndef PRODUCT
 889     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 890       ttyLocker ttyl;
 891       BytecodeCounter::print();
 892     }
 893 #endif
 894     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 895     // XXX correct this offset for amd64
 896     // This is the value of eip which points to where verify_oop will return.
 897     if (os::message_box(msg, "Execution stopped, print registers?")) {
 898       print_state64(pc, regs);
 899       BREAKPOINT;
 900       assert(false, "start up GDB");
 901     }
 902     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 903   } else {
 904     ttyLocker ttyl;
 905     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
 906                     msg);
 907     assert(false, "DEBUG MESSAGE: %s", msg);
 908   }
 909 }
 910 
 911 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 912   ttyLocker ttyl;
 913   FlagSetting fs(Debugging, true);
 914   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 915 #ifndef PRODUCT
 916   tty->cr();
 917   findpc(pc);
 918   tty->cr();
 919 #endif
 920 #define PRINT_REG(rax, value) \
 921   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 922   PRINT_REG(rax, regs[15]);
 923   PRINT_REG(rbx, regs[12]);
 924   PRINT_REG(rcx, regs[14]);
 925   PRINT_REG(rdx, regs[13]);
 926   PRINT_REG(rdi, regs[8]);
 927   PRINT_REG(rsi, regs[9]);
 928   PRINT_REG(rbp, regs[10]);
 929   PRINT_REG(rsp, regs[11]);
 930   PRINT_REG(r8 , regs[7]);
 931   PRINT_REG(r9 , regs[6]);
 932   PRINT_REG(r10, regs[5]);
 933   PRINT_REG(r11, regs[4]);
 934   PRINT_REG(r12, regs[3]);
 935   PRINT_REG(r13, regs[2]);
 936   PRINT_REG(r14, regs[1]);
 937   PRINT_REG(r15, regs[0]);
 938 #undef PRINT_REG
 939   // Print some words near top of staack.
 940   int64_t* rsp = (int64_t*) regs[11];
 941   int64_t* dump_sp = rsp;
 942   for (int col1 = 0; col1 < 8; col1++) {
 943     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 944     os::print_location(tty, *dump_sp++);
 945   }
 946   for (int row = 0; row < 25; row++) {
 947     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 948     for (int col = 0; col < 4; col++) {
 949       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 950     }
 951     tty->cr();
 952   }
 953   // Print some instructions around pc:
 954   Disassembler::decode((address)pc-64, (address)pc);
 955   tty->print_cr("--------");
 956   Disassembler::decode((address)pc, (address)pc+32);
 957 }
 958 
 959 #endif // _LP64
 960 
 961 // Now versions that are common to 32/64 bit
 962 
 963 void MacroAssembler::oopmap_metadata(int index) {
 964   // if (index != -1) tty->print_cr("oopmap_metadata %d", index);
 965   // mov64(r10, 1234); // TODO: Add a new relocInfo with external semantics. see relocInfo::metadata_type
 966 }
 967 
 968 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 969   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 970 }
 971 
 972 void MacroAssembler::addptr(Register dst, Register src) {
 973   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 974 }
 975 
 976 void MacroAssembler::addptr(Address dst, Register src) {
 977   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 978 }
 979 
 980 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 981   if (reachable(src)) {
 982     Assembler::addsd(dst, as_Address(src));
 983   } else {
 984     lea(rscratch1, src);
 985     Assembler::addsd(dst, Address(rscratch1, 0));
 986   }
 987 }
 988 
 989 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 990   if (reachable(src)) {
 991     addss(dst, as_Address(src));
 992   } else {
 993     lea(rscratch1, src);
 994     addss(dst, Address(rscratch1, 0));
 995   }
 996 }
 997 
 998 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
 999   if (reachable(src)) {
1000     Assembler::addpd(dst, as_Address(src));
1001   } else {
1002     lea(rscratch1, src);
1003     Assembler::addpd(dst, Address(rscratch1, 0));
1004   }
1005 }
1006 
1007 void MacroAssembler::align(int modulus) {
1008   align(modulus, offset());
1009 }
1010 
1011 void MacroAssembler::align(int modulus, int target) {
1012   if (target % modulus != 0) {
1013     nop(modulus - (target % modulus));
1014   }
1015 }
1016 
1017 void MacroAssembler::push_f(XMMRegister r) {
1018   subptr(rsp, wordSize);
1019   movflt(Address(rsp, 0), r);
1020 }
1021 
1022 void MacroAssembler::pop_f(XMMRegister r) {
1023   movflt(r, Address(rsp, 0));
1024   addptr(rsp, wordSize);
1025 }
1026 
1027 void MacroAssembler::push_d(XMMRegister r) {
1028   subptr(rsp, 2 * wordSize);
1029   movdbl(Address(rsp, 0), r);
1030 }
1031 
1032 void MacroAssembler::pop_d(XMMRegister r) {
1033   movdbl(r, Address(rsp, 0));
1034   addptr(rsp, 2 * Interpreter::stackElementSize);
1035 }
1036 
1037 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1038   // Used in sign-masking with aligned address.
1039   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1040   if (reachable(src)) {
1041     Assembler::andpd(dst, as_Address(src));
1042   } else {
1043     lea(scratch_reg, src);
1044     Assembler::andpd(dst, Address(scratch_reg, 0));
1045   }
1046 }
1047 
1048 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1049   // Used in sign-masking with aligned address.
1050   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1051   if (reachable(src)) {
1052     Assembler::andps(dst, as_Address(src));
1053   } else {
1054     lea(scratch_reg, src);
1055     Assembler::andps(dst, Address(scratch_reg, 0));
1056   }
1057 }
1058 
1059 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1060   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1061 }
1062 
1063 void MacroAssembler::atomic_incl(Address counter_addr) {
1064   lock();
1065   incrementl(counter_addr);
1066 }
1067 
1068 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1069   if (reachable(counter_addr)) {
1070     atomic_incl(as_Address(counter_addr));
1071   } else {
1072     lea(scr, counter_addr);
1073     atomic_incl(Address(scr, 0));
1074   }
1075 }
1076 
1077 #ifdef _LP64
1078 void MacroAssembler::atomic_incq(Address counter_addr) {
1079   lock();
1080   incrementq(counter_addr);
1081 }
1082 
1083 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1084   if (reachable(counter_addr)) {
1085     atomic_incq(as_Address(counter_addr));
1086   } else {
1087     lea(scr, counter_addr);
1088     atomic_incq(Address(scr, 0));
1089   }
1090 }
1091 #endif
1092 
1093 // Writes to stack successive pages until offset reached to check for
1094 // stack overflow + shadow pages.  This clobbers tmp.
1095 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1096   movptr(tmp, rsp);
1097   // Bang stack for total size given plus shadow page size.
1098   // Bang one page at a time because large size can bang beyond yellow and
1099   // red zones.
1100   Label loop;
1101   bind(loop);
1102   movl(Address(tmp, (-os::vm_page_size())), size );
1103   subptr(tmp, os::vm_page_size());
1104   subl(size, os::vm_page_size());
1105   jcc(Assembler::greater, loop);
1106 
1107   // Bang down shadow pages too.
1108   // At this point, (tmp-0) is the last address touched, so don't
1109   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1110   // was post-decremented.)  Skip this address by starting at i=1, and
1111   // touch a few more pages below.  N.B.  It is important to touch all
1112   // the way down including all pages in the shadow zone.
1113   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1114     // this could be any sized move but this is can be a debugging crumb
1115     // so the bigger the better.
1116     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1117   }
1118 }
1119 
1120 void MacroAssembler::reserved_stack_check() {
1121     // testing if reserved zone needs to be enabled
1122     Label no_reserved_zone_enabling;
1123     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1124     NOT_LP64(get_thread(rsi);)
1125 
1126     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1127     jcc(Assembler::below, no_reserved_zone_enabling);
1128 
1129     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1130     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1131     should_not_reach_here();
1132 
1133     bind(no_reserved_zone_enabling);
1134 }
1135 
1136 int MacroAssembler::biased_locking_enter(Register lock_reg,
1137                                          Register obj_reg,
1138                                          Register swap_reg,
1139                                          Register tmp_reg,
1140                                          bool swap_reg_contains_mark,
1141                                          Label& done,
1142                                          Label* slow_case,
1143                                          BiasedLockingCounters* counters) {
1144   assert(UseBiasedLocking, "why call this otherwise?");
1145   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1146   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1147   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1148   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1149   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1150   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1151 
1152   if (PrintBiasedLockingStatistics && counters == NULL) {
1153     counters = BiasedLocking::counters();
1154   }
1155   // Biased locking
1156   // See whether the lock is currently biased toward our thread and
1157   // whether the epoch is still valid
1158   // Note that the runtime guarantees sufficient alignment of JavaThread
1159   // pointers to allow age to be placed into low bits
1160   // First check to see whether biasing is even enabled for this object
1161   Label cas_label;
1162   int null_check_offset = -1;
1163   if (!swap_reg_contains_mark) {
1164     null_check_offset = offset();
1165     movptr(swap_reg, mark_addr);
1166   }
1167   movptr(tmp_reg, swap_reg);
1168   andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1169   cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1170   jcc(Assembler::notEqual, cas_label);
1171   // The bias pattern is present in the object's header. Need to check
1172   // whether the bias owner and the epoch are both still current.
1173 #ifndef _LP64
1174   // Note that because there is no current thread register on x86_32 we
1175   // need to store off the mark word we read out of the object to
1176   // avoid reloading it and needing to recheck invariants below. This
1177   // store is unfortunate but it makes the overall code shorter and
1178   // simpler.
1179   movptr(saved_mark_addr, swap_reg);
1180 #endif
1181   if (swap_reg_contains_mark) {
1182     null_check_offset = offset();
1183   }
1184   load_prototype_header(tmp_reg, obj_reg);
1185 #ifdef _LP64
1186   orptr(tmp_reg, r15_thread);
1187   xorptr(tmp_reg, swap_reg);
1188   Register header_reg = tmp_reg;
1189 #else
1190   xorptr(tmp_reg, swap_reg);
1191   get_thread(swap_reg);
1192   xorptr(swap_reg, tmp_reg);
1193   Register header_reg = swap_reg;
1194 #endif
1195   andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1196   if (counters != NULL) {
1197     cond_inc32(Assembler::zero,
1198                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1199   }
1200   jcc(Assembler::equal, done);
1201 
1202   Label try_revoke_bias;
1203   Label try_rebias;
1204 
1205   // At this point we know that the header has the bias pattern and
1206   // that we are not the bias owner in the current epoch. We need to
1207   // figure out more details about the state of the header in order to
1208   // know what operations can be legally performed on the object's
1209   // header.
1210 
1211   // If the low three bits in the xor result aren't clear, that means
1212   // the prototype header is no longer biased and we have to revoke
1213   // the bias on this object.
1214   testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1215   jccb(Assembler::notZero, try_revoke_bias);
1216 
1217   // Biasing is still enabled for this data type. See whether the
1218   // epoch of the current bias is still valid, meaning that the epoch
1219   // bits of the mark word are equal to the epoch bits of the
1220   // prototype header. (Note that the prototype header's epoch bits
1221   // only change at a safepoint.) If not, attempt to rebias the object
1222   // toward the current thread. Note that we must be absolutely sure
1223   // that the current epoch is invalid in order to do this because
1224   // otherwise the manipulations it performs on the mark word are
1225   // illegal.
1226   testptr(header_reg, markOopDesc::epoch_mask_in_place);
1227   jccb(Assembler::notZero, try_rebias);
1228 
1229   // The epoch of the current bias is still valid but we know nothing
1230   // about the owner; it might be set or it might be clear. Try to
1231   // acquire the bias of the object using an atomic operation. If this
1232   // fails we will go in to the runtime to revoke the object's bias.
1233   // Note that we first construct the presumed unbiased header so we
1234   // don't accidentally blow away another thread's valid bias.
1235   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1236   andptr(swap_reg,
1237          markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1238 #ifdef _LP64
1239   movptr(tmp_reg, swap_reg);
1240   orptr(tmp_reg, r15_thread);
1241 #else
1242   get_thread(tmp_reg);
1243   orptr(tmp_reg, swap_reg);
1244 #endif
1245   lock();
1246   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1247   // If the biasing toward our thread failed, this means that
1248   // another thread succeeded in biasing it toward itself and we
1249   // need to revoke that bias. The revocation will occur in the
1250   // interpreter runtime in the slow case.
1251   if (counters != NULL) {
1252     cond_inc32(Assembler::zero,
1253                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1254   }
1255   if (slow_case != NULL) {
1256     jcc(Assembler::notZero, *slow_case);
1257   }
1258   jmp(done);
1259 
1260   bind(try_rebias);
1261   // At this point we know the epoch has expired, meaning that the
1262   // current "bias owner", if any, is actually invalid. Under these
1263   // circumstances _only_, we are allowed to use the current header's
1264   // value as the comparison value when doing the cas to acquire the
1265   // bias in the current epoch. In other words, we allow transfer of
1266   // the bias from one thread to another directly in this situation.
1267   //
1268   // FIXME: due to a lack of registers we currently blow away the age
1269   // bits in this situation. Should attempt to preserve them.
1270   load_prototype_header(tmp_reg, obj_reg);
1271 #ifdef _LP64
1272   orptr(tmp_reg, r15_thread);
1273 #else
1274   get_thread(swap_reg);
1275   orptr(tmp_reg, swap_reg);
1276   movptr(swap_reg, saved_mark_addr);
1277 #endif
1278   lock();
1279   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1280   // If the biasing toward our thread failed, then another thread
1281   // succeeded in biasing it toward itself and we need to revoke that
1282   // bias. The revocation will occur in the runtime in the slow case.
1283   if (counters != NULL) {
1284     cond_inc32(Assembler::zero,
1285                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1286   }
1287   if (slow_case != NULL) {
1288     jcc(Assembler::notZero, *slow_case);
1289   }
1290   jmp(done);
1291 
1292   bind(try_revoke_bias);
1293   // The prototype mark in the klass doesn't have the bias bit set any
1294   // more, indicating that objects of this data type are not supposed
1295   // to be biased any more. We are going to try to reset the mark of
1296   // this object to the prototype value and fall through to the
1297   // CAS-based locking scheme. Note that if our CAS fails, it means
1298   // that another thread raced us for the privilege of revoking the
1299   // bias of this particular object, so it's okay to continue in the
1300   // normal locking code.
1301   //
1302   // FIXME: due to a lack of registers we currently blow away the age
1303   // bits in this situation. Should attempt to preserve them.
1304   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1305   load_prototype_header(tmp_reg, obj_reg);
1306   lock();
1307   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1308   // Fall through to the normal CAS-based lock, because no matter what
1309   // the result of the above CAS, some thread must have succeeded in
1310   // removing the bias bit from the object's header.
1311   if (counters != NULL) {
1312     cond_inc32(Assembler::zero,
1313                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1314   }
1315 
1316   bind(cas_label);
1317 
1318   return null_check_offset;
1319 }
1320 
1321 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1322   assert(UseBiasedLocking, "why call this otherwise?");
1323 
1324   // Check for biased locking unlock case, which is a no-op
1325   // Note: we do not have to check the thread ID for two reasons.
1326   // First, the interpreter checks for IllegalMonitorStateException at
1327   // a higher level. Second, if the bias was revoked while we held the
1328   // lock, the object could not be rebiased toward another thread, so
1329   // the bias bit would be clear.
1330   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1331   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1332   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1333   jcc(Assembler::equal, done);
1334 }
1335 
1336 #ifdef COMPILER2
1337 
1338 #if INCLUDE_RTM_OPT
1339 
1340 // Update rtm_counters based on abort status
1341 // input: abort_status
1342 //        rtm_counters (RTMLockingCounters*)
1343 // flags are killed
1344 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1345 
1346   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1347   if (PrintPreciseRTMLockingStatistics) {
1348     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1349       Label check_abort;
1350       testl(abort_status, (1<<i));
1351       jccb(Assembler::equal, check_abort);
1352       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1353       bind(check_abort);
1354     }
1355   }
1356 }
1357 
1358 // Branch if (random & (count-1) != 0), count is 2^n
1359 // tmp, scr and flags are killed
1360 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1361   assert(tmp == rax, "");
1362   assert(scr == rdx, "");
1363   rdtsc(); // modifies EDX:EAX
1364   andptr(tmp, count-1);
1365   jccb(Assembler::notZero, brLabel);
1366 }
1367 
1368 // Perform abort ratio calculation, set no_rtm bit if high ratio
1369 // input:  rtm_counters_Reg (RTMLockingCounters* address)
1370 // tmpReg, rtm_counters_Reg and flags are killed
1371 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1372                                                  Register rtm_counters_Reg,
1373                                                  RTMLockingCounters* rtm_counters,
1374                                                  Metadata* method_data) {
1375   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1376 
1377   if (RTMLockingCalculationDelay > 0) {
1378     // Delay calculation
1379     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1380     testptr(tmpReg, tmpReg);
1381     jccb(Assembler::equal, L_done);
1382   }
1383   // Abort ratio calculation only if abort_count > RTMAbortThreshold
1384   //   Aborted transactions = abort_count * 100
1385   //   All transactions = total_count *  RTMTotalCountIncrRate
1386   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1387 
1388   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1389   cmpptr(tmpReg, RTMAbortThreshold);
1390   jccb(Assembler::below, L_check_always_rtm2);
1391   imulptr(tmpReg, tmpReg, 100);
1392 
1393   Register scrReg = rtm_counters_Reg;
1394   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1395   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1396   imulptr(scrReg, scrReg, RTMAbortRatio);
1397   cmpptr(tmpReg, scrReg);
1398   jccb(Assembler::below, L_check_always_rtm1);
1399   if (method_data != NULL) {
1400     // set rtm_state to "no rtm" in MDO
1401     mov_metadata(tmpReg, method_data);
1402     lock();
1403     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1404   }
1405   jmpb(L_done);
1406   bind(L_check_always_rtm1);
1407   // Reload RTMLockingCounters* address
1408   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1409   bind(L_check_always_rtm2);
1410   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1411   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1412   jccb(Assembler::below, L_done);
1413   if (method_data != NULL) {
1414     // set rtm_state to "always rtm" in MDO
1415     mov_metadata(tmpReg, method_data);
1416     lock();
1417     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1418   }
1419   bind(L_done);
1420 }
1421 
1422 // Update counters and perform abort ratio calculation
1423 // input:  abort_status_Reg
1424 // rtm_counters_Reg, flags are killed
1425 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1426                                    Register rtm_counters_Reg,
1427                                    RTMLockingCounters* rtm_counters,
1428                                    Metadata* method_data,
1429                                    bool profile_rtm) {
1430 
1431   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1432   // update rtm counters based on rax value at abort
1433   // reads abort_status_Reg, updates flags
1434   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1435   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1436   if (profile_rtm) {
1437     // Save abort status because abort_status_Reg is used by following code.
1438     if (RTMRetryCount > 0) {
1439       push(abort_status_Reg);
1440     }
1441     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1442     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1443     // restore abort status
1444     if (RTMRetryCount > 0) {
1445       pop(abort_status_Reg);
1446     }
1447   }
1448 }
1449 
1450 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1451 // inputs: retry_count_Reg
1452 //       : abort_status_Reg
1453 // output: retry_count_Reg decremented by 1
1454 // flags are killed
1455 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1456   Label doneRetry;
1457   assert(abort_status_Reg == rax, "");
1458   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1459   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1460   // if reason is in 0x6 and retry count != 0 then retry
1461   andptr(abort_status_Reg, 0x6);
1462   jccb(Assembler::zero, doneRetry);
1463   testl(retry_count_Reg, retry_count_Reg);
1464   jccb(Assembler::zero, doneRetry);
1465   pause();
1466   decrementl(retry_count_Reg);
1467   jmp(retryLabel);
1468   bind(doneRetry);
1469 }
1470 
1471 // Spin and retry if lock is busy,
1472 // inputs: box_Reg (monitor address)
1473 //       : retry_count_Reg
1474 // output: retry_count_Reg decremented by 1
1475 //       : clear z flag if retry count exceeded
1476 // tmp_Reg, scr_Reg, flags are killed
1477 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1478                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1479   Label SpinLoop, SpinExit, doneRetry;
1480   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1481 
1482   testl(retry_count_Reg, retry_count_Reg);
1483   jccb(Assembler::zero, doneRetry);
1484   decrementl(retry_count_Reg);
1485   movptr(scr_Reg, RTMSpinLoopCount);
1486 
1487   bind(SpinLoop);
1488   pause();
1489   decrementl(scr_Reg);
1490   jccb(Assembler::lessEqual, SpinExit);
1491   movptr(tmp_Reg, Address(box_Reg, owner_offset));
1492   testptr(tmp_Reg, tmp_Reg);
1493   jccb(Assembler::notZero, SpinLoop);
1494 
1495   bind(SpinExit);
1496   jmp(retryLabel);
1497   bind(doneRetry);
1498   incrementl(retry_count_Reg); // clear z flag
1499 }
1500 
1501 // Use RTM for normal stack locks
1502 // Input: objReg (object to lock)
1503 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1504                                        Register retry_on_abort_count_Reg,
1505                                        RTMLockingCounters* stack_rtm_counters,
1506                                        Metadata* method_data, bool profile_rtm,
1507                                        Label& DONE_LABEL, Label& IsInflated) {
1508   assert(UseRTMForStackLocks, "why call this otherwise?");
1509   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1510   assert(tmpReg == rax, "");
1511   assert(scrReg == rdx, "");
1512   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1513 
1514   if (RTMRetryCount > 0) {
1515     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1516     bind(L_rtm_retry);
1517   }
1518   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1519   testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1520   jcc(Assembler::notZero, IsInflated);
1521 
1522   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1523     Label L_noincrement;
1524     if (RTMTotalCountIncrRate > 1) {
1525       // tmpReg, scrReg and flags are killed
1526       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1527     }
1528     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1529     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1530     bind(L_noincrement);
1531   }
1532   xbegin(L_on_abort);
1533   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
1534   andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1535   cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1536   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1537 
1538   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1539   if (UseRTMXendForLockBusy) {
1540     xend();
1541     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1542     jmp(L_decrement_retry);
1543   }
1544   else {
1545     xabort(0);
1546   }
1547   bind(L_on_abort);
1548   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1549     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1550   }
1551   bind(L_decrement_retry);
1552   if (RTMRetryCount > 0) {
1553     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1554     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1555   }
1556 }
1557 
1558 // Use RTM for inflating locks
1559 // inputs: objReg (object to lock)
1560 //         boxReg (on-stack box address (displaced header location) - KILLED)
1561 //         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1562 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1563                                           Register scrReg, Register retry_on_busy_count_Reg,
1564                                           Register retry_on_abort_count_Reg,
1565                                           RTMLockingCounters* rtm_counters,
1566                                           Metadata* method_data, bool profile_rtm,
1567                                           Label& DONE_LABEL) {
1568   assert(UseRTMLocking, "why call this otherwise?");
1569   assert(tmpReg == rax, "");
1570   assert(scrReg == rdx, "");
1571   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1572   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1573 
1574   // Without cast to int32_t a movptr will destroy r10 which is typically obj
1575   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1576   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1577 
1578   if (RTMRetryCount > 0) {
1579     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1580     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1581     bind(L_rtm_retry);
1582   }
1583   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1584     Label L_noincrement;
1585     if (RTMTotalCountIncrRate > 1) {
1586       // tmpReg, scrReg and flags are killed
1587       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1588     }
1589     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1590     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1591     bind(L_noincrement);
1592   }
1593   xbegin(L_on_abort);
1594   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1595   movptr(tmpReg, Address(tmpReg, owner_offset));
1596   testptr(tmpReg, tmpReg);
1597   jcc(Assembler::zero, DONE_LABEL);
1598   if (UseRTMXendForLockBusy) {
1599     xend();
1600     jmp(L_decrement_retry);
1601   }
1602   else {
1603     xabort(0);
1604   }
1605   bind(L_on_abort);
1606   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1607   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1608     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1609   }
1610   if (RTMRetryCount > 0) {
1611     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1612     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1613   }
1614 
1615   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1616   testptr(tmpReg, tmpReg) ;
1617   jccb(Assembler::notZero, L_decrement_retry) ;
1618 
1619   // Appears unlocked - try to swing _owner from null to non-null.
1620   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1621 #ifdef _LP64
1622   Register threadReg = r15_thread;
1623 #else
1624   get_thread(scrReg);
1625   Register threadReg = scrReg;
1626 #endif
1627   lock();
1628   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1629 
1630   if (RTMRetryCount > 0) {
1631     // success done else retry
1632     jccb(Assembler::equal, DONE_LABEL) ;
1633     bind(L_decrement_retry);
1634     // Spin and retry if lock is busy.
1635     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1636   }
1637   else {
1638     bind(L_decrement_retry);
1639   }
1640 }
1641 
1642 #endif //  INCLUDE_RTM_OPT
1643 
1644 // Fast_Lock and Fast_Unlock used by C2
1645 
1646 // Because the transitions from emitted code to the runtime
1647 // monitorenter/exit helper stubs are so slow it's critical that
1648 // we inline both the stack-locking fast-path and the inflated fast path.
1649 //
1650 // See also: cmpFastLock and cmpFastUnlock.
1651 //
1652 // What follows is a specialized inline transliteration of the code
1653 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1654 // another option would be to emit TrySlowEnter and TrySlowExit methods
1655 // at startup-time.  These methods would accept arguments as
1656 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1657 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1658 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1659 // In practice, however, the # of lock sites is bounded and is usually small.
1660 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1661 // if the processor uses simple bimodal branch predictors keyed by EIP
1662 // Since the helper routines would be called from multiple synchronization
1663 // sites.
1664 //
1665 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1666 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1667 // to those specialized methods.  That'd give us a mostly platform-independent
1668 // implementation that the JITs could optimize and inline at their pleasure.
1669 // Done correctly, the only time we'd need to cross to native could would be
1670 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1671 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1672 // (b) explicit barriers or fence operations.
1673 //
1674 // TODO:
1675 //
1676 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1677 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1678 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1679 //    the lock operators would typically be faster than reifying Self.
1680 //
1681 // *  Ideally I'd define the primitives as:
1682 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1683 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1684 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1685 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1686 //    Furthermore the register assignments are overconstrained, possibly resulting in
1687 //    sub-optimal code near the synchronization site.
1688 //
1689 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1690 //    Alternately, use a better sp-proximity test.
1691 //
1692 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1693 //    Either one is sufficient to uniquely identify a thread.
1694 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1695 //
1696 // *  Intrinsify notify() and notifyAll() for the common cases where the
1697 //    object is locked by the calling thread but the waitlist is empty.
1698 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1699 //
1700 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1701 //    But beware of excessive branch density on AMD Opterons.
1702 //
1703 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1704 //    or failure of the fast-path.  If the fast-path fails then we pass
1705 //    control to the slow-path, typically in C.  In Fast_Lock and
1706 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1707 //    will emit a conditional branch immediately after the node.
1708 //    So we have branches to branches and lots of ICC.ZF games.
1709 //    Instead, it might be better to have C2 pass a "FailureLabel"
1710 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
1711 //    will drop through the node.  ICC.ZF is undefined at exit.
1712 //    In the case of failure, the node will branch directly to the
1713 //    FailureLabel
1714 
1715 
1716 // obj: object to lock
1717 // box: on-stack box address (displaced header location) - KILLED
1718 // rax,: tmp -- KILLED
1719 // scr: tmp -- KILLED
1720 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1721                                Register scrReg, Register cx1Reg, Register cx2Reg,
1722                                BiasedLockingCounters* counters,
1723                                RTMLockingCounters* rtm_counters,
1724                                RTMLockingCounters* stack_rtm_counters,
1725                                Metadata* method_data,
1726                                bool use_rtm, bool profile_rtm) {
1727   // Ensure the register assignments are disjoint
1728   assert(tmpReg == rax, "");
1729 
1730   if (use_rtm) {
1731     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1732   } else {
1733     assert(cx1Reg == noreg, "");
1734     assert(cx2Reg == noreg, "");
1735     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1736   }
1737 
1738   if (counters != NULL) {
1739     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1740   }
1741 
1742   // Possible cases that we'll encounter in fast_lock
1743   // ------------------------------------------------
1744   // * Inflated
1745   //    -- unlocked
1746   //    -- Locked
1747   //       = by self
1748   //       = by other
1749   // * biased
1750   //    -- by Self
1751   //    -- by other
1752   // * neutral
1753   // * stack-locked
1754   //    -- by self
1755   //       = sp-proximity test hits
1756   //       = sp-proximity test generates false-negative
1757   //    -- by other
1758   //
1759 
1760   Label IsInflated, DONE_LABEL;
1761 
1762   // it's stack-locked, biased or neutral
1763   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1764   // order to reduce the number of conditional branches in the most common cases.
1765   // Beware -- there's a subtle invariant that fetch of the markword
1766   // at [FETCH], below, will never observe a biased encoding (*101b).
1767   // If this invariant is not held we risk exclusion (safety) failure.
1768   if (UseBiasedLocking && !UseOptoBiasInlining) {
1769     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1770   }
1771 
1772 #if INCLUDE_RTM_OPT
1773   if (UseRTMForStackLocks && use_rtm) {
1774     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1775                       stack_rtm_counters, method_data, profile_rtm,
1776                       DONE_LABEL, IsInflated);
1777   }
1778 #endif // INCLUDE_RTM_OPT
1779 
1780   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
1781   testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1782   jccb(Assembler::notZero, IsInflated);
1783 
1784   // Attempt stack-locking ...
1785   orptr (tmpReg, markOopDesc::unlocked_value);
1786   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1787   lock();
1788   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
1789   if (counters != NULL) {
1790     cond_inc32(Assembler::equal,
1791                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1792   }
1793   jcc(Assembler::equal, DONE_LABEL);           // Success
1794 
1795   // Recursive locking.
1796   // The object is stack-locked: markword contains stack pointer to BasicLock.
1797   // Locked by current thread if difference with current SP is less than one page.
1798   subptr(tmpReg, rsp);
1799   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1800   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1801   movptr(Address(boxReg, 0), tmpReg);
1802   if (counters != NULL) {
1803     cond_inc32(Assembler::equal,
1804                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1805   }
1806   jmp(DONE_LABEL);
1807 
1808   bind(IsInflated);
1809   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1810 
1811 #if INCLUDE_RTM_OPT
1812   // Use the same RTM locking code in 32- and 64-bit VM.
1813   if (use_rtm) {
1814     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1815                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
1816   } else {
1817 #endif // INCLUDE_RTM_OPT
1818 
1819 #ifndef _LP64
1820   // The object is inflated.
1821 
1822   // boxReg refers to the on-stack BasicLock in the current frame.
1823   // We'd like to write:
1824   //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1825   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1826   // additional latency as we have another ST in the store buffer that must drain.
1827 
1828   // avoid ST-before-CAS
1829   // register juggle because we need tmpReg for cmpxchgptr below
1830   movptr(scrReg, boxReg);
1831   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1832 
1833   // Optimistic form: consider XORL tmpReg,tmpReg
1834   movptr(tmpReg, NULL_WORD);
1835 
1836   // Appears unlocked - try to swing _owner from null to non-null.
1837   // Ideally, I'd manifest "Self" with get_thread and then attempt
1838   // to CAS the register containing Self into m->Owner.
1839   // But we don't have enough registers, so instead we can either try to CAS
1840   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1841   // we later store "Self" into m->Owner.  Transiently storing a stack address
1842   // (rsp or the address of the box) into  m->owner is harmless.
1843   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1844   lock();
1845   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1846   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1847   // If we weren't able to swing _owner from NULL to the BasicLock
1848   // then take the slow path.
1849   jccb  (Assembler::notZero, DONE_LABEL);
1850   // update _owner from BasicLock to thread
1851   get_thread (scrReg);                    // beware: clobbers ICCs
1852   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1853   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1854 
1855   // If the CAS fails we can either retry or pass control to the slow-path.
1856   // We use the latter tactic.
1857   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1858   // If the CAS was successful ...
1859   //   Self has acquired the lock
1860   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1861   // Intentional fall-through into DONE_LABEL ...
1862 #else // _LP64
1863   // It's inflated
1864   movq(scrReg, tmpReg);
1865   xorq(tmpReg, tmpReg);
1866 
1867   lock();
1868   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1869   // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1870   // Without cast to int32_t movptr will destroy r10 which is typically obj.
1871   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1872   // Intentional fall-through into DONE_LABEL ...
1873   // Propagate ICC.ZF from CAS above into DONE_LABEL.
1874 #endif // _LP64
1875 #if INCLUDE_RTM_OPT
1876   } // use_rtm()
1877 #endif
1878   // DONE_LABEL is a hot target - we'd really like to place it at the
1879   // start of cache line by padding with NOPs.
1880   // See the AMD and Intel software optimization manuals for the
1881   // most efficient "long" NOP encodings.
1882   // Unfortunately none of our alignment mechanisms suffice.
1883   bind(DONE_LABEL);
1884 
1885   // At DONE_LABEL the icc ZFlag is set as follows ...
1886   // Fast_Unlock uses the same protocol.
1887   // ZFlag == 1 -> Success
1888   // ZFlag == 0 -> Failure - force control through the slow-path
1889 }
1890 
1891 // obj: object to unlock
1892 // box: box address (displaced header location), killed.  Must be EAX.
1893 // tmp: killed, cannot be obj nor box.
1894 //
1895 // Some commentary on balanced locking:
1896 //
1897 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1898 // Methods that don't have provably balanced locking are forced to run in the
1899 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1900 // The interpreter provides two properties:
1901 // I1:  At return-time the interpreter automatically and quietly unlocks any
1902 //      objects acquired the current activation (frame).  Recall that the
1903 //      interpreter maintains an on-stack list of locks currently held by
1904 //      a frame.
1905 // I2:  If a method attempts to unlock an object that is not held by the
1906 //      the frame the interpreter throws IMSX.
1907 //
1908 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1909 // B() doesn't have provably balanced locking so it runs in the interpreter.
1910 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1911 // is still locked by A().
1912 //
1913 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1914 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1915 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1916 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1917 // Arguably given that the spec legislates the JNI case as undefined our implementation
1918 // could reasonably *avoid* checking owner in Fast_Unlock().
1919 // In the interest of performance we elide m->Owner==Self check in unlock.
1920 // A perfectly viable alternative is to elide the owner check except when
1921 // Xcheck:jni is enabled.
1922 
1923 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1924   assert(boxReg == rax, "");
1925   assert_different_registers(objReg, boxReg, tmpReg);
1926 
1927   Label DONE_LABEL, Stacked, CheckSucc;
1928 
1929   // Critically, the biased locking test must have precedence over
1930   // and appear before the (box->dhw == 0) recursive stack-lock test.
1931   if (UseBiasedLocking && !UseOptoBiasInlining) {
1932     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1933   }
1934 
1935 #if INCLUDE_RTM_OPT
1936   if (UseRTMForStackLocks && use_rtm) {
1937     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1938     Label L_regular_unlock;
1939     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));           // fetch markword
1940     andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1941     cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1942     jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1943     xend();                                       // otherwise end...
1944     jmp(DONE_LABEL);                              // ... and we're done
1945     bind(L_regular_unlock);
1946   }
1947 #endif
1948 
1949   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1950   jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
1951   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));             // Examine the object's markword
1952   testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
1953   jccb  (Assembler::zero, Stacked);
1954 
1955   // It's inflated.
1956 #if INCLUDE_RTM_OPT
1957   if (use_rtm) {
1958     Label L_regular_inflated_unlock;
1959     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1960     movptr(boxReg, Address(tmpReg, owner_offset));
1961     testptr(boxReg, boxReg);
1962     jccb(Assembler::notZero, L_regular_inflated_unlock);
1963     xend();
1964     jmpb(DONE_LABEL);
1965     bind(L_regular_inflated_unlock);
1966   }
1967 #endif
1968 
1969   // Despite our balanced locking property we still check that m->_owner == Self
1970   // as java routines or native JNI code called by this thread might
1971   // have released the lock.
1972   // Refer to the comments in synchronizer.cpp for how we might encode extra
1973   // state in _succ so we can avoid fetching EntryList|cxq.
1974   //
1975   // I'd like to add more cases in fast_lock() and fast_unlock() --
1976   // such as recursive enter and exit -- but we have to be wary of
1977   // I$ bloat, T$ effects and BP$ effects.
1978   //
1979   // If there's no contention try a 1-0 exit.  That is, exit without
1980   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1981   // we detect and recover from the race that the 1-0 exit admits.
1982   //
1983   // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1984   // before it STs null into _owner, releasing the lock.  Updates
1985   // to data protected by the critical section must be visible before
1986   // we drop the lock (and thus before any other thread could acquire
1987   // the lock and observe the fields protected by the lock).
1988   // IA32's memory-model is SPO, so STs are ordered with respect to
1989   // each other and there's no need for an explicit barrier (fence).
1990   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1991 #ifndef _LP64
1992   get_thread (boxReg);
1993 
1994   // Note that we could employ various encoding schemes to reduce
1995   // the number of loads below (currently 4) to just 2 or 3.
1996   // Refer to the comments in synchronizer.cpp.
1997   // In practice the chain of fetches doesn't seem to impact performance, however.
1998   xorptr(boxReg, boxReg);
1999   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2000   jccb  (Assembler::notZero, DONE_LABEL);
2001   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2002   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2003   jccb  (Assembler::notZero, CheckSucc);
2004   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2005   jmpb  (DONE_LABEL);
2006 
2007   bind (Stacked);
2008   // It's not inflated and it's not recursively stack-locked and it's not biased.
2009   // It must be stack-locked.
2010   // Try to reset the header to displaced header.
2011   // The "box" value on the stack is stable, so we can reload
2012   // and be assured we observe the same value as above.
2013   movptr(tmpReg, Address(boxReg, 0));
2014   lock();
2015   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2016   // Intention fall-thru into DONE_LABEL
2017 
2018   // DONE_LABEL is a hot target - we'd really like to place it at the
2019   // start of cache line by padding with NOPs.
2020   // See the AMD and Intel software optimization manuals for the
2021   // most efficient "long" NOP encodings.
2022   // Unfortunately none of our alignment mechanisms suffice.
2023   bind (CheckSucc);
2024 #else // _LP64
2025   // It's inflated
2026   xorptr(boxReg, boxReg);
2027   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2028   jccb  (Assembler::notZero, DONE_LABEL);
2029   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2030   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2031   jccb  (Assembler::notZero, CheckSucc);
2032   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2033   jmpb  (DONE_LABEL);
2034 
2035   // Try to avoid passing control into the slow_path ...
2036   Label LSuccess, LGoSlowPath ;
2037   bind  (CheckSucc);
2038 
2039   // The following optional optimization can be elided if necessary
2040   // Effectively: if (succ == null) goto SlowPath
2041   // The code reduces the window for a race, however,
2042   // and thus benefits performance.
2043   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2044   jccb  (Assembler::zero, LGoSlowPath);
2045 
2046   xorptr(boxReg, boxReg);
2047   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2048 
2049   // Memory barrier/fence
2050   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2051   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2052   // This is faster on Nehalem and AMD Shanghai/Barcelona.
2053   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2054   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2055   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2056   lock(); addl(Address(rsp, 0), 0);
2057 
2058   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2059   jccb  (Assembler::notZero, LSuccess);
2060 
2061   // Rare inopportune interleaving - race.
2062   // The successor vanished in the small window above.
2063   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2064   // We need to ensure progress and succession.
2065   // Try to reacquire the lock.
2066   // If that fails then the new owner is responsible for succession and this
2067   // thread needs to take no further action and can exit via the fast path (success).
2068   // If the re-acquire succeeds then pass control into the slow path.
2069   // As implemented, this latter mode is horrible because we generated more
2070   // coherence traffic on the lock *and* artifically extended the critical section
2071   // length while by virtue of passing control into the slow path.
2072 
2073   // box is really RAX -- the following CMPXCHG depends on that binding
2074   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2075   lock();
2076   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2077   // There's no successor so we tried to regrab the lock.
2078   // If that didn't work, then another thread grabbed the
2079   // lock so we're done (and exit was a success).
2080   jccb  (Assembler::notEqual, LSuccess);
2081   // Intentional fall-through into slow-path
2082 
2083   bind  (LGoSlowPath);
2084   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2085   jmpb  (DONE_LABEL);
2086 
2087   bind  (LSuccess);
2088   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2089   jmpb  (DONE_LABEL);
2090 
2091   bind  (Stacked);
2092   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2093   lock();
2094   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2095 
2096 #endif
2097   bind(DONE_LABEL);
2098 }
2099 #endif // COMPILER2
2100 
2101 void MacroAssembler::c2bool(Register x) {
2102   // implements x == 0 ? 0 : 1
2103   // note: must only look at least-significant byte of x
2104   //       since C-style booleans are stored in one byte
2105   //       only! (was bug)
2106   andl(x, 0xFF);
2107   setb(Assembler::notZero, x);
2108 }
2109 
2110 // Wouldn't need if AddressLiteral version had new name
2111 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2112   Assembler::call(L, rtype);
2113 }
2114 
2115 void MacroAssembler::call(Register entry) {
2116   Assembler::call(entry);
2117 }
2118 
2119 void MacroAssembler::call(AddressLiteral entry) {
2120   if (reachable(entry)) {
2121     Assembler::call_literal(entry.target(), entry.rspec());
2122   } else {
2123     lea(rscratch1, entry);
2124     Assembler::call(rscratch1);
2125   }
2126 }
2127 
2128 void MacroAssembler::ic_call(address entry, jint method_index) {
2129   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2130   movptr(rax, (intptr_t)Universe::non_oop_word());
2131   call(AddressLiteral(entry, rh));
2132 }
2133 
2134 // Implementation of call_VM versions
2135 
2136 void MacroAssembler::call_VM(Register oop_result,
2137                              address entry_point,
2138                              bool check_exceptions) {
2139   Label C, E;
2140   call(C, relocInfo::none);
2141   jmp(E);
2142 
2143   bind(C);
2144   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2145   ret(0);
2146 
2147   bind(E);
2148 }
2149 
2150 void MacroAssembler::call_VM(Register oop_result,
2151                              address entry_point,
2152                              Register arg_1,
2153                              bool check_exceptions) {
2154   Label C, E;
2155   call(C, relocInfo::none);
2156   jmp(E);
2157 
2158   bind(C);
2159   pass_arg1(this, arg_1);
2160   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2161   ret(0);
2162 
2163   bind(E);
2164 }
2165 
2166 void MacroAssembler::call_VM(Register oop_result,
2167                              address entry_point,
2168                              Register arg_1,
2169                              Register arg_2,
2170                              bool check_exceptions) {
2171   Label C, E;
2172   call(C, relocInfo::none);
2173   jmp(E);
2174 
2175   bind(C);
2176 
2177   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2178 
2179   pass_arg2(this, arg_2);
2180   pass_arg1(this, arg_1);
2181   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2182   ret(0);
2183 
2184   bind(E);
2185 }
2186 
2187 void MacroAssembler::call_VM(Register oop_result,
2188                              address entry_point,
2189                              Register arg_1,
2190                              Register arg_2,
2191                              Register arg_3,
2192                              bool check_exceptions) {
2193   Label C, E;
2194   call(C, relocInfo::none);
2195   jmp(E);
2196 
2197   bind(C);
2198 
2199   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2200   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2201   pass_arg3(this, arg_3);
2202 
2203   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2204   pass_arg2(this, arg_2);
2205 
2206   pass_arg1(this, arg_1);
2207   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2208   ret(0);
2209 
2210   bind(E);
2211 }
2212 
2213 void MacroAssembler::call_VM(Register oop_result,
2214                              Register last_java_sp,
2215                              address entry_point,
2216                              int number_of_arguments,
2217                              bool check_exceptions) {
2218   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2219   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2220 }
2221 
2222 void MacroAssembler::call_VM(Register oop_result,
2223                              Register last_java_sp,
2224                              address entry_point,
2225                              Register arg_1,
2226                              bool check_exceptions) {
2227   pass_arg1(this, arg_1);
2228   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2229 }
2230 
2231 void MacroAssembler::call_VM(Register oop_result,
2232                              Register last_java_sp,
2233                              address entry_point,
2234                              Register arg_1,
2235                              Register arg_2,
2236                              bool check_exceptions) {
2237 
2238   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2239   pass_arg2(this, arg_2);
2240   pass_arg1(this, arg_1);
2241   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2242 }
2243 
2244 void MacroAssembler::call_VM(Register oop_result,
2245                              Register last_java_sp,
2246                              address entry_point,
2247                              Register arg_1,
2248                              Register arg_2,
2249                              Register arg_3,
2250                              bool check_exceptions) {
2251   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2252   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2253   pass_arg3(this, arg_3);
2254   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2255   pass_arg2(this, arg_2);
2256   pass_arg1(this, arg_1);
2257   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2258 }
2259 
2260 void MacroAssembler::super_call_VM(Register oop_result,
2261                                    Register last_java_sp,
2262                                    address entry_point,
2263                                    int number_of_arguments,
2264                                    bool check_exceptions) {
2265   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2266   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2267 }
2268 
2269 void MacroAssembler::super_call_VM(Register oop_result,
2270                                    Register last_java_sp,
2271                                    address entry_point,
2272                                    Register arg_1,
2273                                    bool check_exceptions) {
2274   pass_arg1(this, arg_1);
2275   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2276 }
2277 
2278 void MacroAssembler::super_call_VM(Register oop_result,
2279                                    Register last_java_sp,
2280                                    address entry_point,
2281                                    Register arg_1,
2282                                    Register arg_2,
2283                                    bool check_exceptions) {
2284 
2285   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2286   pass_arg2(this, arg_2);
2287   pass_arg1(this, arg_1);
2288   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2289 }
2290 
2291 void MacroAssembler::super_call_VM(Register oop_result,
2292                                    Register last_java_sp,
2293                                    address entry_point,
2294                                    Register arg_1,
2295                                    Register arg_2,
2296                                    Register arg_3,
2297                                    bool check_exceptions) {
2298   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2299   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2300   pass_arg3(this, arg_3);
2301   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2302   pass_arg2(this, arg_2);
2303   pass_arg1(this, arg_1);
2304   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2305 }
2306 
2307 void MacroAssembler::call_VM_base(Register oop_result,
2308                                   Register java_thread,
2309                                   Register last_java_sp,
2310                                   address  entry_point,
2311                                   int      number_of_arguments,
2312                                   bool     check_exceptions) {
2313   // determine java_thread register
2314   if (!java_thread->is_valid()) {
2315 #ifdef _LP64
2316     java_thread = r15_thread;
2317 #else
2318     java_thread = rdi;
2319     get_thread(java_thread);
2320 #endif // LP64
2321   }
2322   // determine last_java_sp register
2323   if (!last_java_sp->is_valid()) {
2324     last_java_sp = rsp;
2325   }
2326   // debugging support
2327   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2328   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2329 #ifdef ASSERT
2330   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2331   // r12 is the heapbase.
2332   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2333 #endif // ASSERT
2334 
2335   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2336   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2337 
2338   // push java thread (becomes first argument of C function)
2339 
2340   NOT_LP64(push(java_thread); number_of_arguments++);
2341   LP64_ONLY(mov(c_rarg0, r15_thread));
2342 
2343   // set last Java frame before call
2344   assert(last_java_sp != rbp, "can't use ebp/rbp");
2345 
2346   // Only interpreter should have to set fp
2347   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2348 
2349   // do the call, remove parameters
2350   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2351 
2352   // restore the thread (cannot use the pushed argument since arguments
2353   // may be overwritten by C code generated by an optimizing compiler);
2354   // however can use the register value directly if it is callee saved.
2355   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2356     // rdi & rsi (also r15) are callee saved -> nothing to do
2357 #ifdef ASSERT
2358     guarantee(java_thread != rax, "change this code");
2359     push(rax);
2360     { Label L;
2361       get_thread(rax);
2362       cmpptr(java_thread, rax);
2363       jcc(Assembler::equal, L);
2364       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2365       bind(L);
2366     }
2367     pop(rax);
2368 #endif
2369   } else {
2370     get_thread(java_thread);
2371   }
2372   // reset last Java frame
2373   // Only interpreter should have to clear fp
2374   reset_last_Java_frame(java_thread, true);
2375 
2376    // C++ interp handles this in the interpreter
2377   check_and_handle_popframe(java_thread);
2378   check_and_handle_earlyret(java_thread);
2379 
2380   if (check_exceptions) {
2381     // check for pending exceptions (java_thread is set upon return)
2382     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2383 #ifndef _LP64
2384     jump_cc(Assembler::notEqual,
2385             RuntimeAddress(StubRoutines::forward_exception_entry()));
2386 #else
2387     // This used to conditionally jump to forward_exception however it is
2388     // possible if we relocate that the branch will not reach. So we must jump
2389     // around so we can always reach
2390 
2391     Label ok;
2392     jcc(Assembler::equal, ok);
2393     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2394     bind(ok);
2395 #endif // LP64
2396   }
2397 
2398   // get oop result if there is one and reset the value in the thread
2399   if (oop_result->is_valid()) {
2400     get_vm_result(oop_result, java_thread);
2401   }
2402 }
2403 
2404 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2405 
2406   // Calculate the value for last_Java_sp
2407   // somewhat subtle. call_VM does an intermediate call
2408   // which places a return address on the stack just under the
2409   // stack pointer as the user finsihed with it. This allows
2410   // use to retrieve last_Java_pc from last_Java_sp[-1].
2411   // On 32bit we then have to push additional args on the stack to accomplish
2412   // the actual requested call. On 64bit call_VM only can use register args
2413   // so the only extra space is the return address that call_VM created.
2414   // This hopefully explains the calculations here.
2415 
2416 #ifdef _LP64
2417   // We've pushed one address, correct last_Java_sp
2418   lea(rax, Address(rsp, wordSize));
2419 #else
2420   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2421 #endif // LP64
2422 
2423   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2424 
2425 }
2426 
2427 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2428 void MacroAssembler::call_VM_leaf0(address entry_point) {
2429   MacroAssembler::call_VM_leaf_base(entry_point, 0);
2430 }
2431 
2432 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2433   call_VM_leaf_base(entry_point, number_of_arguments);
2434 }
2435 
2436 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2437   pass_arg0(this, arg_0);
2438   call_VM_leaf(entry_point, 1);
2439 }
2440 
2441 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2442 
2443   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2444   pass_arg1(this, arg_1);
2445   pass_arg0(this, arg_0);
2446   call_VM_leaf(entry_point, 2);
2447 }
2448 
2449 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2450   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2451   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2452   pass_arg2(this, arg_2);
2453   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2454   pass_arg1(this, arg_1);
2455   pass_arg0(this, arg_0);
2456   call_VM_leaf(entry_point, 3);
2457 }
2458 
2459 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2460   pass_arg0(this, arg_0);
2461   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2462 }
2463 
2464 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2465 
2466   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2467   pass_arg1(this, arg_1);
2468   pass_arg0(this, arg_0);
2469   MacroAssembler::call_VM_leaf_base(entry_point, 2);
2470 }
2471 
2472 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2473   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2474   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2475   pass_arg2(this, arg_2);
2476   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2477   pass_arg1(this, arg_1);
2478   pass_arg0(this, arg_0);
2479   MacroAssembler::call_VM_leaf_base(entry_point, 3);
2480 }
2481 
2482 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2483   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2484   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2485   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2486   pass_arg3(this, arg_3);
2487   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2488   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2489   pass_arg2(this, arg_2);
2490   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2491   pass_arg1(this, arg_1);
2492   pass_arg0(this, arg_0);
2493   MacroAssembler::call_VM_leaf_base(entry_point, 4);
2494 }
2495 
2496 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2497   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2498   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2499   verify_oop(oop_result, "broken oop in call_VM_base");
2500 }
2501 
2502 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2503   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2504   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2505 }
2506 
2507 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2508 }
2509 
2510 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2511 }
2512 
2513 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2514   if (reachable(src1)) {
2515     cmpl(as_Address(src1), imm);
2516   } else {
2517     lea(rscratch1, src1);
2518     cmpl(Address(rscratch1, 0), imm);
2519   }
2520 }
2521 
2522 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2523   assert(!src2.is_lval(), "use cmpptr");
2524   if (reachable(src2)) {
2525     cmpl(src1, as_Address(src2));
2526   } else {
2527     lea(rscratch1, src2);
2528     cmpl(src1, Address(rscratch1, 0));
2529   }
2530 }
2531 
2532 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2533   Assembler::cmpl(src1, imm);
2534 }
2535 
2536 void MacroAssembler::cmp32(Register src1, Address src2) {
2537   Assembler::cmpl(src1, src2);
2538 }
2539 
2540 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2541   ucomisd(opr1, opr2);
2542 
2543   Label L;
2544   if (unordered_is_less) {
2545     movl(dst, -1);
2546     jcc(Assembler::parity, L);
2547     jcc(Assembler::below , L);
2548     movl(dst, 0);
2549     jcc(Assembler::equal , L);
2550     increment(dst);
2551   } else { // unordered is greater
2552     movl(dst, 1);
2553     jcc(Assembler::parity, L);
2554     jcc(Assembler::above , L);
2555     movl(dst, 0);
2556     jcc(Assembler::equal , L);
2557     decrementl(dst);
2558   }
2559   bind(L);
2560 }
2561 
2562 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2563   ucomiss(opr1, opr2);
2564 
2565   Label L;
2566   if (unordered_is_less) {
2567     movl(dst, -1);
2568     jcc(Assembler::parity, L);
2569     jcc(Assembler::below , L);
2570     movl(dst, 0);
2571     jcc(Assembler::equal , L);
2572     increment(dst);
2573   } else { // unordered is greater
2574     movl(dst, 1);
2575     jcc(Assembler::parity, L);
2576     jcc(Assembler::above , L);
2577     movl(dst, 0);
2578     jcc(Assembler::equal , L);
2579     decrementl(dst);
2580   }
2581   bind(L);
2582 }
2583 
2584 
2585 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2586   if (reachable(src1)) {
2587     cmpb(as_Address(src1), imm);
2588   } else {
2589     lea(rscratch1, src1);
2590     cmpb(Address(rscratch1, 0), imm);
2591   }
2592 }
2593 
2594 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2595 #ifdef _LP64
2596   if (src2.is_lval()) {
2597     movptr(rscratch1, src2);
2598     Assembler::cmpq(src1, rscratch1);
2599   } else if (reachable(src2)) {
2600     cmpq(src1, as_Address(src2));
2601   } else {
2602     lea(rscratch1, src2);
2603     Assembler::cmpq(src1, Address(rscratch1, 0));
2604   }
2605 #else
2606   if (src2.is_lval()) {
2607     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2608   } else {
2609     cmpl(src1, as_Address(src2));
2610   }
2611 #endif // _LP64
2612 }
2613 
2614 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2615   assert(src2.is_lval(), "not a mem-mem compare");
2616 #ifdef _LP64
2617   // moves src2's literal address
2618   movptr(rscratch1, src2);
2619   Assembler::cmpq(src1, rscratch1);
2620 #else
2621   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2622 #endif // _LP64
2623 }
2624 
2625 void MacroAssembler::cmpoop(Register src1, Register src2) {
2626   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2627   bs->obj_equals(this, src1, src2);
2628 }
2629 
2630 void MacroAssembler::cmpoop(Register src1, Address src2) {
2631   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2632   bs->obj_equals(this, src1, src2);
2633 }
2634 
2635 #ifdef _LP64
2636 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2637   movoop(rscratch1, src2);
2638   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2639   bs->obj_equals(this, src1, rscratch1);
2640 }
2641 #endif
2642 
2643 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2644   if (reachable(adr)) {
2645     lock();
2646     cmpxchgptr(reg, as_Address(adr));
2647   } else {
2648     lea(rscratch1, adr);
2649     lock();
2650     cmpxchgptr(reg, Address(rscratch1, 0));
2651   }
2652 }
2653 
2654 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2655   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2656 }
2657 
2658 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2659   if (reachable(src)) {
2660     Assembler::comisd(dst, as_Address(src));
2661   } else {
2662     lea(rscratch1, src);
2663     Assembler::comisd(dst, Address(rscratch1, 0));
2664   }
2665 }
2666 
2667 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2668   if (reachable(src)) {
2669     Assembler::comiss(dst, as_Address(src));
2670   } else {
2671     lea(rscratch1, src);
2672     Assembler::comiss(dst, Address(rscratch1, 0));
2673   }
2674 }
2675 
2676 
2677 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2678   Condition negated_cond = negate_condition(cond);
2679   Label L;
2680   jcc(negated_cond, L);
2681   pushf(); // Preserve flags
2682   atomic_incl(counter_addr);
2683   popf();
2684   bind(L);
2685 }
2686 
2687 int MacroAssembler::corrected_idivl(Register reg) {
2688   // Full implementation of Java idiv and irem; checks for
2689   // special case as described in JVM spec., p.243 & p.271.
2690   // The function returns the (pc) offset of the idivl
2691   // instruction - may be needed for implicit exceptions.
2692   //
2693   //         normal case                           special case
2694   //
2695   // input : rax,: dividend                         min_int
2696   //         reg: divisor   (may not be rax,/rdx)   -1
2697   //
2698   // output: rax,: quotient  (= rax, idiv reg)       min_int
2699   //         rdx: remainder (= rax, irem reg)       0
2700   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2701   const int min_int = 0x80000000;
2702   Label normal_case, special_case;
2703 
2704   // check for special case
2705   cmpl(rax, min_int);
2706   jcc(Assembler::notEqual, normal_case);
2707   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2708   cmpl(reg, -1);
2709   jcc(Assembler::equal, special_case);
2710 
2711   // handle normal case
2712   bind(normal_case);
2713   cdql();
2714   int idivl_offset = offset();
2715   idivl(reg);
2716 
2717   // normal and special case exit
2718   bind(special_case);
2719 
2720   return idivl_offset;
2721 }
2722 
2723 
2724 
2725 void MacroAssembler::decrementl(Register reg, int value) {
2726   if (value == min_jint) {subl(reg, value) ; return; }
2727   if (value <  0) { incrementl(reg, -value); return; }
2728   if (value == 0) {                        ; return; }
2729   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2730   /* else */      { subl(reg, value)       ; return; }
2731 }
2732 
2733 void MacroAssembler::decrementl(Address dst, int value) {
2734   if (value == min_jint) {subl(dst, value) ; return; }
2735   if (value <  0) { incrementl(dst, -value); return; }
2736   if (value == 0) {                        ; return; }
2737   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2738   /* else */      { subl(dst, value)       ; return; }
2739 }
2740 
2741 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2742   assert (shift_value > 0, "illegal shift value");
2743   Label _is_positive;
2744   testl (reg, reg);
2745   jcc (Assembler::positive, _is_positive);
2746   int offset = (1 << shift_value) - 1 ;
2747 
2748   if (offset == 1) {
2749     incrementl(reg);
2750   } else {
2751     addl(reg, offset);
2752   }
2753 
2754   bind (_is_positive);
2755   sarl(reg, shift_value);
2756 }
2757 
2758 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2759   if (reachable(src)) {
2760     Assembler::divsd(dst, as_Address(src));
2761   } else {
2762     lea(rscratch1, src);
2763     Assembler::divsd(dst, Address(rscratch1, 0));
2764   }
2765 }
2766 
2767 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2768   if (reachable(src)) {
2769     Assembler::divss(dst, as_Address(src));
2770   } else {
2771     lea(rscratch1, src);
2772     Assembler::divss(dst, Address(rscratch1, 0));
2773   }
2774 }
2775 
2776 // !defined(COMPILER2) is because of stupid core builds
2777 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2778 void MacroAssembler::empty_FPU_stack() {
2779   if (VM_Version::supports_mmx()) {
2780     emms();
2781   } else {
2782     for (int i = 8; i-- > 0; ) ffree(i);
2783   }
2784 }
2785 #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2786 
2787 
2788 void MacroAssembler::enter() {
2789   push(rbp);
2790   mov(rbp, rsp);
2791 }
2792 
2793 void MacroAssembler::post_call_nop() {
2794   emit_int8((int8_t)0x0f);
2795   emit_int8((int8_t)0x1f);
2796   emit_int8((int8_t)0x84);
2797   emit_int8((int8_t)0x00);
2798   emit_int32(0x00);
2799 #ifdef CONT_DOUBLE_NOP
2800   emit_int8((int8_t)0x0f);
2801   emit_int8((int8_t)0x1f);
2802   emit_int8((int8_t)0x84);
2803   emit_int8((int8_t)0x00);
2804   emit_int32(0x00);
2805 #endif
2806 }
2807 
2808 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2809 void MacroAssembler::fat_nop() {
2810   if (UseAddressNop) {
2811     addr_nop_5();
2812   } else {
2813     emit_int8((int8_t)0x26); // es:
2814     emit_int8((int8_t)0x2e); // cs:
2815     emit_int8((int8_t)0x64); // fs:
2816     emit_int8((int8_t)0x65); // gs:
2817     emit_int8((int8_t)0x90);
2818   }
2819 }
2820 
2821 void MacroAssembler::fcmp(Register tmp) {
2822   fcmp(tmp, 1, true, true);
2823 }
2824 
2825 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2826   assert(!pop_right || pop_left, "usage error");
2827   if (VM_Version::supports_cmov()) {
2828     assert(tmp == noreg, "unneeded temp");
2829     if (pop_left) {
2830       fucomip(index);
2831     } else {
2832       fucomi(index);
2833     }
2834     if (pop_right) {
2835       fpop();
2836     }
2837   } else {
2838     assert(tmp != noreg, "need temp");
2839     if (pop_left) {
2840       if (pop_right) {
2841         fcompp();
2842       } else {
2843         fcomp(index);
2844       }
2845     } else {
2846       fcom(index);
2847     }
2848     // convert FPU condition into eflags condition via rax,
2849     save_rax(tmp);
2850     fwait(); fnstsw_ax();
2851     sahf();
2852     restore_rax(tmp);
2853   }
2854   // condition codes set as follows:
2855   //
2856   // CF (corresponds to C0) if x < y
2857   // PF (corresponds to C2) if unordered
2858   // ZF (corresponds to C3) if x = y
2859 }
2860 
2861 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2862   fcmp2int(dst, unordered_is_less, 1, true, true);
2863 }
2864 
2865 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2866   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2867   Label L;
2868   if (unordered_is_less) {
2869     movl(dst, -1);
2870     jcc(Assembler::parity, L);
2871     jcc(Assembler::below , L);
2872     movl(dst, 0);
2873     jcc(Assembler::equal , L);
2874     increment(dst);
2875   } else { // unordered is greater
2876     movl(dst, 1);
2877     jcc(Assembler::parity, L);
2878     jcc(Assembler::above , L);
2879     movl(dst, 0);
2880     jcc(Assembler::equal , L);
2881     decrementl(dst);
2882   }
2883   bind(L);
2884 }
2885 
2886 void MacroAssembler::fld_d(AddressLiteral src) {
2887   fld_d(as_Address(src));
2888 }
2889 
2890 void MacroAssembler::fld_s(AddressLiteral src) {
2891   fld_s(as_Address(src));
2892 }
2893 
2894 void MacroAssembler::fld_x(AddressLiteral src) {
2895   Assembler::fld_x(as_Address(src));
2896 }
2897 
2898 void MacroAssembler::fldcw(AddressLiteral src) {
2899   Assembler::fldcw(as_Address(src));
2900 }
2901 
2902 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2903   if (reachable(src)) {
2904     Assembler::mulpd(dst, as_Address(src));
2905   } else {
2906     lea(rscratch1, src);
2907     Assembler::mulpd(dst, Address(rscratch1, 0));
2908   }
2909 }
2910 
2911 void MacroAssembler::increase_precision() {
2912   subptr(rsp, BytesPerWord);
2913   fnstcw(Address(rsp, 0));
2914   movl(rax, Address(rsp, 0));
2915   orl(rax, 0x300);
2916   push(rax);
2917   fldcw(Address(rsp, 0));
2918   pop(rax);
2919 }
2920 
2921 void MacroAssembler::restore_precision() {
2922   fldcw(Address(rsp, 0));
2923   addptr(rsp, BytesPerWord);
2924 }
2925 
2926 void MacroAssembler::fpop() {
2927   ffree();
2928   fincstp();
2929 }
2930 
2931 void MacroAssembler::load_float(Address src) {
2932   if (UseSSE >= 1) {
2933     movflt(xmm0, src);
2934   } else {
2935     LP64_ONLY(ShouldNotReachHere());
2936     NOT_LP64(fld_s(src));
2937   }
2938 }
2939 
2940 void MacroAssembler::store_float(Address dst) {
2941   if (UseSSE >= 1) {
2942     movflt(dst, xmm0);
2943   } else {
2944     LP64_ONLY(ShouldNotReachHere());
2945     NOT_LP64(fstp_s(dst));
2946   }
2947 }
2948 
2949 void MacroAssembler::load_double(Address src) {
2950   if (UseSSE >= 2) {
2951     movdbl(xmm0, src);
2952   } else {
2953     LP64_ONLY(ShouldNotReachHere());
2954     NOT_LP64(fld_d(src));
2955   }
2956 }
2957 
2958 void MacroAssembler::store_double(Address dst) {
2959   if (UseSSE >= 2) {
2960     movdbl(dst, xmm0);
2961   } else {
2962     LP64_ONLY(ShouldNotReachHere());
2963     NOT_LP64(fstp_d(dst));
2964   }
2965 }
2966 
2967 void MacroAssembler::fremr(Register tmp) {
2968   save_rax(tmp);
2969   { Label L;
2970     bind(L);
2971     fprem();
2972     fwait(); fnstsw_ax();
2973 #ifdef _LP64
2974     testl(rax, 0x400);
2975     jcc(Assembler::notEqual, L);
2976 #else
2977     sahf();
2978     jcc(Assembler::parity, L);
2979 #endif // _LP64
2980   }
2981   restore_rax(tmp);
2982   // Result is in ST0.
2983   // Note: fxch & fpop to get rid of ST1
2984   // (otherwise FPU stack could overflow eventually)
2985   fxch(1);
2986   fpop();
2987 }
2988 
2989 // dst = c = a * b + c
2990 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2991   Assembler::vfmadd231sd(c, a, b);
2992   if (dst != c) {
2993     movdbl(dst, c);
2994   }
2995 }
2996 
2997 // dst = c = a * b + c
2998 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2999   Assembler::vfmadd231ss(c, a, b);
3000   if (dst != c) {
3001     movflt(dst, c);
3002   }
3003 }
3004 
3005 // dst = c = a * b + c
3006 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
3007   Assembler::vfmadd231pd(c, a, b, vector_len);
3008   if (dst != c) {
3009     vmovdqu(dst, c);
3010   }
3011 }
3012 
3013 // dst = c = a * b + c
3014 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
3015   Assembler::vfmadd231ps(c, a, b, vector_len);
3016   if (dst != c) {
3017     vmovdqu(dst, c);
3018   }
3019 }
3020 
3021 // dst = c = a * b + c
3022 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
3023   Assembler::vfmadd231pd(c, a, b, vector_len);
3024   if (dst != c) {
3025     vmovdqu(dst, c);
3026   }
3027 }
3028 
3029 // dst = c = a * b + c
3030 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
3031   Assembler::vfmadd231ps(c, a, b, vector_len);
3032   if (dst != c) {
3033     vmovdqu(dst, c);
3034   }
3035 }
3036 
3037 void MacroAssembler::incrementl(AddressLiteral dst) {
3038   if (reachable(dst)) {
3039     incrementl(as_Address(dst));
3040   } else {
3041     lea(rscratch1, dst);
3042     incrementl(Address(rscratch1, 0));
3043   }
3044 }
3045 
3046 void MacroAssembler::incrementl(ArrayAddress dst) {
3047   incrementl(as_Address(dst));
3048 }
3049 
3050 void MacroAssembler::incrementl(Register reg, int value) {
3051   if (value == min_jint) {addl(reg, value) ; return; }
3052   if (value <  0) { decrementl(reg, -value); return; }
3053   if (value == 0) {                        ; return; }
3054   if (value == 1 && UseIncDec) { incl(reg) ; return; }
3055   /* else */      { addl(reg, value)       ; return; }
3056 }
3057 
3058 void MacroAssembler::incrementl(Address dst, int value) {
3059   if (value == min_jint) {addl(dst, value) ; return; }
3060   if (value <  0) { decrementl(dst, -value); return; }
3061   if (value == 0) {                        ; return; }
3062   if (value == 1 && UseIncDec) { incl(dst) ; return; }
3063   /* else */      { addl(dst, value)       ; return; }
3064 }
3065 
3066 void MacroAssembler::jump(AddressLiteral dst) {
3067   if (reachable(dst)) {
3068     jmp_literal(dst.target(), dst.rspec());
3069   } else {
3070     lea(rscratch1, dst);
3071     jmp(rscratch1);
3072   }
3073 }
3074 
3075 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3076   if (reachable(dst)) {
3077     InstructionMark im(this);
3078     relocate(dst.reloc());
3079     const int short_size = 2;
3080     const int long_size = 6;
3081     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3082     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3083       // 0111 tttn #8-bit disp
3084       emit_int8(0x70 | cc);
3085       emit_int8((offs - short_size) & 0xFF);
3086     } else {
3087       // 0000 1111 1000 tttn #32-bit disp
3088       emit_int8(0x0F);
3089       emit_int8((unsigned char)(0x80 | cc));
3090       emit_int32(offs - long_size);
3091     }
3092   } else {
3093 #ifdef ASSERT
3094     warning("reversing conditional branch");
3095 #endif /* ASSERT */
3096     Label skip;
3097     jccb(reverse[cc], skip);
3098     lea(rscratch1, dst);
3099     Assembler::jmp(rscratch1);
3100     bind(skip);
3101   }
3102 }
3103 
3104 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3105   if (reachable(src)) {
3106     Assembler::ldmxcsr(as_Address(src));
3107   } else {
3108     lea(rscratch1, src);
3109     Assembler::ldmxcsr(Address(rscratch1, 0));
3110   }
3111 }
3112 
3113 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3114   int off;
3115   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3116     off = offset();
3117     movsbl(dst, src); // movsxb
3118   } else {
3119     off = load_unsigned_byte(dst, src);
3120     shll(dst, 24);
3121     sarl(dst, 24);
3122   }
3123   return off;
3124 }
3125 
3126 // Note: load_signed_short used to be called load_signed_word.
3127 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3128 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3129 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3130 int MacroAssembler::load_signed_short(Register dst, Address src) {
3131   int off;
3132   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3133     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3134     // version but this is what 64bit has always done. This seems to imply
3135     // that users are only using 32bits worth.
3136     off = offset();
3137     movswl(dst, src); // movsxw
3138   } else {
3139     off = load_unsigned_short(dst, src);
3140     shll(dst, 16);
3141     sarl(dst, 16);
3142   }
3143   return off;
3144 }
3145 
3146 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3147   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3148   // and "3.9 Partial Register Penalties", p. 22).
3149   int off;
3150   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3151     off = offset();
3152     movzbl(dst, src); // movzxb
3153   } else {
3154     xorl(dst, dst);
3155     off = offset();
3156     movb(dst, src);
3157   }
3158   return off;
3159 }
3160 
3161 // Note: load_unsigned_short used to be called load_unsigned_word.
3162 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3163   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3164   // and "3.9 Partial Register Penalties", p. 22).
3165   int off;
3166   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3167     off = offset();
3168     movzwl(dst, src); // movzxw
3169   } else {
3170     xorl(dst, dst);
3171     off = offset();
3172     movw(dst, src);
3173   }
3174   return off;
3175 }
3176 
3177 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3178   switch (size_in_bytes) {
3179 #ifndef _LP64
3180   case  8:
3181     assert(dst2 != noreg, "second dest register required");
3182     movl(dst,  src);
3183     movl(dst2, src.plus_disp(BytesPerInt));
3184     break;
3185 #else
3186   case  8:  movq(dst, src); break;
3187 #endif
3188   case  4:  movl(dst, src); break;
3189   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3190   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3191   default:  ShouldNotReachHere();
3192   }
3193 }
3194 
3195 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3196   switch (size_in_bytes) {
3197 #ifndef _LP64
3198   case  8:
3199     assert(src2 != noreg, "second source register required");
3200     movl(dst,                        src);
3201     movl(dst.plus_disp(BytesPerInt), src2);
3202     break;
3203 #else
3204   case  8:  movq(dst, src); break;
3205 #endif
3206   case  4:  movl(dst, src); break;
3207   case  2:  movw(dst, src); break;
3208   case  1:  movb(dst, src); break;
3209   default:  ShouldNotReachHere();
3210   }
3211 }
3212 
3213 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3214   if (reachable(dst)) {
3215     movl(as_Address(dst), src);
3216   } else {
3217     lea(rscratch1, dst);
3218     movl(Address(rscratch1, 0), src);
3219   }
3220 }
3221 
3222 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3223   if (reachable(src)) {
3224     movl(dst, as_Address(src));
3225   } else {
3226     lea(rscratch1, src);
3227     movl(dst, Address(rscratch1, 0));
3228   }
3229 }
3230 
3231 // C++ bool manipulation
3232 
3233 void MacroAssembler::movbool(Register dst, Address src) {
3234   if(sizeof(bool) == 1)
3235     movb(dst, src);
3236   else if(sizeof(bool) == 2)
3237     movw(dst, src);
3238   else if(sizeof(bool) == 4)
3239     movl(dst, src);
3240   else
3241     // unsupported
3242     ShouldNotReachHere();
3243 }
3244 
3245 void MacroAssembler::movbool(Address dst, bool boolconst) {
3246   if(sizeof(bool) == 1)
3247     movb(dst, (int) boolconst);
3248   else if(sizeof(bool) == 2)
3249     movw(dst, (int) boolconst);
3250   else if(sizeof(bool) == 4)
3251     movl(dst, (int) boolconst);
3252   else
3253     // unsupported
3254     ShouldNotReachHere();
3255 }
3256 
3257 void MacroAssembler::movbool(Address dst, Register src) {
3258   if(sizeof(bool) == 1)
3259     movb(dst, src);
3260   else if(sizeof(bool) == 2)
3261     movw(dst, src);
3262   else if(sizeof(bool) == 4)
3263     movl(dst, src);
3264   else
3265     // unsupported
3266     ShouldNotReachHere();
3267 }
3268 
3269 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3270   movb(as_Address(dst), src);
3271 }
3272 
3273 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3274   if (reachable(src)) {
3275     movdl(dst, as_Address(src));
3276   } else {
3277     lea(rscratch1, src);
3278     movdl(dst, Address(rscratch1, 0));
3279   }
3280 }
3281 
3282 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3283   if (reachable(src)) {
3284     movq(dst, as_Address(src));
3285   } else {
3286     lea(rscratch1, src);
3287     movq(dst, Address(rscratch1, 0));
3288   }
3289 }
3290 
3291 #ifdef COMPILER2
3292 void MacroAssembler::setvectmask(Register dst, Register src) {
3293   guarantee(PostLoopMultiversioning, "must be");
3294   Assembler::movl(dst, 1);
3295   Assembler::shlxl(dst, dst, src);
3296   Assembler::decl(dst);
3297   Assembler::kmovdl(k1, dst);
3298   Assembler::movl(dst, src);
3299 }
3300 
3301 void MacroAssembler::restorevectmask() {
3302   guarantee(PostLoopMultiversioning, "must be");
3303   Assembler::knotwl(k1, k0);
3304 }
3305 #endif // COMPILER2
3306 
3307 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3308   if (reachable(src)) {
3309     if (UseXmmLoadAndClearUpper) {
3310       movsd (dst, as_Address(src));
3311     } else {
3312       movlpd(dst, as_Address(src));
3313     }
3314   } else {
3315     lea(rscratch1, src);
3316     if (UseXmmLoadAndClearUpper) {
3317       movsd (dst, Address(rscratch1, 0));
3318     } else {
3319       movlpd(dst, Address(rscratch1, 0));
3320     }
3321   }
3322 }
3323 
3324 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3325   if (reachable(src)) {
3326     movss(dst, as_Address(src));
3327   } else {
3328     lea(rscratch1, src);
3329     movss(dst, Address(rscratch1, 0));
3330   }
3331 }
3332 
3333 void MacroAssembler::movptr(Register dst, Register src) {
3334   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3335 }
3336 
3337 void MacroAssembler::movptr(Register dst, Address src) {
3338   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3339 }
3340 
3341 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3342 void MacroAssembler::movptr(Register dst, intptr_t src) {
3343   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3344 }
3345 
3346 void MacroAssembler::movptr(Address dst, Register src) {
3347   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3348 }
3349 
3350 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3351     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3352     Assembler::movdqu(dst, src);
3353 }
3354 
3355 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3356     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3357     Assembler::movdqu(dst, src);
3358 }
3359 
3360 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3361     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3362     Assembler::movdqu(dst, src);
3363 }
3364 
3365 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3366   if (reachable(src)) {
3367     movdqu(dst, as_Address(src));
3368   } else {
3369     lea(scratchReg, src);
3370     movdqu(dst, Address(scratchReg, 0));
3371   }
3372 }
3373 
3374 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3375     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3376     Assembler::vmovdqu(dst, src);
3377 }
3378 
3379 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3380     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3381     Assembler::vmovdqu(dst, src);
3382 }
3383 
3384 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3385     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3386     Assembler::vmovdqu(dst, src);
3387 }
3388 
3389 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3390   if (reachable(src)) {
3391     vmovdqu(dst, as_Address(src));
3392   }
3393   else {
3394     lea(scratch_reg, src);
3395     vmovdqu(dst, Address(scratch_reg, 0));
3396   }
3397 }
3398 
3399 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3400   if (reachable(src)) {
3401     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3402   } else {
3403     lea(rscratch, src);
3404     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3405   }
3406 }
3407 
3408 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3409   if (reachable(src)) {
3410     Assembler::movdqa(dst, as_Address(src));
3411   } else {
3412     lea(rscratch1, src);
3413     Assembler::movdqa(dst, Address(rscratch1, 0));
3414   }
3415 }
3416 
3417 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3418   if (reachable(src)) {
3419     Assembler::movsd(dst, as_Address(src));
3420   } else {
3421     lea(rscratch1, src);
3422     Assembler::movsd(dst, Address(rscratch1, 0));
3423   }
3424 }
3425 
3426 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3427   if (reachable(src)) {
3428     Assembler::movss(dst, as_Address(src));
3429   } else {
3430     lea(rscratch1, src);
3431     Assembler::movss(dst, Address(rscratch1, 0));
3432   }
3433 }
3434 
3435 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3436   if (reachable(src)) {
3437     Assembler::mulsd(dst, as_Address(src));
3438   } else {
3439     lea(rscratch1, src);
3440     Assembler::mulsd(dst, Address(rscratch1, 0));
3441   }
3442 }
3443 
3444 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3445   if (reachable(src)) {
3446     Assembler::mulss(dst, as_Address(src));
3447   } else {
3448     lea(rscratch1, src);
3449     Assembler::mulss(dst, Address(rscratch1, 0));
3450   }
3451 }
3452 
3453 void MacroAssembler::null_check(Register reg, int offset) {
3454   if (needs_explicit_null_check(offset)) {
3455     // provoke OS NULL exception if reg = NULL by
3456     // accessing M[reg] w/o changing any (non-CC) registers
3457     // NOTE: cmpl is plenty here to provoke a segv
3458     cmpptr(rax, Address(reg, 0));
3459     // Note: should probably use testl(rax, Address(reg, 0));
3460     //       may be shorter code (however, this version of
3461     //       testl needs to be implemented first)
3462   } else {
3463     // nothing to do, (later) access of M[reg + offset]
3464     // will provoke OS NULL exception if reg = NULL
3465   }
3466 }
3467 
3468 void MacroAssembler::os_breakpoint() {
3469   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3470   // (e.g., MSVC can't call ps() otherwise)
3471   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3472 }
3473 
3474 void MacroAssembler::unimplemented(const char* what) {
3475   const char* buf = NULL;
3476   {
3477     ResourceMark rm;
3478     stringStream ss;
3479     ss.print("unimplemented: %s", what);
3480     buf = code_string(ss.as_string());
3481   }
3482   stop(buf);
3483 }
3484 
3485 #ifdef _LP64
3486 #define XSTATE_BV 0x200
3487 #endif
3488 
3489 void MacroAssembler::pop_CPU_state() {
3490   pop_FPU_state();
3491   pop_IU_state();
3492 }
3493 
3494 void MacroAssembler::pop_FPU_state() {
3495 #ifndef _LP64
3496   frstor(Address(rsp, 0));
3497 #else
3498   fxrstor(Address(rsp, 0));
3499 #endif
3500   addptr(rsp, FPUStateSizeInWords * wordSize);
3501 }
3502 
3503 void MacroAssembler::get_cont_fastpath(Register java_thread, Register dst) {
3504   movl(dst, Address(java_thread, JavaThread::cont_fastpath_offset()));
3505 }
3506 
3507 void MacroAssembler::set_cont_fastpath(Register java_thread, int32_t imm) {
3508   movl(Address(java_thread, JavaThread::cont_fastpath_offset()), imm);
3509 }
3510 
3511 #ifdef ASSERT
3512 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) {
3513 #ifdef _LP64
3514   Label no_cont;
3515   movptr(cont, Address(r15_thread, in_bytes(JavaThread::continuation_offset())));
3516   testl(cont, cont);
3517   jcc(Assembler::zero, no_cont);
3518   stop(name);
3519   bind(no_cont);
3520 #else
3521   Unimplemented();
3522 #endif
3523 }
3524 #endif
3525 
3526 void MacroAssembler::pop_IU_state() {
3527   popa();
3528   LP64_ONLY(addq(rsp, 8));
3529   popf();
3530 }
3531 
3532 // Save Integer and Float state
3533 // Warning: Stack must be 16 byte aligned (64bit)
3534 void MacroAssembler::push_CPU_state() {
3535   push_IU_state();
3536   push_FPU_state();
3537 }
3538 
3539 void MacroAssembler::push_FPU_state() {
3540   subptr(rsp, FPUStateSizeInWords * wordSize);
3541 #ifndef _LP64
3542   fnsave(Address(rsp, 0));
3543   fwait();
3544 #else
3545   fxsave(Address(rsp, 0));
3546 #endif // LP64
3547 }
3548 
3549 void MacroAssembler::push_IU_state() {
3550   // Push flags first because pusha kills them
3551   pushf();
3552   // Make sure rsp stays 16-byte aligned
3553   LP64_ONLY(subq(rsp, 8));
3554   pusha();
3555 }
3556 
3557 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3558   if (!java_thread->is_valid()) {
3559     java_thread = rdi;
3560     get_thread(java_thread);
3561   }
3562   // we must set sp to zero to clear frame
3563   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3564   if (clear_fp) {
3565     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3566   }
3567 
3568   // Always clear the pc because it could have been set by make_walkable()
3569   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3570 
3571   vzeroupper();
3572 }
3573 
3574 void MacroAssembler::restore_rax(Register tmp) {
3575   if (tmp == noreg) pop(rax);
3576   else if (tmp != rax) mov(rax, tmp);
3577 }
3578 
3579 void MacroAssembler::round_to(Register reg, int modulus) {
3580   addptr(reg, modulus - 1);
3581   andptr(reg, -modulus);
3582 }
3583 
3584 void MacroAssembler::save_rax(Register tmp) {
3585   if (tmp == noreg) push(rax);
3586   else if (tmp != rax) mov(tmp, rax);
3587 }
3588 
3589 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
3590   if (SafepointMechanism::uses_thread_local_poll()) {
3591 #ifdef _LP64
3592     assert(thread_reg == r15_thread, "should be");
3593 #else
3594     if (thread_reg == noreg) {
3595       thread_reg = temp_reg;
3596       get_thread(thread_reg);
3597     }
3598 #endif
3599     testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
3600     jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3601   } else {
3602     cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
3603         SafepointSynchronize::_not_synchronized);
3604     jcc(Assembler::notEqual, slow_path);
3605   }
3606 }
3607 
3608 // Calls to C land
3609 //
3610 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3611 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3612 // has to be reset to 0. This is required to allow proper stack traversal.
3613 void MacroAssembler::set_last_Java_frame(Register java_thread,
3614                                          Register last_java_sp,
3615                                          Register last_java_fp,
3616                                          address  last_java_pc) {
3617   vzeroupper();
3618   // determine java_thread register
3619   if (!java_thread->is_valid()) {
3620     java_thread = rdi;
3621     get_thread(java_thread);
3622   }
3623   // determine last_java_sp register
3624   if (!last_java_sp->is_valid()) {
3625     last_java_sp = rsp;
3626   }
3627 
3628   // last_java_fp is optional
3629 
3630   if (last_java_fp->is_valid()) {
3631     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3632   }
3633 
3634   // last_java_pc is optional
3635 
3636   if (last_java_pc != NULL) {
3637     lea(Address(java_thread,
3638                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3639         InternalAddress(last_java_pc));
3640 
3641   }
3642   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3643 }
3644 
3645 void MacroAssembler::shlptr(Register dst, int imm8) {
3646   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3647 }
3648 
3649 void MacroAssembler::shrptr(Register dst, int imm8) {
3650   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3651 }
3652 
3653 void MacroAssembler::sign_extend_byte(Register reg) {
3654   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3655     movsbl(reg, reg); // movsxb
3656   } else {
3657     shll(reg, 24);
3658     sarl(reg, 24);
3659   }
3660 }
3661 
3662 void MacroAssembler::sign_extend_short(Register reg) {
3663   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3664     movswl(reg, reg); // movsxw
3665   } else {
3666     shll(reg, 16);
3667     sarl(reg, 16);
3668   }
3669 }
3670 
3671 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3672   assert(reachable(src), "Address should be reachable");
3673   testl(dst, as_Address(src));
3674 }
3675 
3676 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3677   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3678   Assembler::pcmpeqb(dst, src);
3679 }
3680 
3681 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3682   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3683   Assembler::pcmpeqw(dst, src);
3684 }
3685 
3686 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3687   assert((dst->encoding() < 16),"XMM register should be 0-15");
3688   Assembler::pcmpestri(dst, src, imm8);
3689 }
3690 
3691 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3692   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3693   Assembler::pcmpestri(dst, src, imm8);
3694 }
3695 
3696 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3697   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3698   Assembler::pmovzxbw(dst, src);
3699 }
3700 
3701 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3702   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3703   Assembler::pmovzxbw(dst, src);
3704 }
3705 
3706 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3707   assert((src->encoding() < 16),"XMM register should be 0-15");
3708   Assembler::pmovmskb(dst, src);
3709 }
3710 
3711 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3712   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3713   Assembler::ptest(dst, src);
3714 }
3715 
3716 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3717   if (reachable(src)) {
3718     Assembler::sqrtsd(dst, as_Address(src));
3719   } else {
3720     lea(rscratch1, src);
3721     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3722   }
3723 }
3724 
3725 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3726   if (reachable(src)) {
3727     Assembler::sqrtss(dst, as_Address(src));
3728   } else {
3729     lea(rscratch1, src);
3730     Assembler::sqrtss(dst, Address(rscratch1, 0));
3731   }
3732 }
3733 
3734 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3735   if (reachable(src)) {
3736     Assembler::subsd(dst, as_Address(src));
3737   } else {
3738     lea(rscratch1, src);
3739     Assembler::subsd(dst, Address(rscratch1, 0));
3740   }
3741 }
3742 
3743 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3744   if (reachable(src)) {
3745     Assembler::subss(dst, as_Address(src));
3746   } else {
3747     lea(rscratch1, src);
3748     Assembler::subss(dst, Address(rscratch1, 0));
3749   }
3750 }
3751 
3752 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3753   if (reachable(src)) {
3754     Assembler::ucomisd(dst, as_Address(src));
3755   } else {
3756     lea(rscratch1, src);
3757     Assembler::ucomisd(dst, Address(rscratch1, 0));
3758   }
3759 }
3760 
3761 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3762   if (reachable(src)) {
3763     Assembler::ucomiss(dst, as_Address(src));
3764   } else {
3765     lea(rscratch1, src);
3766     Assembler::ucomiss(dst, Address(rscratch1, 0));
3767   }
3768 }
3769 
3770 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3771   // Used in sign-bit flipping with aligned address.
3772   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3773   if (reachable(src)) {
3774     Assembler::xorpd(dst, as_Address(src));
3775   } else {
3776     lea(scratch_reg, src);
3777     Assembler::xorpd(dst, Address(scratch_reg, 0));
3778   }
3779 }
3780 
3781 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3782   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3783     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3784   }
3785   else {
3786     Assembler::xorpd(dst, src);
3787   }
3788 }
3789 
3790 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3791   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3792     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3793   } else {
3794     Assembler::xorps(dst, src);
3795   }
3796 }
3797 
3798 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3799   // Used in sign-bit flipping with aligned address.
3800   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3801   if (reachable(src)) {
3802     Assembler::xorps(dst, as_Address(src));
3803   } else {
3804     lea(scratch_reg, src);
3805     Assembler::xorps(dst, Address(scratch_reg, 0));
3806   }
3807 }
3808 
3809 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3810   // Used in sign-bit flipping with aligned address.
3811   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3812   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3813   if (reachable(src)) {
3814     Assembler::pshufb(dst, as_Address(src));
3815   } else {
3816     lea(rscratch1, src);
3817     Assembler::pshufb(dst, Address(rscratch1, 0));
3818   }
3819 }
3820 
3821 // AVX 3-operands instructions
3822 
3823 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3824   if (reachable(src)) {
3825     vaddsd(dst, nds, as_Address(src));
3826   } else {
3827     lea(rscratch1, src);
3828     vaddsd(dst, nds, Address(rscratch1, 0));
3829   }
3830 }
3831 
3832 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3833   if (reachable(src)) {
3834     vaddss(dst, nds, as_Address(src));
3835   } else {
3836     lea(rscratch1, src);
3837     vaddss(dst, nds, Address(rscratch1, 0));
3838   }
3839 }
3840 
3841 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3842   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3843   vandps(dst, nds, negate_field, vector_len);
3844 }
3845 
3846 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3847   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3848   vandpd(dst, nds, negate_field, vector_len);
3849 }
3850 
3851 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3852   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3853   Assembler::vpaddb(dst, nds, src, vector_len);
3854 }
3855 
3856 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3857   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3858   Assembler::vpaddb(dst, nds, src, vector_len);
3859 }
3860 
3861 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3862   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3863   Assembler::vpaddw(dst, nds, src, vector_len);
3864 }
3865 
3866 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3867   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3868   Assembler::vpaddw(dst, nds, src, vector_len);
3869 }
3870 
3871 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3872   if (reachable(src)) {
3873     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3874   } else {
3875     lea(scratch_reg, src);
3876     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3877   }
3878 }
3879 
3880 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3881   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3882   Assembler::vpbroadcastw(dst, src, vector_len);
3883 }
3884 
3885 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3886   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3887   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3888 }
3889 
3890 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3891   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3892   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3893 }
3894 
3895 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3896   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3897   Assembler::vpmovzxbw(dst, src, vector_len);
3898 }
3899 
3900 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3901   assert((src->encoding() < 16),"XMM register should be 0-15");
3902   Assembler::vpmovmskb(dst, src);
3903 }
3904 
3905 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3906   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3907   Assembler::vpmullw(dst, nds, src, vector_len);
3908 }
3909 
3910 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3911   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3912   Assembler::vpmullw(dst, nds, src, vector_len);
3913 }
3914 
3915 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3916   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3917   Assembler::vpsubb(dst, nds, src, vector_len);
3918 }
3919 
3920 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3921   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3922   Assembler::vpsubb(dst, nds, src, vector_len);
3923 }
3924 
3925 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3926   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3927   Assembler::vpsubw(dst, nds, src, vector_len);
3928 }
3929 
3930 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3931   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3932   Assembler::vpsubw(dst, nds, src, vector_len);
3933 }
3934 
3935 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3936   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3937   Assembler::vpsraw(dst, nds, shift, vector_len);
3938 }
3939 
3940 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3941   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3942   Assembler::vpsraw(dst, nds, shift, vector_len);
3943 }
3944 
3945 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3946   assert(UseAVX > 2,"");
3947   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3948      vector_len = 2;
3949   }
3950   Assembler::evpsraq(dst, nds, shift, vector_len);
3951 }
3952 
3953 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3954   assert(UseAVX > 2,"");
3955   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3956      vector_len = 2;
3957   }
3958   Assembler::evpsraq(dst, nds, shift, vector_len);
3959 }
3960 
3961 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3962   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3963   Assembler::vpsrlw(dst, nds, shift, vector_len);
3964 }
3965 
3966 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3967   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3968   Assembler::vpsrlw(dst, nds, shift, vector_len);
3969 }
3970 
3971 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3972   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3973   Assembler::vpsllw(dst, nds, shift, vector_len);
3974 }
3975 
3976 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3977   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3978   Assembler::vpsllw(dst, nds, shift, vector_len);
3979 }
3980 
3981 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3982   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3983   Assembler::vptest(dst, src);
3984 }
3985 
3986 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3987   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3988   Assembler::punpcklbw(dst, src);
3989 }
3990 
3991 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3992   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3993   Assembler::pshufd(dst, src, mode);
3994 }
3995 
3996 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3997   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3998   Assembler::pshuflw(dst, src, mode);
3999 }
4000 
4001 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4002   if (reachable(src)) {
4003     vandpd(dst, nds, as_Address(src), vector_len);
4004   } else {
4005     lea(scratch_reg, src);
4006     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
4007   }
4008 }
4009 
4010 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4011   if (reachable(src)) {
4012     vandps(dst, nds, as_Address(src), vector_len);
4013   } else {
4014     lea(scratch_reg, src);
4015     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
4016   }
4017 }
4018 
4019 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4020   if (reachable(src)) {
4021     vdivsd(dst, nds, as_Address(src));
4022   } else {
4023     lea(rscratch1, src);
4024     vdivsd(dst, nds, Address(rscratch1, 0));
4025   }
4026 }
4027 
4028 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4029   if (reachable(src)) {
4030     vdivss(dst, nds, as_Address(src));
4031   } else {
4032     lea(rscratch1, src);
4033     vdivss(dst, nds, Address(rscratch1, 0));
4034   }
4035 }
4036 
4037 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4038   if (reachable(src)) {
4039     vmulsd(dst, nds, as_Address(src));
4040   } else {
4041     lea(rscratch1, src);
4042     vmulsd(dst, nds, Address(rscratch1, 0));
4043   }
4044 }
4045 
4046 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4047   if (reachable(src)) {
4048     vmulss(dst, nds, as_Address(src));
4049   } else {
4050     lea(rscratch1, src);
4051     vmulss(dst, nds, Address(rscratch1, 0));
4052   }
4053 }
4054 
4055 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4056   if (reachable(src)) {
4057     vsubsd(dst, nds, as_Address(src));
4058   } else {
4059     lea(rscratch1, src);
4060     vsubsd(dst, nds, Address(rscratch1, 0));
4061   }
4062 }
4063 
4064 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4065   if (reachable(src)) {
4066     vsubss(dst, nds, as_Address(src));
4067   } else {
4068     lea(rscratch1, src);
4069     vsubss(dst, nds, Address(rscratch1, 0));
4070   }
4071 }
4072 
4073 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4074   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4075   vxorps(dst, nds, src, Assembler::AVX_128bit);
4076 }
4077 
4078 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4079   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4080   vxorpd(dst, nds, src, Assembler::AVX_128bit);
4081 }
4082 
4083 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4084   if (reachable(src)) {
4085     vxorpd(dst, nds, as_Address(src), vector_len);
4086   } else {
4087     lea(scratch_reg, src);
4088     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
4089   }
4090 }
4091 
4092 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4093   if (reachable(src)) {
4094     vxorps(dst, nds, as_Address(src), vector_len);
4095   } else {
4096     lea(scratch_reg, src);
4097     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
4098   }
4099 }
4100 
4101 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4102   if (UseAVX > 1 || (vector_len < 1)) {
4103     if (reachable(src)) {
4104       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
4105     } else {
4106       lea(scratch_reg, src);
4107       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
4108     }
4109   }
4110   else {
4111     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
4112   }
4113 }
4114 
4115 //-------------------------------------------------------------------------------------------
4116 #ifdef COMPILER2
4117 // Generic instructions support for use in .ad files C2 code generation
4118 
4119 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, Register scr) {
4120   if (opcode == Op_AbsVD) {
4121     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
4122   } else {
4123     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4124     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
4125   }
4126 }
4127 
4128 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4129   if (opcode == Op_AbsVD) {
4130     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
4131   } else {
4132     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4133     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
4134   }
4135 }
4136 
4137 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, Register scr) {
4138   if (opcode == Op_AbsVF) {
4139     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
4140   } else {
4141     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4142     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
4143   }
4144 }
4145 
4146 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4147   if (opcode == Op_AbsVF) {
4148     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
4149   } else {
4150     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4151     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
4152   }
4153 }
4154 
4155 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
4156   if (sign) {
4157     pmovsxbw(dst, src);
4158   } else {
4159     pmovzxbw(dst, src);
4160   }
4161 }
4162 
4163 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
4164   if (sign) {
4165     vpmovsxbw(dst, src, vector_len);
4166   } else {
4167     vpmovzxbw(dst, src, vector_len);
4168   }
4169 }
4170 
4171 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
4172   if (opcode == Op_RShiftVI) {
4173     psrad(dst, src);
4174   } else if (opcode == Op_LShiftVI) {
4175     pslld(dst, src);
4176   } else {
4177     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4178     psrld(dst, src);
4179   }
4180 }
4181 
4182 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4183   if (opcode == Op_RShiftVI) {
4184     vpsrad(dst, nds, src, vector_len);
4185   } else if (opcode == Op_LShiftVI) {
4186     vpslld(dst, nds, src, vector_len);
4187   } else {
4188     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4189     vpsrld(dst, nds, src, vector_len);
4190   }
4191 }
4192 
4193 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
4194   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4195     psraw(dst, src);
4196   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4197     psllw(dst, src);
4198   } else {
4199     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4200     psrlw(dst, src);
4201   }
4202 }
4203 
4204 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4205   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4206     vpsraw(dst, nds, src, vector_len);
4207   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4208     vpsllw(dst, nds, src, vector_len);
4209   } else {
4210     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4211     vpsrlw(dst, nds, src, vector_len);
4212   }
4213 }
4214 
4215 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
4216   if (opcode == Op_RShiftVL) {
4217     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
4218   } else if (opcode == Op_LShiftVL) {
4219     psllq(dst, src);
4220   } else {
4221     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4222     psrlq(dst, src);
4223   }
4224 }
4225 
4226 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4227   if (opcode == Op_RShiftVL) {
4228     evpsraq(dst, nds, src, vector_len);
4229   } else if (opcode == Op_LShiftVL) {
4230     vpsllq(dst, nds, src, vector_len);
4231   } else {
4232     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4233     vpsrlq(dst, nds, src, vector_len);
4234   }
4235 }
4236 #endif
4237 //-------------------------------------------------------------------------------------------
4238 
4239 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
4240   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
4241   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
4242   // The inverted mask is sign-extended
4243   andptr(possibly_jweak, inverted_jweak_mask);
4244 }
4245 
4246 void MacroAssembler::resolve_jobject(Register value,
4247                                      Register thread,
4248                                      Register tmp) {
4249   assert_different_registers(value, thread, tmp);
4250   Label done, not_weak;
4251   testptr(value, value);
4252   jcc(Assembler::zero, done);                // Use NULL as-is.
4253   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
4254   jcc(Assembler::zero, not_weak);
4255   // Resolve jweak.
4256   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4257                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
4258   verify_oop(value);
4259   jmp(done);
4260   bind(not_weak);
4261   // Resolve (untagged) jobject.
4262   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
4263   verify_oop(value);
4264   bind(done);
4265 }
4266 
4267 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4268   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4269 }
4270 
4271 // Force generation of a 4 byte immediate value even if it fits into 8bit
4272 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4273   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4274 }
4275 
4276 void MacroAssembler::subptr(Register dst, Register src) {
4277   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4278 }
4279 
4280 // C++ bool manipulation
4281 void MacroAssembler::testbool(Register dst) {
4282   if(sizeof(bool) == 1)
4283     testb(dst, 0xff);
4284   else if(sizeof(bool) == 2) {
4285     // testw implementation needed for two byte bools
4286     ShouldNotReachHere();
4287   } else if(sizeof(bool) == 4)
4288     testl(dst, dst);
4289   else
4290     // unsupported
4291     ShouldNotReachHere();
4292 }
4293 
4294 void MacroAssembler::testptr(Register dst, Register src) {
4295   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4296 }
4297 
4298 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4299 void MacroAssembler::tlab_allocate(Register thread, Register obj,
4300                                    Register var_size_in_bytes,
4301                                    int con_size_in_bytes,
4302                                    Register t1,
4303                                    Register t2,
4304                                    Label& slow_case) {
4305   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4306   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4307 }
4308 
4309 // Defines obj, preserves var_size_in_bytes
4310 void MacroAssembler::eden_allocate(Register thread, Register obj,
4311                                    Register var_size_in_bytes,
4312                                    int con_size_in_bytes,
4313                                    Register t1,
4314                                    Label& slow_case) {
4315   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4316   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4317 }
4318 
4319 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
4320 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4321   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4322   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4323   Label done;
4324 
4325   testptr(length_in_bytes, length_in_bytes);
4326   jcc(Assembler::zero, done);
4327 
4328   // initialize topmost word, divide index by 2, check if odd and test if zero
4329   // note: for the remaining code to work, index must be a multiple of BytesPerWord
4330 #ifdef ASSERT
4331   {
4332     Label L;
4333     testptr(length_in_bytes, BytesPerWord - 1);
4334     jcc(Assembler::zero, L);
4335     stop("length must be a multiple of BytesPerWord");
4336     bind(L);
4337   }
4338 #endif
4339   Register index = length_in_bytes;
4340   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4341   if (UseIncDec) {
4342     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4343   } else {
4344     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4345     shrptr(index, 1);
4346   }
4347 #ifndef _LP64
4348   // index could have not been a multiple of 8 (i.e., bit 2 was set)
4349   {
4350     Label even;
4351     // note: if index was a multiple of 8, then it cannot
4352     //       be 0 now otherwise it must have been 0 before
4353     //       => if it is even, we don't need to check for 0 again
4354     jcc(Assembler::carryClear, even);
4355     // clear topmost word (no jump would be needed if conditional assignment worked here)
4356     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4357     // index could be 0 now, must check again
4358     jcc(Assembler::zero, done);
4359     bind(even);
4360   }
4361 #endif // !_LP64
4362   // initialize remaining object fields: index is a multiple of 2 now
4363   {
4364     Label loop;
4365     bind(loop);
4366     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4367     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4368     decrement(index);
4369     jcc(Assembler::notZero, loop);
4370   }
4371 
4372   bind(done);
4373 }
4374 
4375 // Look up the method for a megamorphic invokeinterface call.
4376 // The target method is determined by <intf_klass, itable_index>.
4377 // The receiver klass is in recv_klass.
4378 // On success, the result will be in method_result, and execution falls through.
4379 // On failure, execution transfers to the given label.
4380 void MacroAssembler::lookup_interface_method(Register recv_klass,
4381                                              Register intf_klass,
4382                                              RegisterOrConstant itable_index,
4383                                              Register method_result,
4384                                              Register scan_temp,
4385                                              Label& L_no_such_interface,
4386                                              bool return_method) {
4387   assert_different_registers(recv_klass, intf_klass, scan_temp);
4388   assert_different_registers(method_result, intf_klass, scan_temp);
4389   assert(recv_klass != method_result || !return_method,
4390          "recv_klass can be destroyed when method isn't needed");
4391 
4392   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4393          "caller must use same register for non-constant itable index as for method");
4394 
4395   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4396   int vtable_base = in_bytes(Klass::vtable_start_offset());
4397   int itentry_off = itableMethodEntry::method_offset_in_bytes();
4398   int scan_step   = itableOffsetEntry::size() * wordSize;
4399   int vte_size    = vtableEntry::size_in_bytes();
4400   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4401   assert(vte_size == wordSize, "else adjust times_vte_scale");
4402 
4403   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4404 
4405   // %%% Could store the aligned, prescaled offset in the klassoop.
4406   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4407 
4408   if (return_method) {
4409     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4410     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4411     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4412   }
4413 
4414   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4415   //   if (scan->interface() == intf) {
4416   //     result = (klass + scan->offset() + itable_index);
4417   //   }
4418   // }
4419   Label search, found_method;
4420 
4421   for (int peel = 1; peel >= 0; peel--) {
4422     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4423     cmpptr(intf_klass, method_result);
4424 
4425     if (peel) {
4426       jccb(Assembler::equal, found_method);
4427     } else {
4428       jccb(Assembler::notEqual, search);
4429       // (invert the test to fall through to found_method...)
4430     }
4431 
4432     if (!peel)  break;
4433 
4434     bind(search);
4435 
4436     // Check that the previous entry is non-null.  A null entry means that
4437     // the receiver class doesn't implement the interface, and wasn't the
4438     // same as when the caller was compiled.
4439     testptr(method_result, method_result);
4440     jcc(Assembler::zero, L_no_such_interface);
4441     addptr(scan_temp, scan_step);
4442   }
4443 
4444   bind(found_method);
4445 
4446   if (return_method) {
4447     // Got a hit.
4448     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4449     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4450   }
4451 }
4452 
4453 
4454 // virtual method calling
4455 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4456                                            RegisterOrConstant vtable_index,
4457                                            Register method_result) {
4458   const int base = in_bytes(Klass::vtable_start_offset());
4459   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4460   Address vtable_entry_addr(recv_klass,
4461                             vtable_index, Address::times_ptr,
4462                             base + vtableEntry::method_offset_in_bytes());
4463   movptr(method_result, vtable_entry_addr);
4464 }
4465 
4466 
4467 void MacroAssembler::check_klass_subtype(Register sub_klass,
4468                            Register super_klass,
4469                            Register temp_reg,
4470                            Label& L_success) {
4471   Label L_failure;
4472   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4473   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4474   bind(L_failure);
4475 }
4476 
4477 
4478 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4479                                                    Register super_klass,
4480                                                    Register temp_reg,
4481                                                    Label* L_success,
4482                                                    Label* L_failure,
4483                                                    Label* L_slow_path,
4484                                         RegisterOrConstant super_check_offset) {
4485   assert_different_registers(sub_klass, super_klass, temp_reg);
4486   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4487   if (super_check_offset.is_register()) {
4488     assert_different_registers(sub_klass, super_klass,
4489                                super_check_offset.as_register());
4490   } else if (must_load_sco) {
4491     assert(temp_reg != noreg, "supply either a temp or a register offset");
4492   }
4493 
4494   Label L_fallthrough;
4495   int label_nulls = 0;
4496   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4497   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4498   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4499   assert(label_nulls <= 1, "at most one NULL in the batch");
4500 
4501   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4502   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4503   Address super_check_offset_addr(super_klass, sco_offset);
4504 
4505   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4506   // range of a jccb.  If this routine grows larger, reconsider at
4507   // least some of these.
4508 #define local_jcc(assembler_cond, label)                                \
4509   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4510   else                             jcc( assembler_cond, label) /*omit semi*/
4511 
4512   // Hacked jmp, which may only be used just before L_fallthrough.
4513 #define final_jmp(label)                                                \
4514   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4515   else                            jmp(label)                /*omit semi*/
4516 
4517   // If the pointers are equal, we are done (e.g., String[] elements).
4518   // This self-check enables sharing of secondary supertype arrays among
4519   // non-primary types such as array-of-interface.  Otherwise, each such
4520   // type would need its own customized SSA.
4521   // We move this check to the front of the fast path because many
4522   // type checks are in fact trivially successful in this manner,
4523   // so we get a nicely predicted branch right at the start of the check.
4524   cmpptr(sub_klass, super_klass);
4525   local_jcc(Assembler::equal, *L_success);
4526 
4527   // Check the supertype display:
4528   if (must_load_sco) {
4529     // Positive movl does right thing on LP64.
4530     movl(temp_reg, super_check_offset_addr);
4531     super_check_offset = RegisterOrConstant(temp_reg);
4532   }
4533   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4534   cmpptr(super_klass, super_check_addr); // load displayed supertype
4535 
4536   // This check has worked decisively for primary supers.
4537   // Secondary supers are sought in the super_cache ('super_cache_addr').
4538   // (Secondary supers are interfaces and very deeply nested subtypes.)
4539   // This works in the same check above because of a tricky aliasing
4540   // between the super_cache and the primary super display elements.
4541   // (The 'super_check_addr' can address either, as the case requires.)
4542   // Note that the cache is updated below if it does not help us find
4543   // what we need immediately.
4544   // So if it was a primary super, we can just fail immediately.
4545   // Otherwise, it's the slow path for us (no success at this point).
4546 
4547   if (super_check_offset.is_register()) {
4548     local_jcc(Assembler::equal, *L_success);
4549     cmpl(super_check_offset.as_register(), sc_offset);
4550     if (L_failure == &L_fallthrough) {
4551       local_jcc(Assembler::equal, *L_slow_path);
4552     } else {
4553       local_jcc(Assembler::notEqual, *L_failure);
4554       final_jmp(*L_slow_path);
4555     }
4556   } else if (super_check_offset.as_constant() == sc_offset) {
4557     // Need a slow path; fast failure is impossible.
4558     if (L_slow_path == &L_fallthrough) {
4559       local_jcc(Assembler::equal, *L_success);
4560     } else {
4561       local_jcc(Assembler::notEqual, *L_slow_path);
4562       final_jmp(*L_success);
4563     }
4564   } else {
4565     // No slow path; it's a fast decision.
4566     if (L_failure == &L_fallthrough) {
4567       local_jcc(Assembler::equal, *L_success);
4568     } else {
4569       local_jcc(Assembler::notEqual, *L_failure);
4570       final_jmp(*L_success);
4571     }
4572   }
4573 
4574   bind(L_fallthrough);
4575 
4576 #undef local_jcc
4577 #undef final_jmp
4578 }
4579 
4580 
4581 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4582                                                    Register super_klass,
4583                                                    Register temp_reg,
4584                                                    Register temp2_reg,
4585                                                    Label* L_success,
4586                                                    Label* L_failure,
4587                                                    bool set_cond_codes) {
4588   assert_different_registers(sub_klass, super_klass, temp_reg);
4589   if (temp2_reg != noreg)
4590     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4591 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4592 
4593   Label L_fallthrough;
4594   int label_nulls = 0;
4595   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4596   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4597   assert(label_nulls <= 1, "at most one NULL in the batch");
4598 
4599   // a couple of useful fields in sub_klass:
4600   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4601   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4602   Address secondary_supers_addr(sub_klass, ss_offset);
4603   Address super_cache_addr(     sub_klass, sc_offset);
4604 
4605   // Do a linear scan of the secondary super-klass chain.
4606   // This code is rarely used, so simplicity is a virtue here.
4607   // The repne_scan instruction uses fixed registers, which we must spill.
4608   // Don't worry too much about pre-existing connections with the input regs.
4609 
4610   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4611   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4612 
4613   // Get super_klass value into rax (even if it was in rdi or rcx).
4614   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4615   if (super_klass != rax || UseCompressedOops) {
4616     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4617     mov(rax, super_klass);
4618   }
4619   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4620   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4621 
4622 #ifndef PRODUCT
4623   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4624   ExternalAddress pst_counter_addr((address) pst_counter);
4625   NOT_LP64(  incrementl(pst_counter_addr) );
4626   LP64_ONLY( lea(rcx, pst_counter_addr) );
4627   LP64_ONLY( incrementl(Address(rcx, 0)) );
4628 #endif //PRODUCT
4629 
4630   // We will consult the secondary-super array.
4631   movptr(rdi, secondary_supers_addr);
4632   // Load the array length.  (Positive movl does right thing on LP64.)
4633   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4634   // Skip to start of data.
4635   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4636 
4637   // Scan RCX words at [RDI] for an occurrence of RAX.
4638   // Set NZ/Z based on last compare.
4639   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4640   // not change flags (only scas instruction which is repeated sets flags).
4641   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4642 
4643     testptr(rax,rax); // Set Z = 0
4644     repne_scan();
4645 
4646   // Unspill the temp. registers:
4647   if (pushed_rdi)  pop(rdi);
4648   if (pushed_rcx)  pop(rcx);
4649   if (pushed_rax)  pop(rax);
4650 
4651   if (set_cond_codes) {
4652     // Special hack for the AD files:  rdi is guaranteed non-zero.
4653     assert(!pushed_rdi, "rdi must be left non-NULL");
4654     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4655   }
4656 
4657   if (L_failure == &L_fallthrough)
4658         jccb(Assembler::notEqual, *L_failure);
4659   else  jcc(Assembler::notEqual, *L_failure);
4660 
4661   // Success.  Cache the super we found and proceed in triumph.
4662   movptr(super_cache_addr, super_klass);
4663 
4664   if (L_success != &L_fallthrough) {
4665     jmp(*L_success);
4666   }
4667 
4668 #undef IS_A_TEMP
4669 
4670   bind(L_fallthrough);
4671 }
4672 
4673 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4674   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4675 
4676   Label L_fallthrough;
4677   if (L_fast_path == NULL) {
4678     L_fast_path = &L_fallthrough;
4679   } else if (L_slow_path == NULL) {
4680     L_slow_path = &L_fallthrough;
4681   }
4682 
4683   // Fast path check: class is fully initialized
4684   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4685   jcc(Assembler::equal, *L_fast_path);
4686 
4687   // Fast path check: current thread is initializer thread
4688   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4689   if (L_slow_path == &L_fallthrough) {
4690     jcc(Assembler::equal, *L_fast_path);
4691     bind(*L_slow_path);
4692   } else if (L_fast_path == &L_fallthrough) {
4693     jcc(Assembler::notEqual, *L_slow_path);
4694     bind(*L_fast_path);
4695   } else {
4696     Unimplemented();
4697   }
4698 }
4699 
4700 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4701   if (VM_Version::supports_cmov()) {
4702     cmovl(cc, dst, src);
4703   } else {
4704     Label L;
4705     jccb(negate_condition(cc), L);
4706     movl(dst, src);
4707     bind(L);
4708   }
4709 }
4710 
4711 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4712   if (VM_Version::supports_cmov()) {
4713     cmovl(cc, dst, src);
4714   } else {
4715     Label L;
4716     jccb(negate_condition(cc), L);
4717     movl(dst, src);
4718     bind(L);
4719   }
4720 }
4721 
4722 void MacroAssembler::verify_oop(Register reg, const char* s) {
4723   if (!VerifyOops) return;
4724 
4725   // Pass register number to verify_oop_subroutine
4726   const char* b = NULL;
4727   {
4728     ResourceMark rm;
4729     stringStream ss;
4730     ss.print("verify_oop: %s: %s", reg->name(), s);
4731     b = code_string(ss.as_string());
4732   }
4733   BLOCK_COMMENT("verify_oop {");
4734 #ifdef _LP64
4735   push(rscratch1);                    // save r10, trashed by movptr()
4736 #endif
4737   push(rax);                          // save rax,
4738   push(reg);                          // pass register argument
4739   ExternalAddress buffer((address) b);
4740   // avoid using pushptr, as it modifies scratch registers
4741   // and our contract is not to modify anything
4742   movptr(rax, buffer.addr());
4743   push(rax);
4744   // call indirectly to solve generation ordering problem
4745   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4746   call(rax);
4747   // Caller pops the arguments (oop, message) and restores rax, r10
4748   BLOCK_COMMENT("} verify_oop");
4749 }
4750 
4751 
4752 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
4753                                                       Register tmp,
4754                                                       int offset) {
4755   intptr_t value = *delayed_value_addr;
4756   if (value != 0)
4757     return RegisterOrConstant(value + offset);
4758 
4759   // load indirectly to solve generation ordering problem
4760   movptr(tmp, ExternalAddress((address) delayed_value_addr));
4761 
4762 #ifdef ASSERT
4763   { Label L;
4764     testptr(tmp, tmp);
4765     if (WizardMode) {
4766       const char* buf = NULL;
4767       {
4768         ResourceMark rm;
4769         stringStream ss;
4770         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
4771         buf = code_string(ss.as_string());
4772       }
4773       jcc(Assembler::notZero, L);
4774       STOP(buf);
4775     } else {
4776       jccb(Assembler::notZero, L);
4777       hlt();
4778     }
4779     bind(L);
4780   }
4781 #endif
4782 
4783   if (offset != 0)
4784     addptr(tmp, offset);
4785 
4786   return RegisterOrConstant(tmp);
4787 }
4788 
4789 
4790 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4791                                          int extra_slot_offset) {
4792   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4793   int stackElementSize = Interpreter::stackElementSize;
4794   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4795 #ifdef ASSERT
4796   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4797   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4798 #endif
4799   Register             scale_reg    = noreg;
4800   Address::ScaleFactor scale_factor = Address::no_scale;
4801   if (arg_slot.is_constant()) {
4802     offset += arg_slot.as_constant() * stackElementSize;
4803   } else {
4804     scale_reg    = arg_slot.as_register();
4805     scale_factor = Address::times(stackElementSize);
4806   }
4807   offset += wordSize;           // return PC is on stack
4808   return Address(rsp, scale_reg, scale_factor, offset);
4809 }
4810 
4811 
4812 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
4813   if (!VerifyOops) return;
4814 
4815   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4816   // Pass register number to verify_oop_subroutine
4817   const char* b = NULL;
4818   {
4819     ResourceMark rm;
4820     stringStream ss;
4821     ss.print("verify_oop_addr: %s", s);
4822     b = code_string(ss.as_string());
4823   }
4824 #ifdef _LP64
4825   push(rscratch1);                    // save r10, trashed by movptr()
4826 #endif
4827   push(rax);                          // save rax,
4828   // addr may contain rsp so we will have to adjust it based on the push
4829   // we just did (and on 64 bit we do two pushes)
4830   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4831   // stores rax into addr which is backwards of what was intended.
4832   if (addr.uses(rsp)) {
4833     lea(rax, addr);
4834     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4835   } else {
4836     pushptr(addr);
4837   }
4838 
4839   ExternalAddress buffer((address) b);
4840   // pass msg argument
4841   // avoid using pushptr, as it modifies scratch registers
4842   // and our contract is not to modify anything
4843   movptr(rax, buffer.addr());
4844   push(rax);
4845 
4846   // call indirectly to solve generation ordering problem
4847   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4848   call(rax);
4849   // Caller pops the arguments (addr, message) and restores rax, r10.
4850 }
4851 
4852 void MacroAssembler::verify_tlab() {
4853 #ifdef ASSERT
4854   if (UseTLAB && VerifyOops) {
4855     Label next, ok;
4856     Register t1 = rsi;
4857     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4858 
4859     push(t1);
4860     NOT_LP64(push(thread_reg));
4861     NOT_LP64(get_thread(thread_reg));
4862 
4863     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4864     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4865     jcc(Assembler::aboveEqual, next);
4866     STOP("assert(top >= start)");
4867     should_not_reach_here();
4868 
4869     bind(next);
4870     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4871     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4872     jcc(Assembler::aboveEqual, ok);
4873     STOP("assert(top <= end)");
4874     should_not_reach_here();
4875 
4876     bind(ok);
4877     NOT_LP64(pop(thread_reg));
4878     pop(t1);
4879   }
4880 #endif
4881 }
4882 
4883 class ControlWord {
4884  public:
4885   int32_t _value;
4886 
4887   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4888   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4889   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4890   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4891   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4892   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4893   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4894   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4895 
4896   void print() const {
4897     // rounding control
4898     const char* rc;
4899     switch (rounding_control()) {
4900       case 0: rc = "round near"; break;
4901       case 1: rc = "round down"; break;
4902       case 2: rc = "round up  "; break;
4903       case 3: rc = "chop      "; break;
4904     };
4905     // precision control
4906     const char* pc;
4907     switch (precision_control()) {
4908       case 0: pc = "24 bits "; break;
4909       case 1: pc = "reserved"; break;
4910       case 2: pc = "53 bits "; break;
4911       case 3: pc = "64 bits "; break;
4912     };
4913     // flags
4914     char f[9];
4915     f[0] = ' ';
4916     f[1] = ' ';
4917     f[2] = (precision   ()) ? 'P' : 'p';
4918     f[3] = (underflow   ()) ? 'U' : 'u';
4919     f[4] = (overflow    ()) ? 'O' : 'o';
4920     f[5] = (zero_divide ()) ? 'Z' : 'z';
4921     f[6] = (denormalized()) ? 'D' : 'd';
4922     f[7] = (invalid     ()) ? 'I' : 'i';
4923     f[8] = '\x0';
4924     // output
4925     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4926   }
4927 
4928 };
4929 
4930 class StatusWord {
4931  public:
4932   int32_t _value;
4933 
4934   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4935   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4936   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4937   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4938   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4939   int  top() const                     { return  (_value >> 11) & 7      ; }
4940   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4941   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4942   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4943   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4944   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4945   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4946   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4947   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4948 
4949   void print() const {
4950     // condition codes
4951     char c[5];
4952     c[0] = (C3()) ? '3' : '-';
4953     c[1] = (C2()) ? '2' : '-';
4954     c[2] = (C1()) ? '1' : '-';
4955     c[3] = (C0()) ? '0' : '-';
4956     c[4] = '\x0';
4957     // flags
4958     char f[9];
4959     f[0] = (error_status()) ? 'E' : '-';
4960     f[1] = (stack_fault ()) ? 'S' : '-';
4961     f[2] = (precision   ()) ? 'P' : '-';
4962     f[3] = (underflow   ()) ? 'U' : '-';
4963     f[4] = (overflow    ()) ? 'O' : '-';
4964     f[5] = (zero_divide ()) ? 'Z' : '-';
4965     f[6] = (denormalized()) ? 'D' : '-';
4966     f[7] = (invalid     ()) ? 'I' : '-';
4967     f[8] = '\x0';
4968     // output
4969     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4970   }
4971 
4972 };
4973 
4974 class TagWord {
4975  public:
4976   int32_t _value;
4977 
4978   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4979 
4980   void print() const {
4981     printf("%04x", _value & 0xFFFF);
4982   }
4983 
4984 };
4985 
4986 class FPU_Register {
4987  public:
4988   int32_t _m0;
4989   int32_t _m1;
4990   int16_t _ex;
4991 
4992   bool is_indefinite() const           {
4993     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4994   }
4995 
4996   void print() const {
4997     char  sign = (_ex < 0) ? '-' : '+';
4998     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4999     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
5000   };
5001 
5002 };
5003 
5004 class FPU_State {
5005  public:
5006   enum {
5007     register_size       = 10,
5008     number_of_registers =  8,
5009     register_mask       =  7
5010   };
5011 
5012   ControlWord  _control_word;
5013   StatusWord   _status_word;
5014   TagWord      _tag_word;
5015   int32_t      _error_offset;
5016   int32_t      _error_selector;
5017   int32_t      _data_offset;
5018   int32_t      _data_selector;
5019   int8_t       _register[register_size * number_of_registers];
5020 
5021   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5022   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
5023 
5024   const char* tag_as_string(int tag) const {
5025     switch (tag) {
5026       case 0: return "valid";
5027       case 1: return "zero";
5028       case 2: return "special";
5029       case 3: return "empty";
5030     }
5031     ShouldNotReachHere();
5032     return NULL;
5033   }
5034 
5035   void print() const {
5036     // print computation registers
5037     { int t = _status_word.top();
5038       for (int i = 0; i < number_of_registers; i++) {
5039         int j = (i - t) & register_mask;
5040         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5041         st(j)->print();
5042         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5043       }
5044     }
5045     printf("\n");
5046     // print control registers
5047     printf("ctrl = "); _control_word.print(); printf("\n");
5048     printf("stat = "); _status_word .print(); printf("\n");
5049     printf("tags = "); _tag_word    .print(); printf("\n");
5050   }
5051 
5052 };
5053 
5054 class Flag_Register {
5055  public:
5056   int32_t _value;
5057 
5058   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5059   bool direction() const               { return ((_value >> 10) & 1) != 0; }
5060   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5061   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5062   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5063   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5064   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5065 
5066   void print() const {
5067     // flags
5068     char f[8];
5069     f[0] = (overflow       ()) ? 'O' : '-';
5070     f[1] = (direction      ()) ? 'D' : '-';
5071     f[2] = (sign           ()) ? 'S' : '-';
5072     f[3] = (zero           ()) ? 'Z' : '-';
5073     f[4] = (auxiliary_carry()) ? 'A' : '-';
5074     f[5] = (parity         ()) ? 'P' : '-';
5075     f[6] = (carry          ()) ? 'C' : '-';
5076     f[7] = '\x0';
5077     // output
5078     printf("%08x  flags = %s", _value, f);
5079   }
5080 
5081 };
5082 
5083 class IU_Register {
5084  public:
5085   int32_t _value;
5086 
5087   void print() const {
5088     printf("%08x  %11d", _value, _value);
5089   }
5090 
5091 };
5092 
5093 class IU_State {
5094  public:
5095   Flag_Register _eflags;
5096   IU_Register   _rdi;
5097   IU_Register   _rsi;
5098   IU_Register   _rbp;
5099   IU_Register   _rsp;
5100   IU_Register   _rbx;
5101   IU_Register   _rdx;
5102   IU_Register   _rcx;
5103   IU_Register   _rax;
5104 
5105   void print() const {
5106     // computation registers
5107     printf("rax,  = "); _rax.print(); printf("\n");
5108     printf("rbx,  = "); _rbx.print(); printf("\n");
5109     printf("rcx  = "); _rcx.print(); printf("\n");
5110     printf("rdx  = "); _rdx.print(); printf("\n");
5111     printf("rdi  = "); _rdi.print(); printf("\n");
5112     printf("rsi  = "); _rsi.print(); printf("\n");
5113     printf("rbp,  = "); _rbp.print(); printf("\n");
5114     printf("rsp  = "); _rsp.print(); printf("\n");
5115     printf("\n");
5116     // control registers
5117     printf("flgs = "); _eflags.print(); printf("\n");
5118   }
5119 };
5120 
5121 
5122 class CPU_State {
5123  public:
5124   FPU_State _fpu_state;
5125   IU_State  _iu_state;
5126 
5127   void print() const {
5128     printf("--------------------------------------------------\n");
5129     _iu_state .print();
5130     printf("\n");
5131     _fpu_state.print();
5132     printf("--------------------------------------------------\n");
5133   }
5134 
5135 };
5136 
5137 
5138 static void _print_CPU_state(CPU_State* state) {
5139   state->print();
5140 };
5141 
5142 
5143 void MacroAssembler::print_CPU_state() {
5144   push_CPU_state();
5145   push(rsp);                // pass CPU state
5146   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5147   addptr(rsp, wordSize);       // discard argument
5148   pop_CPU_state();
5149 }
5150 
5151 
5152 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5153   static int counter = 0;
5154   FPU_State* fs = &state->_fpu_state;
5155   counter++;
5156   // For leaf calls, only verify that the top few elements remain empty.
5157   // We only need 1 empty at the top for C2 code.
5158   if( stack_depth < 0 ) {
5159     if( fs->tag_for_st(7) != 3 ) {
5160       printf("FPR7 not empty\n");
5161       state->print();
5162       assert(false, "error");
5163       return false;
5164     }
5165     return true;                // All other stack states do not matter
5166   }
5167 
5168   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5169          "bad FPU control word");
5170 
5171   // compute stack depth
5172   int i = 0;
5173   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5174   int d = i;
5175   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5176   // verify findings
5177   if (i != FPU_State::number_of_registers) {
5178     // stack not contiguous
5179     printf("%s: stack not contiguous at ST%d\n", s, i);
5180     state->print();
5181     assert(false, "error");
5182     return false;
5183   }
5184   // check if computed stack depth corresponds to expected stack depth
5185   if (stack_depth < 0) {
5186     // expected stack depth is -stack_depth or less
5187     if (d > -stack_depth) {
5188       // too many elements on the stack
5189       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5190       state->print();
5191       assert(false, "error");
5192       return false;
5193     }
5194   } else {
5195     // expected stack depth is stack_depth
5196     if (d != stack_depth) {
5197       // wrong stack depth
5198       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5199       state->print();
5200       assert(false, "error");
5201       return false;
5202     }
5203   }
5204   // everything is cool
5205   return true;
5206 }
5207 
5208 
5209 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5210   if (!VerifyFPU) return;
5211   push_CPU_state();
5212   push(rsp);                // pass CPU state
5213   ExternalAddress msg((address) s);
5214   // pass message string s
5215   pushptr(msg.addr());
5216   push(stack_depth);        // pass stack depth
5217   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5218   addptr(rsp, 3 * wordSize);   // discard arguments
5219   // check for error
5220   { Label L;
5221     testl(rax, rax);
5222     jcc(Assembler::notZero, L);
5223     int3();                  // break if error condition
5224     bind(L);
5225   }
5226   pop_CPU_state();
5227 }
5228 
5229 void MacroAssembler::restore_cpu_control_state_after_jni() {
5230   // Either restore the MXCSR register after returning from the JNI Call
5231   // or verify that it wasn't changed (with -Xcheck:jni flag).
5232   if (VM_Version::supports_sse()) {
5233     if (RestoreMXCSROnJNICalls) {
5234       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5235     } else if (CheckJNICalls) {
5236       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5237     }
5238   }
5239   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5240   vzeroupper();
5241   // Reset k1 to 0xffff.
5242 
5243 #ifdef COMPILER2
5244   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
5245     push(rcx);
5246     movl(rcx, 0xffff);
5247     kmovwl(k1, rcx);
5248     pop(rcx);
5249   }
5250 #endif // COMPILER2
5251 
5252 #ifndef _LP64
5253   // Either restore the x87 floating pointer control word after returning
5254   // from the JNI call or verify that it wasn't changed.
5255   if (CheckJNICalls) {
5256     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5257   }
5258 #endif // _LP64
5259 }
5260 
5261 // ((OopHandle)result).resolve();
5262 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5263   assert_different_registers(result, tmp);
5264 
5265   // Only 64 bit platforms support GCs that require a tmp register
5266   // Only IN_HEAP loads require a thread_tmp register
5267   // OopHandle::resolve is an indirection like jobject.
5268   access_load_at(T_OBJECT, IN_NATIVE,
5269                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5270 }
5271 
5272 // ((WeakHandle)result).resolve();
5273 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5274   assert_different_registers(rresult, rtmp);
5275   Label resolved;
5276 
5277   // A null weak handle resolves to null.
5278   cmpptr(rresult, 0);
5279   jcc(Assembler::equal, resolved);
5280 
5281   // Only 64 bit platforms support GCs that require a tmp register
5282   // Only IN_HEAP loads require a thread_tmp register
5283   // WeakHandle::resolve is an indirection like jweak.
5284   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5285                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5286   bind(resolved);
5287 }
5288 
5289 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5290   // get mirror
5291   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5292   load_method_holder(mirror, method);
5293   movptr(mirror, Address(mirror, mirror_offset));
5294   resolve_oop_handle(mirror, tmp);
5295 }
5296 
5297 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5298   load_method_holder(rresult, rmethod);
5299   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5300 }
5301 
5302 void MacroAssembler::load_method_holder(Register holder, Register method) {
5303   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5304   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5305   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
5306 }
5307 
5308 void MacroAssembler::load_klass(Register dst, Register src) {
5309 #ifdef _LP64
5310   if (UseCompressedClassPointers) {
5311     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5312     decode_klass_not_null(dst);
5313   } else
5314 #endif
5315     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5316 }
5317 
5318 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5319   load_klass(dst, src);
5320   movptr(dst, Address(dst, Klass::prototype_header_offset()));
5321 }
5322 
5323 void MacroAssembler::store_klass(Register dst, Register src) {
5324 #ifdef _LP64
5325   if (UseCompressedClassPointers) {
5326     encode_klass_not_null(src);
5327     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5328   } else
5329 #endif
5330     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5331 }
5332 
5333 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5334                                     Register tmp1, Register thread_tmp) {
5335   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5336   decorators = AccessInternal::decorator_fixup(decorators);
5337   bool as_raw = (decorators & AS_RAW) != 0;
5338   if (as_raw) {
5339     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5340   } else {
5341     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5342   }
5343 }
5344 
5345 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
5346                                      Register tmp1, Register tmp2) {
5347   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5348   decorators = AccessInternal::decorator_fixup(decorators);
5349   bool as_raw = (decorators & AS_RAW) != 0;
5350   if (as_raw) {
5351     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
5352   } else {
5353     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
5354   }
5355 }
5356 
5357 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
5358   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
5359   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
5360     decorators |= ACCESS_READ | ACCESS_WRITE;
5361   }
5362   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5363   return bs->resolve(this, decorators, obj);
5364 }
5365 
5366 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5367                                    Register thread_tmp, DecoratorSet decorators) {
5368   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5369 }
5370 
5371 // Doesn't do verfication, generates fixed size code
5372 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5373                                             Register thread_tmp, DecoratorSet decorators) {
5374   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5375 }
5376 
5377 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
5378                                     Register tmp2, DecoratorSet decorators) {
5379   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
5380 }
5381 
5382 // Used for storing NULLs.
5383 void MacroAssembler::store_heap_oop_null(Address dst) {
5384   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
5385 }
5386 
5387 #ifdef _LP64
5388 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5389   if (UseCompressedClassPointers) {
5390     // Store to klass gap in destination
5391     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5392   }
5393 }
5394 
5395 #ifdef ASSERT
5396 void MacroAssembler::verify_heapbase(const char* msg) {
5397   assert (UseCompressedOops, "should be compressed");
5398   assert (Universe::heap() != NULL, "java heap should be initialized");
5399   if (CheckCompressedOops) {
5400     Label ok;
5401     push(rscratch1); // cmpptr trashes rscratch1
5402     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5403     jcc(Assembler::equal, ok);
5404     STOP(msg);
5405     bind(ok);
5406     pop(rscratch1);
5407   }
5408 }
5409 #endif
5410 
5411 // Algorithm must match oop.inline.hpp encode_heap_oop.
5412 void MacroAssembler::encode_heap_oop(Register r) {
5413 #ifdef ASSERT
5414   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5415 #endif
5416   verify_oop(r, "broken oop in encode_heap_oop");
5417   if (CompressedOops::base() == NULL) {
5418     if (CompressedOops::shift() != 0) {
5419       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5420       shrq(r, LogMinObjAlignmentInBytes);
5421     }
5422     return;
5423   }
5424   testq(r, r);
5425   cmovq(Assembler::equal, r, r12_heapbase);
5426   subq(r, r12_heapbase);
5427   shrq(r, LogMinObjAlignmentInBytes);
5428 }
5429 
5430 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5431 #ifdef ASSERT
5432   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5433   if (CheckCompressedOops) {
5434     Label ok;
5435     testq(r, r);
5436     jcc(Assembler::notEqual, ok);
5437     STOP("null oop passed to encode_heap_oop_not_null");
5438     bind(ok);
5439   }
5440 #endif
5441   verify_oop(r, "broken oop in encode_heap_oop_not_null");
5442   if (CompressedOops::base() != NULL) {
5443     subq(r, r12_heapbase);
5444   }
5445   if (CompressedOops::shift() != 0) {
5446     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5447     shrq(r, LogMinObjAlignmentInBytes);
5448   }
5449 }
5450 
5451 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5452 #ifdef ASSERT
5453   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5454   if (CheckCompressedOops) {
5455     Label ok;
5456     testq(src, src);
5457     jcc(Assembler::notEqual, ok);
5458     STOP("null oop passed to encode_heap_oop_not_null2");
5459     bind(ok);
5460   }
5461 #endif
5462   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5463   if (dst != src) {
5464     movq(dst, src);
5465   }
5466   if (CompressedOops::base() != NULL) {
5467     subq(dst, r12_heapbase);
5468   }
5469   if (CompressedOops::shift() != 0) {
5470     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5471     shrq(dst, LogMinObjAlignmentInBytes);
5472   }
5473 }
5474 
5475 void  MacroAssembler::decode_heap_oop(Register r) {
5476 #ifdef ASSERT
5477   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5478 #endif
5479   if (CompressedOops::base() == NULL) {
5480     if (CompressedOops::shift() != 0) {
5481       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5482       shlq(r, LogMinObjAlignmentInBytes);
5483     }
5484   } else {
5485     Label done;
5486     shlq(r, LogMinObjAlignmentInBytes);
5487     jccb(Assembler::equal, done);
5488     addq(r, r12_heapbase);
5489     bind(done);
5490   }
5491   verify_oop(r, "broken oop in decode_heap_oop");
5492 }
5493 
5494 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5495   // Note: it will change flags
5496   assert (UseCompressedOops, "should only be used for compressed headers");
5497   assert (Universe::heap() != NULL, "java heap should be initialized");
5498   // Cannot assert, unverified entry point counts instructions (see .ad file)
5499   // vtableStubs also counts instructions in pd_code_size_limit.
5500   // Also do not verify_oop as this is called by verify_oop.
5501   if (CompressedOops::shift() != 0) {
5502     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5503     shlq(r, LogMinObjAlignmentInBytes);
5504     if (CompressedOops::base() != NULL) {
5505       addq(r, r12_heapbase);
5506     }
5507   } else {
5508     assert (CompressedOops::base() == NULL, "sanity");
5509   }
5510 }
5511 
5512 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5513   // Note: it will change flags
5514   assert (UseCompressedOops, "should only be used for compressed headers");
5515   assert (Universe::heap() != NULL, "java heap should be initialized");
5516   // Cannot assert, unverified entry point counts instructions (see .ad file)
5517   // vtableStubs also counts instructions in pd_code_size_limit.
5518   // Also do not verify_oop as this is called by verify_oop.
5519   if (CompressedOops::shift() != 0) {
5520     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5521     if (LogMinObjAlignmentInBytes == Address::times_8) {
5522       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5523     } else {
5524       if (dst != src) {
5525         movq(dst, src);
5526       }
5527       shlq(dst, LogMinObjAlignmentInBytes);
5528       if (CompressedOops::base() != NULL) {
5529         addq(dst, r12_heapbase);
5530       }
5531     }
5532   } else {
5533     assert (CompressedOops::base() == NULL, "sanity");
5534     if (dst != src) {
5535       movq(dst, src);
5536     }
5537   }
5538 }
5539 
5540 void MacroAssembler::encode_klass_not_null(Register r) {
5541   if (CompressedKlassPointers::base() != NULL) {
5542     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5543     assert(r != r12_heapbase, "Encoding a klass in r12");
5544     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5545     subq(r, r12_heapbase);
5546   }
5547   if (CompressedKlassPointers::shift() != 0) {
5548     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5549     shrq(r, LogKlassAlignmentInBytes);
5550   }
5551   if (CompressedKlassPointers::base() != NULL) {
5552     reinit_heapbase();
5553   }
5554 }
5555 
5556 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5557   if (dst == src) {
5558     encode_klass_not_null(src);
5559   } else {
5560     if (CompressedKlassPointers::base() != NULL) {
5561       mov64(dst, (int64_t)CompressedKlassPointers::base());
5562       negq(dst);
5563       addq(dst, src);
5564     } else {
5565       movptr(dst, src);
5566     }
5567     if (CompressedKlassPointers::shift() != 0) {
5568       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5569       shrq(dst, LogKlassAlignmentInBytes);
5570     }
5571   }
5572 }
5573 
5574 // Function instr_size_for_decode_klass_not_null() counts the instructions
5575 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5576 // when (Universe::heap() != NULL).  Hence, if the instructions they
5577 // generate change, then this method needs to be updated.
5578 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5579   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5580   if (CompressedKlassPointers::base() != NULL) {
5581     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5582     return (CompressedKlassPointers::shift() == 0 ? 20 : 24);
5583   } else {
5584     // longest load decode klass function, mov64, leaq
5585     return 16;
5586   }
5587 }
5588 
5589 // !!! If the instructions that get generated here change then function
5590 // instr_size_for_decode_klass_not_null() needs to get updated.
5591 void  MacroAssembler::decode_klass_not_null(Register r) {
5592   // Note: it will change flags
5593   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5594   assert(r != r12_heapbase, "Decoding a klass in r12");
5595   // Cannot assert, unverified entry point counts instructions (see .ad file)
5596   // vtableStubs also counts instructions in pd_code_size_limit.
5597   // Also do not verify_oop as this is called by verify_oop.
5598   if (CompressedKlassPointers::shift() != 0) {
5599     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5600     shlq(r, LogKlassAlignmentInBytes);
5601   }
5602   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5603   if (CompressedKlassPointers::base() != NULL) {
5604     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5605     addq(r, r12_heapbase);
5606     reinit_heapbase();
5607   }
5608 }
5609 
5610 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5611   // Note: it will change flags
5612   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5613   if (dst == src) {
5614     decode_klass_not_null(dst);
5615   } else {
5616     // Cannot assert, unverified entry point counts instructions (see .ad file)
5617     // vtableStubs also counts instructions in pd_code_size_limit.
5618     // Also do not verify_oop as this is called by verify_oop.
5619     mov64(dst, (int64_t)CompressedKlassPointers::base());
5620     if (CompressedKlassPointers::shift() != 0) {
5621       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5622       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5623       leaq(dst, Address(dst, src, Address::times_8, 0));
5624     } else {
5625       addq(dst, src);
5626     }
5627   }
5628 }
5629 
5630 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5631   assert (UseCompressedOops, "should only be used for compressed headers");
5632   assert (Universe::heap() != NULL, "java heap should be initialized");
5633   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5634   int oop_index = oop_recorder()->find_index(obj);
5635   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5636   mov_narrow_oop(dst, oop_index, rspec);
5637 }
5638 
5639 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5640   assert (UseCompressedOops, "should only be used for compressed headers");
5641   assert (Universe::heap() != NULL, "java heap should be initialized");
5642   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5643   int oop_index = oop_recorder()->find_index(obj);
5644   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5645   mov_narrow_oop(dst, oop_index, rspec);
5646 }
5647 
5648 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5649   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5650   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5651   int klass_index = oop_recorder()->find_index(k);
5652   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5653   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5654 }
5655 
5656 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5657   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5658   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5659   int klass_index = oop_recorder()->find_index(k);
5660   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5661   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5662 }
5663 
5664 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5665   assert (UseCompressedOops, "should only be used for compressed headers");
5666   assert (Universe::heap() != NULL, "java heap should be initialized");
5667   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5668   int oop_index = oop_recorder()->find_index(obj);
5669   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5670   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5671 }
5672 
5673 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5674   assert (UseCompressedOops, "should only be used for compressed headers");
5675   assert (Universe::heap() != NULL, "java heap should be initialized");
5676   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5677   int oop_index = oop_recorder()->find_index(obj);
5678   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5679   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5680 }
5681 
5682 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5683   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5684   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5685   int klass_index = oop_recorder()->find_index(k);
5686   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5687   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5688 }
5689 
5690 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5691   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5692   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5693   int klass_index = oop_recorder()->find_index(k);
5694   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5695   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5696 }
5697 
5698 void MacroAssembler::reinit_heapbase() {
5699   if (UseCompressedOops || UseCompressedClassPointers) {
5700     if (Universe::heap() != NULL) {
5701       if (CompressedOops::base() == NULL) {
5702         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5703       } else {
5704         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5705       }
5706     } else {
5707       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5708     }
5709   }
5710 }
5711 
5712 #endif // _LP64
5713 
5714 // C2 compiled method's prolog code.
5715 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5716 
5717   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5718   // NativeJump::patch_verified_entry will be able to patch out the entry
5719   // code safely. The push to verify stack depth is ok at 5 bytes,
5720   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5721   // stack bang then we must use the 6 byte frame allocation even if
5722   // we have no frame. :-(
5723   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5724 
5725   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5726   // Remove word for return addr
5727   framesize -= wordSize;
5728   stack_bang_size -= wordSize;
5729 
5730   // Calls to C2R adapters often do not accept exceptional returns.
5731   // We require that their callers must bang for them.  But be careful, because
5732   // some VM calls (such as call site linkage) can use several kilobytes of
5733   // stack.  But the stack safety zone should account for that.
5734   // See bugs 4446381, 4468289, 4497237.
5735   if (stack_bang_size > 0) {
5736     generate_stack_overflow_check(stack_bang_size);
5737 
5738     // We always push rbp, so that on return to interpreter rbp, will be
5739     // restored correctly and we can correct the stack.
5740     push(rbp);
5741     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5742     if (PreserveFramePointer) {
5743       mov(rbp, rsp);
5744     }
5745     // Remove word for ebp
5746     framesize -= wordSize;
5747 
5748     // Create frame
5749     if (framesize) {
5750       subptr(rsp, framesize);
5751     }
5752   } else {
5753     // Create frame (force generation of a 4 byte immediate value)
5754     subptr_imm32(rsp, framesize);
5755 
5756     // Save RBP register now.
5757     framesize -= wordSize;
5758     movptr(Address(rsp, framesize), rbp);
5759     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5760     if (PreserveFramePointer) {
5761       movptr(rbp, rsp);
5762       if (framesize > 0) {
5763         addptr(rbp, framesize);
5764       }
5765     }
5766   }
5767 
5768   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5769     framesize -= wordSize;
5770     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5771   }
5772 
5773 #ifndef _LP64
5774   // If method sets FPU control word do it now
5775   if (fp_mode_24b) {
5776     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
5777   }
5778   if (UseSSE >= 2 && VerifyFPU) {
5779     verify_FPU(0, "FPU stack must be clean on entry");
5780   }
5781 #endif
5782 
5783 #ifdef ASSERT
5784   if (VerifyStackAtCalls) {
5785     Label L;
5786     push(rax);
5787     mov(rax, rsp);
5788     andptr(rax, StackAlignmentInBytes-1);
5789     cmpptr(rax, StackAlignmentInBytes-wordSize);
5790     pop(rax);
5791     jcc(Assembler::equal, L);
5792     STOP("Stack is not properly aligned!");
5793     bind(L);
5794   }
5795 #endif
5796 
5797   if (!is_stub) {
5798     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5799     bs->nmethod_entry_barrier(this);
5800   }
5801 }
5802 
5803 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
5804 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
5805   // cnt - number of qwords (8-byte words).
5806   // base - start address, qword aligned.
5807   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5808   if (UseAVX >= 2) {
5809     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5810   } else {
5811     pxor(xtmp, xtmp);
5812   }
5813   jmp(L_zero_64_bytes);
5814 
5815   BIND(L_loop);
5816   if (UseAVX >= 2) {
5817     vmovdqu(Address(base,  0), xtmp);
5818     vmovdqu(Address(base, 32), xtmp);
5819   } else {
5820     movdqu(Address(base,  0), xtmp);
5821     movdqu(Address(base, 16), xtmp);
5822     movdqu(Address(base, 32), xtmp);
5823     movdqu(Address(base, 48), xtmp);
5824   }
5825   addptr(base, 64);
5826 
5827   BIND(L_zero_64_bytes);
5828   subptr(cnt, 8);
5829   jccb(Assembler::greaterEqual, L_loop);
5830   addptr(cnt, 4);
5831   jccb(Assembler::less, L_tail);
5832   // Copy trailing 32 bytes
5833   if (UseAVX >= 2) {
5834     vmovdqu(Address(base, 0), xtmp);
5835   } else {
5836     movdqu(Address(base,  0), xtmp);
5837     movdqu(Address(base, 16), xtmp);
5838   }
5839   addptr(base, 32);
5840   subptr(cnt, 4);
5841 
5842   BIND(L_tail);
5843   addptr(cnt, 4);
5844   jccb(Assembler::lessEqual, L_end);
5845   decrement(cnt);
5846 
5847   BIND(L_sloop);
5848   movq(Address(base, 0), xtmp);
5849   addptr(base, 8);
5850   decrement(cnt);
5851   jccb(Assembler::greaterEqual, L_sloop);
5852   BIND(L_end);
5853 }
5854 
5855 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
5856   // cnt - number of qwords (8-byte words).
5857   // base - start address, qword aligned.
5858   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5859   assert(base==rdi, "base register must be edi for rep stos");
5860   assert(tmp==rax,   "tmp register must be eax for rep stos");
5861   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5862   assert(InitArrayShortSize % BytesPerLong == 0,
5863     "InitArrayShortSize should be the multiple of BytesPerLong");
5864 
5865   Label DONE;
5866 
5867   if (!is_large || !UseXMMForObjInit) {
5868     xorptr(tmp, tmp);
5869   }
5870 
5871   if (!is_large) {
5872     Label LOOP, LONG;
5873     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5874     jccb(Assembler::greater, LONG);
5875 
5876     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5877 
5878     decrement(cnt);
5879     jccb(Assembler::negative, DONE); // Zero length
5880 
5881     // Use individual pointer-sized stores for small counts:
5882     BIND(LOOP);
5883     movptr(Address(base, cnt, Address::times_ptr), tmp);
5884     decrement(cnt);
5885     jccb(Assembler::greaterEqual, LOOP);
5886     jmpb(DONE);
5887 
5888     BIND(LONG);
5889   }
5890 
5891   // Use longer rep-prefixed ops for non-small counts:
5892   if (UseFastStosb) {
5893     shlptr(cnt, 3); // convert to number of bytes
5894     rep_stosb();
5895   } else if (UseXMMForObjInit) {
5896     movptr(tmp, base);
5897     xmm_clear_mem(tmp, cnt, xtmp);
5898   } else {
5899     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5900     rep_stos();
5901   }
5902 
5903   BIND(DONE);
5904 }
5905 
5906 #ifdef COMPILER2
5907 
5908 // IndexOf for constant substrings with size >= 8 chars
5909 // which don't need to be loaded through stack.
5910 void MacroAssembler::string_indexofC8(Register str1, Register str2,
5911                                       Register cnt1, Register cnt2,
5912                                       int int_cnt2,  Register result,
5913                                       XMMRegister vec, Register tmp,
5914                                       int ae) {
5915   ShortBranchVerifier sbv(this);
5916   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
5917   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
5918 
5919   // This method uses the pcmpestri instruction with bound registers
5920   //   inputs:
5921   //     xmm - substring
5922   //     rax - substring length (elements count)
5923   //     mem - scanned string
5924   //     rdx - string length (elements count)
5925   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
5926   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
5927   //   outputs:
5928   //     rcx - matched index in string
5929   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
5930   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
5931   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
5932   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
5933   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
5934 
5935   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
5936         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
5937         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
5938 
5939   // Note, inline_string_indexOf() generates checks:
5940   // if (substr.count > string.count) return -1;
5941   // if (substr.count == 0) return 0;
5942   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
5943 
5944   // Load substring.
5945   if (ae == StrIntrinsicNode::UL) {
5946     pmovzxbw(vec, Address(str2, 0));
5947   } else {
5948     movdqu(vec, Address(str2, 0));
5949   }
5950   movl(cnt2, int_cnt2);
5951   movptr(result, str1); // string addr
5952 
5953   if (int_cnt2 > stride) {
5954     jmpb(SCAN_TO_SUBSTR);
5955 
5956     // Reload substr for rescan, this code
5957     // is executed only for large substrings (> 8 chars)
5958     bind(RELOAD_SUBSTR);
5959     if (ae == StrIntrinsicNode::UL) {
5960       pmovzxbw(vec, Address(str2, 0));
5961     } else {
5962       movdqu(vec, Address(str2, 0));
5963     }
5964     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
5965 
5966     bind(RELOAD_STR);
5967     // We came here after the beginning of the substring was
5968     // matched but the rest of it was not so we need to search
5969     // again. Start from the next element after the previous match.
5970 
5971     // cnt2 is number of substring reminding elements and
5972     // cnt1 is number of string reminding elements when cmp failed.
5973     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
5974     subl(cnt1, cnt2);
5975     addl(cnt1, int_cnt2);
5976     movl(cnt2, int_cnt2); // Now restore cnt2
5977 
5978     decrementl(cnt1);     // Shift to next element
5979     cmpl(cnt1, cnt2);
5980     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
5981 
5982     addptr(result, (1<<scale1));
5983 
5984   } // (int_cnt2 > 8)
5985 
5986   // Scan string for start of substr in 16-byte vectors
5987   bind(SCAN_TO_SUBSTR);
5988   pcmpestri(vec, Address(result, 0), mode);
5989   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
5990   subl(cnt1, stride);
5991   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
5992   cmpl(cnt1, cnt2);
5993   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
5994   addptr(result, 16);
5995   jmpb(SCAN_TO_SUBSTR);
5996 
5997   // Found a potential substr
5998   bind(FOUND_CANDIDATE);
5999   // Matched whole vector if first element matched (tmp(rcx) == 0).
6000   if (int_cnt2 == stride) {
6001     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
6002   } else { // int_cnt2 > 8
6003     jccb(Assembler::overflow, FOUND_SUBSTR);
6004   }
6005   // After pcmpestri tmp(rcx) contains matched element index
6006   // Compute start addr of substr
6007   lea(result, Address(result, tmp, scale1));
6008 
6009   // Make sure string is still long enough
6010   subl(cnt1, tmp);
6011   cmpl(cnt1, cnt2);
6012   if (int_cnt2 == stride) {
6013     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6014   } else { // int_cnt2 > 8
6015     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
6016   }
6017   // Left less then substring.
6018 
6019   bind(RET_NOT_FOUND);
6020   movl(result, -1);
6021   jmp(EXIT);
6022 
6023   if (int_cnt2 > stride) {
6024     // This code is optimized for the case when whole substring
6025     // is matched if its head is matched.
6026     bind(MATCH_SUBSTR_HEAD);
6027     pcmpestri(vec, Address(result, 0), mode);
6028     // Reload only string if does not match
6029     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
6030 
6031     Label CONT_SCAN_SUBSTR;
6032     // Compare the rest of substring (> 8 chars).
6033     bind(FOUND_SUBSTR);
6034     // First 8 chars are already matched.
6035     negptr(cnt2);
6036     addptr(cnt2, stride);
6037 
6038     bind(SCAN_SUBSTR);
6039     subl(cnt1, stride);
6040     cmpl(cnt2, -stride); // Do not read beyond substring
6041     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
6042     // Back-up strings to avoid reading beyond substring:
6043     // cnt1 = cnt1 - cnt2 + 8
6044     addl(cnt1, cnt2); // cnt2 is negative
6045     addl(cnt1, stride);
6046     movl(cnt2, stride); negptr(cnt2);
6047     bind(CONT_SCAN_SUBSTR);
6048     if (int_cnt2 < (int)G) {
6049       int tail_off1 = int_cnt2<<scale1;
6050       int tail_off2 = int_cnt2<<scale2;
6051       if (ae == StrIntrinsicNode::UL) {
6052         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
6053       } else {
6054         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
6055       }
6056       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
6057     } else {
6058       // calculate index in register to avoid integer overflow (int_cnt2*2)
6059       movl(tmp, int_cnt2);
6060       addptr(tmp, cnt2);
6061       if (ae == StrIntrinsicNode::UL) {
6062         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
6063       } else {
6064         movdqu(vec, Address(str2, tmp, scale2, 0));
6065       }
6066       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
6067     }
6068     // Need to reload strings pointers if not matched whole vector
6069     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6070     addptr(cnt2, stride);
6071     jcc(Assembler::negative, SCAN_SUBSTR);
6072     // Fall through if found full substring
6073 
6074   } // (int_cnt2 > 8)
6075 
6076   bind(RET_FOUND);
6077   // Found result if we matched full small substring.
6078   // Compute substr offset
6079   subptr(result, str1);
6080   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6081     shrl(result, 1); // index
6082   }
6083   bind(EXIT);
6084 
6085 } // string_indexofC8
6086 
6087 // Small strings are loaded through stack if they cross page boundary.
6088 void MacroAssembler::string_indexof(Register str1, Register str2,
6089                                     Register cnt1, Register cnt2,
6090                                     int int_cnt2,  Register result,
6091                                     XMMRegister vec, Register tmp,
6092                                     int ae) {
6093   ShortBranchVerifier sbv(this);
6094   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6095   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6096 
6097   //
6098   // int_cnt2 is length of small (< 8 chars) constant substring
6099   // or (-1) for non constant substring in which case its length
6100   // is in cnt2 register.
6101   //
6102   // Note, inline_string_indexOf() generates checks:
6103   // if (substr.count > string.count) return -1;
6104   // if (substr.count == 0) return 0;
6105   //
6106   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6107   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
6108   // This method uses the pcmpestri instruction with bound registers
6109   //   inputs:
6110   //     xmm - substring
6111   //     rax - substring length (elements count)
6112   //     mem - scanned string
6113   //     rdx - string length (elements count)
6114   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6115   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6116   //   outputs:
6117   //     rcx - matched index in string
6118   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6119   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6120   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6121   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6122 
6123   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6124         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6125         FOUND_CANDIDATE;
6126 
6127   { //========================================================
6128     // We don't know where these strings are located
6129     // and we can't read beyond them. Load them through stack.
6130     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6131 
6132     movptr(tmp, rsp); // save old SP
6133 
6134     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6135       if (int_cnt2 == (1>>scale2)) { // One byte
6136         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
6137         load_unsigned_byte(result, Address(str2, 0));
6138         movdl(vec, result); // move 32 bits
6139       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
6140         // Not enough header space in 32-bit VM: 12+3 = 15.
6141         movl(result, Address(str2, -1));
6142         shrl(result, 8);
6143         movdl(vec, result); // move 32 bits
6144       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
6145         load_unsigned_short(result, Address(str2, 0));
6146         movdl(vec, result); // move 32 bits
6147       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
6148         movdl(vec, Address(str2, 0)); // move 32 bits
6149       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
6150         movq(vec, Address(str2, 0));  // move 64 bits
6151       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
6152         // Array header size is 12 bytes in 32-bit VM
6153         // + 6 bytes for 3 chars == 18 bytes,
6154         // enough space to load vec and shift.
6155         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6156         if (ae == StrIntrinsicNode::UL) {
6157           int tail_off = int_cnt2-8;
6158           pmovzxbw(vec, Address(str2, tail_off));
6159           psrldq(vec, -2*tail_off);
6160         }
6161         else {
6162           int tail_off = int_cnt2*(1<<scale2);
6163           movdqu(vec, Address(str2, tail_off-16));
6164           psrldq(vec, 16-tail_off);
6165         }
6166       }
6167     } else { // not constant substring
6168       cmpl(cnt2, stride);
6169       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6170 
6171       // We can read beyond string if srt+16 does not cross page boundary
6172       // since heaps are aligned and mapped by pages.
6173       assert(os::vm_page_size() < (int)G, "default page should be small");
6174       movl(result, str2); // We need only low 32 bits
6175       andl(result, (os::vm_page_size()-1));
6176       cmpl(result, (os::vm_page_size()-16));
6177       jccb(Assembler::belowEqual, CHECK_STR);
6178 
6179       // Move small strings to stack to allow load 16 bytes into vec.
6180       subptr(rsp, 16);
6181       int stk_offset = wordSize-(1<<scale2);
6182       push(cnt2);
6183 
6184       bind(COPY_SUBSTR);
6185       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
6186         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
6187         movb(Address(rsp, cnt2, scale2, stk_offset), result);
6188       } else if (ae == StrIntrinsicNode::UU) {
6189         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
6190         movw(Address(rsp, cnt2, scale2, stk_offset), result);
6191       }
6192       decrement(cnt2);
6193       jccb(Assembler::notZero, COPY_SUBSTR);
6194 
6195       pop(cnt2);
6196       movptr(str2, rsp);  // New substring address
6197     } // non constant
6198 
6199     bind(CHECK_STR);
6200     cmpl(cnt1, stride);
6201     jccb(Assembler::aboveEqual, BIG_STRINGS);
6202 
6203     // Check cross page boundary.
6204     movl(result, str1); // We need only low 32 bits
6205     andl(result, (os::vm_page_size()-1));
6206     cmpl(result, (os::vm_page_size()-16));
6207     jccb(Assembler::belowEqual, BIG_STRINGS);
6208 
6209     subptr(rsp, 16);
6210     int stk_offset = -(1<<scale1);
6211     if (int_cnt2 < 0) { // not constant
6212       push(cnt2);
6213       stk_offset += wordSize;
6214     }
6215     movl(cnt2, cnt1);
6216 
6217     bind(COPY_STR);
6218     if (ae == StrIntrinsicNode::LL) {
6219       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
6220       movb(Address(rsp, cnt2, scale1, stk_offset), result);
6221     } else {
6222       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
6223       movw(Address(rsp, cnt2, scale1, stk_offset), result);
6224     }
6225     decrement(cnt2);
6226     jccb(Assembler::notZero, COPY_STR);
6227 
6228     if (int_cnt2 < 0) { // not constant
6229       pop(cnt2);
6230     }
6231     movptr(str1, rsp);  // New string address
6232 
6233     bind(BIG_STRINGS);
6234     // Load substring.
6235     if (int_cnt2 < 0) { // -1
6236       if (ae == StrIntrinsicNode::UL) {
6237         pmovzxbw(vec, Address(str2, 0));
6238       } else {
6239         movdqu(vec, Address(str2, 0));
6240       }
6241       push(cnt2);       // substr count
6242       push(str2);       // substr addr
6243       push(str1);       // string addr
6244     } else {
6245       // Small (< 8 chars) constant substrings are loaded already.
6246       movl(cnt2, int_cnt2);
6247     }
6248     push(tmp);  // original SP
6249 
6250   } // Finished loading
6251 
6252   //========================================================
6253   // Start search
6254   //
6255 
6256   movptr(result, str1); // string addr
6257 
6258   if (int_cnt2  < 0) {  // Only for non constant substring
6259     jmpb(SCAN_TO_SUBSTR);
6260 
6261     // SP saved at sp+0
6262     // String saved at sp+1*wordSize
6263     // Substr saved at sp+2*wordSize
6264     // Substr count saved at sp+3*wordSize
6265 
6266     // Reload substr for rescan, this code
6267     // is executed only for large substrings (> 8 chars)
6268     bind(RELOAD_SUBSTR);
6269     movptr(str2, Address(rsp, 2*wordSize));
6270     movl(cnt2, Address(rsp, 3*wordSize));
6271     if (ae == StrIntrinsicNode::UL) {
6272       pmovzxbw(vec, Address(str2, 0));
6273     } else {
6274       movdqu(vec, Address(str2, 0));
6275     }
6276     // We came here after the beginning of the substring was
6277     // matched but the rest of it was not so we need to search
6278     // again. Start from the next element after the previous match.
6279     subptr(str1, result); // Restore counter
6280     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6281       shrl(str1, 1);
6282     }
6283     addl(cnt1, str1);
6284     decrementl(cnt1);   // Shift to next element
6285     cmpl(cnt1, cnt2);
6286     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6287 
6288     addptr(result, (1<<scale1));
6289   } // non constant
6290 
6291   // Scan string for start of substr in 16-byte vectors
6292   bind(SCAN_TO_SUBSTR);
6293   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6294   pcmpestri(vec, Address(result, 0), mode);
6295   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6296   subl(cnt1, stride);
6297   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6298   cmpl(cnt1, cnt2);
6299   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6300   addptr(result, 16);
6301 
6302   bind(ADJUST_STR);
6303   cmpl(cnt1, stride); // Do not read beyond string
6304   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6305   // Back-up string to avoid reading beyond string.
6306   lea(result, Address(result, cnt1, scale1, -16));
6307   movl(cnt1, stride);
6308   jmpb(SCAN_TO_SUBSTR);
6309 
6310   // Found a potential substr
6311   bind(FOUND_CANDIDATE);
6312   // After pcmpestri tmp(rcx) contains matched element index
6313 
6314   // Make sure string is still long enough
6315   subl(cnt1, tmp);
6316   cmpl(cnt1, cnt2);
6317   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6318   // Left less then substring.
6319 
6320   bind(RET_NOT_FOUND);
6321   movl(result, -1);
6322   jmp(CLEANUP);
6323 
6324   bind(FOUND_SUBSTR);
6325   // Compute start addr of substr
6326   lea(result, Address(result, tmp, scale1));
6327   if (int_cnt2 > 0) { // Constant substring
6328     // Repeat search for small substring (< 8 chars)
6329     // from new point without reloading substring.
6330     // Have to check that we don't read beyond string.
6331     cmpl(tmp, stride-int_cnt2);
6332     jccb(Assembler::greater, ADJUST_STR);
6333     // Fall through if matched whole substring.
6334   } else { // non constant
6335     assert(int_cnt2 == -1, "should be != 0");
6336 
6337     addl(tmp, cnt2);
6338     // Found result if we matched whole substring.
6339     cmpl(tmp, stride);
6340     jcc(Assembler::lessEqual, RET_FOUND);
6341 
6342     // Repeat search for small substring (<= 8 chars)
6343     // from new point 'str1' without reloading substring.
6344     cmpl(cnt2, stride);
6345     // Have to check that we don't read beyond string.
6346     jccb(Assembler::lessEqual, ADJUST_STR);
6347 
6348     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6349     // Compare the rest of substring (> 8 chars).
6350     movptr(str1, result);
6351 
6352     cmpl(tmp, cnt2);
6353     // First 8 chars are already matched.
6354     jccb(Assembler::equal, CHECK_NEXT);
6355 
6356     bind(SCAN_SUBSTR);
6357     pcmpestri(vec, Address(str1, 0), mode);
6358     // Need to reload strings pointers if not matched whole vector
6359     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6360 
6361     bind(CHECK_NEXT);
6362     subl(cnt2, stride);
6363     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6364     addptr(str1, 16);
6365     if (ae == StrIntrinsicNode::UL) {
6366       addptr(str2, 8);
6367     } else {
6368       addptr(str2, 16);
6369     }
6370     subl(cnt1, stride);
6371     cmpl(cnt2, stride); // Do not read beyond substring
6372     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6373     // Back-up strings to avoid reading beyond substring.
6374 
6375     if (ae == StrIntrinsicNode::UL) {
6376       lea(str2, Address(str2, cnt2, scale2, -8));
6377       lea(str1, Address(str1, cnt2, scale1, -16));
6378     } else {
6379       lea(str2, Address(str2, cnt2, scale2, -16));
6380       lea(str1, Address(str1, cnt2, scale1, -16));
6381     }
6382     subl(cnt1, cnt2);
6383     movl(cnt2, stride);
6384     addl(cnt1, stride);
6385     bind(CONT_SCAN_SUBSTR);
6386     if (ae == StrIntrinsicNode::UL) {
6387       pmovzxbw(vec, Address(str2, 0));
6388     } else {
6389       movdqu(vec, Address(str2, 0));
6390     }
6391     jmp(SCAN_SUBSTR);
6392 
6393     bind(RET_FOUND_LONG);
6394     movptr(str1, Address(rsp, wordSize));
6395   } // non constant
6396 
6397   bind(RET_FOUND);
6398   // Compute substr offset
6399   subptr(result, str1);
6400   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6401     shrl(result, 1); // index
6402   }
6403   bind(CLEANUP);
6404   pop(rsp); // restore SP
6405 
6406 } // string_indexof
6407 
6408 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
6409                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
6410   ShortBranchVerifier sbv(this);
6411   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6412 
6413   int stride = 8;
6414 
6415   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6416         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6417         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6418         FOUND_SEQ_CHAR, DONE_LABEL;
6419 
6420   movptr(result, str1);
6421   if (UseAVX >= 2) {
6422     cmpl(cnt1, stride);
6423     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6424     cmpl(cnt1, 2*stride);
6425     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
6426     movdl(vec1, ch);
6427     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
6428     vpxor(vec2, vec2);
6429     movl(tmp, cnt1);
6430     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
6431     andl(cnt1,0x0000000F);  //tail count (in chars)
6432 
6433     bind(SCAN_TO_16_CHAR_LOOP);
6434     vmovdqu(vec3, Address(result, 0));
6435     vpcmpeqw(vec3, vec3, vec1, 1);
6436     vptest(vec2, vec3);
6437     jcc(Assembler::carryClear, FOUND_CHAR);
6438     addptr(result, 32);
6439     subl(tmp, 2*stride);
6440     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6441     jmp(SCAN_TO_8_CHAR);
6442     bind(SCAN_TO_8_CHAR_INIT);
6443     movdl(vec1, ch);
6444     pshuflw(vec1, vec1, 0x00);
6445     pshufd(vec1, vec1, 0);
6446     pxor(vec2, vec2);
6447   }
6448   bind(SCAN_TO_8_CHAR);
6449   cmpl(cnt1, stride);
6450   if (UseAVX >= 2) {
6451     jcc(Assembler::less, SCAN_TO_CHAR);
6452   } else {
6453     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6454     movdl(vec1, ch);
6455     pshuflw(vec1, vec1, 0x00);
6456     pshufd(vec1, vec1, 0);
6457     pxor(vec2, vec2);
6458   }
6459   movl(tmp, cnt1);
6460   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
6461   andl(cnt1,0x00000007);  //tail count (in chars)
6462 
6463   bind(SCAN_TO_8_CHAR_LOOP);
6464   movdqu(vec3, Address(result, 0));
6465   pcmpeqw(vec3, vec1);
6466   ptest(vec2, vec3);
6467   jcc(Assembler::carryClear, FOUND_CHAR);
6468   addptr(result, 16);
6469   subl(tmp, stride);
6470   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
6471   bind(SCAN_TO_CHAR);
6472   testl(cnt1, cnt1);
6473   jcc(Assembler::zero, RET_NOT_FOUND);
6474   bind(SCAN_TO_CHAR_LOOP);
6475   load_unsigned_short(tmp, Address(result, 0));
6476   cmpl(ch, tmp);
6477   jccb(Assembler::equal, FOUND_SEQ_CHAR);
6478   addptr(result, 2);
6479   subl(cnt1, 1);
6480   jccb(Assembler::zero, RET_NOT_FOUND);
6481   jmp(SCAN_TO_CHAR_LOOP);
6482 
6483   bind(RET_NOT_FOUND);
6484   movl(result, -1);
6485   jmpb(DONE_LABEL);
6486 
6487   bind(FOUND_CHAR);
6488   if (UseAVX >= 2) {
6489     vpmovmskb(tmp, vec3);
6490   } else {
6491     pmovmskb(tmp, vec3);
6492   }
6493   bsfl(ch, tmp);
6494   addl(result, ch);
6495 
6496   bind(FOUND_SEQ_CHAR);
6497   subptr(result, str1);
6498   shrl(result, 1);
6499 
6500   bind(DONE_LABEL);
6501 } // string_indexof_char
6502 
6503 // helper function for string_compare
6504 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
6505                                         Address::ScaleFactor scale, Address::ScaleFactor scale1,
6506                                         Address::ScaleFactor scale2, Register index, int ae) {
6507   if (ae == StrIntrinsicNode::LL) {
6508     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
6509     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
6510   } else if (ae == StrIntrinsicNode::UU) {
6511     load_unsigned_short(elem1, Address(str1, index, scale, 0));
6512     load_unsigned_short(elem2, Address(str2, index, scale, 0));
6513   } else {
6514     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
6515     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
6516   }
6517 }
6518 
6519 // Compare strings, used for char[] and byte[].
6520 void MacroAssembler::string_compare(Register str1, Register str2,
6521                                     Register cnt1, Register cnt2, Register result,
6522                                     XMMRegister vec1, int ae) {
6523   ShortBranchVerifier sbv(this);
6524   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6525   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
6526   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
6527   int stride2x2 = 0x40;
6528   Address::ScaleFactor scale = Address::no_scale;
6529   Address::ScaleFactor scale1 = Address::no_scale;
6530   Address::ScaleFactor scale2 = Address::no_scale;
6531 
6532   if (ae != StrIntrinsicNode::LL) {
6533     stride2x2 = 0x20;
6534   }
6535 
6536   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
6537     shrl(cnt2, 1);
6538   }
6539   // Compute the minimum of the string lengths and the
6540   // difference of the string lengths (stack).
6541   // Do the conditional move stuff
6542   movl(result, cnt1);
6543   subl(cnt1, cnt2);
6544   push(cnt1);
6545   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
6546 
6547   // Is the minimum length zero?
6548   testl(cnt2, cnt2);
6549   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6550   if (ae == StrIntrinsicNode::LL) {
6551     // Load first bytes
6552     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
6553     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
6554   } else if (ae == StrIntrinsicNode::UU) {
6555     // Load first characters
6556     load_unsigned_short(result, Address(str1, 0));
6557     load_unsigned_short(cnt1, Address(str2, 0));
6558   } else {
6559     load_unsigned_byte(result, Address(str1, 0));
6560     load_unsigned_short(cnt1, Address(str2, 0));
6561   }
6562   subl(result, cnt1);
6563   jcc(Assembler::notZero,  POP_LABEL);
6564 
6565   if (ae == StrIntrinsicNode::UU) {
6566     // Divide length by 2 to get number of chars
6567     shrl(cnt2, 1);
6568   }
6569   cmpl(cnt2, 1);
6570   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6571 
6572   // Check if the strings start at the same location and setup scale and stride
6573   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6574     cmpptr(str1, str2);
6575     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6576     if (ae == StrIntrinsicNode::LL) {
6577       scale = Address::times_1;
6578       stride = 16;
6579     } else {
6580       scale = Address::times_2;
6581       stride = 8;
6582     }
6583   } else {
6584     scale1 = Address::times_1;
6585     scale2 = Address::times_2;
6586     // scale not used
6587     stride = 8;
6588   }
6589 
6590   if (UseAVX >= 2 && UseSSE42Intrinsics) {
6591     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6592     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6593     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
6594     Label COMPARE_TAIL_LONG;
6595     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
6596 
6597     int pcmpmask = 0x19;
6598     if (ae == StrIntrinsicNode::LL) {
6599       pcmpmask &= ~0x01;
6600     }
6601 
6602     // Setup to compare 16-chars (32-bytes) vectors,
6603     // start from first character again because it has aligned address.
6604     if (ae == StrIntrinsicNode::LL) {
6605       stride2 = 32;
6606     } else {
6607       stride2 = 16;
6608     }
6609     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6610       adr_stride = stride << scale;
6611     } else {
6612       adr_stride1 = 8;  //stride << scale1;
6613       adr_stride2 = 16; //stride << scale2;
6614     }
6615 
6616     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6617     // rax and rdx are used by pcmpestri as elements counters
6618     movl(result, cnt2);
6619     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
6620     jcc(Assembler::zero, COMPARE_TAIL_LONG);
6621 
6622     // fast path : compare first 2 8-char vectors.
6623     bind(COMPARE_16_CHARS);
6624     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6625       movdqu(vec1, Address(str1, 0));
6626     } else {
6627       pmovzxbw(vec1, Address(str1, 0));
6628     }
6629     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6630     jccb(Assembler::below, COMPARE_INDEX_CHAR);
6631 
6632     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6633       movdqu(vec1, Address(str1, adr_stride));
6634       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6635     } else {
6636       pmovzxbw(vec1, Address(str1, adr_stride1));
6637       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
6638     }
6639     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6640     addl(cnt1, stride);
6641 
6642     // Compare the characters at index in cnt1
6643     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
6644     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6645     subl(result, cnt2);
6646     jmp(POP_LABEL);
6647 
6648     // Setup the registers to start vector comparison loop
6649     bind(COMPARE_WIDE_VECTORS);
6650     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6651       lea(str1, Address(str1, result, scale));
6652       lea(str2, Address(str2, result, scale));
6653     } else {
6654       lea(str1, Address(str1, result, scale1));
6655       lea(str2, Address(str2, result, scale2));
6656     }
6657     subl(result, stride2);
6658     subl(cnt2, stride2);
6659     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
6660     negptr(result);
6661 
6662     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6663     bind(COMPARE_WIDE_VECTORS_LOOP);
6664 
6665 #ifdef _LP64
6666     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
6667       cmpl(cnt2, stride2x2);
6668       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
6669       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
6670       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
6671 
6672       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
6673       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6674         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
6675         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6676       } else {
6677         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
6678         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6679       }
6680       kortestql(k7, k7);
6681       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
6682       addptr(result, stride2x2);  // update since we already compared at this addr
6683       subl(cnt2, stride2x2);      // and sub the size too
6684       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
6685 
6686       vpxor(vec1, vec1);
6687       jmpb(COMPARE_WIDE_TAIL);
6688     }//if (VM_Version::supports_avx512vlbw())
6689 #endif // _LP64
6690 
6691 
6692     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6693     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6694       vmovdqu(vec1, Address(str1, result, scale));
6695       vpxor(vec1, Address(str2, result, scale));
6696     } else {
6697       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
6698       vpxor(vec1, Address(str2, result, scale2));
6699     }
6700     vptest(vec1, vec1);
6701     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
6702     addptr(result, stride2);
6703     subl(cnt2, stride2);
6704     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6705     // clean upper bits of YMM registers
6706     vpxor(vec1, vec1);
6707 
6708     // compare wide vectors tail
6709     bind(COMPARE_WIDE_TAIL);
6710     testptr(result, result);
6711     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6712 
6713     movl(result, stride2);
6714     movl(cnt2, result);
6715     negptr(result);
6716     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6717 
6718     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6719     bind(VECTOR_NOT_EQUAL);
6720     // clean upper bits of YMM registers
6721     vpxor(vec1, vec1);
6722     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6723       lea(str1, Address(str1, result, scale));
6724       lea(str2, Address(str2, result, scale));
6725     } else {
6726       lea(str1, Address(str1, result, scale1));
6727       lea(str2, Address(str2, result, scale2));
6728     }
6729     jmp(COMPARE_16_CHARS);
6730 
6731     // Compare tail chars, length between 1 to 15 chars
6732     bind(COMPARE_TAIL_LONG);
6733     movl(cnt2, result);
6734     cmpl(cnt2, stride);
6735     jcc(Assembler::less, COMPARE_SMALL_STR);
6736 
6737     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6738       movdqu(vec1, Address(str1, 0));
6739     } else {
6740       pmovzxbw(vec1, Address(str1, 0));
6741     }
6742     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6743     jcc(Assembler::below, COMPARE_INDEX_CHAR);
6744     subptr(cnt2, stride);
6745     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6746     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6747       lea(str1, Address(str1, result, scale));
6748       lea(str2, Address(str2, result, scale));
6749     } else {
6750       lea(str1, Address(str1, result, scale1));
6751       lea(str2, Address(str2, result, scale2));
6752     }
6753     negptr(cnt2);
6754     jmpb(WHILE_HEAD_LABEL);
6755 
6756     bind(COMPARE_SMALL_STR);
6757   } else if (UseSSE42Intrinsics) {
6758     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6759     int pcmpmask = 0x19;
6760     // Setup to compare 8-char (16-byte) vectors,
6761     // start from first character again because it has aligned address.
6762     movl(result, cnt2);
6763     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
6764     if (ae == StrIntrinsicNode::LL) {
6765       pcmpmask &= ~0x01;
6766     }
6767     jcc(Assembler::zero, COMPARE_TAIL);
6768     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6769       lea(str1, Address(str1, result, scale));
6770       lea(str2, Address(str2, result, scale));
6771     } else {
6772       lea(str1, Address(str1, result, scale1));
6773       lea(str2, Address(str2, result, scale2));
6774     }
6775     negptr(result);
6776 
6777     // pcmpestri
6778     //   inputs:
6779     //     vec1- substring
6780     //     rax - negative string length (elements count)
6781     //     mem - scanned string
6782     //     rdx - string length (elements count)
6783     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
6784     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
6785     //   outputs:
6786     //     rcx - first mismatched element index
6787     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6788 
6789     bind(COMPARE_WIDE_VECTORS);
6790     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6791       movdqu(vec1, Address(str1, result, scale));
6792       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6793     } else {
6794       pmovzxbw(vec1, Address(str1, result, scale1));
6795       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6796     }
6797     // After pcmpestri cnt1(rcx) contains mismatched element index
6798 
6799     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
6800     addptr(result, stride);
6801     subptr(cnt2, stride);
6802     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6803 
6804     // compare wide vectors tail
6805     testptr(result, result);
6806     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6807 
6808     movl(cnt2, stride);
6809     movl(result, stride);
6810     negptr(result);
6811     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6812       movdqu(vec1, Address(str1, result, scale));
6813       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6814     } else {
6815       pmovzxbw(vec1, Address(str1, result, scale1));
6816       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6817     }
6818     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6819 
6820     // Mismatched characters in the vectors
6821     bind(VECTOR_NOT_EQUAL);
6822     addptr(cnt1, result);
6823     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6824     subl(result, cnt2);
6825     jmpb(POP_LABEL);
6826 
6827     bind(COMPARE_TAIL); // limit is zero
6828     movl(cnt2, result);
6829     // Fallthru to tail compare
6830   }
6831   // Shift str2 and str1 to the end of the arrays, negate min
6832   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6833     lea(str1, Address(str1, cnt2, scale));
6834     lea(str2, Address(str2, cnt2, scale));
6835   } else {
6836     lea(str1, Address(str1, cnt2, scale1));
6837     lea(str2, Address(str2, cnt2, scale2));
6838   }
6839   decrementl(cnt2);  // first character was compared already
6840   negptr(cnt2);
6841 
6842   // Compare the rest of the elements
6843   bind(WHILE_HEAD_LABEL);
6844   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
6845   subl(result, cnt1);
6846   jccb(Assembler::notZero, POP_LABEL);
6847   increment(cnt2);
6848   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6849 
6850   // Strings are equal up to min length.  Return the length difference.
6851   bind(LENGTH_DIFF_LABEL);
6852   pop(result);
6853   if (ae == StrIntrinsicNode::UU) {
6854     // Divide diff by 2 to get number of chars
6855     sarl(result, 1);
6856   }
6857   jmpb(DONE_LABEL);
6858 
6859 #ifdef _LP64
6860   if (VM_Version::supports_avx512vlbw()) {
6861 
6862     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
6863 
6864     kmovql(cnt1, k7);
6865     notq(cnt1);
6866     bsfq(cnt2, cnt1);
6867     if (ae != StrIntrinsicNode::LL) {
6868       // Divide diff by 2 to get number of chars
6869       sarl(cnt2, 1);
6870     }
6871     addq(result, cnt2);
6872     if (ae == StrIntrinsicNode::LL) {
6873       load_unsigned_byte(cnt1, Address(str2, result));
6874       load_unsigned_byte(result, Address(str1, result));
6875     } else if (ae == StrIntrinsicNode::UU) {
6876       load_unsigned_short(cnt1, Address(str2, result, scale));
6877       load_unsigned_short(result, Address(str1, result, scale));
6878     } else {
6879       load_unsigned_short(cnt1, Address(str2, result, scale2));
6880       load_unsigned_byte(result, Address(str1, result, scale1));
6881     }
6882     subl(result, cnt1);
6883     jmpb(POP_LABEL);
6884   }//if (VM_Version::supports_avx512vlbw())
6885 #endif // _LP64
6886 
6887   // Discard the stored length difference
6888   bind(POP_LABEL);
6889   pop(cnt1);
6890 
6891   // That's it
6892   bind(DONE_LABEL);
6893   if(ae == StrIntrinsicNode::UL) {
6894     negl(result);
6895   }
6896 
6897 }
6898 
6899 // Search for Non-ASCII character (Negative byte value) in a byte array,
6900 // return true if it has any and false otherwise.
6901 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
6902 //   @HotSpotIntrinsicCandidate
6903 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
6904 //     for (int i = off; i < off + len; i++) {
6905 //       if (ba[i] < 0) {
6906 //         return true;
6907 //       }
6908 //     }
6909 //     return false;
6910 //   }
6911 void MacroAssembler::has_negatives(Register ary1, Register len,
6912   Register result, Register tmp1,
6913   XMMRegister vec1, XMMRegister vec2) {
6914   // rsi: byte array
6915   // rcx: len
6916   // rax: result
6917   ShortBranchVerifier sbv(this);
6918   assert_different_registers(ary1, len, result, tmp1);
6919   assert_different_registers(vec1, vec2);
6920   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
6921 
6922   // len == 0
6923   testl(len, len);
6924   jcc(Assembler::zero, FALSE_LABEL);
6925 
6926   if ((UseAVX > 2) && // AVX512
6927     VM_Version::supports_avx512vlbw() &&
6928     VM_Version::supports_bmi2()) {
6929 
6930     Label test_64_loop, test_tail;
6931     Register tmp3_aliased = len;
6932 
6933     movl(tmp1, len);
6934     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
6935 
6936     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
6937     andl(len, ~(64 - 1));    // vector count (in chars)
6938     jccb(Assembler::zero, test_tail);
6939 
6940     lea(ary1, Address(ary1, len, Address::times_1));
6941     negptr(len);
6942 
6943     bind(test_64_loop);
6944     // Check whether our 64 elements of size byte contain negatives
6945     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
6946     kortestql(k2, k2);
6947     jcc(Assembler::notZero, TRUE_LABEL);
6948 
6949     addptr(len, 64);
6950     jccb(Assembler::notZero, test_64_loop);
6951 
6952 
6953     bind(test_tail);
6954     // bail out when there is nothing to be done
6955     testl(tmp1, -1);
6956     jcc(Assembler::zero, FALSE_LABEL);
6957 
6958     // ~(~0 << len) applied up to two times (for 32-bit scenario)
6959 #ifdef _LP64
6960     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
6961     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
6962     notq(tmp3_aliased);
6963     kmovql(k3, tmp3_aliased);
6964 #else
6965     Label k_init;
6966     jmp(k_init);
6967 
6968     // We could not read 64-bits from a general purpose register thus we move
6969     // data required to compose 64 1's to the instruction stream
6970     // We emit 64 byte wide series of elements from 0..63 which later on would
6971     // be used as a compare targets with tail count contained in tmp1 register.
6972     // Result would be a k register having tmp1 consecutive number or 1
6973     // counting from least significant bit.
6974     address tmp = pc();
6975     emit_int64(0x0706050403020100);
6976     emit_int64(0x0F0E0D0C0B0A0908);
6977     emit_int64(0x1716151413121110);
6978     emit_int64(0x1F1E1D1C1B1A1918);
6979     emit_int64(0x2726252423222120);
6980     emit_int64(0x2F2E2D2C2B2A2928);
6981     emit_int64(0x3736353433323130);
6982     emit_int64(0x3F3E3D3C3B3A3938);
6983 
6984     bind(k_init);
6985     lea(len, InternalAddress(tmp));
6986     // create mask to test for negative byte inside a vector
6987     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
6988     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
6989 
6990 #endif
6991     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
6992     ktestq(k2, k3);
6993     jcc(Assembler::notZero, TRUE_LABEL);
6994 
6995     jmp(FALSE_LABEL);
6996   } else {
6997     movl(result, len); // copy
6998 
6999     if (UseAVX == 2 && UseSSE >= 2) {
7000       // With AVX2, use 32-byte vector compare
7001       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7002 
7003       // Compare 32-byte vectors
7004       andl(result, 0x0000001f);  //   tail count (in bytes)
7005       andl(len, 0xffffffe0);   // vector count (in bytes)
7006       jccb(Assembler::zero, COMPARE_TAIL);
7007 
7008       lea(ary1, Address(ary1, len, Address::times_1));
7009       negptr(len);
7010 
7011       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
7012       movdl(vec2, tmp1);
7013       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
7014 
7015       bind(COMPARE_WIDE_VECTORS);
7016       vmovdqu(vec1, Address(ary1, len, Address::times_1));
7017       vptest(vec1, vec2);
7018       jccb(Assembler::notZero, TRUE_LABEL);
7019       addptr(len, 32);
7020       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7021 
7022       testl(result, result);
7023       jccb(Assembler::zero, FALSE_LABEL);
7024 
7025       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7026       vptest(vec1, vec2);
7027       jccb(Assembler::notZero, TRUE_LABEL);
7028       jmpb(FALSE_LABEL);
7029 
7030       bind(COMPARE_TAIL); // len is zero
7031       movl(len, result);
7032       // Fallthru to tail compare
7033     } else if (UseSSE42Intrinsics) {
7034       // With SSE4.2, use double quad vector compare
7035       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7036 
7037       // Compare 16-byte vectors
7038       andl(result, 0x0000000f);  //   tail count (in bytes)
7039       andl(len, 0xfffffff0);   // vector count (in bytes)
7040       jcc(Assembler::zero, COMPARE_TAIL);
7041 
7042       lea(ary1, Address(ary1, len, Address::times_1));
7043       negptr(len);
7044 
7045       movl(tmp1, 0x80808080);
7046       movdl(vec2, tmp1);
7047       pshufd(vec2, vec2, 0);
7048 
7049       bind(COMPARE_WIDE_VECTORS);
7050       movdqu(vec1, Address(ary1, len, Address::times_1));
7051       ptest(vec1, vec2);
7052       jcc(Assembler::notZero, TRUE_LABEL);
7053       addptr(len, 16);
7054       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7055 
7056       testl(result, result);
7057       jcc(Assembler::zero, FALSE_LABEL);
7058 
7059       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7060       ptest(vec1, vec2);
7061       jccb(Assembler::notZero, TRUE_LABEL);
7062       jmpb(FALSE_LABEL);
7063 
7064       bind(COMPARE_TAIL); // len is zero
7065       movl(len, result);
7066       // Fallthru to tail compare
7067     }
7068   }
7069   // Compare 4-byte vectors
7070   andl(len, 0xfffffffc); // vector count (in bytes)
7071   jccb(Assembler::zero, COMPARE_CHAR);
7072 
7073   lea(ary1, Address(ary1, len, Address::times_1));
7074   negptr(len);
7075 
7076   bind(COMPARE_VECTORS);
7077   movl(tmp1, Address(ary1, len, Address::times_1));
7078   andl(tmp1, 0x80808080);
7079   jccb(Assembler::notZero, TRUE_LABEL);
7080   addptr(len, 4);
7081   jcc(Assembler::notZero, COMPARE_VECTORS);
7082 
7083   // Compare trailing char (final 2 bytes), if any
7084   bind(COMPARE_CHAR);
7085   testl(result, 0x2);   // tail  char
7086   jccb(Assembler::zero, COMPARE_BYTE);
7087   load_unsigned_short(tmp1, Address(ary1, 0));
7088   andl(tmp1, 0x00008080);
7089   jccb(Assembler::notZero, TRUE_LABEL);
7090   subptr(result, 2);
7091   lea(ary1, Address(ary1, 2));
7092 
7093   bind(COMPARE_BYTE);
7094   testl(result, 0x1);   // tail  byte
7095   jccb(Assembler::zero, FALSE_LABEL);
7096   load_unsigned_byte(tmp1, Address(ary1, 0));
7097   andl(tmp1, 0x00000080);
7098   jccb(Assembler::notEqual, TRUE_LABEL);
7099   jmpb(FALSE_LABEL);
7100 
7101   bind(TRUE_LABEL);
7102   movl(result, 1);   // return true
7103   jmpb(DONE);
7104 
7105   bind(FALSE_LABEL);
7106   xorl(result, result); // return false
7107 
7108   // That's it
7109   bind(DONE);
7110   if (UseAVX >= 2 && UseSSE >= 2) {
7111     // clean upper bits of YMM registers
7112     vpxor(vec1, vec1);
7113     vpxor(vec2, vec2);
7114   }
7115 }
7116 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
7117 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
7118                                    Register limit, Register result, Register chr,
7119                                    XMMRegister vec1, XMMRegister vec2, bool is_char) {
7120   ShortBranchVerifier sbv(this);
7121   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
7122 
7123   int length_offset  = arrayOopDesc::length_offset_in_bytes();
7124   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
7125 
7126   if (is_array_equ) {
7127     // Check the input args
7128     cmpoop(ary1, ary2);
7129     jcc(Assembler::equal, TRUE_LABEL);
7130 
7131     // Need additional checks for arrays_equals.
7132     testptr(ary1, ary1);
7133     jcc(Assembler::zero, FALSE_LABEL);
7134     testptr(ary2, ary2);
7135     jcc(Assembler::zero, FALSE_LABEL);
7136 
7137     // Check the lengths
7138     movl(limit, Address(ary1, length_offset));
7139     cmpl(limit, Address(ary2, length_offset));
7140     jcc(Assembler::notEqual, FALSE_LABEL);
7141   }
7142 
7143   // count == 0
7144   testl(limit, limit);
7145   jcc(Assembler::zero, TRUE_LABEL);
7146 
7147   if (is_array_equ) {
7148     // Load array address
7149     lea(ary1, Address(ary1, base_offset));
7150     lea(ary2, Address(ary2, base_offset));
7151   }
7152 
7153   if (is_array_equ && is_char) {
7154     // arrays_equals when used for char[].
7155     shll(limit, 1);      // byte count != 0
7156   }
7157   movl(result, limit); // copy
7158 
7159   if (UseAVX >= 2) {
7160     // With AVX2, use 32-byte vector compare
7161     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7162 
7163     // Compare 32-byte vectors
7164     andl(result, 0x0000001f);  //   tail count (in bytes)
7165     andl(limit, 0xffffffe0);   // vector count (in bytes)
7166     jcc(Assembler::zero, COMPARE_TAIL);
7167 
7168     lea(ary1, Address(ary1, limit, Address::times_1));
7169     lea(ary2, Address(ary2, limit, Address::times_1));
7170     negptr(limit);
7171 
7172     bind(COMPARE_WIDE_VECTORS);
7173 
7174 #ifdef _LP64
7175     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7176       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
7177 
7178       cmpl(limit, -64);
7179       jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7180 
7181       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7182 
7183       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
7184       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
7185       kortestql(k7, k7);
7186       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7187       addptr(limit, 64);  // update since we already compared at this addr
7188       cmpl(limit, -64);
7189       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7190 
7191       // At this point we may still need to compare -limit+result bytes.
7192       // We could execute the next two instruction and just continue via non-wide path:
7193       //  cmpl(limit, 0);
7194       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
7195       // But since we stopped at the points ary{1,2}+limit which are
7196       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
7197       // (|limit| <= 32 and result < 32),
7198       // we may just compare the last 64 bytes.
7199       //
7200       addptr(result, -64);   // it is safe, bc we just came from this area
7201       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
7202       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
7203       kortestql(k7, k7);
7204       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7205 
7206       jmp(TRUE_LABEL);
7207 
7208       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7209 
7210     }//if (VM_Version::supports_avx512vlbw())
7211 #endif //_LP64
7212 
7213     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
7214     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
7215     vpxor(vec1, vec2);
7216 
7217     vptest(vec1, vec1);
7218     jcc(Assembler::notZero, FALSE_LABEL);
7219     addptr(limit, 32);
7220     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7221 
7222     testl(result, result);
7223     jcc(Assembler::zero, TRUE_LABEL);
7224 
7225     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7226     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
7227     vpxor(vec1, vec2);
7228 
7229     vptest(vec1, vec1);
7230     jccb(Assembler::notZero, FALSE_LABEL);
7231     jmpb(TRUE_LABEL);
7232 
7233     bind(COMPARE_TAIL); // limit is zero
7234     movl(limit, result);
7235     // Fallthru to tail compare
7236   } else if (UseSSE42Intrinsics) {
7237     // With SSE4.2, use double quad vector compare
7238     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7239 
7240     // Compare 16-byte vectors
7241     andl(result, 0x0000000f);  //   tail count (in bytes)
7242     andl(limit, 0xfffffff0);   // vector count (in bytes)
7243     jcc(Assembler::zero, COMPARE_TAIL);
7244 
7245     lea(ary1, Address(ary1, limit, Address::times_1));
7246     lea(ary2, Address(ary2, limit, Address::times_1));
7247     negptr(limit);
7248 
7249     bind(COMPARE_WIDE_VECTORS);
7250     movdqu(vec1, Address(ary1, limit, Address::times_1));
7251     movdqu(vec2, Address(ary2, limit, Address::times_1));
7252     pxor(vec1, vec2);
7253 
7254     ptest(vec1, vec1);
7255     jcc(Assembler::notZero, FALSE_LABEL);
7256     addptr(limit, 16);
7257     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7258 
7259     testl(result, result);
7260     jcc(Assembler::zero, TRUE_LABEL);
7261 
7262     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7263     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
7264     pxor(vec1, vec2);
7265 
7266     ptest(vec1, vec1);
7267     jccb(Assembler::notZero, FALSE_LABEL);
7268     jmpb(TRUE_LABEL);
7269 
7270     bind(COMPARE_TAIL); // limit is zero
7271     movl(limit, result);
7272     // Fallthru to tail compare
7273   }
7274 
7275   // Compare 4-byte vectors
7276   andl(limit, 0xfffffffc); // vector count (in bytes)
7277   jccb(Assembler::zero, COMPARE_CHAR);
7278 
7279   lea(ary1, Address(ary1, limit, Address::times_1));
7280   lea(ary2, Address(ary2, limit, Address::times_1));
7281   negptr(limit);
7282 
7283   bind(COMPARE_VECTORS);
7284   movl(chr, Address(ary1, limit, Address::times_1));
7285   cmpl(chr, Address(ary2, limit, Address::times_1));
7286   jccb(Assembler::notEqual, FALSE_LABEL);
7287   addptr(limit, 4);
7288   jcc(Assembler::notZero, COMPARE_VECTORS);
7289 
7290   // Compare trailing char (final 2 bytes), if any
7291   bind(COMPARE_CHAR);
7292   testl(result, 0x2);   // tail  char
7293   jccb(Assembler::zero, COMPARE_BYTE);
7294   load_unsigned_short(chr, Address(ary1, 0));
7295   load_unsigned_short(limit, Address(ary2, 0));
7296   cmpl(chr, limit);
7297   jccb(Assembler::notEqual, FALSE_LABEL);
7298 
7299   if (is_array_equ && is_char) {
7300     bind(COMPARE_BYTE);
7301   } else {
7302     lea(ary1, Address(ary1, 2));
7303     lea(ary2, Address(ary2, 2));
7304 
7305     bind(COMPARE_BYTE);
7306     testl(result, 0x1);   // tail  byte
7307     jccb(Assembler::zero, TRUE_LABEL);
7308     load_unsigned_byte(chr, Address(ary1, 0));
7309     load_unsigned_byte(limit, Address(ary2, 0));
7310     cmpl(chr, limit);
7311     jccb(Assembler::notEqual, FALSE_LABEL);
7312   }
7313   bind(TRUE_LABEL);
7314   movl(result, 1);   // return true
7315   jmpb(DONE);
7316 
7317   bind(FALSE_LABEL);
7318   xorl(result, result); // return false
7319 
7320   // That's it
7321   bind(DONE);
7322   if (UseAVX >= 2) {
7323     // clean upper bits of YMM registers
7324     vpxor(vec1, vec1);
7325     vpxor(vec2, vec2);
7326   }
7327 }
7328 
7329 #endif
7330 
7331 void MacroAssembler::generate_fill(BasicType t, bool aligned,
7332                                    Register to, Register value, Register count,
7333                                    Register rtmp, XMMRegister xtmp) {
7334   ShortBranchVerifier sbv(this);
7335   assert_different_registers(to, value, count, rtmp);
7336   Label L_exit;
7337   Label L_fill_2_bytes, L_fill_4_bytes;
7338 
7339   int shift = -1;
7340   switch (t) {
7341     case T_BYTE:
7342       shift = 2;
7343       break;
7344     case T_SHORT:
7345       shift = 1;
7346       break;
7347     case T_INT:
7348       shift = 0;
7349       break;
7350     default: ShouldNotReachHere();
7351   }
7352 
7353   if (t == T_BYTE) {
7354     andl(value, 0xff);
7355     movl(rtmp, value);
7356     shll(rtmp, 8);
7357     orl(value, rtmp);
7358   }
7359   if (t == T_SHORT) {
7360     andl(value, 0xffff);
7361   }
7362   if (t == T_BYTE || t == T_SHORT) {
7363     movl(rtmp, value);
7364     shll(rtmp, 16);
7365     orl(value, rtmp);
7366   }
7367 
7368   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
7369   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
7370   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
7371     Label L_skip_align2;
7372     // align source address at 4 bytes address boundary
7373     if (t == T_BYTE) {
7374       Label L_skip_align1;
7375       // One byte misalignment happens only for byte arrays
7376       testptr(to, 1);
7377       jccb(Assembler::zero, L_skip_align1);
7378       movb(Address(to, 0), value);
7379       increment(to);
7380       decrement(count);
7381       BIND(L_skip_align1);
7382     }
7383     // Two bytes misalignment happens only for byte and short (char) arrays
7384     testptr(to, 2);
7385     jccb(Assembler::zero, L_skip_align2);
7386     movw(Address(to, 0), value);
7387     addptr(to, 2);
7388     subl(count, 1<<(shift-1));
7389     BIND(L_skip_align2);
7390   }
7391   if (UseSSE < 2) {
7392     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7393     // Fill 32-byte chunks
7394     subl(count, 8 << shift);
7395     jcc(Assembler::less, L_check_fill_8_bytes);
7396     align(16);
7397 
7398     BIND(L_fill_32_bytes_loop);
7399 
7400     for (int i = 0; i < 32; i += 4) {
7401       movl(Address(to, i), value);
7402     }
7403 
7404     addptr(to, 32);
7405     subl(count, 8 << shift);
7406     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7407     BIND(L_check_fill_8_bytes);
7408     addl(count, 8 << shift);
7409     jccb(Assembler::zero, L_exit);
7410     jmpb(L_fill_8_bytes);
7411 
7412     //
7413     // length is too short, just fill qwords
7414     //
7415     BIND(L_fill_8_bytes_loop);
7416     movl(Address(to, 0), value);
7417     movl(Address(to, 4), value);
7418     addptr(to, 8);
7419     BIND(L_fill_8_bytes);
7420     subl(count, 1 << (shift + 1));
7421     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7422     // fall through to fill 4 bytes
7423   } else {
7424     Label L_fill_32_bytes;
7425     if (!UseUnalignedLoadStores) {
7426       // align to 8 bytes, we know we are 4 byte aligned to start
7427       testptr(to, 4);
7428       jccb(Assembler::zero, L_fill_32_bytes);
7429       movl(Address(to, 0), value);
7430       addptr(to, 4);
7431       subl(count, 1<<shift);
7432     }
7433     BIND(L_fill_32_bytes);
7434     {
7435       assert( UseSSE >= 2, "supported cpu only" );
7436       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7437       movdl(xtmp, value);
7438       if (UseAVX > 2 && UseUnalignedLoadStores) {
7439         // Fill 64-byte chunks
7440         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7441         vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7442 
7443         subl(count, 16 << shift);
7444         jcc(Assembler::less, L_check_fill_32_bytes);
7445         align(16);
7446 
7447         BIND(L_fill_64_bytes_loop);
7448         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7449         addptr(to, 64);
7450         subl(count, 16 << shift);
7451         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7452 
7453         BIND(L_check_fill_32_bytes);
7454         addl(count, 8 << shift);
7455         jccb(Assembler::less, L_check_fill_8_bytes);
7456         vmovdqu(Address(to, 0), xtmp);
7457         addptr(to, 32);
7458         subl(count, 8 << shift);
7459 
7460         BIND(L_check_fill_8_bytes);
7461       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7462         // Fill 64-byte chunks
7463         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7464         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
7465 
7466         subl(count, 16 << shift);
7467         jcc(Assembler::less, L_check_fill_32_bytes);
7468         align(16);
7469 
7470         BIND(L_fill_64_bytes_loop);
7471         vmovdqu(Address(to, 0), xtmp);
7472         vmovdqu(Address(to, 32), xtmp);
7473         addptr(to, 64);
7474         subl(count, 16 << shift);
7475         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7476 
7477         BIND(L_check_fill_32_bytes);
7478         addl(count, 8 << shift);
7479         jccb(Assembler::less, L_check_fill_8_bytes);
7480         vmovdqu(Address(to, 0), xtmp);
7481         addptr(to, 32);
7482         subl(count, 8 << shift);
7483 
7484         BIND(L_check_fill_8_bytes);
7485         // clean upper bits of YMM registers
7486         movdl(xtmp, value);
7487         pshufd(xtmp, xtmp, 0);
7488       } else {
7489         // Fill 32-byte chunks
7490         pshufd(xtmp, xtmp, 0);
7491 
7492         subl(count, 8 << shift);
7493         jcc(Assembler::less, L_check_fill_8_bytes);
7494         align(16);
7495 
7496         BIND(L_fill_32_bytes_loop);
7497 
7498         if (UseUnalignedLoadStores) {
7499           movdqu(Address(to, 0), xtmp);
7500           movdqu(Address(to, 16), xtmp);
7501         } else {
7502           movq(Address(to, 0), xtmp);
7503           movq(Address(to, 8), xtmp);
7504           movq(Address(to, 16), xtmp);
7505           movq(Address(to, 24), xtmp);
7506         }
7507 
7508         addptr(to, 32);
7509         subl(count, 8 << shift);
7510         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7511 
7512         BIND(L_check_fill_8_bytes);
7513       }
7514       addl(count, 8 << shift);
7515       jccb(Assembler::zero, L_exit);
7516       jmpb(L_fill_8_bytes);
7517 
7518       //
7519       // length is too short, just fill qwords
7520       //
7521       BIND(L_fill_8_bytes_loop);
7522       movq(Address(to, 0), xtmp);
7523       addptr(to, 8);
7524       BIND(L_fill_8_bytes);
7525       subl(count, 1 << (shift + 1));
7526       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7527     }
7528   }
7529   // fill trailing 4 bytes
7530   BIND(L_fill_4_bytes);
7531   testl(count, 1<<shift);
7532   jccb(Assembler::zero, L_fill_2_bytes);
7533   movl(Address(to, 0), value);
7534   if (t == T_BYTE || t == T_SHORT) {
7535     Label L_fill_byte;
7536     addptr(to, 4);
7537     BIND(L_fill_2_bytes);
7538     // fill trailing 2 bytes
7539     testl(count, 1<<(shift-1));
7540     jccb(Assembler::zero, L_fill_byte);
7541     movw(Address(to, 0), value);
7542     if (t == T_BYTE) {
7543       addptr(to, 2);
7544       BIND(L_fill_byte);
7545       // fill trailing byte
7546       testl(count, 1);
7547       jccb(Assembler::zero, L_exit);
7548       movb(Address(to, 0), value);
7549     } else {
7550       BIND(L_fill_byte);
7551     }
7552   } else {
7553     BIND(L_fill_2_bytes);
7554   }
7555   BIND(L_exit);
7556 }
7557 
7558 // encode char[] to byte[] in ISO_8859_1
7559    //@HotSpotIntrinsicCandidate
7560    //private static int implEncodeISOArray(byte[] sa, int sp,
7561    //byte[] da, int dp, int len) {
7562    //  int i = 0;
7563    //  for (; i < len; i++) {
7564    //    char c = StringUTF16.getChar(sa, sp++);
7565    //    if (c > '\u00FF')
7566    //      break;
7567    //    da[dp++] = (byte)c;
7568    //  }
7569    //  return i;
7570    //}
7571 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7572   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7573   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7574   Register tmp5, Register result) {
7575 
7576   // rsi: src
7577   // rdi: dst
7578   // rdx: len
7579   // rcx: tmp5
7580   // rax: result
7581   ShortBranchVerifier sbv(this);
7582   assert_different_registers(src, dst, len, tmp5, result);
7583   Label L_done, L_copy_1_char, L_copy_1_char_exit;
7584 
7585   // set result
7586   xorl(result, result);
7587   // check for zero length
7588   testl(len, len);
7589   jcc(Assembler::zero, L_done);
7590 
7591   movl(result, len);
7592 
7593   // Setup pointers
7594   lea(src, Address(src, len, Address::times_2)); // char[]
7595   lea(dst, Address(dst, len, Address::times_1)); // byte[]
7596   negptr(len);
7597 
7598   if (UseSSE42Intrinsics || UseAVX >= 2) {
7599     Label L_copy_8_chars, L_copy_8_chars_exit;
7600     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7601 
7602     if (UseAVX >= 2) {
7603       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7604       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7605       movdl(tmp1Reg, tmp5);
7606       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
7607       jmp(L_chars_32_check);
7608 
7609       bind(L_copy_32_chars);
7610       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7611       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7612       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7613       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7614       jccb(Assembler::notZero, L_copy_32_chars_exit);
7615       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7616       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7617       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7618 
7619       bind(L_chars_32_check);
7620       addptr(len, 32);
7621       jcc(Assembler::lessEqual, L_copy_32_chars);
7622 
7623       bind(L_copy_32_chars_exit);
7624       subptr(len, 16);
7625       jccb(Assembler::greater, L_copy_16_chars_exit);
7626 
7627     } else if (UseSSE42Intrinsics) {
7628       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7629       movdl(tmp1Reg, tmp5);
7630       pshufd(tmp1Reg, tmp1Reg, 0);
7631       jmpb(L_chars_16_check);
7632     }
7633 
7634     bind(L_copy_16_chars);
7635     if (UseAVX >= 2) {
7636       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7637       vptest(tmp2Reg, tmp1Reg);
7638       jcc(Assembler::notZero, L_copy_16_chars_exit);
7639       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7640       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7641     } else {
7642       if (UseAVX > 0) {
7643         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7644         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7645         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7646       } else {
7647         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7648         por(tmp2Reg, tmp3Reg);
7649         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7650         por(tmp2Reg, tmp4Reg);
7651       }
7652       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7653       jccb(Assembler::notZero, L_copy_16_chars_exit);
7654       packuswb(tmp3Reg, tmp4Reg);
7655     }
7656     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7657 
7658     bind(L_chars_16_check);
7659     addptr(len, 16);
7660     jcc(Assembler::lessEqual, L_copy_16_chars);
7661 
7662     bind(L_copy_16_chars_exit);
7663     if (UseAVX >= 2) {
7664       // clean upper bits of YMM registers
7665       vpxor(tmp2Reg, tmp2Reg);
7666       vpxor(tmp3Reg, tmp3Reg);
7667       vpxor(tmp4Reg, tmp4Reg);
7668       movdl(tmp1Reg, tmp5);
7669       pshufd(tmp1Reg, tmp1Reg, 0);
7670     }
7671     subptr(len, 8);
7672     jccb(Assembler::greater, L_copy_8_chars_exit);
7673 
7674     bind(L_copy_8_chars);
7675     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7676     ptest(tmp3Reg, tmp1Reg);
7677     jccb(Assembler::notZero, L_copy_8_chars_exit);
7678     packuswb(tmp3Reg, tmp1Reg);
7679     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7680     addptr(len, 8);
7681     jccb(Assembler::lessEqual, L_copy_8_chars);
7682 
7683     bind(L_copy_8_chars_exit);
7684     subptr(len, 8);
7685     jccb(Assembler::zero, L_done);
7686   }
7687 
7688   bind(L_copy_1_char);
7689   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7690   testl(tmp5, 0xff00);      // check if Unicode char
7691   jccb(Assembler::notZero, L_copy_1_char_exit);
7692   movb(Address(dst, len, Address::times_1, 0), tmp5);
7693   addptr(len, 1);
7694   jccb(Assembler::less, L_copy_1_char);
7695 
7696   bind(L_copy_1_char_exit);
7697   addptr(result, len); // len is negative count of not processed elements
7698 
7699   bind(L_done);
7700 }
7701 
7702 #ifdef _LP64
7703 /**
7704  * Helper for multiply_to_len().
7705  */
7706 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7707   addq(dest_lo, src1);
7708   adcq(dest_hi, 0);
7709   addq(dest_lo, src2);
7710   adcq(dest_hi, 0);
7711 }
7712 
7713 /**
7714  * Multiply 64 bit by 64 bit first loop.
7715  */
7716 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7717                                            Register y, Register y_idx, Register z,
7718                                            Register carry, Register product,
7719                                            Register idx, Register kdx) {
7720   //
7721   //  jlong carry, x[], y[], z[];
7722   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7723   //    huge_128 product = y[idx] * x[xstart] + carry;
7724   //    z[kdx] = (jlong)product;
7725   //    carry  = (jlong)(product >>> 64);
7726   //  }
7727   //  z[xstart] = carry;
7728   //
7729 
7730   Label L_first_loop, L_first_loop_exit;
7731   Label L_one_x, L_one_y, L_multiply;
7732 
7733   decrementl(xstart);
7734   jcc(Assembler::negative, L_one_x);
7735 
7736   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7737   rorq(x_xstart, 32); // convert big-endian to little-endian
7738 
7739   bind(L_first_loop);
7740   decrementl(idx);
7741   jcc(Assembler::negative, L_first_loop_exit);
7742   decrementl(idx);
7743   jcc(Assembler::negative, L_one_y);
7744   movq(y_idx, Address(y, idx, Address::times_4,  0));
7745   rorq(y_idx, 32); // convert big-endian to little-endian
7746   bind(L_multiply);
7747   movq(product, x_xstart);
7748   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7749   addq(product, carry);
7750   adcq(rdx, 0);
7751   subl(kdx, 2);
7752   movl(Address(z, kdx, Address::times_4,  4), product);
7753   shrq(product, 32);
7754   movl(Address(z, kdx, Address::times_4,  0), product);
7755   movq(carry, rdx);
7756   jmp(L_first_loop);
7757 
7758   bind(L_one_y);
7759   movl(y_idx, Address(y,  0));
7760   jmp(L_multiply);
7761 
7762   bind(L_one_x);
7763   movl(x_xstart, Address(x,  0));
7764   jmp(L_first_loop);
7765 
7766   bind(L_first_loop_exit);
7767 }
7768 
7769 /**
7770  * Multiply 64 bit by 64 bit and add 128 bit.
7771  */
7772 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7773                                             Register yz_idx, Register idx,
7774                                             Register carry, Register product, int offset) {
7775   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7776   //     z[kdx] = (jlong)product;
7777 
7778   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
7779   rorq(yz_idx, 32); // convert big-endian to little-endian
7780   movq(product, x_xstart);
7781   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
7782   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
7783   rorq(yz_idx, 32); // convert big-endian to little-endian
7784 
7785   add2_with_carry(rdx, product, carry, yz_idx);
7786 
7787   movl(Address(z, idx, Address::times_4,  offset+4), product);
7788   shrq(product, 32);
7789   movl(Address(z, idx, Address::times_4,  offset), product);
7790 
7791 }
7792 
7793 /**
7794  * Multiply 128 bit by 128 bit. Unrolled inner loop.
7795  */
7796 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7797                                              Register yz_idx, Register idx, Register jdx,
7798                                              Register carry, Register product,
7799                                              Register carry2) {
7800   //   jlong carry, x[], y[], z[];
7801   //   int kdx = ystart+1;
7802   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7803   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7804   //     z[kdx+idx+1] = (jlong)product;
7805   //     jlong carry2  = (jlong)(product >>> 64);
7806   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7807   //     z[kdx+idx] = (jlong)product;
7808   //     carry  = (jlong)(product >>> 64);
7809   //   }
7810   //   idx += 2;
7811   //   if (idx > 0) {
7812   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7813   //     z[kdx+idx] = (jlong)product;
7814   //     carry  = (jlong)(product >>> 64);
7815   //   }
7816   //
7817 
7818   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7819 
7820   movl(jdx, idx);
7821   andl(jdx, 0xFFFFFFFC);
7822   shrl(jdx, 2);
7823 
7824   bind(L_third_loop);
7825   subl(jdx, 1);
7826   jcc(Assembler::negative, L_third_loop_exit);
7827   subl(idx, 4);
7828 
7829   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7830   movq(carry2, rdx);
7831 
7832   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7833   movq(carry, rdx);
7834   jmp(L_third_loop);
7835 
7836   bind (L_third_loop_exit);
7837 
7838   andl (idx, 0x3);
7839   jcc(Assembler::zero, L_post_third_loop_done);
7840 
7841   Label L_check_1;
7842   subl(idx, 2);
7843   jcc(Assembler::negative, L_check_1);
7844 
7845   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7846   movq(carry, rdx);
7847 
7848   bind (L_check_1);
7849   addl (idx, 0x2);
7850   andl (idx, 0x1);
7851   subl(idx, 1);
7852   jcc(Assembler::negative, L_post_third_loop_done);
7853 
7854   movl(yz_idx, Address(y, idx, Address::times_4,  0));
7855   movq(product, x_xstart);
7856   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7857   movl(yz_idx, Address(z, idx, Address::times_4,  0));
7858 
7859   add2_with_carry(rdx, product, yz_idx, carry);
7860 
7861   movl(Address(z, idx, Address::times_4,  0), product);
7862   shrq(product, 32);
7863 
7864   shlq(rdx, 32);
7865   orq(product, rdx);
7866   movq(carry, product);
7867 
7868   bind(L_post_third_loop_done);
7869 }
7870 
7871 /**
7872  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7873  *
7874  */
7875 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7876                                                   Register carry, Register carry2,
7877                                                   Register idx, Register jdx,
7878                                                   Register yz_idx1, Register yz_idx2,
7879                                                   Register tmp, Register tmp3, Register tmp4) {
7880   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7881 
7882   //   jlong carry, x[], y[], z[];
7883   //   int kdx = ystart+1;
7884   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7885   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7886   //     jlong carry2  = (jlong)(tmp3 >>> 64);
7887   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
7888   //     carry  = (jlong)(tmp4 >>> 64);
7889   //     z[kdx+idx+1] = (jlong)tmp3;
7890   //     z[kdx+idx] = (jlong)tmp4;
7891   //   }
7892   //   idx += 2;
7893   //   if (idx > 0) {
7894   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7895   //     z[kdx+idx] = (jlong)yz_idx1;
7896   //     carry  = (jlong)(yz_idx1 >>> 64);
7897   //   }
7898   //
7899 
7900   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7901 
7902   movl(jdx, idx);
7903   andl(jdx, 0xFFFFFFFC);
7904   shrl(jdx, 2);
7905 
7906   bind(L_third_loop);
7907   subl(jdx, 1);
7908   jcc(Assembler::negative, L_third_loop_exit);
7909   subl(idx, 4);
7910 
7911   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
7912   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7913   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
7914   rorxq(yz_idx2, yz_idx2, 32);
7915 
7916   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
7917   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
7918 
7919   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
7920   rorxq(yz_idx1, yz_idx1, 32);
7921   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7922   rorxq(yz_idx2, yz_idx2, 32);
7923 
7924   if (VM_Version::supports_adx()) {
7925     adcxq(tmp3, carry);
7926     adoxq(tmp3, yz_idx1);
7927 
7928     adcxq(tmp4, tmp);
7929     adoxq(tmp4, yz_idx2);
7930 
7931     movl(carry, 0); // does not affect flags
7932     adcxq(carry2, carry);
7933     adoxq(carry2, carry);
7934   } else {
7935     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7936     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7937   }
7938   movq(carry, carry2);
7939 
7940   movl(Address(z, idx, Address::times_4, 12), tmp3);
7941   shrq(tmp3, 32);
7942   movl(Address(z, idx, Address::times_4,  8), tmp3);
7943 
7944   movl(Address(z, idx, Address::times_4,  4), tmp4);
7945   shrq(tmp4, 32);
7946   movl(Address(z, idx, Address::times_4,  0), tmp4);
7947 
7948   jmp(L_third_loop);
7949 
7950   bind (L_third_loop_exit);
7951 
7952   andl (idx, 0x3);
7953   jcc(Assembler::zero, L_post_third_loop_done);
7954 
7955   Label L_check_1;
7956   subl(idx, 2);
7957   jcc(Assembler::negative, L_check_1);
7958 
7959   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
7960   rorxq(yz_idx1, yz_idx1, 32);
7961   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
7962   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7963   rorxq(yz_idx2, yz_idx2, 32);
7964 
7965   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7966 
7967   movl(Address(z, idx, Address::times_4,  4), tmp3);
7968   shrq(tmp3, 32);
7969   movl(Address(z, idx, Address::times_4,  0), tmp3);
7970   movq(carry, tmp4);
7971 
7972   bind (L_check_1);
7973   addl (idx, 0x2);
7974   andl (idx, 0x1);
7975   subl(idx, 1);
7976   jcc(Assembler::negative, L_post_third_loop_done);
7977   movl(tmp4, Address(y, idx, Address::times_4,  0));
7978   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
7979   movl(tmp4, Address(z, idx, Address::times_4,  0));
7980 
7981   add2_with_carry(carry2, tmp3, tmp4, carry);
7982 
7983   movl(Address(z, idx, Address::times_4,  0), tmp3);
7984   shrq(tmp3, 32);
7985 
7986   shlq(carry2, 32);
7987   orq(tmp3, carry2);
7988   movq(carry, tmp3);
7989 
7990   bind(L_post_third_loop_done);
7991 }
7992 
7993 /**
7994  * Code for BigInteger::multiplyToLen() instrinsic.
7995  *
7996  * rdi: x
7997  * rax: xlen
7998  * rsi: y
7999  * rcx: ylen
8000  * r8:  z
8001  * r11: zlen
8002  * r12: tmp1
8003  * r13: tmp2
8004  * r14: tmp3
8005  * r15: tmp4
8006  * rbx: tmp5
8007  *
8008  */
8009 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
8010                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
8011   ShortBranchVerifier sbv(this);
8012   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
8013 
8014   push(tmp1);
8015   push(tmp2);
8016   push(tmp3);
8017   push(tmp4);
8018   push(tmp5);
8019 
8020   push(xlen);
8021   push(zlen);
8022 
8023   const Register idx = tmp1;
8024   const Register kdx = tmp2;
8025   const Register xstart = tmp3;
8026 
8027   const Register y_idx = tmp4;
8028   const Register carry = tmp5;
8029   const Register product  = xlen;
8030   const Register x_xstart = zlen;  // reuse register
8031 
8032   // First Loop.
8033   //
8034   //  final static long LONG_MASK = 0xffffffffL;
8035   //  int xstart = xlen - 1;
8036   //  int ystart = ylen - 1;
8037   //  long carry = 0;
8038   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8039   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
8040   //    z[kdx] = (int)product;
8041   //    carry = product >>> 32;
8042   //  }
8043   //  z[xstart] = (int)carry;
8044   //
8045 
8046   movl(idx, ylen);      // idx = ylen;
8047   movl(kdx, zlen);      // kdx = xlen+ylen;
8048   xorq(carry, carry);   // carry = 0;
8049 
8050   Label L_done;
8051 
8052   movl(xstart, xlen);
8053   decrementl(xstart);
8054   jcc(Assembler::negative, L_done);
8055 
8056   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
8057 
8058   Label L_second_loop;
8059   testl(kdx, kdx);
8060   jcc(Assembler::zero, L_second_loop);
8061 
8062   Label L_carry;
8063   subl(kdx, 1);
8064   jcc(Assembler::zero, L_carry);
8065 
8066   movl(Address(z, kdx, Address::times_4,  0), carry);
8067   shrq(carry, 32);
8068   subl(kdx, 1);
8069 
8070   bind(L_carry);
8071   movl(Address(z, kdx, Address::times_4,  0), carry);
8072 
8073   // Second and third (nested) loops.
8074   //
8075   // for (int i = xstart-1; i >= 0; i--) { // Second loop
8076   //   carry = 0;
8077   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
8078   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
8079   //                    (z[k] & LONG_MASK) + carry;
8080   //     z[k] = (int)product;
8081   //     carry = product >>> 32;
8082   //   }
8083   //   z[i] = (int)carry;
8084   // }
8085   //
8086   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
8087 
8088   const Register jdx = tmp1;
8089 
8090   bind(L_second_loop);
8091   xorl(carry, carry);    // carry = 0;
8092   movl(jdx, ylen);       // j = ystart+1
8093 
8094   subl(xstart, 1);       // i = xstart-1;
8095   jcc(Assembler::negative, L_done);
8096 
8097   push (z);
8098 
8099   Label L_last_x;
8100   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
8101   subl(xstart, 1);       // i = xstart-1;
8102   jcc(Assembler::negative, L_last_x);
8103 
8104   if (UseBMI2Instructions) {
8105     movq(rdx,  Address(x, xstart, Address::times_4,  0));
8106     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
8107   } else {
8108     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8109     rorq(x_xstart, 32);  // convert big-endian to little-endian
8110   }
8111 
8112   Label L_third_loop_prologue;
8113   bind(L_third_loop_prologue);
8114 
8115   push (x);
8116   push (xstart);
8117   push (ylen);
8118 
8119 
8120   if (UseBMI2Instructions) {
8121     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
8122   } else { // !UseBMI2Instructions
8123     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
8124   }
8125 
8126   pop(ylen);
8127   pop(xlen);
8128   pop(x);
8129   pop(z);
8130 
8131   movl(tmp3, xlen);
8132   addl(tmp3, 1);
8133   movl(Address(z, tmp3, Address::times_4,  0), carry);
8134   subl(tmp3, 1);
8135   jccb(Assembler::negative, L_done);
8136 
8137   shrq(carry, 32);
8138   movl(Address(z, tmp3, Address::times_4,  0), carry);
8139   jmp(L_second_loop);
8140 
8141   // Next infrequent code is moved outside loops.
8142   bind(L_last_x);
8143   if (UseBMI2Instructions) {
8144     movl(rdx, Address(x,  0));
8145   } else {
8146     movl(x_xstart, Address(x,  0));
8147   }
8148   jmp(L_third_loop_prologue);
8149 
8150   bind(L_done);
8151 
8152   pop(zlen);
8153   pop(xlen);
8154 
8155   pop(tmp5);
8156   pop(tmp4);
8157   pop(tmp3);
8158   pop(tmp2);
8159   pop(tmp1);
8160 }
8161 
8162 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
8163   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
8164   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
8165   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
8166   Label VECTOR8_TAIL, VECTOR4_TAIL;
8167   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
8168   Label SAME_TILL_END, DONE;
8169   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
8170 
8171   //scale is in rcx in both Win64 and Unix
8172   ShortBranchVerifier sbv(this);
8173 
8174   shlq(length);
8175   xorq(result, result);
8176 
8177   if ((UseAVX > 2) &&
8178       VM_Version::supports_avx512vlbw()) {
8179     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
8180 
8181     cmpq(length, 64);
8182     jcc(Assembler::less, VECTOR32_TAIL);
8183     movq(tmp1, length);
8184     andq(tmp1, 0x3F);      // tail count
8185     andq(length, ~(0x3F)); //vector count
8186 
8187     bind(VECTOR64_LOOP);
8188     // AVX512 code to compare 64 byte vectors.
8189     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
8190     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
8191     kortestql(k7, k7);
8192     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
8193     addq(result, 64);
8194     subq(length, 64);
8195     jccb(Assembler::notZero, VECTOR64_LOOP);
8196 
8197     //bind(VECTOR64_TAIL);
8198     testq(tmp1, tmp1);
8199     jcc(Assembler::zero, SAME_TILL_END);
8200 
8201     //bind(VECTOR64_TAIL);
8202     // AVX512 code to compare upto 63 byte vectors.
8203     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
8204     shlxq(tmp2, tmp2, tmp1);
8205     notq(tmp2);
8206     kmovql(k3, tmp2);
8207 
8208     evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
8209     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
8210 
8211     ktestql(k7, k3);
8212     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
8213 
8214     bind(VECTOR64_NOT_EQUAL);
8215     kmovql(tmp1, k7);
8216     notq(tmp1);
8217     tzcntq(tmp1, tmp1);
8218     addq(result, tmp1);
8219     shrq(result);
8220     jmp(DONE);
8221     bind(VECTOR32_TAIL);
8222   }
8223 
8224   cmpq(length, 8);
8225   jcc(Assembler::equal, VECTOR8_LOOP);
8226   jcc(Assembler::less, VECTOR4_TAIL);
8227 
8228   if (UseAVX >= 2) {
8229     Label VECTOR16_TAIL, VECTOR32_LOOP;
8230 
8231     cmpq(length, 16);
8232     jcc(Assembler::equal, VECTOR16_LOOP);
8233     jcc(Assembler::less, VECTOR8_LOOP);
8234 
8235     cmpq(length, 32);
8236     jccb(Assembler::less, VECTOR16_TAIL);
8237 
8238     subq(length, 32);
8239     bind(VECTOR32_LOOP);
8240     vmovdqu(rymm0, Address(obja, result));
8241     vmovdqu(rymm1, Address(objb, result));
8242     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
8243     vptest(rymm2, rymm2);
8244     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
8245     addq(result, 32);
8246     subq(length, 32);
8247     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
8248     addq(length, 32);
8249     jcc(Assembler::equal, SAME_TILL_END);
8250     //falling through if less than 32 bytes left //close the branch here.
8251 
8252     bind(VECTOR16_TAIL);
8253     cmpq(length, 16);
8254     jccb(Assembler::less, VECTOR8_TAIL);
8255     bind(VECTOR16_LOOP);
8256     movdqu(rymm0, Address(obja, result));
8257     movdqu(rymm1, Address(objb, result));
8258     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
8259     ptest(rymm2, rymm2);
8260     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8261     addq(result, 16);
8262     subq(length, 16);
8263     jcc(Assembler::equal, SAME_TILL_END);
8264     //falling through if less than 16 bytes left
8265   } else {//regular intrinsics
8266 
8267     cmpq(length, 16);
8268     jccb(Assembler::less, VECTOR8_TAIL);
8269 
8270     subq(length, 16);
8271     bind(VECTOR16_LOOP);
8272     movdqu(rymm0, Address(obja, result));
8273     movdqu(rymm1, Address(objb, result));
8274     pxor(rymm0, rymm1);
8275     ptest(rymm0, rymm0);
8276     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8277     addq(result, 16);
8278     subq(length, 16);
8279     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
8280     addq(length, 16);
8281     jcc(Assembler::equal, SAME_TILL_END);
8282     //falling through if less than 16 bytes left
8283   }
8284 
8285   bind(VECTOR8_TAIL);
8286   cmpq(length, 8);
8287   jccb(Assembler::less, VECTOR4_TAIL);
8288   bind(VECTOR8_LOOP);
8289   movq(tmp1, Address(obja, result));
8290   movq(tmp2, Address(objb, result));
8291   xorq(tmp1, tmp2);
8292   testq(tmp1, tmp1);
8293   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
8294   addq(result, 8);
8295   subq(length, 8);
8296   jcc(Assembler::equal, SAME_TILL_END);
8297   //falling through if less than 8 bytes left
8298 
8299   bind(VECTOR4_TAIL);
8300   cmpq(length, 4);
8301   jccb(Assembler::less, BYTES_TAIL);
8302   bind(VECTOR4_LOOP);
8303   movl(tmp1, Address(obja, result));
8304   xorl(tmp1, Address(objb, result));
8305   testl(tmp1, tmp1);
8306   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
8307   addq(result, 4);
8308   subq(length, 4);
8309   jcc(Assembler::equal, SAME_TILL_END);
8310   //falling through if less than 4 bytes left
8311 
8312   bind(BYTES_TAIL);
8313   bind(BYTES_LOOP);
8314   load_unsigned_byte(tmp1, Address(obja, result));
8315   load_unsigned_byte(tmp2, Address(objb, result));
8316   xorl(tmp1, tmp2);
8317   testl(tmp1, tmp1);
8318   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8319   decq(length);
8320   jcc(Assembler::zero, SAME_TILL_END);
8321   incq(result);
8322   load_unsigned_byte(tmp1, Address(obja, result));
8323   load_unsigned_byte(tmp2, Address(objb, result));
8324   xorl(tmp1, tmp2);
8325   testl(tmp1, tmp1);
8326   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8327   decq(length);
8328   jcc(Assembler::zero, SAME_TILL_END);
8329   incq(result);
8330   load_unsigned_byte(tmp1, Address(obja, result));
8331   load_unsigned_byte(tmp2, Address(objb, result));
8332   xorl(tmp1, tmp2);
8333   testl(tmp1, tmp1);
8334   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8335   jmp(SAME_TILL_END);
8336 
8337   if (UseAVX >= 2) {
8338     bind(VECTOR32_NOT_EQUAL);
8339     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
8340     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
8341     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
8342     vpmovmskb(tmp1, rymm0);
8343     bsfq(tmp1, tmp1);
8344     addq(result, tmp1);
8345     shrq(result);
8346     jmp(DONE);
8347   }
8348 
8349   bind(VECTOR16_NOT_EQUAL);
8350   if (UseAVX >= 2) {
8351     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
8352     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
8353     pxor(rymm0, rymm2);
8354   } else {
8355     pcmpeqb(rymm2, rymm2);
8356     pxor(rymm0, rymm1);
8357     pcmpeqb(rymm0, rymm1);
8358     pxor(rymm0, rymm2);
8359   }
8360   pmovmskb(tmp1, rymm0);
8361   bsfq(tmp1, tmp1);
8362   addq(result, tmp1);
8363   shrq(result);
8364   jmpb(DONE);
8365 
8366   bind(VECTOR8_NOT_EQUAL);
8367   bind(VECTOR4_NOT_EQUAL);
8368   bsfq(tmp1, tmp1);
8369   shrq(tmp1, 3);
8370   addq(result, tmp1);
8371   bind(BYTES_NOT_EQUAL);
8372   shrq(result);
8373   jmpb(DONE);
8374 
8375   bind(SAME_TILL_END);
8376   mov64(result, -1);
8377 
8378   bind(DONE);
8379 }
8380 
8381 //Helper functions for square_to_len()
8382 
8383 /**
8384  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
8385  * Preserves x and z and modifies rest of the registers.
8386  */
8387 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8388   // Perform square and right shift by 1
8389   // Handle odd xlen case first, then for even xlen do the following
8390   // jlong carry = 0;
8391   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
8392   //     huge_128 product = x[j:j+1] * x[j:j+1];
8393   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
8394   //     z[i+2:i+3] = (jlong)(product >>> 1);
8395   //     carry = (jlong)product;
8396   // }
8397 
8398   xorq(tmp5, tmp5);     // carry
8399   xorq(rdxReg, rdxReg);
8400   xorl(tmp1, tmp1);     // index for x
8401   xorl(tmp4, tmp4);     // index for z
8402 
8403   Label L_first_loop, L_first_loop_exit;
8404 
8405   testl(xlen, 1);
8406   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
8407 
8408   // Square and right shift by 1 the odd element using 32 bit multiply
8409   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
8410   imulq(raxReg, raxReg);
8411   shrq(raxReg, 1);
8412   adcq(tmp5, 0);
8413   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
8414   incrementl(tmp1);
8415   addl(tmp4, 2);
8416 
8417   // Square and  right shift by 1 the rest using 64 bit multiply
8418   bind(L_first_loop);
8419   cmpptr(tmp1, xlen);
8420   jccb(Assembler::equal, L_first_loop_exit);
8421 
8422   // Square
8423   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
8424   rorq(raxReg, 32);    // convert big-endian to little-endian
8425   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
8426 
8427   // Right shift by 1 and save carry
8428   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
8429   rcrq(rdxReg, 1);
8430   rcrq(raxReg, 1);
8431   adcq(tmp5, 0);
8432 
8433   // Store result in z
8434   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
8435   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
8436 
8437   // Update indices for x and z
8438   addl(tmp1, 2);
8439   addl(tmp4, 4);
8440   jmp(L_first_loop);
8441 
8442   bind(L_first_loop_exit);
8443 }
8444 
8445 
8446 /**
8447  * Perform the following multiply add operation using BMI2 instructions
8448  * carry:sum = sum + op1*op2 + carry
8449  * op2 should be in rdx
8450  * op2 is preserved, all other registers are modified
8451  */
8452 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
8453   // assert op2 is rdx
8454   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
8455   addq(sum, carry);
8456   adcq(tmp2, 0);
8457   addq(sum, op1);
8458   adcq(tmp2, 0);
8459   movq(carry, tmp2);
8460 }
8461 
8462 /**
8463  * Perform the following multiply add operation:
8464  * carry:sum = sum + op1*op2 + carry
8465  * Preserves op1, op2 and modifies rest of registers
8466  */
8467 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
8468   // rdx:rax = op1 * op2
8469   movq(raxReg, op2);
8470   mulq(op1);
8471 
8472   //  rdx:rax = sum + carry + rdx:rax
8473   addq(sum, carry);
8474   adcq(rdxReg, 0);
8475   addq(sum, raxReg);
8476   adcq(rdxReg, 0);
8477 
8478   // carry:sum = rdx:sum
8479   movq(carry, rdxReg);
8480 }
8481 
8482 /**
8483  * Add 64 bit long carry into z[] with carry propogation.
8484  * Preserves z and carry register values and modifies rest of registers.
8485  *
8486  */
8487 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
8488   Label L_fourth_loop, L_fourth_loop_exit;
8489 
8490   movl(tmp1, 1);
8491   subl(zlen, 2);
8492   addq(Address(z, zlen, Address::times_4, 0), carry);
8493 
8494   bind(L_fourth_loop);
8495   jccb(Assembler::carryClear, L_fourth_loop_exit);
8496   subl(zlen, 2);
8497   jccb(Assembler::negative, L_fourth_loop_exit);
8498   addq(Address(z, zlen, Address::times_4, 0), tmp1);
8499   jmp(L_fourth_loop);
8500   bind(L_fourth_loop_exit);
8501 }
8502 
8503 /**
8504  * Shift z[] left by 1 bit.
8505  * Preserves x, len, z and zlen registers and modifies rest of the registers.
8506  *
8507  */
8508 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
8509 
8510   Label L_fifth_loop, L_fifth_loop_exit;
8511 
8512   // Fifth loop
8513   // Perform primitiveLeftShift(z, zlen, 1)
8514 
8515   const Register prev_carry = tmp1;
8516   const Register new_carry = tmp4;
8517   const Register value = tmp2;
8518   const Register zidx = tmp3;
8519 
8520   // int zidx, carry;
8521   // long value;
8522   // carry = 0;
8523   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
8524   //    (carry:value)  = (z[i] << 1) | carry ;
8525   //    z[i] = value;
8526   // }
8527 
8528   movl(zidx, zlen);
8529   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
8530 
8531   bind(L_fifth_loop);
8532   decl(zidx);  // Use decl to preserve carry flag
8533   decl(zidx);
8534   jccb(Assembler::negative, L_fifth_loop_exit);
8535 
8536   if (UseBMI2Instructions) {
8537      movq(value, Address(z, zidx, Address::times_4, 0));
8538      rclq(value, 1);
8539      rorxq(value, value, 32);
8540      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8541   }
8542   else {
8543     // clear new_carry
8544     xorl(new_carry, new_carry);
8545 
8546     // Shift z[i] by 1, or in previous carry and save new carry
8547     movq(value, Address(z, zidx, Address::times_4, 0));
8548     shlq(value, 1);
8549     adcl(new_carry, 0);
8550 
8551     orq(value, prev_carry);
8552     rorq(value, 0x20);
8553     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8554 
8555     // Set previous carry = new carry
8556     movl(prev_carry, new_carry);
8557   }
8558   jmp(L_fifth_loop);
8559 
8560   bind(L_fifth_loop_exit);
8561 }
8562 
8563 
8564 /**
8565  * Code for BigInteger::squareToLen() intrinsic
8566  *
8567  * rdi: x
8568  * rsi: len
8569  * r8:  z
8570  * rcx: zlen
8571  * r12: tmp1
8572  * r13: tmp2
8573  * r14: tmp3
8574  * r15: tmp4
8575  * rbx: tmp5
8576  *
8577  */
8578 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8579 
8580   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
8581   push(tmp1);
8582   push(tmp2);
8583   push(tmp3);
8584   push(tmp4);
8585   push(tmp5);
8586 
8587   // First loop
8588   // Store the squares, right shifted one bit (i.e., divided by 2).
8589   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
8590 
8591   // Add in off-diagonal sums.
8592   //
8593   // Second, third (nested) and fourth loops.
8594   // zlen +=2;
8595   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
8596   //    carry = 0;
8597   //    long op2 = x[xidx:xidx+1];
8598   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
8599   //       k -= 2;
8600   //       long op1 = x[j:j+1];
8601   //       long sum = z[k:k+1];
8602   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
8603   //       z[k:k+1] = sum;
8604   //    }
8605   //    add_one_64(z, k, carry, tmp_regs);
8606   // }
8607 
8608   const Register carry = tmp5;
8609   const Register sum = tmp3;
8610   const Register op1 = tmp4;
8611   Register op2 = tmp2;
8612 
8613   push(zlen);
8614   push(len);
8615   addl(zlen,2);
8616   bind(L_second_loop);
8617   xorq(carry, carry);
8618   subl(zlen, 4);
8619   subl(len, 2);
8620   push(zlen);
8621   push(len);
8622   cmpl(len, 0);
8623   jccb(Assembler::lessEqual, L_second_loop_exit);
8624 
8625   // Multiply an array by one 64 bit long.
8626   if (UseBMI2Instructions) {
8627     op2 = rdxReg;
8628     movq(op2, Address(x, len, Address::times_4,  0));
8629     rorxq(op2, op2, 32);
8630   }
8631   else {
8632     movq(op2, Address(x, len, Address::times_4,  0));
8633     rorq(op2, 32);
8634   }
8635 
8636   bind(L_third_loop);
8637   decrementl(len);
8638   jccb(Assembler::negative, L_third_loop_exit);
8639   decrementl(len);
8640   jccb(Assembler::negative, L_last_x);
8641 
8642   movq(op1, Address(x, len, Address::times_4,  0));
8643   rorq(op1, 32);
8644 
8645   bind(L_multiply);
8646   subl(zlen, 2);
8647   movq(sum, Address(z, zlen, Address::times_4,  0));
8648 
8649   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8650   if (UseBMI2Instructions) {
8651     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8652   }
8653   else {
8654     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8655   }
8656 
8657   movq(Address(z, zlen, Address::times_4, 0), sum);
8658 
8659   jmp(L_third_loop);
8660   bind(L_third_loop_exit);
8661 
8662   // Fourth loop
8663   // Add 64 bit long carry into z with carry propogation.
8664   // Uses offsetted zlen.
8665   add_one_64(z, zlen, carry, tmp1);
8666 
8667   pop(len);
8668   pop(zlen);
8669   jmp(L_second_loop);
8670 
8671   // Next infrequent code is moved outside loops.
8672   bind(L_last_x);
8673   movl(op1, Address(x, 0));
8674   jmp(L_multiply);
8675 
8676   bind(L_second_loop_exit);
8677   pop(len);
8678   pop(zlen);
8679   pop(len);
8680   pop(zlen);
8681 
8682   // Fifth loop
8683   // Shift z left 1 bit.
8684   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8685 
8686   // z[zlen-1] |= x[len-1] & 1;
8687   movl(tmp3, Address(x, len, Address::times_4, -4));
8688   andl(tmp3, 1);
8689   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
8690 
8691   pop(tmp5);
8692   pop(tmp4);
8693   pop(tmp3);
8694   pop(tmp2);
8695   pop(tmp1);
8696 }
8697 
8698 /**
8699  * Helper function for mul_add()
8700  * Multiply the in[] by int k and add to out[] starting at offset offs using
8701  * 128 bit by 32 bit multiply and return the carry in tmp5.
8702  * Only quad int aligned length of in[] is operated on in this function.
8703  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8704  * This function preserves out, in and k registers.
8705  * len and offset point to the appropriate index in "in" & "out" correspondingly
8706  * tmp5 has the carry.
8707  * other registers are temporary and are modified.
8708  *
8709  */
8710 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8711   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8712   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8713 
8714   Label L_first_loop, L_first_loop_exit;
8715 
8716   movl(tmp1, len);
8717   shrl(tmp1, 2);
8718 
8719   bind(L_first_loop);
8720   subl(tmp1, 1);
8721   jccb(Assembler::negative, L_first_loop_exit);
8722 
8723   subl(len, 4);
8724   subl(offset, 4);
8725 
8726   Register op2 = tmp2;
8727   const Register sum = tmp3;
8728   const Register op1 = tmp4;
8729   const Register carry = tmp5;
8730 
8731   if (UseBMI2Instructions) {
8732     op2 = rdxReg;
8733   }
8734 
8735   movq(op1, Address(in, len, Address::times_4,  8));
8736   rorq(op1, 32);
8737   movq(sum, Address(out, offset, Address::times_4,  8));
8738   rorq(sum, 32);
8739   if (UseBMI2Instructions) {
8740     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8741   }
8742   else {
8743     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8744   }
8745   // Store back in big endian from little endian
8746   rorq(sum, 0x20);
8747   movq(Address(out, offset, Address::times_4,  8), sum);
8748 
8749   movq(op1, Address(in, len, Address::times_4,  0));
8750   rorq(op1, 32);
8751   movq(sum, Address(out, offset, Address::times_4,  0));
8752   rorq(sum, 32);
8753   if (UseBMI2Instructions) {
8754     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8755   }
8756   else {
8757     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8758   }
8759   // Store back in big endian from little endian
8760   rorq(sum, 0x20);
8761   movq(Address(out, offset, Address::times_4,  0), sum);
8762 
8763   jmp(L_first_loop);
8764   bind(L_first_loop_exit);
8765 }
8766 
8767 /**
8768  * Code for BigInteger::mulAdd() intrinsic
8769  *
8770  * rdi: out
8771  * rsi: in
8772  * r11: offs (out.length - offset)
8773  * rcx: len
8774  * r8:  k
8775  * r12: tmp1
8776  * r13: tmp2
8777  * r14: tmp3
8778  * r15: tmp4
8779  * rbx: tmp5
8780  * Multiply the in[] by word k and add to out[], return the carry in rax
8781  */
8782 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8783    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8784    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8785 
8786   Label L_carry, L_last_in, L_done;
8787 
8788 // carry = 0;
8789 // for (int j=len-1; j >= 0; j--) {
8790 //    long product = (in[j] & LONG_MASK) * kLong +
8791 //                   (out[offs] & LONG_MASK) + carry;
8792 //    out[offs--] = (int)product;
8793 //    carry = product >>> 32;
8794 // }
8795 //
8796   push(tmp1);
8797   push(tmp2);
8798   push(tmp3);
8799   push(tmp4);
8800   push(tmp5);
8801 
8802   Register op2 = tmp2;
8803   const Register sum = tmp3;
8804   const Register op1 = tmp4;
8805   const Register carry =  tmp5;
8806 
8807   if (UseBMI2Instructions) {
8808     op2 = rdxReg;
8809     movl(op2, k);
8810   }
8811   else {
8812     movl(op2, k);
8813   }
8814 
8815   xorq(carry, carry);
8816 
8817   //First loop
8818 
8819   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8820   //The carry is in tmp5
8821   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8822 
8823   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8824   decrementl(len);
8825   jccb(Assembler::negative, L_carry);
8826   decrementl(len);
8827   jccb(Assembler::negative, L_last_in);
8828 
8829   movq(op1, Address(in, len, Address::times_4,  0));
8830   rorq(op1, 32);
8831 
8832   subl(offs, 2);
8833   movq(sum, Address(out, offs, Address::times_4,  0));
8834   rorq(sum, 32);
8835 
8836   if (UseBMI2Instructions) {
8837     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8838   }
8839   else {
8840     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8841   }
8842 
8843   // Store back in big endian from little endian
8844   rorq(sum, 0x20);
8845   movq(Address(out, offs, Address::times_4,  0), sum);
8846 
8847   testl(len, len);
8848   jccb(Assembler::zero, L_carry);
8849 
8850   //Multiply the last in[] entry, if any
8851   bind(L_last_in);
8852   movl(op1, Address(in, 0));
8853   movl(sum, Address(out, offs, Address::times_4,  -4));
8854 
8855   movl(raxReg, k);
8856   mull(op1); //tmp4 * eax -> edx:eax
8857   addl(sum, carry);
8858   adcl(rdxReg, 0);
8859   addl(sum, raxReg);
8860   adcl(rdxReg, 0);
8861   movl(carry, rdxReg);
8862 
8863   movl(Address(out, offs, Address::times_4,  -4), sum);
8864 
8865   bind(L_carry);
8866   //return tmp5/carry as carry in rax
8867   movl(rax, carry);
8868 
8869   bind(L_done);
8870   pop(tmp5);
8871   pop(tmp4);
8872   pop(tmp3);
8873   pop(tmp2);
8874   pop(tmp1);
8875 }
8876 #endif
8877 
8878 /**
8879  * Emits code to update CRC-32 with a byte value according to constants in table
8880  *
8881  * @param [in,out]crc   Register containing the crc.
8882  * @param [in]val       Register containing the byte to fold into the CRC.
8883  * @param [in]table     Register containing the table of crc constants.
8884  *
8885  * uint32_t crc;
8886  * val = crc_table[(val ^ crc) & 0xFF];
8887  * crc = val ^ (crc >> 8);
8888  *
8889  */
8890 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8891   xorl(val, crc);
8892   andl(val, 0xFF);
8893   shrl(crc, 8); // unsigned shift
8894   xorl(crc, Address(table, val, Address::times_4, 0));
8895 }
8896 
8897 /**
8898 * Fold four 128-bit data chunks
8899 */
8900 void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8901   evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
8902   evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
8903   evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
8904   evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
8905 }
8906 
8907 /**
8908  * Fold 128-bit data chunk
8909  */
8910 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8911   if (UseAVX > 0) {
8912     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
8913     vpclmulldq(xcrc, xK, xcrc); // [63:0]
8914     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
8915     pxor(xcrc, xtmp);
8916   } else {
8917     movdqa(xtmp, xcrc);
8918     pclmulhdq(xtmp, xK);   // [123:64]
8919     pclmulldq(xcrc, xK);   // [63:0]
8920     pxor(xcrc, xtmp);
8921     movdqu(xtmp, Address(buf, offset));
8922     pxor(xcrc, xtmp);
8923   }
8924 }
8925 
8926 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
8927   if (UseAVX > 0) {
8928     vpclmulhdq(xtmp, xK, xcrc);
8929     vpclmulldq(xcrc, xK, xcrc);
8930     pxor(xcrc, xbuf);
8931     pxor(xcrc, xtmp);
8932   } else {
8933     movdqa(xtmp, xcrc);
8934     pclmulhdq(xtmp, xK);
8935     pclmulldq(xcrc, xK);
8936     pxor(xcrc, xbuf);
8937     pxor(xcrc, xtmp);
8938   }
8939 }
8940 
8941 /**
8942  * 8-bit folds to compute 32-bit CRC
8943  *
8944  * uint64_t xcrc;
8945  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
8946  */
8947 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
8948   movdl(tmp, xcrc);
8949   andl(tmp, 0xFF);
8950   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
8951   psrldq(xcrc, 1); // unsigned shift one byte
8952   pxor(xcrc, xtmp);
8953 }
8954 
8955 /**
8956  * uint32_t crc;
8957  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
8958  */
8959 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
8960   movl(tmp, crc);
8961   andl(tmp, 0xFF);
8962   shrl(crc, 8);
8963   xorl(crc, Address(table, tmp, Address::times_4, 0));
8964 }
8965 
8966 /**
8967  * @param crc   register containing existing CRC (32-bit)
8968  * @param buf   register pointing to input byte buffer (byte*)
8969  * @param len   register containing number of bytes
8970  * @param table register that will contain address of CRC table
8971  * @param tmp   scratch register
8972  */
8973 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
8974   assert_different_registers(crc, buf, len, table, tmp, rax);
8975 
8976   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8977   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8978 
8979   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8980   // context for the registers used, where all instructions below are using 128-bit mode
8981   // On EVEX without VL and BW, these instructions will all be AVX.
8982   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
8983   notl(crc); // ~crc
8984   cmpl(len, 16);
8985   jcc(Assembler::less, L_tail);
8986 
8987   // Align buffer to 16 bytes
8988   movl(tmp, buf);
8989   andl(tmp, 0xF);
8990   jccb(Assembler::zero, L_aligned);
8991   subl(tmp,  16);
8992   addl(len, tmp);
8993 
8994   align(4);
8995   BIND(L_align_loop);
8996   movsbl(rax, Address(buf, 0)); // load byte with sign extension
8997   update_byte_crc32(crc, rax, table);
8998   increment(buf);
8999   incrementl(tmp);
9000   jccb(Assembler::less, L_align_loop);
9001 
9002   BIND(L_aligned);
9003   movl(tmp, len); // save
9004   shrl(len, 4);
9005   jcc(Assembler::zero, L_tail_restore);
9006 
9007   // Fold total 512 bits of polynomial on each iteration
9008   if (VM_Version::supports_vpclmulqdq()) {
9009     Label Parallel_loop, L_No_Parallel;
9010 
9011     cmpl(len, 8);
9012     jccb(Assembler::less, L_No_Parallel);
9013 
9014     movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9015     evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
9016     movdl(xmm5, crc);
9017     evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
9018     addptr(buf, 64);
9019     subl(len, 7);
9020     evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
9021 
9022     BIND(Parallel_loop);
9023     fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
9024     addptr(buf, 64);
9025     subl(len, 4);
9026     jcc(Assembler::greater, Parallel_loop);
9027 
9028     vextracti64x2(xmm2, xmm1, 0x01);
9029     vextracti64x2(xmm3, xmm1, 0x02);
9030     vextracti64x2(xmm4, xmm1, 0x03);
9031     jmp(L_fold_512b);
9032 
9033     BIND(L_No_Parallel);
9034   }
9035   // Fold crc into first bytes of vector
9036   movdqa(xmm1, Address(buf, 0));
9037   movdl(rax, xmm1);
9038   xorl(crc, rax);
9039   if (VM_Version::supports_sse4_1()) {
9040     pinsrd(xmm1, crc, 0);
9041   } else {
9042     pinsrw(xmm1, crc, 0);
9043     shrl(crc, 16);
9044     pinsrw(xmm1, crc, 1);
9045   }
9046   addptr(buf, 16);
9047   subl(len, 4); // len > 0
9048   jcc(Assembler::less, L_fold_tail);
9049 
9050   movdqa(xmm2, Address(buf,  0));
9051   movdqa(xmm3, Address(buf, 16));
9052   movdqa(xmm4, Address(buf, 32));
9053   addptr(buf, 48);
9054   subl(len, 3);
9055   jcc(Assembler::lessEqual, L_fold_512b);
9056 
9057   // Fold total 512 bits of polynomial on each iteration,
9058   // 128 bits per each of 4 parallel streams.
9059   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9060 
9061   align(32);
9062   BIND(L_fold_512b_loop);
9063   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9064   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
9065   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
9066   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
9067   addptr(buf, 64);
9068   subl(len, 4);
9069   jcc(Assembler::greater, L_fold_512b_loop);
9070 
9071   // Fold 512 bits to 128 bits.
9072   BIND(L_fold_512b);
9073   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9074   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
9075   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
9076   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
9077 
9078   // Fold the rest of 128 bits data chunks
9079   BIND(L_fold_tail);
9080   addl(len, 3);
9081   jccb(Assembler::lessEqual, L_fold_128b);
9082   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9083 
9084   BIND(L_fold_tail_loop);
9085   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9086   addptr(buf, 16);
9087   decrementl(len);
9088   jccb(Assembler::greater, L_fold_tail_loop);
9089 
9090   // Fold 128 bits in xmm1 down into 32 bits in crc register.
9091   BIND(L_fold_128b);
9092   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
9093   if (UseAVX > 0) {
9094     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
9095     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
9096     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
9097   } else {
9098     movdqa(xmm2, xmm0);
9099     pclmulqdq(xmm2, xmm1, 0x1);
9100     movdqa(xmm3, xmm0);
9101     pand(xmm3, xmm2);
9102     pclmulqdq(xmm0, xmm3, 0x1);
9103   }
9104   psrldq(xmm1, 8);
9105   psrldq(xmm2, 4);
9106   pxor(xmm0, xmm1);
9107   pxor(xmm0, xmm2);
9108 
9109   // 8 8-bit folds to compute 32-bit CRC.
9110   for (int j = 0; j < 4; j++) {
9111     fold_8bit_crc32(xmm0, table, xmm1, rax);
9112   }
9113   movdl(crc, xmm0); // mov 32 bits to general register
9114   for (int j = 0; j < 4; j++) {
9115     fold_8bit_crc32(crc, table, rax);
9116   }
9117 
9118   BIND(L_tail_restore);
9119   movl(len, tmp); // restore
9120   BIND(L_tail);
9121   andl(len, 0xf);
9122   jccb(Assembler::zero, L_exit);
9123 
9124   // Fold the rest of bytes
9125   align(4);
9126   BIND(L_tail_loop);
9127   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9128   update_byte_crc32(crc, rax, table);
9129   increment(buf);
9130   decrementl(len);
9131   jccb(Assembler::greater, L_tail_loop);
9132 
9133   BIND(L_exit);
9134   notl(crc); // ~c
9135 }
9136 
9137 #ifdef _LP64
9138 // S. Gueron / Information Processing Letters 112 (2012) 184
9139 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
9140 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
9141 // Output: the 64-bit carry-less product of B * CONST
9142 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
9143                                      Register tmp1, Register tmp2, Register tmp3) {
9144   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9145   if (n > 0) {
9146     addq(tmp3, n * 256 * 8);
9147   }
9148   //    Q1 = TABLEExt[n][B & 0xFF];
9149   movl(tmp1, in);
9150   andl(tmp1, 0x000000FF);
9151   shll(tmp1, 3);
9152   addq(tmp1, tmp3);
9153   movq(tmp1, Address(tmp1, 0));
9154 
9155   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9156   movl(tmp2, in);
9157   shrl(tmp2, 8);
9158   andl(tmp2, 0x000000FF);
9159   shll(tmp2, 3);
9160   addq(tmp2, tmp3);
9161   movq(tmp2, Address(tmp2, 0));
9162 
9163   shlq(tmp2, 8);
9164   xorq(tmp1, tmp2);
9165 
9166   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9167   movl(tmp2, in);
9168   shrl(tmp2, 16);
9169   andl(tmp2, 0x000000FF);
9170   shll(tmp2, 3);
9171   addq(tmp2, tmp3);
9172   movq(tmp2, Address(tmp2, 0));
9173 
9174   shlq(tmp2, 16);
9175   xorq(tmp1, tmp2);
9176 
9177   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9178   shrl(in, 24);
9179   andl(in, 0x000000FF);
9180   shll(in, 3);
9181   addq(in, tmp3);
9182   movq(in, Address(in, 0));
9183 
9184   shlq(in, 24);
9185   xorq(in, tmp1);
9186   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9187 }
9188 
9189 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9190                                       Register in_out,
9191                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9192                                       XMMRegister w_xtmp2,
9193                                       Register tmp1,
9194                                       Register n_tmp2, Register n_tmp3) {
9195   if (is_pclmulqdq_supported) {
9196     movdl(w_xtmp1, in_out); // modified blindly
9197 
9198     movl(tmp1, const_or_pre_comp_const_index);
9199     movdl(w_xtmp2, tmp1);
9200     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9201 
9202     movdq(in_out, w_xtmp1);
9203   } else {
9204     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
9205   }
9206 }
9207 
9208 // Recombination Alternative 2: No bit-reflections
9209 // T1 = (CRC_A * U1) << 1
9210 // T2 = (CRC_B * U2) << 1
9211 // C1 = T1 >> 32
9212 // C2 = T2 >> 32
9213 // T1 = T1 & 0xFFFFFFFF
9214 // T2 = T2 & 0xFFFFFFFF
9215 // T1 = CRC32(0, T1)
9216 // T2 = CRC32(0, T2)
9217 // C1 = C1 ^ T1
9218 // C2 = C2 ^ T2
9219 // CRC = C1 ^ C2 ^ CRC_C
9220 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9221                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9222                                      Register tmp1, Register tmp2,
9223                                      Register n_tmp3) {
9224   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9225   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9226   shlq(in_out, 1);
9227   movl(tmp1, in_out);
9228   shrq(in_out, 32);
9229   xorl(tmp2, tmp2);
9230   crc32(tmp2, tmp1, 4);
9231   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
9232   shlq(in1, 1);
9233   movl(tmp1, in1);
9234   shrq(in1, 32);
9235   xorl(tmp2, tmp2);
9236   crc32(tmp2, tmp1, 4);
9237   xorl(in1, tmp2);
9238   xorl(in_out, in1);
9239   xorl(in_out, in2);
9240 }
9241 
9242 // Set N to predefined value
9243 // Subtract from a lenght of a buffer
9244 // execute in a loop:
9245 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9246 // for i = 1 to N do
9247 //  CRC_A = CRC32(CRC_A, A[i])
9248 //  CRC_B = CRC32(CRC_B, B[i])
9249 //  CRC_C = CRC32(CRC_C, C[i])
9250 // end for
9251 // Recombine
9252 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9253                                        Register in_out1, Register in_out2, Register in_out3,
9254                                        Register tmp1, Register tmp2, Register tmp3,
9255                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9256                                        Register tmp4, Register tmp5,
9257                                        Register n_tmp6) {
9258   Label L_processPartitions;
9259   Label L_processPartition;
9260   Label L_exit;
9261 
9262   bind(L_processPartitions);
9263   cmpl(in_out1, 3 * size);
9264   jcc(Assembler::less, L_exit);
9265     xorl(tmp1, tmp1);
9266     xorl(tmp2, tmp2);
9267     movq(tmp3, in_out2);
9268     addq(tmp3, size);
9269 
9270     bind(L_processPartition);
9271       crc32(in_out3, Address(in_out2, 0), 8);
9272       crc32(tmp1, Address(in_out2, size), 8);
9273       crc32(tmp2, Address(in_out2, size * 2), 8);
9274       addq(in_out2, 8);
9275       cmpq(in_out2, tmp3);
9276       jcc(Assembler::less, L_processPartition);
9277     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9278             w_xtmp1, w_xtmp2, w_xtmp3,
9279             tmp4, tmp5,
9280             n_tmp6);
9281     addq(in_out2, 2 * size);
9282     subl(in_out1, 3 * size);
9283     jmp(L_processPartitions);
9284 
9285   bind(L_exit);
9286 }
9287 #else
9288 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
9289                                      Register tmp1, Register tmp2, Register tmp3,
9290                                      XMMRegister xtmp1, XMMRegister xtmp2) {
9291   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9292   if (n > 0) {
9293     addl(tmp3, n * 256 * 8);
9294   }
9295   //    Q1 = TABLEExt[n][B & 0xFF];
9296   movl(tmp1, in_out);
9297   andl(tmp1, 0x000000FF);
9298   shll(tmp1, 3);
9299   addl(tmp1, tmp3);
9300   movq(xtmp1, Address(tmp1, 0));
9301 
9302   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9303   movl(tmp2, in_out);
9304   shrl(tmp2, 8);
9305   andl(tmp2, 0x000000FF);
9306   shll(tmp2, 3);
9307   addl(tmp2, tmp3);
9308   movq(xtmp2, Address(tmp2, 0));
9309 
9310   psllq(xtmp2, 8);
9311   pxor(xtmp1, xtmp2);
9312 
9313   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9314   movl(tmp2, in_out);
9315   shrl(tmp2, 16);
9316   andl(tmp2, 0x000000FF);
9317   shll(tmp2, 3);
9318   addl(tmp2, tmp3);
9319   movq(xtmp2, Address(tmp2, 0));
9320 
9321   psllq(xtmp2, 16);
9322   pxor(xtmp1, xtmp2);
9323 
9324   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9325   shrl(in_out, 24);
9326   andl(in_out, 0x000000FF);
9327   shll(in_out, 3);
9328   addl(in_out, tmp3);
9329   movq(xtmp2, Address(in_out, 0));
9330 
9331   psllq(xtmp2, 24);
9332   pxor(xtmp1, xtmp2); // Result in CXMM
9333   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9334 }
9335 
9336 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9337                                       Register in_out,
9338                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9339                                       XMMRegister w_xtmp2,
9340                                       Register tmp1,
9341                                       Register n_tmp2, Register n_tmp3) {
9342   if (is_pclmulqdq_supported) {
9343     movdl(w_xtmp1, in_out);
9344 
9345     movl(tmp1, const_or_pre_comp_const_index);
9346     movdl(w_xtmp2, tmp1);
9347     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9348     // Keep result in XMM since GPR is 32 bit in length
9349   } else {
9350     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
9351   }
9352 }
9353 
9354 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9355                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9356                                      Register tmp1, Register tmp2,
9357                                      Register n_tmp3) {
9358   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9359   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9360 
9361   psllq(w_xtmp1, 1);
9362   movdl(tmp1, w_xtmp1);
9363   psrlq(w_xtmp1, 32);
9364   movdl(in_out, w_xtmp1);
9365 
9366   xorl(tmp2, tmp2);
9367   crc32(tmp2, tmp1, 4);
9368   xorl(in_out, tmp2);
9369 
9370   psllq(w_xtmp2, 1);
9371   movdl(tmp1, w_xtmp2);
9372   psrlq(w_xtmp2, 32);
9373   movdl(in1, w_xtmp2);
9374 
9375   xorl(tmp2, tmp2);
9376   crc32(tmp2, tmp1, 4);
9377   xorl(in1, tmp2);
9378   xorl(in_out, in1);
9379   xorl(in_out, in2);
9380 }
9381 
9382 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9383                                        Register in_out1, Register in_out2, Register in_out3,
9384                                        Register tmp1, Register tmp2, Register tmp3,
9385                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9386                                        Register tmp4, Register tmp5,
9387                                        Register n_tmp6) {
9388   Label L_processPartitions;
9389   Label L_processPartition;
9390   Label L_exit;
9391 
9392   bind(L_processPartitions);
9393   cmpl(in_out1, 3 * size);
9394   jcc(Assembler::less, L_exit);
9395     xorl(tmp1, tmp1);
9396     xorl(tmp2, tmp2);
9397     movl(tmp3, in_out2);
9398     addl(tmp3, size);
9399 
9400     bind(L_processPartition);
9401       crc32(in_out3, Address(in_out2, 0), 4);
9402       crc32(tmp1, Address(in_out2, size), 4);
9403       crc32(tmp2, Address(in_out2, size*2), 4);
9404       crc32(in_out3, Address(in_out2, 0+4), 4);
9405       crc32(tmp1, Address(in_out2, size+4), 4);
9406       crc32(tmp2, Address(in_out2, size*2+4), 4);
9407       addl(in_out2, 8);
9408       cmpl(in_out2, tmp3);
9409       jcc(Assembler::less, L_processPartition);
9410 
9411         push(tmp3);
9412         push(in_out1);
9413         push(in_out2);
9414         tmp4 = tmp3;
9415         tmp5 = in_out1;
9416         n_tmp6 = in_out2;
9417 
9418       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9419             w_xtmp1, w_xtmp2, w_xtmp3,
9420             tmp4, tmp5,
9421             n_tmp6);
9422 
9423         pop(in_out2);
9424         pop(in_out1);
9425         pop(tmp3);
9426 
9427     addl(in_out2, 2 * size);
9428     subl(in_out1, 3 * size);
9429     jmp(L_processPartitions);
9430 
9431   bind(L_exit);
9432 }
9433 #endif //LP64
9434 
9435 #ifdef _LP64
9436 // Algorithm 2: Pipelined usage of the CRC32 instruction.
9437 // Input: A buffer I of L bytes.
9438 // Output: the CRC32C value of the buffer.
9439 // Notations:
9440 // Write L = 24N + r, with N = floor (L/24).
9441 // r = L mod 24 (0 <= r < 24).
9442 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
9443 // N quadwords, and R consists of r bytes.
9444 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
9445 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
9446 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
9447 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
9448 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9449                                           Register tmp1, Register tmp2, Register tmp3,
9450                                           Register tmp4, Register tmp5, Register tmp6,
9451                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9452                                           bool is_pclmulqdq_supported) {
9453   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9454   Label L_wordByWord;
9455   Label L_byteByByteProlog;
9456   Label L_byteByByte;
9457   Label L_exit;
9458 
9459   if (is_pclmulqdq_supported ) {
9460     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9461     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
9462 
9463     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9464     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9465 
9466     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9467     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9468     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
9469   } else {
9470     const_or_pre_comp_const_index[0] = 1;
9471     const_or_pre_comp_const_index[1] = 0;
9472 
9473     const_or_pre_comp_const_index[2] = 3;
9474     const_or_pre_comp_const_index[3] = 2;
9475 
9476     const_or_pre_comp_const_index[4] = 5;
9477     const_or_pre_comp_const_index[5] = 4;
9478    }
9479   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9480                     in2, in1, in_out,
9481                     tmp1, tmp2, tmp3,
9482                     w_xtmp1, w_xtmp2, w_xtmp3,
9483                     tmp4, tmp5,
9484                     tmp6);
9485   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9486                     in2, in1, in_out,
9487                     tmp1, tmp2, tmp3,
9488                     w_xtmp1, w_xtmp2, w_xtmp3,
9489                     tmp4, tmp5,
9490                     tmp6);
9491   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9492                     in2, in1, in_out,
9493                     tmp1, tmp2, tmp3,
9494                     w_xtmp1, w_xtmp2, w_xtmp3,
9495                     tmp4, tmp5,
9496                     tmp6);
9497   movl(tmp1, in2);
9498   andl(tmp1, 0x00000007);
9499   negl(tmp1);
9500   addl(tmp1, in2);
9501   addq(tmp1, in1);
9502 
9503   BIND(L_wordByWord);
9504   cmpq(in1, tmp1);
9505   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9506     crc32(in_out, Address(in1, 0), 4);
9507     addq(in1, 4);
9508     jmp(L_wordByWord);
9509 
9510   BIND(L_byteByByteProlog);
9511   andl(in2, 0x00000007);
9512   movl(tmp2, 1);
9513 
9514   BIND(L_byteByByte);
9515   cmpl(tmp2, in2);
9516   jccb(Assembler::greater, L_exit);
9517     crc32(in_out, Address(in1, 0), 1);
9518     incq(in1);
9519     incl(tmp2);
9520     jmp(L_byteByByte);
9521 
9522   BIND(L_exit);
9523 }
9524 #else
9525 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9526                                           Register tmp1, Register  tmp2, Register tmp3,
9527                                           Register tmp4, Register  tmp5, Register tmp6,
9528                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9529                                           bool is_pclmulqdq_supported) {
9530   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9531   Label L_wordByWord;
9532   Label L_byteByByteProlog;
9533   Label L_byteByByte;
9534   Label L_exit;
9535 
9536   if (is_pclmulqdq_supported) {
9537     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9538     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
9539 
9540     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9541     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9542 
9543     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9544     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9545   } else {
9546     const_or_pre_comp_const_index[0] = 1;
9547     const_or_pre_comp_const_index[1] = 0;
9548 
9549     const_or_pre_comp_const_index[2] = 3;
9550     const_or_pre_comp_const_index[3] = 2;
9551 
9552     const_or_pre_comp_const_index[4] = 5;
9553     const_or_pre_comp_const_index[5] = 4;
9554   }
9555   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9556                     in2, in1, in_out,
9557                     tmp1, tmp2, tmp3,
9558                     w_xtmp1, w_xtmp2, w_xtmp3,
9559                     tmp4, tmp5,
9560                     tmp6);
9561   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9562                     in2, in1, in_out,
9563                     tmp1, tmp2, tmp3,
9564                     w_xtmp1, w_xtmp2, w_xtmp3,
9565                     tmp4, tmp5,
9566                     tmp6);
9567   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9568                     in2, in1, in_out,
9569                     tmp1, tmp2, tmp3,
9570                     w_xtmp1, w_xtmp2, w_xtmp3,
9571                     tmp4, tmp5,
9572                     tmp6);
9573   movl(tmp1, in2);
9574   andl(tmp1, 0x00000007);
9575   negl(tmp1);
9576   addl(tmp1, in2);
9577   addl(tmp1, in1);
9578 
9579   BIND(L_wordByWord);
9580   cmpl(in1, tmp1);
9581   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9582     crc32(in_out, Address(in1,0), 4);
9583     addl(in1, 4);
9584     jmp(L_wordByWord);
9585 
9586   BIND(L_byteByByteProlog);
9587   andl(in2, 0x00000007);
9588   movl(tmp2, 1);
9589 
9590   BIND(L_byteByByte);
9591   cmpl(tmp2, in2);
9592   jccb(Assembler::greater, L_exit);
9593     movb(tmp1, Address(in1, 0));
9594     crc32(in_out, tmp1, 1);
9595     incl(in1);
9596     incl(tmp2);
9597     jmp(L_byteByByte);
9598 
9599   BIND(L_exit);
9600 }
9601 #endif // LP64
9602 #undef BIND
9603 #undef BLOCK_COMMENT
9604 
9605 // Compress char[] array to byte[].
9606 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
9607 //   @HotSpotIntrinsicCandidate
9608 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
9609 //     for (int i = 0; i < len; i++) {
9610 //       int c = src[srcOff++];
9611 //       if (c >>> 8 != 0) {
9612 //         return 0;
9613 //       }
9614 //       dst[dstOff++] = (byte)c;
9615 //     }
9616 //     return len;
9617 //   }
9618 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
9619   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
9620   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
9621   Register tmp5, Register result) {
9622   Label copy_chars_loop, return_length, return_zero, done;
9623 
9624   // rsi: src
9625   // rdi: dst
9626   // rdx: len
9627   // rcx: tmp5
9628   // rax: result
9629 
9630   // rsi holds start addr of source char[] to be compressed
9631   // rdi holds start addr of destination byte[]
9632   // rdx holds length
9633 
9634   assert(len != result, "");
9635 
9636   // save length for return
9637   push(len);
9638 
9639   if ((UseAVX > 2) && // AVX512
9640     VM_Version::supports_avx512vlbw() &&
9641     VM_Version::supports_bmi2()) {
9642 
9643     Label copy_32_loop, copy_loop_tail, below_threshold;
9644 
9645     // alignment
9646     Label post_alignment;
9647 
9648     // if length of the string is less than 16, handle it in an old fashioned way
9649     testl(len, -32);
9650     jcc(Assembler::zero, below_threshold);
9651 
9652     // First check whether a character is compressable ( <= 0xFF).
9653     // Create mask to test for Unicode chars inside zmm vector
9654     movl(result, 0x00FF);
9655     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
9656 
9657     testl(len, -64);
9658     jcc(Assembler::zero, post_alignment);
9659 
9660     movl(tmp5, dst);
9661     andl(tmp5, (32 - 1));
9662     negl(tmp5);
9663     andl(tmp5, (32 - 1));
9664 
9665     // bail out when there is nothing to be done
9666     testl(tmp5, 0xFFFFFFFF);
9667     jcc(Assembler::zero, post_alignment);
9668 
9669     // ~(~0 << len), where len is the # of remaining elements to process
9670     movl(result, 0xFFFFFFFF);
9671     shlxl(result, result, tmp5);
9672     notl(result);
9673     kmovdl(k3, result);
9674 
9675     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9676     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9677     ktestd(k2, k3);
9678     jcc(Assembler::carryClear, return_zero);
9679 
9680     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9681 
9682     addptr(src, tmp5);
9683     addptr(src, tmp5);
9684     addptr(dst, tmp5);
9685     subl(len, tmp5);
9686 
9687     bind(post_alignment);
9688     // end of alignment
9689 
9690     movl(tmp5, len);
9691     andl(tmp5, (32 - 1));    // tail count (in chars)
9692     andl(len, ~(32 - 1));    // vector count (in chars)
9693     jcc(Assembler::zero, copy_loop_tail);
9694 
9695     lea(src, Address(src, len, Address::times_2));
9696     lea(dst, Address(dst, len, Address::times_1));
9697     negptr(len);
9698 
9699     bind(copy_32_loop);
9700     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9701     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9702     kortestdl(k2, k2);
9703     jcc(Assembler::carryClear, return_zero);
9704 
9705     // All elements in current processed chunk are valid candidates for
9706     // compression. Write a truncated byte elements to the memory.
9707     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9708     addptr(len, 32);
9709     jcc(Assembler::notZero, copy_32_loop);
9710 
9711     bind(copy_loop_tail);
9712     // bail out when there is nothing to be done
9713     testl(tmp5, 0xFFFFFFFF);
9714     jcc(Assembler::zero, return_length);
9715 
9716     movl(len, tmp5);
9717 
9718     // ~(~0 << len), where len is the # of remaining elements to process
9719     movl(result, 0xFFFFFFFF);
9720     shlxl(result, result, len);
9721     notl(result);
9722 
9723     kmovdl(k3, result);
9724 
9725     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9726     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9727     ktestd(k2, k3);
9728     jcc(Assembler::carryClear, return_zero);
9729 
9730     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9731     jmp(return_length);
9732 
9733     bind(below_threshold);
9734   }
9735 
9736   if (UseSSE42Intrinsics) {
9737     Label copy_32_loop, copy_16, copy_tail;
9738 
9739     movl(result, len);
9740 
9741     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
9742 
9743     // vectored compression
9744     andl(len, 0xfffffff0);    // vector count (in chars)
9745     andl(result, 0x0000000f);    // tail count (in chars)
9746     testl(len, len);
9747     jcc(Assembler::zero, copy_16);
9748 
9749     // compress 16 chars per iter
9750     movdl(tmp1Reg, tmp5);
9751     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9752     pxor(tmp4Reg, tmp4Reg);
9753 
9754     lea(src, Address(src, len, Address::times_2));
9755     lea(dst, Address(dst, len, Address::times_1));
9756     negptr(len);
9757 
9758     bind(copy_32_loop);
9759     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
9760     por(tmp4Reg, tmp2Reg);
9761     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9762     por(tmp4Reg, tmp3Reg);
9763     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
9764     jcc(Assembler::notZero, return_zero);
9765     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
9766     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9767     addptr(len, 16);
9768     jcc(Assembler::notZero, copy_32_loop);
9769 
9770     // compress next vector of 8 chars (if any)
9771     bind(copy_16);
9772     movl(len, result);
9773     andl(len, 0xfffffff8);    // vector count (in chars)
9774     andl(result, 0x00000007);    // tail count (in chars)
9775     testl(len, len);
9776     jccb(Assembler::zero, copy_tail);
9777 
9778     movdl(tmp1Reg, tmp5);
9779     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9780     pxor(tmp3Reg, tmp3Reg);
9781 
9782     movdqu(tmp2Reg, Address(src, 0));
9783     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
9784     jccb(Assembler::notZero, return_zero);
9785     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
9786     movq(Address(dst, 0), tmp2Reg);
9787     addptr(src, 16);
9788     addptr(dst, 8);
9789 
9790     bind(copy_tail);
9791     movl(len, result);
9792   }
9793   // compress 1 char per iter
9794   testl(len, len);
9795   jccb(Assembler::zero, return_length);
9796   lea(src, Address(src, len, Address::times_2));
9797   lea(dst, Address(dst, len, Address::times_1));
9798   negptr(len);
9799 
9800   bind(copy_chars_loop);
9801   load_unsigned_short(result, Address(src, len, Address::times_2));
9802   testl(result, 0xff00);      // check if Unicode char
9803   jccb(Assembler::notZero, return_zero);
9804   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
9805   increment(len);
9806   jcc(Assembler::notZero, copy_chars_loop);
9807 
9808   // if compression succeeded, return length
9809   bind(return_length);
9810   pop(result);
9811   jmpb(done);
9812 
9813   // if compression failed, return 0
9814   bind(return_zero);
9815   xorl(result, result);
9816   addptr(rsp, wordSize);
9817 
9818   bind(done);
9819 }
9820 
9821 // Inflate byte[] array to char[].
9822 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9823 //   @HotSpotIntrinsicCandidate
9824 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9825 //     for (int i = 0; i < len; i++) {
9826 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9827 //     }
9828 //   }
9829 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9830   XMMRegister tmp1, Register tmp2) {
9831   Label copy_chars_loop, done, below_threshold;
9832   // rsi: src
9833   // rdi: dst
9834   // rdx: len
9835   // rcx: tmp2
9836 
9837   // rsi holds start addr of source byte[] to be inflated
9838   // rdi holds start addr of destination char[]
9839   // rdx holds length
9840   assert_different_registers(src, dst, len, tmp2);
9841 
9842   if ((UseAVX > 2) && // AVX512
9843     VM_Version::supports_avx512vlbw() &&
9844     VM_Version::supports_bmi2()) {
9845 
9846     Label copy_32_loop, copy_tail;
9847     Register tmp3_aliased = len;
9848 
9849     // if length of the string is less than 16, handle it in an old fashioned way
9850     testl(len, -16);
9851     jcc(Assembler::zero, below_threshold);
9852 
9853     // In order to use only one arithmetic operation for the main loop we use
9854     // this pre-calculation
9855     movl(tmp2, len);
9856     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9857     andl(len, -32);     // vector count
9858     jccb(Assembler::zero, copy_tail);
9859 
9860     lea(src, Address(src, len, Address::times_1));
9861     lea(dst, Address(dst, len, Address::times_2));
9862     negptr(len);
9863 
9864 
9865     // inflate 32 chars per iter
9866     bind(copy_32_loop);
9867     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9868     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9869     addptr(len, 32);
9870     jcc(Assembler::notZero, copy_32_loop);
9871 
9872     bind(copy_tail);
9873     // bail out when there is nothing to be done
9874     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9875     jcc(Assembler::zero, done);
9876 
9877     // ~(~0 << length), where length is the # of remaining elements to process
9878     movl(tmp3_aliased, -1);
9879     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9880     notl(tmp3_aliased);
9881     kmovdl(k2, tmp3_aliased);
9882     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
9883     evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
9884 
9885     jmp(done);
9886   }
9887   if (UseSSE42Intrinsics) {
9888     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9889 
9890     movl(tmp2, len);
9891 
9892     if (UseAVX > 1) {
9893       andl(tmp2, (16 - 1));
9894       andl(len, -16);
9895       jccb(Assembler::zero, copy_new_tail);
9896     } else {
9897       andl(tmp2, 0x00000007);   // tail count (in chars)
9898       andl(len, 0xfffffff8);    // vector count (in chars)
9899       jccb(Assembler::zero, copy_tail);
9900     }
9901 
9902     // vectored inflation
9903     lea(src, Address(src, len, Address::times_1));
9904     lea(dst, Address(dst, len, Address::times_2));
9905     negptr(len);
9906 
9907     if (UseAVX > 1) {
9908       bind(copy_16_loop);
9909       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9910       vmovdqu(Address(dst, len, Address::times_2), tmp1);
9911       addptr(len, 16);
9912       jcc(Assembler::notZero, copy_16_loop);
9913 
9914       bind(below_threshold);
9915       bind(copy_new_tail);
9916       if ((UseAVX > 2) &&
9917         VM_Version::supports_avx512vlbw() &&
9918         VM_Version::supports_bmi2()) {
9919         movl(tmp2, len);
9920       } else {
9921         movl(len, tmp2);
9922       }
9923       andl(tmp2, 0x00000007);
9924       andl(len, 0xFFFFFFF8);
9925       jccb(Assembler::zero, copy_tail);
9926 
9927       pmovzxbw(tmp1, Address(src, 0));
9928       movdqu(Address(dst, 0), tmp1);
9929       addptr(src, 8);
9930       addptr(dst, 2 * 8);
9931 
9932       jmp(copy_tail, true);
9933     }
9934 
9935     // inflate 8 chars per iter
9936     bind(copy_8_loop);
9937     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
9938     movdqu(Address(dst, len, Address::times_2), tmp1);
9939     addptr(len, 8);
9940     jcc(Assembler::notZero, copy_8_loop);
9941 
9942     bind(copy_tail);
9943     movl(len, tmp2);
9944 
9945     cmpl(len, 4);
9946     jccb(Assembler::less, copy_bytes);
9947 
9948     movdl(tmp1, Address(src, 0));  // load 4 byte chars
9949     pmovzxbw(tmp1, tmp1);
9950     movq(Address(dst, 0), tmp1);
9951     subptr(len, 4);
9952     addptr(src, 4);
9953     addptr(dst, 8);
9954 
9955     bind(copy_bytes);
9956   } else {
9957     bind(below_threshold);
9958   }
9959 
9960   testl(len, len);
9961   jccb(Assembler::zero, done);
9962   lea(src, Address(src, len, Address::times_1));
9963   lea(dst, Address(dst, len, Address::times_2));
9964   negptr(len);
9965 
9966   // inflate 1 char per iter
9967   bind(copy_chars_loop);
9968   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
9969   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
9970   increment(len);
9971   jcc(Assembler::notZero, copy_chars_loop);
9972 
9973   bind(done);
9974 }
9975 
9976 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9977   switch (cond) {
9978     // Note some conditions are synonyms for others
9979     case Assembler::zero:         return Assembler::notZero;
9980     case Assembler::notZero:      return Assembler::zero;
9981     case Assembler::less:         return Assembler::greaterEqual;
9982     case Assembler::lessEqual:    return Assembler::greater;
9983     case Assembler::greater:      return Assembler::lessEqual;
9984     case Assembler::greaterEqual: return Assembler::less;
9985     case Assembler::below:        return Assembler::aboveEqual;
9986     case Assembler::belowEqual:   return Assembler::above;
9987     case Assembler::above:        return Assembler::belowEqual;
9988     case Assembler::aboveEqual:   return Assembler::below;
9989     case Assembler::overflow:     return Assembler::noOverflow;
9990     case Assembler::noOverflow:   return Assembler::overflow;
9991     case Assembler::negative:     return Assembler::positive;
9992     case Assembler::positive:     return Assembler::negative;
9993     case Assembler::parity:       return Assembler::noParity;
9994     case Assembler::noParity:     return Assembler::parity;
9995   }
9996   ShouldNotReachHere(); return Assembler::overflow;
9997 }
9998 
9999 SkipIfEqual::SkipIfEqual(
10000     MacroAssembler* masm, const bool* flag_addr, bool value) {
10001   _masm = masm;
10002   _masm->cmp8(ExternalAddress((address)flag_addr), value);
10003   _masm->jcc(Assembler::equal, _label);
10004 }
10005 
10006 SkipIfEqual::~SkipIfEqual() {
10007   _masm->bind(_label);
10008 }
10009 
10010 // 32-bit Windows has its own fast-path implementation
10011 // of get_thread
10012 #if !defined(WIN32) || defined(_LP64)
10013 
10014 // This is simply a call to Thread::current()
10015 void MacroAssembler::get_thread(Register thread) {
10016   if (thread != rax) {
10017     push(rax);
10018   }
10019   LP64_ONLY(push(rdi);)
10020   LP64_ONLY(push(rsi);)
10021   push(rdx);
10022   push(rcx);
10023 #ifdef _LP64
10024   push(r8);
10025   push(r9);
10026   push(r10);
10027   push(r11);
10028 #endif
10029 
10030   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10031 
10032 #ifdef _LP64
10033   pop(r11);
10034   pop(r10);
10035   pop(r9);
10036   pop(r8);
10037 #endif
10038   pop(rcx);
10039   pop(rdx);
10040   LP64_ONLY(pop(rsi);)
10041   LP64_ONLY(pop(rdi);)
10042   if (thread != rax) {
10043     mov(thread, rax);
10044     pop(rax);
10045   }
10046 }
10047 
10048 #endif