1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "gc/shared/barrierSetNMethod.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_x86.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/frame.inline.hpp"
  41 #include "runtime/handles.inline.hpp"
  42 #include "runtime/sharedRuntime.hpp"
  43 #include "runtime/stubCodeGenerator.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "runtime/thread.inline.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 #if INCLUDE_ZGC
  50 #include "gc/z/zThreadLocalData.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #define __ _masm->
  58 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  59 #define a__ ((Assembler*)_masm)->
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     // This can destroy rscratch1 if counter is far from the code cache
  80     __ incrementl(ExternalAddress((address)&counter));
  81   }
  82 #define inc_counter_np(counter) \
  83   BLOCK_COMMENT("inc_counter " #counter); \
  84   inc_counter_np_(counter);
  85 #endif
  86 
  87   // Call stubs are used to call Java from C
  88   //
  89   // Linux Arguments:
  90   //    c_rarg0:   call wrapper address                   address
  91   //    c_rarg1:   result                                 address
  92   //    c_rarg2:   result type                            BasicType
  93   //    c_rarg3:   method                                 Method*
  94   //    c_rarg4:   (interpreter) entry point              address
  95   //    c_rarg5:   parameters                             intptr_t*
  96   //    16(rbp): parameter size (in words)              int
  97   //    24(rbp): thread                                 Thread*
  98   //
  99   //     [ return_from_Java     ] <--- rsp
 100   //     [ argument word n      ]
 101   //      ...
 102   // -12 [ argument word 1      ]
 103   // -11 [ saved r15            ] <--- rsp_after_call
 104   // -10 [ saved r14            ]
 105   //  -9 [ saved r13            ]
 106   //  -8 [ saved r12            ]
 107   //  -7 [ saved rbx            ]
 108   //  -6 [ call wrapper         ]
 109   //  -5 [ result               ]
 110   //  -4 [ result type          ]
 111   //  -3 [ method               ]
 112   //  -2 [ entry point          ]
 113   //  -1 [ parameters           ]
 114   //   0 [ saved rbp            ] <--- rbp
 115   //   1 [ return address       ]
 116   //   2 [ parameter size       ]
 117   //   3 [ thread               ]
 118   //
 119   // Windows Arguments:
 120   //    c_rarg0:   call wrapper address                   address
 121   //    c_rarg1:   result                                 address
 122   //    c_rarg2:   result type                            BasicType
 123   //    c_rarg3:   method                                 Method*
 124   //    48(rbp): (interpreter) entry point              address
 125   //    56(rbp): parameters                             intptr_t*
 126   //    64(rbp): parameter size (in words)              int
 127   //    72(rbp): thread                                 Thread*
 128   //
 129   //     [ return_from_Java     ] <--- rsp
 130   //     [ argument word n      ]
 131   //      ...
 132   // -60 [ argument word 1      ]
 133   // -59 [ saved xmm31          ] <--- rsp after_call
 134   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 135   // -27 [ saved xmm15          ]
 136   //     [ saved xmm7-xmm14     ]
 137   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 138   //  -7 [ saved r15            ]
 139   //  -6 [ saved r14            ]
 140   //  -5 [ saved r13            ]
 141   //  -4 [ saved r12            ]
 142   //  -3 [ saved rdi            ]
 143   //  -2 [ saved rsi            ]
 144   //  -1 [ saved rbx            ]
 145   //   0 [ saved rbp            ] <--- rbp
 146   //   1 [ return address       ]
 147   //   2 [ call wrapper         ]
 148   //   3 [ result               ]
 149   //   4 [ result type          ]
 150   //   5 [ method               ]
 151   //   6 [ entry point          ]
 152   //   7 [ parameters           ]
 153   //   8 [ parameter size       ]
 154   //   9 [ thread               ]
 155   //
 156   //    Windows reserves the callers stack space for arguments 1-4.
 157   //    We spill c_rarg0-c_rarg3 to this space.
 158 
 159   // Call stub stack layout word offsets from rbp
 160   enum call_stub_layout {
 161 #ifdef _WIN64
 162     xmm_save_first     = 6,  // save from xmm6
 163     xmm_save_last      = 31, // to xmm31
 164     xmm_save_base      = -9,
 165     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 166     r15_off            = -7,
 167     r14_off            = -6,
 168     r13_off            = -5,
 169     r12_off            = -4,
 170     rdi_off            = -3,
 171     rsi_off            = -2,
 172     rbx_off            = -1,
 173     rbp_off            =  0,
 174     retaddr_off        =  1,
 175     call_wrapper_off   =  2,
 176     result_off         =  3,
 177     result_type_off    =  4,
 178     method_off         =  5,
 179     entry_point_off    =  6,
 180     parameters_off     =  7,
 181     parameter_size_off =  8,
 182     thread_off         =  9
 183 #else
 184     rsp_after_call_off = -12,
 185     mxcsr_off          = rsp_after_call_off,
 186     r15_off            = -11,
 187     r14_off            = -10,
 188     r13_off            = -9,
 189     r12_off            = -8,
 190     rbx_off            = -7,
 191     call_wrapper_off   = -6,
 192     result_off         = -5,
 193     result_type_off    = -4,
 194     method_off         = -3,
 195     entry_point_off    = -2,
 196     parameters_off     = -1,
 197     rbp_off            =  0,
 198     retaddr_off        =  1,
 199     parameter_size_off =  2,
 200     thread_off         =  3
 201 #endif
 202   };
 203 
 204 #ifdef _WIN64
 205   Address xmm_save(int reg) {
 206     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 207     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 208   }
 209 #endif
 210 
 211   address generate_call_stub(address& return_address) {
 212     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 213            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 214            "adjust this code");
 215     StubCodeMark mark(this, "StubRoutines", "call_stub");
 216     address start = __ pc();
 217 
 218     // same as in generate_catch_exception()!
 219     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 220 
 221     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 222     const Address result        (rbp, result_off         * wordSize);
 223     const Address result_type   (rbp, result_type_off    * wordSize);
 224     const Address method        (rbp, method_off         * wordSize);
 225     const Address entry_point   (rbp, entry_point_off    * wordSize);
 226     const Address parameters    (rbp, parameters_off     * wordSize);
 227     const Address parameter_size(rbp, parameter_size_off * wordSize);
 228 
 229     // same as in generate_catch_exception()!
 230     const Address thread        (rbp, thread_off         * wordSize);
 231 
 232     const Address r15_save(rbp, r15_off * wordSize);
 233     const Address r14_save(rbp, r14_off * wordSize);
 234     const Address r13_save(rbp, r13_off * wordSize);
 235     const Address r12_save(rbp, r12_off * wordSize);
 236     const Address rbx_save(rbp, rbx_off * wordSize);
 237 
 238     // stub code
 239     __ enter();
 240     __ subptr(rsp, -rsp_after_call_off * wordSize);
 241 
 242     // save register parameters
 243 #ifndef _WIN64
 244     __ movptr(parameters,   c_rarg5); // parameters
 245     __ movptr(entry_point,  c_rarg4); // entry_point
 246 #endif
 247 
 248     __ movptr(method,       c_rarg3); // method
 249     __ movl(result_type,  c_rarg2);   // result type
 250     __ movptr(result,       c_rarg1); // result
 251     __ movptr(call_wrapper, c_rarg0); // call wrapper
 252 
 253     // save regs belonging to calling function
 254     __ movptr(rbx_save, rbx);
 255     __ movptr(r12_save, r12);
 256     __ movptr(r13_save, r13);
 257     __ movptr(r14_save, r14);
 258     __ movptr(r15_save, r15);
 259 
 260 #ifdef _WIN64
 261     int last_reg = 15;
 262     if (UseAVX > 2) {
 263       last_reg = 31;
 264     }
 265     if (VM_Version::supports_evex()) {
 266       for (int i = xmm_save_first; i <= last_reg; i++) {
 267         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 268       }
 269     } else {
 270       for (int i = xmm_save_first; i <= last_reg; i++) {
 271         __ movdqu(xmm_save(i), as_XMMRegister(i));
 272       }
 273     }
 274 
 275     const Address rdi_save(rbp, rdi_off * wordSize);
 276     const Address rsi_save(rbp, rsi_off * wordSize);
 277 
 278     __ movptr(rsi_save, rsi);
 279     __ movptr(rdi_save, rdi);
 280 #else
 281     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 282     {
 283       Label skip_ldmx;
 284       __ stmxcsr(mxcsr_save);
 285       __ movl(rax, mxcsr_save);
 286       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 287       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 288       __ cmp32(rax, mxcsr_std);
 289       __ jcc(Assembler::equal, skip_ldmx);
 290       __ ldmxcsr(mxcsr_std);
 291       __ bind(skip_ldmx);
 292     }
 293 #endif
 294 
 295     // Load up thread register
 296     __ movptr(r15_thread, thread);
 297     __ reinit_heapbase();
 298 
 299 #ifdef ASSERT
 300     // make sure we have no pending exceptions
 301     {
 302       Label L;
 303       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 304       __ jcc(Assembler::equal, L);
 305       __ stop("StubRoutines::call_stub: entered with pending exception");
 306       __ bind(L);
 307     }
 308 #endif
 309 
 310     // pass parameters if any
 311     BLOCK_COMMENT("pass parameters if any");
 312     Label parameters_done;
 313     __ movl(c_rarg3, parameter_size);
 314     __ testl(c_rarg3, c_rarg3);
 315     __ jcc(Assembler::zero, parameters_done);
 316 
 317     Label loop;
 318     __ movptr(c_rarg2, parameters);       // parameter pointer
 319     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 320     __ BIND(loop);
 321     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 322     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 323     __ decrementl(c_rarg1);             // decrement counter
 324     __ push(rax);                       // pass parameter
 325     __ jcc(Assembler::notZero, loop);
 326 
 327     // call Java function
 328     __ BIND(parameters_done);
 329     __ movptr(rbx, method);             // get Method*
 330     __ movptr(c_rarg1, entry_point);    // get entry_point
 331     __ mov(r13, rsp);                   // set sender sp
 332     BLOCK_COMMENT("call Java function");
 333     __ call(c_rarg1);
 334 
 335     BLOCK_COMMENT("call_stub_return_address:");
 336     return_address = __ pc();
 337 
 338     // store result depending on type (everything that is not
 339     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 340     __ movptr(c_rarg0, result);
 341     Label is_long, is_float, is_double, exit;
 342     __ movl(c_rarg1, result_type);
 343     __ cmpl(c_rarg1, T_OBJECT);
 344     __ jcc(Assembler::equal, is_long);
 345     __ cmpl(c_rarg1, T_LONG);
 346     __ jcc(Assembler::equal, is_long);
 347     __ cmpl(c_rarg1, T_FLOAT);
 348     __ jcc(Assembler::equal, is_float);
 349     __ cmpl(c_rarg1, T_DOUBLE);
 350     __ jcc(Assembler::equal, is_double);
 351 
 352     // handle T_INT case
 353     __ movl(Address(c_rarg0, 0), rax);
 354 
 355     __ BIND(exit);
 356 
 357     // pop parameters
 358     __ lea(rsp, rsp_after_call);
 359 
 360 #ifdef ASSERT
 361     // verify that threads correspond
 362     {
 363      Label L1, L2, L3;
 364       __ cmpptr(r15_thread, thread);
 365       __ jcc(Assembler::equal, L1);
 366       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 367       __ bind(L1);
 368       __ get_thread(rbx);
 369       __ cmpptr(r15_thread, thread);
 370       __ jcc(Assembler::equal, L2);
 371       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 372       __ bind(L2);
 373       __ cmpptr(r15_thread, rbx);
 374       __ jcc(Assembler::equal, L3);
 375       __ stop("StubRoutines::call_stub: threads must correspond");
 376       __ bind(L3);
 377     }
 378 #endif
 379 
 380     // restore regs belonging to calling function
 381 #ifdef _WIN64
 382     // emit the restores for xmm regs
 383     if (VM_Version::supports_evex()) {
 384       for (int i = xmm_save_first; i <= last_reg; i++) {
 385         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 386       }
 387     } else {
 388       for (int i = xmm_save_first; i <= last_reg; i++) {
 389         __ movdqu(as_XMMRegister(i), xmm_save(i));
 390       }
 391     }
 392 #endif
 393     __ movptr(r15, r15_save);
 394     __ movptr(r14, r14_save);
 395     __ movptr(r13, r13_save);
 396     __ movptr(r12, r12_save);
 397     __ movptr(rbx, rbx_save);
 398 
 399 #ifdef _WIN64
 400     __ movptr(rdi, rdi_save);
 401     __ movptr(rsi, rsi_save);
 402 #else
 403     __ ldmxcsr(mxcsr_save);
 404 #endif
 405 
 406     // restore rsp
 407     __ addptr(rsp, -rsp_after_call_off * wordSize);
 408 
 409     // return
 410     __ vzeroupper();
 411     __ pop(rbp);
 412     __ ret(0);
 413 
 414     // handle return types different from T_INT
 415     __ BIND(is_long);
 416     __ movq(Address(c_rarg0, 0), rax);
 417     __ jmp(exit);
 418 
 419     __ BIND(is_float);
 420     __ movflt(Address(c_rarg0, 0), xmm0);
 421     __ jmp(exit);
 422 
 423     __ BIND(is_double);
 424     __ movdbl(Address(c_rarg0, 0), xmm0);
 425     __ jmp(exit);
 426 
 427     return start;
 428   }
 429 
 430   // Return point for a Java call if there's an exception thrown in
 431   // Java code.  The exception is caught and transformed into a
 432   // pending exception stored in JavaThread that can be tested from
 433   // within the VM.
 434   //
 435   // Note: Usually the parameters are removed by the callee. In case
 436   // of an exception crossing an activation frame boundary, that is
 437   // not the case if the callee is compiled code => need to setup the
 438   // rsp.
 439   //
 440   // rax: exception oop
 441 
 442   address generate_catch_exception() {
 443     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 444     address start = __ pc();
 445 
 446     // same as in generate_call_stub():
 447     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 448     const Address thread        (rbp, thread_off         * wordSize);
 449 
 450 #ifdef ASSERT
 451     // verify that threads correspond
 452     {
 453       Label L1, L2, L3;
 454       __ cmpptr(r15_thread, thread);
 455       __ jcc(Assembler::equal, L1);
 456       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 457       __ bind(L1);
 458       __ get_thread(rbx);
 459       __ cmpptr(r15_thread, thread);
 460       __ jcc(Assembler::equal, L2);
 461       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 462       __ bind(L2);
 463       __ cmpptr(r15_thread, rbx);
 464       __ jcc(Assembler::equal, L3);
 465       __ stop("StubRoutines::catch_exception: threads must correspond");
 466       __ bind(L3);
 467     }
 468 #endif
 469 
 470     // set pending exception
 471     __ verify_oop(rax);
 472 
 473     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 474     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 475     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 476     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 477 
 478     // complete return to VM
 479     assert(StubRoutines::_call_stub_return_address != NULL,
 480            "_call_stub_return_address must have been generated before");
 481     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 482 
 483     return start;
 484   }
 485 
 486   // Continuation point for runtime calls returning with a pending
 487   // exception.  The pending exception check happened in the runtime
 488   // or native call stub.  The pending exception in Thread is
 489   // converted into a Java-level exception.
 490   //
 491   // Contract with Java-level exception handlers:
 492   // rax: exception
 493   // rdx: throwing pc
 494   //
 495   // NOTE: At entry of this stub, exception-pc must be on stack !!
 496 
 497   address generate_forward_exception() {
 498     StubCodeMark mark(this, "StubRoutines", "forward exception");
 499     address start = __ pc();
 500 
 501     // Upon entry, the sp points to the return address returning into
 502     // Java (interpreted or compiled) code; i.e., the return address
 503     // becomes the throwing pc.
 504     //
 505     // Arguments pushed before the runtime call are still on the stack
 506     // but the exception handler will reset the stack pointer ->
 507     // ignore them.  A potential result in registers can be ignored as
 508     // well.
 509 
 510 #ifdef ASSERT
 511     // make sure this code is only executed if there is a pending exception
 512     {
 513       Label L;
 514       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 515       __ jcc(Assembler::notEqual, L);
 516       __ stop("StubRoutines::forward exception: no pending exception (1)");
 517       __ bind(L);
 518     }
 519 #endif
 520 
 521     // compute exception handler into rbx
 522     __ movptr(c_rarg0, Address(rsp, 0));
 523     BLOCK_COMMENT("call exception_handler_for_return_address");
 524     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 525                          SharedRuntime::exception_handler_for_return_address),
 526                     r15_thread, c_rarg0);
 527     __ mov(rbx, rax);
 528 
 529     // setup rax & rdx, remove return address & clear pending exception
 530     __ pop(rdx);
 531     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 532     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 533 
 534 #ifdef ASSERT
 535     // make sure exception is set
 536     {
 537       Label L;
 538       __ testptr(rax, rax);
 539       __ jcc(Assembler::notEqual, L);
 540       __ stop("StubRoutines::forward exception: no pending exception (2)");
 541       __ bind(L);
 542     }
 543 #endif
 544 
 545     // continue at exception handler (return address removed)
 546     // rax: exception
 547     // rbx: exception handler
 548     // rdx: throwing pc
 549     __ verify_oop(rax);
 550     __ jmp(rbx);
 551 
 552     return start;
 553   }
 554 
 555   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 556   //
 557   // Arguments :
 558   //    c_rarg0: exchange_value
 559   //    c_rarg0: dest
 560   //
 561   // Result:
 562   //    *dest <- ex, return (orig *dest)
 563   address generate_atomic_xchg() {
 564     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 565     address start = __ pc();
 566 
 567     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 568     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 569     __ ret(0);
 570 
 571     return start;
 572   }
 573 
 574   // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
 575   //
 576   // Arguments :
 577   //    c_rarg0: exchange_value
 578   //    c_rarg1: dest
 579   //
 580   // Result:
 581   //    *dest <- ex, return (orig *dest)
 582   address generate_atomic_xchg_long() {
 583     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
 584     address start = __ pc();
 585 
 586     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 587     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 588     __ ret(0);
 589 
 590     return start;
 591   }
 592 
 593   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 594   //                                         jint compare_value)
 595   //
 596   // Arguments :
 597   //    c_rarg0: exchange_value
 598   //    c_rarg1: dest
 599   //    c_rarg2: compare_value
 600   //
 601   // Result:
 602   //    if ( compare_value == *dest ) {
 603   //       *dest = exchange_value
 604   //       return compare_value;
 605   //    else
 606   //       return *dest;
 607   address generate_atomic_cmpxchg() {
 608     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 609     address start = __ pc();
 610 
 611     __ movl(rax, c_rarg2);
 612     __ lock();
 613     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 614     __ ret(0);
 615 
 616     return start;
 617   }
 618 
 619   // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
 620   //                                           int8_t compare_value)
 621   //
 622   // Arguments :
 623   //    c_rarg0: exchange_value
 624   //    c_rarg1: dest
 625   //    c_rarg2: compare_value
 626   //
 627   // Result:
 628   //    if ( compare_value == *dest ) {
 629   //       *dest = exchange_value
 630   //       return compare_value;
 631   //    else
 632   //       return *dest;
 633   address generate_atomic_cmpxchg_byte() {
 634     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 635     address start = __ pc();
 636 
 637     __ movsbq(rax, c_rarg2);
 638     __ lock();
 639     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 640     __ ret(0);
 641 
 642     return start;
 643   }
 644 
 645   // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
 646   //                                            volatile int64_t* dest,
 647   //                                            int64_t compare_value)
 648   // Arguments :
 649   //    c_rarg0: exchange_value
 650   //    c_rarg1: dest
 651   //    c_rarg2: compare_value
 652   //
 653   // Result:
 654   //    if ( compare_value == *dest ) {
 655   //       *dest = exchange_value
 656   //       return compare_value;
 657   //    else
 658   //       return *dest;
 659   address generate_atomic_cmpxchg_long() {
 660     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 661     address start = __ pc();
 662 
 663     __ movq(rax, c_rarg2);
 664     __ lock();
 665     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 666     __ ret(0);
 667 
 668     return start;
 669   }
 670 
 671   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 672   //
 673   // Arguments :
 674   //    c_rarg0: add_value
 675   //    c_rarg1: dest
 676   //
 677   // Result:
 678   //    *dest += add_value
 679   //    return *dest;
 680   address generate_atomic_add() {
 681     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 682     address start = __ pc();
 683 
 684     __ movl(rax, c_rarg0);
 685     __ lock();
 686     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 687     __ addl(rax, c_rarg0);
 688     __ ret(0);
 689 
 690     return start;
 691   }
 692 
 693   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 694   //
 695   // Arguments :
 696   //    c_rarg0: add_value
 697   //    c_rarg1: dest
 698   //
 699   // Result:
 700   //    *dest += add_value
 701   //    return *dest;
 702   address generate_atomic_add_long() {
 703     StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
 704     address start = __ pc();
 705 
 706     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 707     __ lock();
 708     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 709     __ addptr(rax, c_rarg0);
 710     __ ret(0);
 711 
 712     return start;
 713   }
 714 
 715   // Support for intptr_t OrderAccess::fence()
 716   //
 717   // Arguments :
 718   //
 719   // Result:
 720   address generate_orderaccess_fence() {
 721     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 722     address start = __ pc();
 723     __ membar(Assembler::StoreLoad);
 724     __ ret(0);
 725 
 726     return start;
 727   }
 728 
 729   // Support for intptr_t get_previous_fp()
 730   //
 731   // This routine is used to find the previous frame pointer for the
 732   // caller (current_frame_guess). This is used as part of debugging
 733   // ps() is seemingly lost trying to find frames.
 734   // This code assumes that caller current_frame_guess) has a frame.
 735   address generate_get_previous_fp() {
 736     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
 737     const Address old_fp(rbp, 0);
 738     const Address older_fp(rax, 0);
 739     address start = __ pc();
 740 
 741     __ enter();
 742     __ movptr(rax, old_fp); // callers fp
 743     __ movptr(rax, older_fp); // the frame for ps()
 744     __ pop(rbp);
 745     __ ret(0);
 746 
 747     return start;
 748   }
 749 
 750   // Support for intptr_t get_previous_sp()
 751   //
 752   // This routine is used to find the previous stack pointer for the
 753   // caller.
 754   address generate_get_previous_sp() {
 755     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 756     address start = __ pc();
 757 
 758     __ movptr(rax, rsp);
 759     __ addptr(rax, 8); // return address is at the top of the stack.
 760     __ ret(0);
 761 
 762     return start;
 763   }
 764 
 765   //----------------------------------------------------------------------------------------------------
 766   // Support for void verify_mxcsr()
 767   //
 768   // This routine is used with -Xcheck:jni to verify that native
 769   // JNI code does not return to Java code without restoring the
 770   // MXCSR register to our expected state.
 771 
 772   address generate_verify_mxcsr() {
 773     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 774     address start = __ pc();
 775 
 776     const Address mxcsr_save(rsp, 0);
 777 
 778     if (CheckJNICalls) {
 779       Label ok_ret;
 780       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 781       __ push(rax);
 782       __ subptr(rsp, wordSize);      // allocate a temp location
 783       __ stmxcsr(mxcsr_save);
 784       __ movl(rax, mxcsr_save);
 785       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 786       __ cmp32(rax, mxcsr_std);
 787       __ jcc(Assembler::equal, ok_ret);
 788 
 789       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 790 
 791       __ ldmxcsr(mxcsr_std);
 792 
 793       __ bind(ok_ret);
 794       __ addptr(rsp, wordSize);
 795       __ pop(rax);
 796     }
 797 
 798     __ ret(0);
 799 
 800     return start;
 801   }
 802 
 803   address generate_f2i_fixup() {
 804     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 805     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 806 
 807     address start = __ pc();
 808 
 809     Label L;
 810 
 811     __ push(rax);
 812     __ push(c_rarg3);
 813     __ push(c_rarg2);
 814     __ push(c_rarg1);
 815 
 816     __ movl(rax, 0x7f800000);
 817     __ xorl(c_rarg3, c_rarg3);
 818     __ movl(c_rarg2, inout);
 819     __ movl(c_rarg1, c_rarg2);
 820     __ andl(c_rarg1, 0x7fffffff);
 821     __ cmpl(rax, c_rarg1); // NaN? -> 0
 822     __ jcc(Assembler::negative, L);
 823     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 824     __ movl(c_rarg3, 0x80000000);
 825     __ movl(rax, 0x7fffffff);
 826     __ cmovl(Assembler::positive, c_rarg3, rax);
 827 
 828     __ bind(L);
 829     __ movptr(inout, c_rarg3);
 830 
 831     __ pop(c_rarg1);
 832     __ pop(c_rarg2);
 833     __ pop(c_rarg3);
 834     __ pop(rax);
 835 
 836     __ ret(0);
 837 
 838     return start;
 839   }
 840 
 841   address generate_f2l_fixup() {
 842     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 843     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 844     address start = __ pc();
 845 
 846     Label L;
 847 
 848     __ push(rax);
 849     __ push(c_rarg3);
 850     __ push(c_rarg2);
 851     __ push(c_rarg1);
 852 
 853     __ movl(rax, 0x7f800000);
 854     __ xorl(c_rarg3, c_rarg3);
 855     __ movl(c_rarg2, inout);
 856     __ movl(c_rarg1, c_rarg2);
 857     __ andl(c_rarg1, 0x7fffffff);
 858     __ cmpl(rax, c_rarg1); // NaN? -> 0
 859     __ jcc(Assembler::negative, L);
 860     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 861     __ mov64(c_rarg3, 0x8000000000000000);
 862     __ mov64(rax, 0x7fffffffffffffff);
 863     __ cmov(Assembler::positive, c_rarg3, rax);
 864 
 865     __ bind(L);
 866     __ movptr(inout, c_rarg3);
 867 
 868     __ pop(c_rarg1);
 869     __ pop(c_rarg2);
 870     __ pop(c_rarg3);
 871     __ pop(rax);
 872 
 873     __ ret(0);
 874 
 875     return start;
 876   }
 877 
 878   address generate_d2i_fixup() {
 879     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 880     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 881 
 882     address start = __ pc();
 883 
 884     Label L;
 885 
 886     __ push(rax);
 887     __ push(c_rarg3);
 888     __ push(c_rarg2);
 889     __ push(c_rarg1);
 890     __ push(c_rarg0);
 891 
 892     __ movl(rax, 0x7ff00000);
 893     __ movq(c_rarg2, inout);
 894     __ movl(c_rarg3, c_rarg2);
 895     __ mov(c_rarg1, c_rarg2);
 896     __ mov(c_rarg0, c_rarg2);
 897     __ negl(c_rarg3);
 898     __ shrptr(c_rarg1, 0x20);
 899     __ orl(c_rarg3, c_rarg2);
 900     __ andl(c_rarg1, 0x7fffffff);
 901     __ xorl(c_rarg2, c_rarg2);
 902     __ shrl(c_rarg3, 0x1f);
 903     __ orl(c_rarg1, c_rarg3);
 904     __ cmpl(rax, c_rarg1);
 905     __ jcc(Assembler::negative, L); // NaN -> 0
 906     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 907     __ movl(c_rarg2, 0x80000000);
 908     __ movl(rax, 0x7fffffff);
 909     __ cmov(Assembler::positive, c_rarg2, rax);
 910 
 911     __ bind(L);
 912     __ movptr(inout, c_rarg2);
 913 
 914     __ pop(c_rarg0);
 915     __ pop(c_rarg1);
 916     __ pop(c_rarg2);
 917     __ pop(c_rarg3);
 918     __ pop(rax);
 919 
 920     __ ret(0);
 921 
 922     return start;
 923   }
 924 
 925   address generate_d2l_fixup() {
 926     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 927     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 928 
 929     address start = __ pc();
 930 
 931     Label L;
 932 
 933     __ push(rax);
 934     __ push(c_rarg3);
 935     __ push(c_rarg2);
 936     __ push(c_rarg1);
 937     __ push(c_rarg0);
 938 
 939     __ movl(rax, 0x7ff00000);
 940     __ movq(c_rarg2, inout);
 941     __ movl(c_rarg3, c_rarg2);
 942     __ mov(c_rarg1, c_rarg2);
 943     __ mov(c_rarg0, c_rarg2);
 944     __ negl(c_rarg3);
 945     __ shrptr(c_rarg1, 0x20);
 946     __ orl(c_rarg3, c_rarg2);
 947     __ andl(c_rarg1, 0x7fffffff);
 948     __ xorl(c_rarg2, c_rarg2);
 949     __ shrl(c_rarg3, 0x1f);
 950     __ orl(c_rarg1, c_rarg3);
 951     __ cmpl(rax, c_rarg1);
 952     __ jcc(Assembler::negative, L); // NaN -> 0
 953     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 954     __ mov64(c_rarg2, 0x8000000000000000);
 955     __ mov64(rax, 0x7fffffffffffffff);
 956     __ cmovq(Assembler::positive, c_rarg2, rax);
 957 
 958     __ bind(L);
 959     __ movq(inout, c_rarg2);
 960 
 961     __ pop(c_rarg0);
 962     __ pop(c_rarg1);
 963     __ pop(c_rarg2);
 964     __ pop(c_rarg3);
 965     __ pop(rax);
 966 
 967     __ ret(0);
 968 
 969     return start;
 970   }
 971 
 972   address generate_fp_mask(const char *stub_name, int64_t mask) {
 973     __ align(CodeEntryAlignment);
 974     StubCodeMark mark(this, "StubRoutines", stub_name);
 975     address start = __ pc();
 976 
 977     __ emit_data64( mask, relocInfo::none );
 978     __ emit_data64( mask, relocInfo::none );
 979 
 980     return start;
 981   }
 982 
 983   address generate_vector_mask(const char *stub_name, int64_t mask) {
 984     __ align(CodeEntryAlignment);
 985     StubCodeMark mark(this, "StubRoutines", stub_name);
 986     address start = __ pc();
 987 
 988     __ emit_data64(mask, relocInfo::none);
 989     __ emit_data64(mask, relocInfo::none);
 990     __ emit_data64(mask, relocInfo::none);
 991     __ emit_data64(mask, relocInfo::none);
 992     __ emit_data64(mask, relocInfo::none);
 993     __ emit_data64(mask, relocInfo::none);
 994     __ emit_data64(mask, relocInfo::none);
 995     __ emit_data64(mask, relocInfo::none);
 996 
 997     return start;
 998   }
 999 
1000   address generate_vector_byte_perm_mask(const char *stub_name) {
1001     __ align(CodeEntryAlignment);
1002     StubCodeMark mark(this, "StubRoutines", stub_name);
1003     address start = __ pc();
1004 
1005     __ emit_data64(0x0000000000000001, relocInfo::none);
1006     __ emit_data64(0x0000000000000003, relocInfo::none);
1007     __ emit_data64(0x0000000000000005, relocInfo::none);
1008     __ emit_data64(0x0000000000000007, relocInfo::none);
1009     __ emit_data64(0x0000000000000000, relocInfo::none);
1010     __ emit_data64(0x0000000000000002, relocInfo::none);
1011     __ emit_data64(0x0000000000000004, relocInfo::none);
1012     __ emit_data64(0x0000000000000006, relocInfo::none);
1013 
1014     return start;
1015   }
1016 
1017   // Non-destructive plausibility checks for oops
1018   //
1019   // Arguments:
1020   //    all args on stack!
1021   //
1022   // Stack after saving c_rarg3:
1023   //    [tos + 0]: saved c_rarg3
1024   //    [tos + 1]: saved c_rarg2
1025   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
1026   //    [tos + 3]: saved flags
1027   //    [tos + 4]: return address
1028   //  * [tos + 5]: error message (char*)
1029   //  * [tos + 6]: object to verify (oop)
1030   //  * [tos + 7]: saved rax - saved by caller and bashed
1031   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
1032   //  * = popped on exit
1033   address generate_verify_oop() {
1034     StubCodeMark mark(this, "StubRoutines", "verify_oop");
1035     address start = __ pc();
1036 
1037     Label exit, error;
1038 
1039     __ pushf();
1040     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1041 
1042     __ push(r12);
1043 
1044     // save c_rarg2 and c_rarg3
1045     __ push(c_rarg2);
1046     __ push(c_rarg3);
1047 
1048     enum {
1049            // After previous pushes.
1050            oop_to_verify = 6 * wordSize,
1051            saved_rax     = 7 * wordSize,
1052            saved_r10     = 8 * wordSize,
1053 
1054            // Before the call to MacroAssembler::debug(), see below.
1055            return_addr   = 16 * wordSize,
1056            error_msg     = 17 * wordSize
1057     };
1058 
1059     // get object
1060     __ movptr(rax, Address(rsp, oop_to_verify));
1061 
1062     // make sure object is 'reasonable'
1063     __ testptr(rax, rax);
1064     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1065 
1066 #if INCLUDE_ZGC
1067     if (UseZGC) {
1068       // Check if metadata bits indicate a bad oop
1069       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1070       __ jcc(Assembler::notZero, error);
1071     }
1072 #endif
1073 
1074     // Check if the oop is in the right area of memory
1075     __ movptr(c_rarg2, rax);
1076     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1077     __ andptr(c_rarg2, c_rarg3);
1078     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1079     __ cmpptr(c_rarg2, c_rarg3);
1080     __ jcc(Assembler::notZero, error);
1081 
1082     // set r12 to heapbase for load_klass()
1083     __ reinit_heapbase();
1084 
1085     // make sure klass is 'reasonable', which is not zero.
1086     __ load_klass(rax, rax);  // get klass
1087     __ testptr(rax, rax);
1088     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1089 
1090     // return if everything seems ok
1091     __ bind(exit);
1092     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1093     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1094     __ pop(c_rarg3);                             // restore c_rarg3
1095     __ pop(c_rarg2);                             // restore c_rarg2
1096     __ pop(r12);                                 // restore r12
1097     __ popf();                                   // restore flags
1098     __ ret(4 * wordSize);                        // pop caller saved stuff
1099 
1100     // handle errors
1101     __ bind(error);
1102     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1103     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1104     __ pop(c_rarg3);                             // get saved c_rarg3 back
1105     __ pop(c_rarg2);                             // get saved c_rarg2 back
1106     __ pop(r12);                                 // get saved r12 back
1107     __ popf();                                   // get saved flags off stack --
1108                                                  // will be ignored
1109 
1110     __ pusha();                                  // push registers
1111                                                  // (rip is already
1112                                                  // already pushed)
1113     // debug(char* msg, int64_t pc, int64_t regs[])
1114     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1115     // pushed all the registers, so now the stack looks like:
1116     //     [tos +  0] 16 saved registers
1117     //     [tos + 16] return address
1118     //   * [tos + 17] error message (char*)
1119     //   * [tos + 18] object to verify (oop)
1120     //   * [tos + 19] saved rax - saved by caller and bashed
1121     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1122     //   * = popped on exit
1123 
1124     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1125     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1126     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1127     __ mov(r12, rsp);                               // remember rsp
1128     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1129     __ andptr(rsp, -16);                            // align stack as required by ABI
1130     BLOCK_COMMENT("call MacroAssembler::debug");
1131     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1132     __ mov(rsp, r12);                               // restore rsp
1133     __ popa();                                      // pop registers (includes r12)
1134     __ ret(4 * wordSize);                           // pop caller saved stuff
1135 
1136     return start;
1137   }
1138 
1139   //
1140   // Verify that a register contains clean 32-bits positive value
1141   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1142   //
1143   //  Input:
1144   //    Rint  -  32-bits value
1145   //    Rtmp  -  scratch
1146   //
1147   void assert_clean_int(Register Rint, Register Rtmp) {
1148 #ifdef ASSERT
1149     Label L;
1150     assert_different_registers(Rtmp, Rint);
1151     __ movslq(Rtmp, Rint);
1152     __ cmpq(Rtmp, Rint);
1153     __ jcc(Assembler::equal, L);
1154     __ stop("high 32-bits of int value are not 0");
1155     __ bind(L);
1156 #endif
1157   }
1158 
1159   //  Generate overlap test for array copy stubs
1160   //
1161   //  Input:
1162   //     c_rarg0 - from
1163   //     c_rarg1 - to
1164   //     c_rarg2 - element count
1165   //
1166   //  Output:
1167   //     rax   - &from[element count - 1]
1168   //
1169   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1170     assert(no_overlap_target != NULL, "must be generated");
1171     array_overlap_test(no_overlap_target, NULL, sf);
1172   }
1173   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1174     array_overlap_test(NULL, &L_no_overlap, sf);
1175   }
1176   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1177     const Register from     = c_rarg0;
1178     const Register to       = c_rarg1;
1179     const Register count    = c_rarg2;
1180     const Register end_from = rax;
1181 
1182     __ cmpptr(to, from);
1183     __ lea(end_from, Address(from, count, sf, 0));
1184     if (NOLp == NULL) {
1185       ExternalAddress no_overlap(no_overlap_target);
1186       __ jump_cc(Assembler::belowEqual, no_overlap);
1187       __ cmpptr(to, end_from);
1188       __ jump_cc(Assembler::aboveEqual, no_overlap);
1189     } else {
1190       __ jcc(Assembler::belowEqual, (*NOLp));
1191       __ cmpptr(to, end_from);
1192       __ jcc(Assembler::aboveEqual, (*NOLp));
1193     }
1194   }
1195 
1196   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1197   //
1198   // Outputs:
1199   //    rdi - rcx
1200   //    rsi - rdx
1201   //    rdx - r8
1202   //    rcx - r9
1203   //
1204   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1205   // are non-volatile.  r9 and r10 should not be used by the caller.
1206   //
1207   DEBUG_ONLY(bool regs_in_thread;)
1208 
1209   void setup_arg_regs(int nargs = 3) {
1210     const Register saved_rdi = r9;
1211     const Register saved_rsi = r10;
1212     assert(nargs == 3 || nargs == 4, "else fix");
1213 #ifdef _WIN64
1214     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1215            "unexpected argument registers");
1216     if (nargs >= 4)
1217       __ mov(rax, r9);  // r9 is also saved_rdi
1218     __ movptr(saved_rdi, rdi);
1219     __ movptr(saved_rsi, rsi);
1220     __ mov(rdi, rcx); // c_rarg0
1221     __ mov(rsi, rdx); // c_rarg1
1222     __ mov(rdx, r8);  // c_rarg2
1223     if (nargs >= 4)
1224       __ mov(rcx, rax); // c_rarg3 (via rax)
1225 #else
1226     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1227            "unexpected argument registers");
1228 #endif
1229     DEBUG_ONLY(regs_in_thread = false;)
1230   }
1231 
1232   void restore_arg_regs() {
1233     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1234     const Register saved_rdi = r9;
1235     const Register saved_rsi = r10;
1236 #ifdef _WIN64
1237     __ movptr(rdi, saved_rdi);
1238     __ movptr(rsi, saved_rsi);
1239 #endif
1240   }
1241 
1242   // This is used in places where r10 is a scratch register, and can
1243   // be adapted if r9 is needed also.
1244   void setup_arg_regs_using_thread() {
1245     const Register saved_r15 = r9;
1246 #ifdef _WIN64
1247     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1248     __ get_thread(r15_thread);
1249     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1250            "unexpected argument registers");
1251     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1252     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1253 
1254     __ mov(rdi, rcx); // c_rarg0
1255     __ mov(rsi, rdx); // c_rarg1
1256     __ mov(rdx, r8);  // c_rarg2
1257 #else
1258     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1259            "unexpected argument registers");
1260 #endif
1261     DEBUG_ONLY(regs_in_thread = true;)
1262   }
1263 
1264   void restore_arg_regs_using_thread() {
1265     assert(regs_in_thread, "wrong call to restore_arg_regs");
1266     const Register saved_r15 = r9;
1267 #ifdef _WIN64
1268     __ get_thread(r15_thread);
1269     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1270     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1271     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1272 #endif
1273   }
1274 
1275   // Copy big chunks forward
1276   //
1277   // Inputs:
1278   //   end_from     - source arrays end address
1279   //   end_to       - destination array end address
1280   //   qword_count  - 64-bits element count, negative
1281   //   to           - scratch
1282   //   L_copy_bytes - entry label
1283   //   L_copy_8_bytes  - exit  label
1284   //
1285   void copy_bytes_forward(Register end_from, Register end_to,
1286                              Register qword_count, Register to,
1287                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1288     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1289     Label L_loop;
1290     __ align(OptoLoopAlignment);
1291     if (UseUnalignedLoadStores) {
1292       Label L_end;
1293       // Copy 64-bytes per iteration
1294       __ BIND(L_loop);
1295       if (UseAVX > 2) {
1296         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1297         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1298       } else if (UseAVX == 2) {
1299         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1300         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1301         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1302         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1303       } else {
1304         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1305         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1306         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1307         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1308         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1309         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1310         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1311         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1312       }
1313       __ BIND(L_copy_bytes);
1314       __ addptr(qword_count, 8);
1315       __ jcc(Assembler::lessEqual, L_loop);
1316       __ subptr(qword_count, 4);  // sub(8) and add(4)
1317       __ jccb(Assembler::greater, L_end);
1318       // Copy trailing 32 bytes
1319       if (UseAVX >= 2) {
1320         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1321         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1322       } else {
1323         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1324         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1325         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1326         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1327       }
1328       __ addptr(qword_count, 4);
1329       __ BIND(L_end);
1330       if (UseAVX >= 2) {
1331         // clean upper bits of YMM registers
1332         __ vpxor(xmm0, xmm0);
1333         __ vpxor(xmm1, xmm1);
1334       }
1335     } else {
1336       // Copy 32-bytes per iteration
1337       __ BIND(L_loop);
1338       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1339       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1340       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1341       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1342       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1343       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1344       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1345       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1346 
1347       __ BIND(L_copy_bytes);
1348       __ addptr(qword_count, 4);
1349       __ jcc(Assembler::lessEqual, L_loop);
1350     }
1351     __ subptr(qword_count, 4);
1352     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1353   }
1354 
1355   // Copy big chunks backward
1356   //
1357   // Inputs:
1358   //   from         - source arrays address
1359   //   dest         - destination array address
1360   //   qword_count  - 64-bits element count
1361   //   to           - scratch
1362   //   L_copy_bytes - entry label
1363   //   L_copy_8_bytes  - exit  label
1364   //
1365   void copy_bytes_backward(Register from, Register dest,
1366                               Register qword_count, Register to,
1367                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1368     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1369     Label L_loop;
1370     __ align(OptoLoopAlignment);
1371     if (UseUnalignedLoadStores) {
1372       Label L_end;
1373       // Copy 64-bytes per iteration
1374       __ BIND(L_loop);
1375       if (UseAVX > 2) {
1376         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1377         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1378       } else if (UseAVX == 2) {
1379         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1380         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1381         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1382         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1383       } else {
1384         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1385         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1386         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1387         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1388         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1389         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1390         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1391         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1392       }
1393       __ BIND(L_copy_bytes);
1394       __ subptr(qword_count, 8);
1395       __ jcc(Assembler::greaterEqual, L_loop);
1396 
1397       __ addptr(qword_count, 4);  // add(8) and sub(4)
1398       __ jccb(Assembler::less, L_end);
1399       // Copy trailing 32 bytes
1400       if (UseAVX >= 2) {
1401         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1402         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1403       } else {
1404         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1405         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1406         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1407         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1408       }
1409       __ subptr(qword_count, 4);
1410       __ BIND(L_end);
1411       if (UseAVX >= 2) {
1412         // clean upper bits of YMM registers
1413         __ vpxor(xmm0, xmm0);
1414         __ vpxor(xmm1, xmm1);
1415       }
1416     } else {
1417       // Copy 32-bytes per iteration
1418       __ BIND(L_loop);
1419       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1420       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1421       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1422       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1423       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1424       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1425       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1426       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1427 
1428       __ BIND(L_copy_bytes);
1429       __ subptr(qword_count, 4);
1430       __ jcc(Assembler::greaterEqual, L_loop);
1431     }
1432     __ addptr(qword_count, 4);
1433     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1434   }
1435 
1436 
1437   // Arguments:
1438   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1439   //             ignored
1440   //   name    - stub name string
1441   //
1442   // Inputs:
1443   //   c_rarg0   - source array address
1444   //   c_rarg1   - destination array address
1445   //   c_rarg2   - element count, treated as ssize_t, can be zero
1446   //
1447   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1448   // we let the hardware handle it.  The one to eight bytes within words,
1449   // dwords or qwords that span cache line boundaries will still be loaded
1450   // and stored atomically.
1451   //
1452   // Side Effects:
1453   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1454   //   used by generate_conjoint_byte_copy().
1455   //
1456   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1457     __ align(CodeEntryAlignment);
1458     StubCodeMark mark(this, "StubRoutines", name);
1459     address start = __ pc();
1460 
1461     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1462     Label L_copy_byte, L_exit;
1463     const Register from        = rdi;  // source array address
1464     const Register to          = rsi;  // destination array address
1465     const Register count       = rdx;  // elements count
1466     const Register byte_count  = rcx;
1467     const Register qword_count = count;
1468     const Register end_from    = from; // source array end address
1469     const Register end_to      = to;   // destination array end address
1470     // End pointers are inclusive, and if count is not zero they point
1471     // to the last unit copied:  end_to[0] := end_from[0]
1472 
1473     __ enter(); // required for proper stackwalking of RuntimeStub frame
1474     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1475 
1476     if (entry != NULL) {
1477       *entry = __ pc();
1478        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1479       BLOCK_COMMENT("Entry:");
1480     }
1481 
1482     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1483                       // r9 and r10 may be used to save non-volatile registers
1484 
1485     // 'from', 'to' and 'count' are now valid
1486     __ movptr(byte_count, count);
1487     __ shrptr(count, 3); // count => qword_count
1488 
1489     // Copy from low to high addresses.  Use 'to' as scratch.
1490     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1491     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1492     __ negptr(qword_count); // make the count negative
1493     __ jmp(L_copy_bytes);
1494 
1495     // Copy trailing qwords
1496   __ BIND(L_copy_8_bytes);
1497     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1498     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1499     __ increment(qword_count);
1500     __ jcc(Assembler::notZero, L_copy_8_bytes);
1501 
1502     // Check for and copy trailing dword
1503   __ BIND(L_copy_4_bytes);
1504     __ testl(byte_count, 4);
1505     __ jccb(Assembler::zero, L_copy_2_bytes);
1506     __ movl(rax, Address(end_from, 8));
1507     __ movl(Address(end_to, 8), rax);
1508 
1509     __ addptr(end_from, 4);
1510     __ addptr(end_to, 4);
1511 
1512     // Check for and copy trailing word
1513   __ BIND(L_copy_2_bytes);
1514     __ testl(byte_count, 2);
1515     __ jccb(Assembler::zero, L_copy_byte);
1516     __ movw(rax, Address(end_from, 8));
1517     __ movw(Address(end_to, 8), rax);
1518 
1519     __ addptr(end_from, 2);
1520     __ addptr(end_to, 2);
1521 
1522     // Check for and copy trailing byte
1523   __ BIND(L_copy_byte);
1524     __ testl(byte_count, 1);
1525     __ jccb(Assembler::zero, L_exit);
1526     __ movb(rax, Address(end_from, 8));
1527     __ movb(Address(end_to, 8), rax);
1528 
1529   __ BIND(L_exit);
1530     restore_arg_regs();
1531     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1532     __ xorptr(rax, rax); // return 0
1533     __ vzeroupper();
1534     __ leave(); // required for proper stackwalking of RuntimeStub frame
1535     __ ret(0);
1536 
1537     // Copy in multi-bytes chunks
1538     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1539     __ jmp(L_copy_4_bytes);
1540 
1541     return start;
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1555   // we let the hardware handle it.  The one to eight bytes within words,
1556   // dwords or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1560                                       address* entry, const char *name) {
1561     __ align(CodeEntryAlignment);
1562     StubCodeMark mark(this, "StubRoutines", name);
1563     address start = __ pc();
1564 
1565     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1566     const Register from        = rdi;  // source array address
1567     const Register to          = rsi;  // destination array address
1568     const Register count       = rdx;  // elements count
1569     const Register byte_count  = rcx;
1570     const Register qword_count = count;
1571 
1572     __ enter(); // required for proper stackwalking of RuntimeStub frame
1573     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1574 
1575     if (entry != NULL) {
1576       *entry = __ pc();
1577       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1578       BLOCK_COMMENT("Entry:");
1579     }
1580 
1581     array_overlap_test(nooverlap_target, Address::times_1);
1582     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1583                       // r9 and r10 may be used to save non-volatile registers
1584 
1585     // 'from', 'to' and 'count' are now valid
1586     __ movptr(byte_count, count);
1587     __ shrptr(count, 3);   // count => qword_count
1588 
1589     // Copy from high to low addresses.
1590 
1591     // Check for and copy trailing byte
1592     __ testl(byte_count, 1);
1593     __ jcc(Assembler::zero, L_copy_2_bytes);
1594     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1595     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1596     __ decrement(byte_count); // Adjust for possible trailing word
1597 
1598     // Check for and copy trailing word
1599   __ BIND(L_copy_2_bytes);
1600     __ testl(byte_count, 2);
1601     __ jcc(Assembler::zero, L_copy_4_bytes);
1602     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1603     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1604 
1605     // Check for and copy trailing dword
1606   __ BIND(L_copy_4_bytes);
1607     __ testl(byte_count, 4);
1608     __ jcc(Assembler::zero, L_copy_bytes);
1609     __ movl(rax, Address(from, qword_count, Address::times_8));
1610     __ movl(Address(to, qword_count, Address::times_8), rax);
1611     __ jmp(L_copy_bytes);
1612 
1613     // Copy trailing qwords
1614   __ BIND(L_copy_8_bytes);
1615     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1616     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1617     __ decrement(qword_count);
1618     __ jcc(Assembler::notZero, L_copy_8_bytes);
1619 
1620     restore_arg_regs();
1621     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1622     __ xorptr(rax, rax); // return 0
1623     __ vzeroupper();
1624     __ leave(); // required for proper stackwalking of RuntimeStub frame
1625     __ ret(0);
1626 
1627     // Copy in multi-bytes chunks
1628     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1629 
1630     restore_arg_regs();
1631     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1632     __ xorptr(rax, rax); // return 0
1633     __ vzeroupper();
1634     __ leave(); // required for proper stackwalking of RuntimeStub frame
1635     __ ret(0);
1636 
1637     return start;
1638   }
1639 
1640   // Arguments:
1641   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1642   //             ignored
1643   //   name    - stub name string
1644   //
1645   // Inputs:
1646   //   c_rarg0   - source array address
1647   //   c_rarg1   - destination array address
1648   //   c_rarg2   - element count, treated as ssize_t, can be zero
1649   //
1650   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1651   // let the hardware handle it.  The two or four words within dwords
1652   // or qwords that span cache line boundaries will still be loaded
1653   // and stored atomically.
1654   //
1655   // Side Effects:
1656   //   disjoint_short_copy_entry is set to the no-overlap entry point
1657   //   used by generate_conjoint_short_copy().
1658   //
1659   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1660     __ align(CodeEntryAlignment);
1661     StubCodeMark mark(this, "StubRoutines", name);
1662     address start = __ pc();
1663 
1664     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1665     const Register from        = rdi;  // source array address
1666     const Register to          = rsi;  // destination array address
1667     const Register count       = rdx;  // elements count
1668     const Register word_count  = rcx;
1669     const Register qword_count = count;
1670     const Register end_from    = from; // source array end address
1671     const Register end_to      = to;   // destination array end address
1672     // End pointers are inclusive, and if count is not zero they point
1673     // to the last unit copied:  end_to[0] := end_from[0]
1674 
1675     __ enter(); // required for proper stackwalking of RuntimeStub frame
1676     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1677 
1678     if (entry != NULL) {
1679       *entry = __ pc();
1680       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1681       BLOCK_COMMENT("Entry:");
1682     }
1683 
1684     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1685                       // r9 and r10 may be used to save non-volatile registers
1686 
1687     // 'from', 'to' and 'count' are now valid
1688     __ movptr(word_count, count);
1689     __ shrptr(count, 2); // count => qword_count
1690 
1691     // Copy from low to high addresses.  Use 'to' as scratch.
1692     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1693     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1694     __ negptr(qword_count);
1695     __ jmp(L_copy_bytes);
1696 
1697     // Copy trailing qwords
1698   __ BIND(L_copy_8_bytes);
1699     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1700     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1701     __ increment(qword_count);
1702     __ jcc(Assembler::notZero, L_copy_8_bytes);
1703 
1704     // Original 'dest' is trashed, so we can't use it as a
1705     // base register for a possible trailing word copy
1706 
1707     // Check for and copy trailing dword
1708   __ BIND(L_copy_4_bytes);
1709     __ testl(word_count, 2);
1710     __ jccb(Assembler::zero, L_copy_2_bytes);
1711     __ movl(rax, Address(end_from, 8));
1712     __ movl(Address(end_to, 8), rax);
1713 
1714     __ addptr(end_from, 4);
1715     __ addptr(end_to, 4);
1716 
1717     // Check for and copy trailing word
1718   __ BIND(L_copy_2_bytes);
1719     __ testl(word_count, 1);
1720     __ jccb(Assembler::zero, L_exit);
1721     __ movw(rax, Address(end_from, 8));
1722     __ movw(Address(end_to, 8), rax);
1723 
1724   __ BIND(L_exit);
1725     restore_arg_regs();
1726     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1727     __ xorptr(rax, rax); // return 0
1728     __ vzeroupper();
1729     __ leave(); // required for proper stackwalking of RuntimeStub frame
1730     __ ret(0);
1731 
1732     // Copy in multi-bytes chunks
1733     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1734     __ jmp(L_copy_4_bytes);
1735 
1736     return start;
1737   }
1738 
1739   address generate_fill(BasicType t, bool aligned, const char *name) {
1740     __ align(CodeEntryAlignment);
1741     StubCodeMark mark(this, "StubRoutines", name);
1742     address start = __ pc();
1743 
1744     BLOCK_COMMENT("Entry:");
1745 
1746     const Register to       = c_rarg0;  // source array address
1747     const Register value    = c_rarg1;  // value
1748     const Register count    = c_rarg2;  // elements count
1749 
1750     __ enter(); // required for proper stackwalking of RuntimeStub frame
1751 
1752     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1753 
1754     __ vzeroupper();
1755     __ leave(); // required for proper stackwalking of RuntimeStub frame
1756     __ ret(0);
1757     return start;
1758   }
1759 
1760   // Arguments:
1761   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1762   //             ignored
1763   //   name    - stub name string
1764   //
1765   // Inputs:
1766   //   c_rarg0   - source array address
1767   //   c_rarg1   - destination array address
1768   //   c_rarg2   - element count, treated as ssize_t, can be zero
1769   //
1770   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1771   // let the hardware handle it.  The two or four words within dwords
1772   // or qwords that span cache line boundaries will still be loaded
1773   // and stored atomically.
1774   //
1775   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1776                                        address *entry, const char *name) {
1777     __ align(CodeEntryAlignment);
1778     StubCodeMark mark(this, "StubRoutines", name);
1779     address start = __ pc();
1780 
1781     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1782     const Register from        = rdi;  // source array address
1783     const Register to          = rsi;  // destination array address
1784     const Register count       = rdx;  // elements count
1785     const Register word_count  = rcx;
1786     const Register qword_count = count;
1787 
1788     __ enter(); // required for proper stackwalking of RuntimeStub frame
1789     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1790 
1791     if (entry != NULL) {
1792       *entry = __ pc();
1793       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1794       BLOCK_COMMENT("Entry:");
1795     }
1796 
1797     array_overlap_test(nooverlap_target, Address::times_2);
1798     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1799                       // r9 and r10 may be used to save non-volatile registers
1800 
1801     // 'from', 'to' and 'count' are now valid
1802     __ movptr(word_count, count);
1803     __ shrptr(count, 2); // count => qword_count
1804 
1805     // Copy from high to low addresses.  Use 'to' as scratch.
1806 
1807     // Check for and copy trailing word
1808     __ testl(word_count, 1);
1809     __ jccb(Assembler::zero, L_copy_4_bytes);
1810     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1811     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1812 
1813     // Check for and copy trailing dword
1814   __ BIND(L_copy_4_bytes);
1815     __ testl(word_count, 2);
1816     __ jcc(Assembler::zero, L_copy_bytes);
1817     __ movl(rax, Address(from, qword_count, Address::times_8));
1818     __ movl(Address(to, qword_count, Address::times_8), rax);
1819     __ jmp(L_copy_bytes);
1820 
1821     // Copy trailing qwords
1822   __ BIND(L_copy_8_bytes);
1823     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1824     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1825     __ decrement(qword_count);
1826     __ jcc(Assembler::notZero, L_copy_8_bytes);
1827 
1828     restore_arg_regs();
1829     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1830     __ xorptr(rax, rax); // return 0
1831     __ vzeroupper();
1832     __ leave(); // required for proper stackwalking of RuntimeStub frame
1833     __ ret(0);
1834 
1835     // Copy in multi-bytes chunks
1836     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1837 
1838     restore_arg_regs();
1839     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1840     __ xorptr(rax, rax); // return 0
1841     __ vzeroupper();
1842     __ leave(); // required for proper stackwalking of RuntimeStub frame
1843     __ ret(0);
1844 
1845     return start;
1846   }
1847 
1848   // Arguments:
1849   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1850   //             ignored
1851   //   is_oop  - true => oop array, so generate store check code
1852   //   name    - stub name string
1853   //
1854   // Inputs:
1855   //   c_rarg0   - source array address
1856   //   c_rarg1   - destination array address
1857   //   c_rarg2   - element count, treated as ssize_t, can be zero
1858   //
1859   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1860   // the hardware handle it.  The two dwords within qwords that span
1861   // cache line boundaries will still be loaded and stored atomicly.
1862   //
1863   // Side Effects:
1864   //   disjoint_int_copy_entry is set to the no-overlap entry point
1865   //   used by generate_conjoint_int_oop_copy().
1866   //
1867   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1868                                          const char *name, bool dest_uninitialized = false) {
1869     __ align(CodeEntryAlignment);
1870     StubCodeMark mark(this, "StubRoutines", name);
1871     address start = __ pc();
1872 
1873     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1874     const Register from        = rdi;  // source array address
1875     const Register to          = rsi;  // destination array address
1876     const Register count       = rdx;  // elements count
1877     const Register dword_count = rcx;
1878     const Register qword_count = count;
1879     const Register end_from    = from; // source array end address
1880     const Register end_to      = to;   // destination array end address
1881     // End pointers are inclusive, and if count is not zero they point
1882     // to the last unit copied:  end_to[0] := end_from[0]
1883 
1884     __ enter(); // required for proper stackwalking of RuntimeStub frame
1885     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1886 
1887     if (entry != NULL) {
1888       *entry = __ pc();
1889       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1890       BLOCK_COMMENT("Entry:");
1891     }
1892 
1893     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1894                                    // r9 is used to save r15_thread
1895 
1896     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1897     if (dest_uninitialized) {
1898       decorators |= IS_DEST_UNINITIALIZED;
1899     }
1900     if (aligned) {
1901       decorators |= ARRAYCOPY_ALIGNED;
1902     }
1903 
1904     BasicType type = is_oop ? T_OBJECT : T_INT;
1905     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1906     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1907 
1908     // 'from', 'to' and 'count' are now valid
1909     __ movptr(dword_count, count);
1910     __ shrptr(count, 1); // count => qword_count
1911 
1912     // Copy from low to high addresses.  Use 'to' as scratch.
1913     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1914     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1915     __ negptr(qword_count);
1916     __ jmp(L_copy_bytes);
1917 
1918     // Copy trailing qwords
1919   __ BIND(L_copy_8_bytes);
1920     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1921     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1922     __ increment(qword_count);
1923     __ jcc(Assembler::notZero, L_copy_8_bytes);
1924 
1925     // Check for and copy trailing dword
1926   __ BIND(L_copy_4_bytes);
1927     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1928     __ jccb(Assembler::zero, L_exit);
1929     __ movl(rax, Address(end_from, 8));
1930     __ movl(Address(end_to, 8), rax);
1931 
1932   __ BIND(L_exit);
1933     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
1934     restore_arg_regs_using_thread();
1935     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1936     __ vzeroupper();
1937     __ xorptr(rax, rax); // return 0
1938     __ leave(); // required for proper stackwalking of RuntimeStub frame
1939     __ ret(0);
1940 
1941     // Copy in multi-bytes chunks
1942     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1943     __ jmp(L_copy_4_bytes);
1944 
1945     return start;
1946   }
1947 
1948   // Arguments:
1949   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1950   //             ignored
1951   //   is_oop  - true => oop array, so generate store check code
1952   //   name    - stub name string
1953   //
1954   // Inputs:
1955   //   c_rarg0   - source array address
1956   //   c_rarg1   - destination array address
1957   //   c_rarg2   - element count, treated as ssize_t, can be zero
1958   //
1959   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1960   // the hardware handle it.  The two dwords within qwords that span
1961   // cache line boundaries will still be loaded and stored atomicly.
1962   //
1963   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1964                                          address *entry, const char *name,
1965                                          bool dest_uninitialized = false) {
1966     __ align(CodeEntryAlignment);
1967     StubCodeMark mark(this, "StubRoutines", name);
1968     address start = __ pc();
1969 
1970     Label L_copy_bytes, L_copy_8_bytes, L_exit;
1971     const Register from        = rdi;  // source array address
1972     const Register to          = rsi;  // destination array address
1973     const Register count       = rdx;  // elements count
1974     const Register dword_count = rcx;
1975     const Register qword_count = count;
1976 
1977     __ enter(); // required for proper stackwalking of RuntimeStub frame
1978     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1979 
1980     if (entry != NULL) {
1981       *entry = __ pc();
1982        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1983       BLOCK_COMMENT("Entry:");
1984     }
1985 
1986     array_overlap_test(nooverlap_target, Address::times_4);
1987     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1988                                    // r9 is used to save r15_thread
1989 
1990     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1991     if (dest_uninitialized) {
1992       decorators |= IS_DEST_UNINITIALIZED;
1993     }
1994     if (aligned) {
1995       decorators |= ARRAYCOPY_ALIGNED;
1996     }
1997 
1998     BasicType type = is_oop ? T_OBJECT : T_INT;
1999     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2000     // no registers are destroyed by this call
2001     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2002 
2003     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2004     // 'from', 'to' and 'count' are now valid
2005     __ movptr(dword_count, count);
2006     __ shrptr(count, 1); // count => qword_count
2007 
2008     // Copy from high to low addresses.  Use 'to' as scratch.
2009 
2010     // Check for and copy trailing dword
2011     __ testl(dword_count, 1);
2012     __ jcc(Assembler::zero, L_copy_bytes);
2013     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2014     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2015     __ jmp(L_copy_bytes);
2016 
2017     // Copy trailing qwords
2018   __ BIND(L_copy_8_bytes);
2019     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2020     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2021     __ decrement(qword_count);
2022     __ jcc(Assembler::notZero, L_copy_8_bytes);
2023 
2024     if (is_oop) {
2025       __ jmp(L_exit);
2026     }
2027     restore_arg_regs_using_thread();
2028     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2029     __ xorptr(rax, rax); // return 0
2030     __ vzeroupper();
2031     __ leave(); // required for proper stackwalking of RuntimeStub frame
2032     __ ret(0);
2033 
2034     // Copy in multi-bytes chunks
2035     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2036 
2037   __ BIND(L_exit);
2038     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2039     restore_arg_regs_using_thread();
2040     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2041     __ xorptr(rax, rax); // return 0
2042     __ vzeroupper();
2043     __ leave(); // required for proper stackwalking of RuntimeStub frame
2044     __ ret(0);
2045 
2046     return start;
2047   }
2048 
2049   // Arguments:
2050   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2051   //             ignored
2052   //   is_oop  - true => oop array, so generate store check code
2053   //   name    - stub name string
2054   //
2055   // Inputs:
2056   //   c_rarg0   - source array address
2057   //   c_rarg1   - destination array address
2058   //   c_rarg2   - element count, treated as ssize_t, can be zero
2059   //
2060  // Side Effects:
2061   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2062   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2063   //
2064   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2065                                           const char *name, bool dest_uninitialized = false) {
2066     __ align(CodeEntryAlignment);
2067     StubCodeMark mark(this, "StubRoutines", name);
2068     address start = __ pc();
2069 
2070     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2071     const Register from        = rdi;  // source array address
2072     const Register to          = rsi;  // destination array address
2073     const Register qword_count = rdx;  // elements count
2074     const Register end_from    = from; // source array end address
2075     const Register end_to      = rcx;  // destination array end address
2076     const Register saved_count = r11;
2077     // End pointers are inclusive, and if count is not zero they point
2078     // to the last unit copied:  end_to[0] := end_from[0]
2079 
2080     __ enter(); // required for proper stackwalking of RuntimeStub frame
2081     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2082     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2083 
2084     if (entry != NULL) {
2085       *entry = __ pc();
2086       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2087       BLOCK_COMMENT("Entry:");
2088     }
2089 
2090     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2091                                      // r9 is used to save r15_thread
2092     // 'from', 'to' and 'qword_count' are now valid
2093 
2094     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2095     if (dest_uninitialized) {
2096       decorators |= IS_DEST_UNINITIALIZED;
2097     }
2098     if (aligned) {
2099       decorators |= ARRAYCOPY_ALIGNED;
2100     }
2101 
2102     BasicType type = is_oop ? T_OBJECT : T_LONG;
2103     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2104     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2105 
2106     // Copy from low to high addresses.  Use 'to' as scratch.
2107     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2108     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2109     __ negptr(qword_count);
2110     __ jmp(L_copy_bytes);
2111 
2112     // Copy trailing qwords
2113   __ BIND(L_copy_8_bytes);
2114     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2115     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2116     __ increment(qword_count);
2117     __ jcc(Assembler::notZero, L_copy_8_bytes);
2118 
2119     if (is_oop) {
2120       __ jmp(L_exit);
2121     } else {
2122       restore_arg_regs_using_thread();
2123       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2124       __ xorptr(rax, rax); // return 0
2125       __ vzeroupper();
2126       __ leave(); // required for proper stackwalking of RuntimeStub frame
2127       __ ret(0);
2128     }
2129 
2130     // Copy in multi-bytes chunks
2131     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2132 
2133     __ BIND(L_exit);
2134     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2135     restore_arg_regs_using_thread();
2136     if (is_oop) {
2137       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2138     } else {
2139       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2140     }
2141     __ vzeroupper();
2142     __ xorptr(rax, rax); // return 0
2143     __ leave(); // required for proper stackwalking of RuntimeStub frame
2144     __ ret(0);
2145 
2146     return start;
2147   }
2148 
2149   // Arguments:
2150   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2151   //             ignored
2152   //   is_oop  - true => oop array, so generate store check code
2153   //   name    - stub name string
2154   //
2155   // Inputs:
2156   //   c_rarg0   - source array address
2157   //   c_rarg1   - destination array address
2158   //   c_rarg2   - element count, treated as ssize_t, can be zero
2159   //
2160   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2161                                           address nooverlap_target, address *entry,
2162                                           const char *name, bool dest_uninitialized = false) {
2163     __ align(CodeEntryAlignment);
2164     StubCodeMark mark(this, "StubRoutines", name);
2165     address start = __ pc();
2166 
2167     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2168     const Register from        = rdi;  // source array address
2169     const Register to          = rsi;  // destination array address
2170     const Register qword_count = rdx;  // elements count
2171     const Register saved_count = rcx;
2172 
2173     __ enter(); // required for proper stackwalking of RuntimeStub frame
2174     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2175 
2176     if (entry != NULL) {
2177       *entry = __ pc();
2178       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2179       BLOCK_COMMENT("Entry:");
2180     }
2181 
2182     array_overlap_test(nooverlap_target, Address::times_8);
2183     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2184                                    // r9 is used to save r15_thread
2185     // 'from', 'to' and 'qword_count' are now valid
2186 
2187     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2188     if (dest_uninitialized) {
2189       decorators |= IS_DEST_UNINITIALIZED;
2190     }
2191     if (aligned) {
2192       decorators |= ARRAYCOPY_ALIGNED;
2193     }
2194 
2195     BasicType type = is_oop ? T_OBJECT : T_LONG;
2196     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2197     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2198 
2199     __ jmp(L_copy_bytes);
2200 
2201     // Copy trailing qwords
2202   __ BIND(L_copy_8_bytes);
2203     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2204     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2205     __ decrement(qword_count);
2206     __ jcc(Assembler::notZero, L_copy_8_bytes);
2207 
2208     if (is_oop) {
2209       __ jmp(L_exit);
2210     } else {
2211       restore_arg_regs_using_thread();
2212       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2213       __ xorptr(rax, rax); // return 0
2214       __ vzeroupper();
2215       __ leave(); // required for proper stackwalking of RuntimeStub frame
2216       __ ret(0);
2217     }
2218 
2219     // Copy in multi-bytes chunks
2220     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2221 
2222     __ BIND(L_exit);
2223     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2224     restore_arg_regs_using_thread();
2225     if (is_oop) {
2226       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2227     } else {
2228       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2229     }
2230     __ vzeroupper();
2231     __ xorptr(rax, rax); // return 0
2232     __ leave(); // required for proper stackwalking of RuntimeStub frame
2233     __ ret(0);
2234 
2235     return start;
2236   }
2237 
2238 
2239   // Helper for generating a dynamic type check.
2240   // Smashes no registers.
2241   void generate_type_check(Register sub_klass,
2242                            Register super_check_offset,
2243                            Register super_klass,
2244                            Label& L_success) {
2245     assert_different_registers(sub_klass, super_check_offset, super_klass);
2246 
2247     BLOCK_COMMENT("type_check:");
2248 
2249     Label L_miss;
2250 
2251     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2252                                      super_check_offset);
2253     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2254 
2255     // Fall through on failure!
2256     __ BIND(L_miss);
2257   }
2258 
2259   //
2260   //  Generate checkcasting array copy stub
2261   //
2262   //  Input:
2263   //    c_rarg0   - source array address
2264   //    c_rarg1   - destination array address
2265   //    c_rarg2   - element count, treated as ssize_t, can be zero
2266   //    c_rarg3   - size_t ckoff (super_check_offset)
2267   // not Win64
2268   //    c_rarg4   - oop ckval (super_klass)
2269   // Win64
2270   //    rsp+40    - oop ckval (super_klass)
2271   //
2272   //  Output:
2273   //    rax ==  0  -  success
2274   //    rax == -1^K - failure, where K is partial transfer count
2275   //
2276   address generate_checkcast_copy(const char *name, address *entry,
2277                                   bool dest_uninitialized = false) {
2278 
2279     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2280 
2281     // Input registers (after setup_arg_regs)
2282     const Register from        = rdi;   // source array address
2283     const Register to          = rsi;   // destination array address
2284     const Register length      = rdx;   // elements count
2285     const Register ckoff       = rcx;   // super_check_offset
2286     const Register ckval       = r8;    // super_klass
2287 
2288     // Registers used as temps (r13, r14 are save-on-entry)
2289     const Register end_from    = from;  // source array end address
2290     const Register end_to      = r13;   // destination array end address
2291     const Register count       = rdx;   // -(count_remaining)
2292     const Register r14_length  = r14;   // saved copy of length
2293     // End pointers are inclusive, and if length is not zero they point
2294     // to the last unit copied:  end_to[0] := end_from[0]
2295 
2296     const Register rax_oop    = rax;    // actual oop copied
2297     const Register r11_klass  = r11;    // oop._klass
2298 
2299     //---------------------------------------------------------------
2300     // Assembler stub will be used for this call to arraycopy
2301     // if the two arrays are subtypes of Object[] but the
2302     // destination array type is not equal to or a supertype
2303     // of the source type.  Each element must be separately
2304     // checked.
2305 
2306     __ align(CodeEntryAlignment);
2307     StubCodeMark mark(this, "StubRoutines", name);
2308     address start = __ pc();
2309 
2310     __ enter(); // required for proper stackwalking of RuntimeStub frame
2311 
2312 #ifdef ASSERT
2313     // caller guarantees that the arrays really are different
2314     // otherwise, we would have to make conjoint checks
2315     { Label L;
2316       array_overlap_test(L, TIMES_OOP);
2317       __ stop("checkcast_copy within a single array");
2318       __ bind(L);
2319     }
2320 #endif //ASSERT
2321 
2322     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2323                        // ckoff => rcx, ckval => r8
2324                        // r9 and r10 may be used to save non-volatile registers
2325 #ifdef _WIN64
2326     // last argument (#4) is on stack on Win64
2327     __ movptr(ckval, Address(rsp, 6 * wordSize));
2328 #endif
2329 
2330     // Caller of this entry point must set up the argument registers.
2331     if (entry != NULL) {
2332       *entry = __ pc();
2333       BLOCK_COMMENT("Entry:");
2334     }
2335 
2336     // allocate spill slots for r13, r14
2337     enum {
2338       saved_r13_offset,
2339       saved_r14_offset,
2340       saved_r10_offset,
2341       saved_rbp_offset
2342     };
2343     __ subptr(rsp, saved_rbp_offset * wordSize);
2344     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2345     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2346     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2347 
2348 #ifdef ASSERT
2349       Label L2;
2350       __ get_thread(r14);
2351       __ cmpptr(r15_thread, r14);
2352       __ jcc(Assembler::equal, L2);
2353       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2354       __ bind(L2);
2355 #endif // ASSERT
2356 
2357     // check that int operands are properly extended to size_t
2358     assert_clean_int(length, rax);
2359     assert_clean_int(ckoff, rax);
2360 
2361 #ifdef ASSERT
2362     BLOCK_COMMENT("assert consistent ckoff/ckval");
2363     // The ckoff and ckval must be mutually consistent,
2364     // even though caller generates both.
2365     { Label L;
2366       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2367       __ cmpl(ckoff, Address(ckval, sco_offset));
2368       __ jcc(Assembler::equal, L);
2369       __ stop("super_check_offset inconsistent");
2370       __ bind(L);
2371     }
2372 #endif //ASSERT
2373 
2374     // Loop-invariant addresses.  They are exclusive end pointers.
2375     Address end_from_addr(from, length, TIMES_OOP, 0);
2376     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2377     // Loop-variant addresses.  They assume post-incremented count < 0.
2378     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2379     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2380 
2381     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2382     if (dest_uninitialized) {
2383       decorators |= IS_DEST_UNINITIALIZED;
2384     }
2385 
2386     BasicType type = T_OBJECT;
2387     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2388     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2389 
2390     // Copy from low to high addresses, indexed from the end of each array.
2391     __ lea(end_from, end_from_addr);
2392     __ lea(end_to,   end_to_addr);
2393     __ movptr(r14_length, length);        // save a copy of the length
2394     assert(length == count, "");          // else fix next line:
2395     __ negptr(count);                     // negate and test the length
2396     __ jcc(Assembler::notZero, L_load_element);
2397 
2398     // Empty array:  Nothing to do.
2399     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2400     __ jmp(L_done);
2401 
2402     // ======== begin loop ========
2403     // (Loop is rotated; its entry is L_load_element.)
2404     // Loop control:
2405     //   for (count = -count; count != 0; count++)
2406     // Base pointers src, dst are biased by 8*(count-1),to last element.
2407     __ align(OptoLoopAlignment);
2408 
2409     __ BIND(L_store_element);
2410     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW);  // store the oop
2411     __ increment(count);               // increment the count toward zero
2412     __ jcc(Assembler::zero, L_do_card_marks);
2413 
2414     // ======== loop entry is here ========
2415     __ BIND(L_load_element);
2416     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2417     __ testptr(rax_oop, rax_oop);
2418     __ jcc(Assembler::zero, L_store_element);
2419 
2420     __ load_klass(r11_klass, rax_oop);// query the object klass
2421     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2422     // ======== end loop ========
2423 
2424     // It was a real error; we must depend on the caller to finish the job.
2425     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2426     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2427     // and report their number to the caller.
2428     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2429     Label L_post_barrier;
2430     __ addptr(r14_length, count);     // K = (original - remaining) oops
2431     __ movptr(rax, r14_length);       // save the value
2432     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2433     __ jccb(Assembler::notZero, L_post_barrier);
2434     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2435 
2436     // Come here on success only.
2437     __ BIND(L_do_card_marks);
2438     __ xorptr(rax, rax);              // return 0 on success
2439 
2440     __ BIND(L_post_barrier);
2441     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2442 
2443     // Common exit point (success or failure).
2444     __ BIND(L_done);
2445     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2446     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2447     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2448     restore_arg_regs();
2449     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2450     __ leave(); // required for proper stackwalking of RuntimeStub frame
2451     __ ret(0);
2452 
2453     return start;
2454   }
2455 
2456   //
2457   //  Generate 'unsafe' array copy stub
2458   //  Though just as safe as the other stubs, it takes an unscaled
2459   //  size_t argument instead of an element count.
2460   //
2461   //  Input:
2462   //    c_rarg0   - source array address
2463   //    c_rarg1   - destination array address
2464   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2465   //
2466   // Examines the alignment of the operands and dispatches
2467   // to a long, int, short, or byte copy loop.
2468   //
2469   address generate_unsafe_copy(const char *name,
2470                                address byte_copy_entry, address short_copy_entry,
2471                                address int_copy_entry, address long_copy_entry) {
2472 
2473     Label L_long_aligned, L_int_aligned, L_short_aligned;
2474 
2475     // Input registers (before setup_arg_regs)
2476     const Register from        = c_rarg0;  // source array address
2477     const Register to          = c_rarg1;  // destination array address
2478     const Register size        = c_rarg2;  // byte count (size_t)
2479 
2480     // Register used as a temp
2481     const Register bits        = rax;      // test copy of low bits
2482 
2483     __ align(CodeEntryAlignment);
2484     StubCodeMark mark(this, "StubRoutines", name);
2485     address start = __ pc();
2486 
2487     __ enter(); // required for proper stackwalking of RuntimeStub frame
2488 
2489     // bump this on entry, not on exit:
2490     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2491 
2492     __ mov(bits, from);
2493     __ orptr(bits, to);
2494     __ orptr(bits, size);
2495 
2496     __ testb(bits, BytesPerLong-1);
2497     __ jccb(Assembler::zero, L_long_aligned);
2498 
2499     __ testb(bits, BytesPerInt-1);
2500     __ jccb(Assembler::zero, L_int_aligned);
2501 
2502     __ testb(bits, BytesPerShort-1);
2503     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2504 
2505     __ BIND(L_short_aligned);
2506     __ shrptr(size, LogBytesPerShort); // size => short_count
2507     __ jump(RuntimeAddress(short_copy_entry));
2508 
2509     __ BIND(L_int_aligned);
2510     __ shrptr(size, LogBytesPerInt); // size => int_count
2511     __ jump(RuntimeAddress(int_copy_entry));
2512 
2513     __ BIND(L_long_aligned);
2514     __ shrptr(size, LogBytesPerLong); // size => qword_count
2515     __ jump(RuntimeAddress(long_copy_entry));
2516 
2517     return start;
2518   }
2519 
2520   // Perform range checks on the proposed arraycopy.
2521   // Kills temp, but nothing else.
2522   // Also, clean the sign bits of src_pos and dst_pos.
2523   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2524                               Register src_pos, // source position (c_rarg1)
2525                               Register dst,     // destination array oo (c_rarg2)
2526                               Register dst_pos, // destination position (c_rarg3)
2527                               Register length,
2528                               Register temp,
2529                               Label& L_failed) {
2530     BLOCK_COMMENT("arraycopy_range_checks:");
2531 
2532     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2533     __ movl(temp, length);
2534     __ addl(temp, src_pos);             // src_pos + length
2535     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2536     __ jcc(Assembler::above, L_failed);
2537 
2538     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2539     __ movl(temp, length);
2540     __ addl(temp, dst_pos);             // dst_pos + length
2541     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2542     __ jcc(Assembler::above, L_failed);
2543 
2544     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2545     // Move with sign extension can be used since they are positive.
2546     __ movslq(src_pos, src_pos);
2547     __ movslq(dst_pos, dst_pos);
2548 
2549     BLOCK_COMMENT("arraycopy_range_checks done");
2550   }
2551 
2552   //
2553   //  Generate generic array copy stubs
2554   //
2555   //  Input:
2556   //    c_rarg0    -  src oop
2557   //    c_rarg1    -  src_pos (32-bits)
2558   //    c_rarg2    -  dst oop
2559   //    c_rarg3    -  dst_pos (32-bits)
2560   // not Win64
2561   //    c_rarg4    -  element count (32-bits)
2562   // Win64
2563   //    rsp+40     -  element count (32-bits)
2564   //
2565   //  Output:
2566   //    rax ==  0  -  success
2567   //    rax == -1^K - failure, where K is partial transfer count
2568   //
2569   address generate_generic_copy(const char *name,
2570                                 address byte_copy_entry, address short_copy_entry,
2571                                 address int_copy_entry, address oop_copy_entry,
2572                                 address long_copy_entry, address checkcast_copy_entry) {
2573 
2574     Label L_failed, L_failed_0, L_objArray;
2575     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2576 
2577     // Input registers
2578     const Register src        = c_rarg0;  // source array oop
2579     const Register src_pos    = c_rarg1;  // source position
2580     const Register dst        = c_rarg2;  // destination array oop
2581     const Register dst_pos    = c_rarg3;  // destination position
2582 #ifndef _WIN64
2583     const Register length     = c_rarg4;
2584 #else
2585     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2586 #endif
2587 
2588     { int modulus = CodeEntryAlignment;
2589       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2590       int advance = target - (__ offset() % modulus);
2591       if (advance < 0)  advance += modulus;
2592       if (advance > 0)  __ nop(advance);
2593     }
2594     StubCodeMark mark(this, "StubRoutines", name);
2595 
2596     // Short-hop target to L_failed.  Makes for denser prologue code.
2597     __ BIND(L_failed_0);
2598     __ jmp(L_failed);
2599     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2600 
2601     __ align(CodeEntryAlignment);
2602     address start = __ pc();
2603 
2604     __ enter(); // required for proper stackwalking of RuntimeStub frame
2605 
2606     // bump this on entry, not on exit:
2607     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2608 
2609     //-----------------------------------------------------------------------
2610     // Assembler stub will be used for this call to arraycopy
2611     // if the following conditions are met:
2612     //
2613     // (1) src and dst must not be null.
2614     // (2) src_pos must not be negative.
2615     // (3) dst_pos must not be negative.
2616     // (4) length  must not be negative.
2617     // (5) src klass and dst klass should be the same and not NULL.
2618     // (6) src and dst should be arrays.
2619     // (7) src_pos + length must not exceed length of src.
2620     // (8) dst_pos + length must not exceed length of dst.
2621     //
2622 
2623     //  if (src == NULL) return -1;
2624     __ testptr(src, src);         // src oop
2625     size_t j1off = __ offset();
2626     __ jccb(Assembler::zero, L_failed_0);
2627 
2628     //  if (src_pos < 0) return -1;
2629     __ testl(src_pos, src_pos); // src_pos (32-bits)
2630     __ jccb(Assembler::negative, L_failed_0);
2631 
2632     //  if (dst == NULL) return -1;
2633     __ testptr(dst, dst);         // dst oop
2634     __ jccb(Assembler::zero, L_failed_0);
2635 
2636     //  if (dst_pos < 0) return -1;
2637     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2638     size_t j4off = __ offset();
2639     __ jccb(Assembler::negative, L_failed_0);
2640 
2641     // The first four tests are very dense code,
2642     // but not quite dense enough to put four
2643     // jumps in a 16-byte instruction fetch buffer.
2644     // That's good, because some branch predicters
2645     // do not like jumps so close together.
2646     // Make sure of this.
2647     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2648 
2649     // registers used as temp
2650     const Register r11_length    = r11; // elements count to copy
2651     const Register r10_src_klass = r10; // array klass
2652 
2653     //  if (length < 0) return -1;
2654     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2655     __ testl(r11_length, r11_length);
2656     __ jccb(Assembler::negative, L_failed_0);
2657 
2658     __ load_klass(r10_src_klass, src);
2659 #ifdef ASSERT
2660     //  assert(src->klass() != NULL);
2661     {
2662       BLOCK_COMMENT("assert klasses not null {");
2663       Label L1, L2;
2664       __ testptr(r10_src_klass, r10_src_klass);
2665       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2666       __ bind(L1);
2667       __ stop("broken null klass");
2668       __ bind(L2);
2669       __ load_klass(rax, dst);
2670       __ cmpq(rax, 0);
2671       __ jcc(Assembler::equal, L1);     // this would be broken also
2672       BLOCK_COMMENT("} assert klasses not null done");
2673     }
2674 #endif
2675 
2676     // Load layout helper (32-bits)
2677     //
2678     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2679     // 32        30    24            16              8     2                 0
2680     //
2681     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2682     //
2683 
2684     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2685 
2686     // Handle objArrays completely differently...
2687     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2688     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2689     __ jcc(Assembler::equal, L_objArray);
2690 
2691     //  if (src->klass() != dst->klass()) return -1;
2692     __ load_klass(rax, dst);
2693     __ cmpq(r10_src_klass, rax);
2694     __ jcc(Assembler::notEqual, L_failed);
2695 
2696     const Register rax_lh = rax;  // layout helper
2697     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2698 
2699     //  if (!src->is_Array()) return -1;
2700     __ cmpl(rax_lh, Klass::_lh_neutral_value);
2701     __ jcc(Assembler::greaterEqual, L_failed);
2702 
2703     // At this point, it is known to be a typeArray (array_tag 0x3).
2704 #ifdef ASSERT
2705     {
2706       BLOCK_COMMENT("assert primitive array {");
2707       Label L;
2708       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2709       __ jcc(Assembler::greaterEqual, L);
2710       __ stop("must be a primitive array");
2711       __ bind(L);
2712       BLOCK_COMMENT("} assert primitive array done");
2713     }
2714 #endif
2715 
2716     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2717                            r10, L_failed);
2718 
2719     // TypeArrayKlass
2720     //
2721     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2722     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2723     //
2724 
2725     const Register r10_offset = r10;    // array offset
2726     const Register rax_elsize = rax_lh; // element size
2727 
2728     __ movl(r10_offset, rax_lh);
2729     __ shrl(r10_offset, Klass::_lh_header_size_shift);
2730     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2731     __ addptr(src, r10_offset);           // src array offset
2732     __ addptr(dst, r10_offset);           // dst array offset
2733     BLOCK_COMMENT("choose copy loop based on element size");
2734     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2735 
2736     // next registers should be set before the jump to corresponding stub
2737     const Register from     = c_rarg0;  // source array address
2738     const Register to       = c_rarg1;  // destination array address
2739     const Register count    = c_rarg2;  // elements count
2740 
2741     // 'from', 'to', 'count' registers should be set in such order
2742     // since they are the same as 'src', 'src_pos', 'dst'.
2743 
2744   __ BIND(L_copy_bytes);
2745     __ cmpl(rax_elsize, 0);
2746     __ jccb(Assembler::notEqual, L_copy_shorts);
2747     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2748     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2749     __ movl2ptr(count, r11_length); // length
2750     __ jump(RuntimeAddress(byte_copy_entry));
2751 
2752   __ BIND(L_copy_shorts);
2753     __ cmpl(rax_elsize, LogBytesPerShort);
2754     __ jccb(Assembler::notEqual, L_copy_ints);
2755     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2756     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2757     __ movl2ptr(count, r11_length); // length
2758     __ jump(RuntimeAddress(short_copy_entry));
2759 
2760   __ BIND(L_copy_ints);
2761     __ cmpl(rax_elsize, LogBytesPerInt);
2762     __ jccb(Assembler::notEqual, L_copy_longs);
2763     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2764     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2765     __ movl2ptr(count, r11_length); // length
2766     __ jump(RuntimeAddress(int_copy_entry));
2767 
2768   __ BIND(L_copy_longs);
2769 #ifdef ASSERT
2770     {
2771       BLOCK_COMMENT("assert long copy {");
2772       Label L;
2773       __ cmpl(rax_elsize, LogBytesPerLong);
2774       __ jcc(Assembler::equal, L);
2775       __ stop("must be long copy, but elsize is wrong");
2776       __ bind(L);
2777       BLOCK_COMMENT("} assert long copy done");
2778     }
2779 #endif
2780     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2781     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2782     __ movl2ptr(count, r11_length); // length
2783     __ jump(RuntimeAddress(long_copy_entry));
2784 
2785     // ObjArrayKlass
2786   __ BIND(L_objArray);
2787     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2788 
2789     Label L_plain_copy, L_checkcast_copy;
2790     //  test array classes for subtyping
2791     __ load_klass(rax, dst);
2792     __ cmpq(r10_src_klass, rax); // usual case is exact equality
2793     __ jcc(Assembler::notEqual, L_checkcast_copy);
2794 
2795     // Identically typed arrays can be copied without element-wise checks.
2796     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2797                            r10, L_failed);
2798 
2799     __ lea(from, Address(src, src_pos, TIMES_OOP,
2800                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2801     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2802                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2803     __ movl2ptr(count, r11_length); // length
2804   __ BIND(L_plain_copy);
2805     __ jump(RuntimeAddress(oop_copy_entry));
2806 
2807   __ BIND(L_checkcast_copy);
2808     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2809     {
2810       // Before looking at dst.length, make sure dst is also an objArray.
2811       __ cmpl(Address(rax, lh_offset), objArray_lh);
2812       __ jcc(Assembler::notEqual, L_failed);
2813 
2814       // It is safe to examine both src.length and dst.length.
2815       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2816                              rax, L_failed);
2817 
2818       const Register r11_dst_klass = r11;
2819       __ load_klass(r11_dst_klass, dst); // reload
2820 
2821       // Marshal the base address arguments now, freeing registers.
2822       __ lea(from, Address(src, src_pos, TIMES_OOP,
2823                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2824       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2825                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2826       __ movl(count, length);           // length (reloaded)
2827       Register sco_temp = c_rarg3;      // this register is free now
2828       assert_different_registers(from, to, count, sco_temp,
2829                                  r11_dst_klass, r10_src_klass);
2830       assert_clean_int(count, sco_temp);
2831 
2832       // Generate the type check.
2833       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2834       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2835       assert_clean_int(sco_temp, rax);
2836       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2837 
2838       // Fetch destination element klass from the ObjArrayKlass header.
2839       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2840       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2841       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
2842       assert_clean_int(sco_temp, rax);
2843 
2844       // the checkcast_copy loop needs two extra arguments:
2845       assert(c_rarg3 == sco_temp, "#3 already in place");
2846       // Set up arguments for checkcast_copy_entry.
2847       setup_arg_regs(4);
2848       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2849       __ jump(RuntimeAddress(checkcast_copy_entry));
2850     }
2851 
2852   __ BIND(L_failed);
2853     __ xorptr(rax, rax);
2854     __ notptr(rax); // return -1
2855     __ leave();   // required for proper stackwalking of RuntimeStub frame
2856     __ ret(0);
2857 
2858     return start;
2859   }
2860 
2861   void generate_arraycopy_stubs() {
2862     address entry;
2863     address entry_jbyte_arraycopy;
2864     address entry_jshort_arraycopy;
2865     address entry_jint_arraycopy;
2866     address entry_oop_arraycopy;
2867     address entry_jlong_arraycopy;
2868     address entry_checkcast_arraycopy;
2869 
2870     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
2871                                                                            "jbyte_disjoint_arraycopy");
2872     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
2873                                                                            "jbyte_arraycopy");
2874 
2875     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2876                                                                             "jshort_disjoint_arraycopy");
2877     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
2878                                                                             "jshort_arraycopy");
2879 
2880     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
2881                                                                               "jint_disjoint_arraycopy");
2882     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
2883                                                                               &entry_jint_arraycopy, "jint_arraycopy");
2884 
2885     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
2886                                                                                "jlong_disjoint_arraycopy");
2887     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
2888                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
2889 
2890 
2891     if (UseCompressedOops) {
2892       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
2893                                                                               "oop_disjoint_arraycopy");
2894       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
2895                                                                               &entry_oop_arraycopy, "oop_arraycopy");
2896       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
2897                                                                                      "oop_disjoint_arraycopy_uninit",
2898                                                                                      /*dest_uninitialized*/true);
2899       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
2900                                                                                      NULL, "oop_arraycopy_uninit",
2901                                                                                      /*dest_uninitialized*/true);
2902     } else {
2903       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
2904                                                                                "oop_disjoint_arraycopy");
2905       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
2906                                                                                &entry_oop_arraycopy, "oop_arraycopy");
2907       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
2908                                                                                       "oop_disjoint_arraycopy_uninit",
2909                                                                                       /*dest_uninitialized*/true);
2910       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
2911                                                                                       NULL, "oop_arraycopy_uninit",
2912                                                                                       /*dest_uninitialized*/true);
2913     }
2914 
2915     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2916     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2917                                                                         /*dest_uninitialized*/true);
2918 
2919     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2920                                                               entry_jbyte_arraycopy,
2921                                                               entry_jshort_arraycopy,
2922                                                               entry_jint_arraycopy,
2923                                                               entry_jlong_arraycopy);
2924     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2925                                                                entry_jbyte_arraycopy,
2926                                                                entry_jshort_arraycopy,
2927                                                                entry_jint_arraycopy,
2928                                                                entry_oop_arraycopy,
2929                                                                entry_jlong_arraycopy,
2930                                                                entry_checkcast_arraycopy);
2931 
2932     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2933     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2934     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2935     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2936     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2937     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2938 
2939     // We don't generate specialized code for HeapWord-aligned source
2940     // arrays, so just use the code we've already generated
2941     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
2942     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
2943 
2944     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
2945     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
2946 
2947     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
2948     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
2949 
2950     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
2951     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
2952 
2953     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
2954     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
2955 
2956     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
2957     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
2958   }
2959 
2960   // AES intrinsic stubs
2961   enum {AESBlockSize = 16};
2962 
2963   address generate_key_shuffle_mask() {
2964     __ align(16);
2965     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2966     address start = __ pc();
2967     __ emit_data64( 0x0405060700010203, relocInfo::none );
2968     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2969     return start;
2970   }
2971 
2972   address generate_counter_shuffle_mask() {
2973     __ align(16);
2974     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2975     address start = __ pc();
2976     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
2977     __ emit_data64(0x0001020304050607, relocInfo::none);
2978     return start;
2979   }
2980 
2981   // Utility routine for loading a 128-bit key word in little endian format
2982   // can optionally specify that the shuffle mask is already in an xmmregister
2983   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2984     __ movdqu(xmmdst, Address(key, offset));
2985     if (xmm_shuf_mask != NULL) {
2986       __ pshufb(xmmdst, xmm_shuf_mask);
2987     } else {
2988       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2989     }
2990   }
2991 
2992   // Utility routine for increase 128bit counter (iv in CTR mode)
2993   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2994     __ pextrq(reg, xmmdst, 0x0);
2995     __ addq(reg, inc_delta);
2996     __ pinsrq(xmmdst, reg, 0x0);
2997     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2998     __ pextrq(reg, xmmdst, 0x01); // Carry
2999     __ addq(reg, 0x01);
3000     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3001     __ BIND(next_block);          // next instruction
3002   }
3003 
3004   // Arguments:
3005   //
3006   // Inputs:
3007   //   c_rarg0   - source byte array address
3008   //   c_rarg1   - destination byte array address
3009   //   c_rarg2   - K (key) in little endian int array
3010   //
3011   address generate_aescrypt_encryptBlock() {
3012     assert(UseAES, "need AES instructions and misaligned SSE support");
3013     __ align(CodeEntryAlignment);
3014     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3015     Label L_doLast;
3016     address start = __ pc();
3017 
3018     const Register from        = c_rarg0;  // source array address
3019     const Register to          = c_rarg1;  // destination array address
3020     const Register key         = c_rarg2;  // key array address
3021     const Register keylen      = rax;
3022 
3023     const XMMRegister xmm_result = xmm0;
3024     const XMMRegister xmm_key_shuf_mask = xmm1;
3025     // On win64 xmm6-xmm15 must be preserved so don't use them.
3026     const XMMRegister xmm_temp1  = xmm2;
3027     const XMMRegister xmm_temp2  = xmm3;
3028     const XMMRegister xmm_temp3  = xmm4;
3029     const XMMRegister xmm_temp4  = xmm5;
3030 
3031     __ enter(); // required for proper stackwalking of RuntimeStub frame
3032 
3033     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3034     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3035 
3036     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3037     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3038 
3039     // For encryption, the java expanded key ordering is just what we need
3040     // we don't know if the key is aligned, hence not using load-execute form
3041 
3042     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3043     __ pxor(xmm_result, xmm_temp1);
3044 
3045     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3046     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3047     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3048     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3049 
3050     __ aesenc(xmm_result, xmm_temp1);
3051     __ aesenc(xmm_result, xmm_temp2);
3052     __ aesenc(xmm_result, xmm_temp3);
3053     __ aesenc(xmm_result, xmm_temp4);
3054 
3055     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3056     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3057     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3058     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3059 
3060     __ aesenc(xmm_result, xmm_temp1);
3061     __ aesenc(xmm_result, xmm_temp2);
3062     __ aesenc(xmm_result, xmm_temp3);
3063     __ aesenc(xmm_result, xmm_temp4);
3064 
3065     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3066     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3067 
3068     __ cmpl(keylen, 44);
3069     __ jccb(Assembler::equal, L_doLast);
3070 
3071     __ aesenc(xmm_result, xmm_temp1);
3072     __ aesenc(xmm_result, xmm_temp2);
3073 
3074     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3075     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3076 
3077     __ cmpl(keylen, 52);
3078     __ jccb(Assembler::equal, L_doLast);
3079 
3080     __ aesenc(xmm_result, xmm_temp1);
3081     __ aesenc(xmm_result, xmm_temp2);
3082 
3083     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3084     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3085 
3086     __ BIND(L_doLast);
3087     __ aesenc(xmm_result, xmm_temp1);
3088     __ aesenclast(xmm_result, xmm_temp2);
3089     __ movdqu(Address(to, 0), xmm_result);        // store the result
3090     __ xorptr(rax, rax); // return 0
3091     __ leave(); // required for proper stackwalking of RuntimeStub frame
3092     __ ret(0);
3093 
3094     return start;
3095   }
3096 
3097 
3098   // Arguments:
3099   //
3100   // Inputs:
3101   //   c_rarg0   - source byte array address
3102   //   c_rarg1   - destination byte array address
3103   //   c_rarg2   - K (key) in little endian int array
3104   //
3105   address generate_aescrypt_decryptBlock() {
3106     assert(UseAES, "need AES instructions and misaligned SSE support");
3107     __ align(CodeEntryAlignment);
3108     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3109     Label L_doLast;
3110     address start = __ pc();
3111 
3112     const Register from        = c_rarg0;  // source array address
3113     const Register to          = c_rarg1;  // destination array address
3114     const Register key         = c_rarg2;  // key array address
3115     const Register keylen      = rax;
3116 
3117     const XMMRegister xmm_result = xmm0;
3118     const XMMRegister xmm_key_shuf_mask = xmm1;
3119     // On win64 xmm6-xmm15 must be preserved so don't use them.
3120     const XMMRegister xmm_temp1  = xmm2;
3121     const XMMRegister xmm_temp2  = xmm3;
3122     const XMMRegister xmm_temp3  = xmm4;
3123     const XMMRegister xmm_temp4  = xmm5;
3124 
3125     __ enter(); // required for proper stackwalking of RuntimeStub frame
3126 
3127     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3128     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3129 
3130     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3131     __ movdqu(xmm_result, Address(from, 0));
3132 
3133     // for decryption java expanded key ordering is rotated one position from what we want
3134     // so we start from 0x10 here and hit 0x00 last
3135     // we don't know if the key is aligned, hence not using load-execute form
3136     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3137     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3138     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3139     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3140 
3141     __ pxor  (xmm_result, xmm_temp1);
3142     __ aesdec(xmm_result, xmm_temp2);
3143     __ aesdec(xmm_result, xmm_temp3);
3144     __ aesdec(xmm_result, xmm_temp4);
3145 
3146     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3147     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3148     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3149     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3150 
3151     __ aesdec(xmm_result, xmm_temp1);
3152     __ aesdec(xmm_result, xmm_temp2);
3153     __ aesdec(xmm_result, xmm_temp3);
3154     __ aesdec(xmm_result, xmm_temp4);
3155 
3156     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3157     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3158     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3159 
3160     __ cmpl(keylen, 44);
3161     __ jccb(Assembler::equal, L_doLast);
3162 
3163     __ aesdec(xmm_result, xmm_temp1);
3164     __ aesdec(xmm_result, xmm_temp2);
3165 
3166     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3167     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3168 
3169     __ cmpl(keylen, 52);
3170     __ jccb(Assembler::equal, L_doLast);
3171 
3172     __ aesdec(xmm_result, xmm_temp1);
3173     __ aesdec(xmm_result, xmm_temp2);
3174 
3175     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3176     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3177 
3178     __ BIND(L_doLast);
3179     __ aesdec(xmm_result, xmm_temp1);
3180     __ aesdec(xmm_result, xmm_temp2);
3181 
3182     // for decryption the aesdeclast operation is always on key+0x00
3183     __ aesdeclast(xmm_result, xmm_temp3);
3184     __ movdqu(Address(to, 0), xmm_result);  // store the result
3185     __ xorptr(rax, rax); // return 0
3186     __ leave(); // required for proper stackwalking of RuntimeStub frame
3187     __ ret(0);
3188 
3189     return start;
3190   }
3191 
3192 
3193   // Arguments:
3194   //
3195   // Inputs:
3196   //   c_rarg0   - source byte array address
3197   //   c_rarg1   - destination byte array address
3198   //   c_rarg2   - K (key) in little endian int array
3199   //   c_rarg3   - r vector byte array address
3200   //   c_rarg4   - input length
3201   //
3202   // Output:
3203   //   rax       - input length
3204   //
3205   address generate_cipherBlockChaining_encryptAESCrypt() {
3206     assert(UseAES, "need AES instructions and misaligned SSE support");
3207     __ align(CodeEntryAlignment);
3208     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3209     address start = __ pc();
3210 
3211     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3212     const Register from        = c_rarg0;  // source array address
3213     const Register to          = c_rarg1;  // destination array address
3214     const Register key         = c_rarg2;  // key array address
3215     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3216                                            // and left with the results of the last encryption block
3217 #ifndef _WIN64
3218     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3219 #else
3220     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3221     const Register len_reg     = r11;      // pick the volatile windows register
3222 #endif
3223     const Register pos         = rax;
3224 
3225     // xmm register assignments for the loops below
3226     const XMMRegister xmm_result = xmm0;
3227     const XMMRegister xmm_temp   = xmm1;
3228     // keys 0-10 preloaded into xmm2-xmm12
3229     const int XMM_REG_NUM_KEY_FIRST = 2;
3230     const int XMM_REG_NUM_KEY_LAST  = 15;
3231     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3232     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3233     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3234     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3235     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3236 
3237     __ enter(); // required for proper stackwalking of RuntimeStub frame
3238 
3239 #ifdef _WIN64
3240     // on win64, fill len_reg from stack position
3241     __ movl(len_reg, len_mem);
3242 #else
3243     __ push(len_reg); // Save
3244 #endif
3245 
3246     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3247     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3248     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3249     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3250       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3251       offset += 0x10;
3252     }
3253     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3254 
3255     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3256     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3257     __ cmpl(rax, 44);
3258     __ jcc(Assembler::notEqual, L_key_192_256);
3259 
3260     // 128 bit code follows here
3261     __ movptr(pos, 0);
3262     __ align(OptoLoopAlignment);
3263 
3264     __ BIND(L_loopTop_128);
3265     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3266     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3267     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3268     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3269       __ aesenc(xmm_result, as_XMMRegister(rnum));
3270     }
3271     __ aesenclast(xmm_result, xmm_key10);
3272     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3273     // no need to store r to memory until we exit
3274     __ addptr(pos, AESBlockSize);
3275     __ subptr(len_reg, AESBlockSize);
3276     __ jcc(Assembler::notEqual, L_loopTop_128);
3277 
3278     __ BIND(L_exit);
3279     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3280 
3281 #ifdef _WIN64
3282     __ movl(rax, len_mem);
3283 #else
3284     __ pop(rax); // return length
3285 #endif
3286     __ leave(); // required for proper stackwalking of RuntimeStub frame
3287     __ ret(0);
3288 
3289     __ BIND(L_key_192_256);
3290     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3291     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3292     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3293     __ cmpl(rax, 52);
3294     __ jcc(Assembler::notEqual, L_key_256);
3295 
3296     // 192-bit code follows here (could be changed to use more xmm registers)
3297     __ movptr(pos, 0);
3298     __ align(OptoLoopAlignment);
3299 
3300     __ BIND(L_loopTop_192);
3301     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3302     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3303     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3304     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3305       __ aesenc(xmm_result, as_XMMRegister(rnum));
3306     }
3307     __ aesenclast(xmm_result, xmm_key12);
3308     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3309     // no need to store r to memory until we exit
3310     __ addptr(pos, AESBlockSize);
3311     __ subptr(len_reg, AESBlockSize);
3312     __ jcc(Assembler::notEqual, L_loopTop_192);
3313     __ jmp(L_exit);
3314 
3315     __ BIND(L_key_256);
3316     // 256-bit code follows here (could be changed to use more xmm registers)
3317     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3318     __ movptr(pos, 0);
3319     __ align(OptoLoopAlignment);
3320 
3321     __ BIND(L_loopTop_256);
3322     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3323     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3324     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3325     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3326       __ aesenc(xmm_result, as_XMMRegister(rnum));
3327     }
3328     load_key(xmm_temp, key, 0xe0);
3329     __ aesenclast(xmm_result, xmm_temp);
3330     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3331     // no need to store r to memory until we exit
3332     __ addptr(pos, AESBlockSize);
3333     __ subptr(len_reg, AESBlockSize);
3334     __ jcc(Assembler::notEqual, L_loopTop_256);
3335     __ jmp(L_exit);
3336 
3337     return start;
3338   }
3339 
3340   // Safefetch stubs.
3341   void generate_safefetch(const char* name, int size, address* entry,
3342                           address* fault_pc, address* continuation_pc) {
3343     // safefetch signatures:
3344     //   int      SafeFetch32(int*      adr, int      errValue);
3345     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3346     //
3347     // arguments:
3348     //   c_rarg0 = adr
3349     //   c_rarg1 = errValue
3350     //
3351     // result:
3352     //   PPC_RET  = *adr or errValue
3353 
3354     StubCodeMark mark(this, "StubRoutines", name);
3355 
3356     // Entry point, pc or function descriptor.
3357     *entry = __ pc();
3358 
3359     // Load *adr into c_rarg1, may fault.
3360     *fault_pc = __ pc();
3361     switch (size) {
3362       case 4:
3363         // int32_t
3364         __ movl(c_rarg1, Address(c_rarg0, 0));
3365         break;
3366       case 8:
3367         // int64_t
3368         __ movq(c_rarg1, Address(c_rarg0, 0));
3369         break;
3370       default:
3371         ShouldNotReachHere();
3372     }
3373 
3374     // return errValue or *adr
3375     *continuation_pc = __ pc();
3376     __ movq(rax, c_rarg1);
3377     __ ret(0);
3378   }
3379 
3380   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3381   // to hide instruction latency
3382   //
3383   // Arguments:
3384   //
3385   // Inputs:
3386   //   c_rarg0   - source byte array address
3387   //   c_rarg1   - destination byte array address
3388   //   c_rarg2   - K (key) in little endian int array
3389   //   c_rarg3   - r vector byte array address
3390   //   c_rarg4   - input length
3391   //
3392   // Output:
3393   //   rax       - input length
3394   //
3395   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3396     assert(UseAES, "need AES instructions and misaligned SSE support");
3397     __ align(CodeEntryAlignment);
3398     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3399     address start = __ pc();
3400 
3401     const Register from        = c_rarg0;  // source array address
3402     const Register to          = c_rarg1;  // destination array address
3403     const Register key         = c_rarg2;  // key array address
3404     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3405                                            // and left with the results of the last encryption block
3406 #ifndef _WIN64
3407     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3408 #else
3409     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3410     const Register len_reg     = r11;      // pick the volatile windows register
3411 #endif
3412     const Register pos         = rax;
3413 
3414     const int PARALLEL_FACTOR = 4;
3415     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3416 
3417     Label L_exit;
3418     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3419     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3420     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3421     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3422     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3423 
3424     // keys 0-10 preloaded into xmm5-xmm15
3425     const int XMM_REG_NUM_KEY_FIRST = 5;
3426     const int XMM_REG_NUM_KEY_LAST  = 15;
3427     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3428     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3429 
3430     __ enter(); // required for proper stackwalking of RuntimeStub frame
3431 
3432 #ifdef _WIN64
3433     // on win64, fill len_reg from stack position
3434     __ movl(len_reg, len_mem);
3435 #else
3436     __ push(len_reg); // Save
3437 #endif
3438     __ push(rbx);
3439     // the java expanded key ordering is rotated one position from what we want
3440     // so we start from 0x10 here and hit 0x00 last
3441     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3442     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3443     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3444     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3445       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3446       offset += 0x10;
3447     }
3448     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3449 
3450     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3451 
3452     // registers holding the four results in the parallelized loop
3453     const XMMRegister xmm_result0 = xmm0;
3454     const XMMRegister xmm_result1 = xmm2;
3455     const XMMRegister xmm_result2 = xmm3;
3456     const XMMRegister xmm_result3 = xmm4;
3457 
3458     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3459 
3460     __ xorptr(pos, pos);
3461 
3462     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3463     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3464     __ cmpl(rbx, 52);
3465     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3466     __ cmpl(rbx, 60);
3467     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3468 
3469 #define DoFour(opc, src_reg)           \
3470   __ opc(xmm_result0, src_reg);         \
3471   __ opc(xmm_result1, src_reg);         \
3472   __ opc(xmm_result2, src_reg);         \
3473   __ opc(xmm_result3, src_reg);         \
3474 
3475     for (int k = 0; k < 3; ++k) {
3476       __ BIND(L_multiBlock_loopTopHead[k]);
3477       if (k != 0) {
3478         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3479         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3480       }
3481       if (k == 1) {
3482         __ subptr(rsp, 6 * wordSize);
3483         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3484         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3485         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3486         load_key(xmm1, key, 0xc0);  // 0xc0;
3487         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3488       } else if (k == 2) {
3489         __ subptr(rsp, 10 * wordSize);
3490         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3491         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3492         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3493         load_key(xmm1, key, 0xe0);  // 0xe0;
3494         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3495         load_key(xmm15, key, 0xb0); // 0xb0;
3496         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3497         load_key(xmm1, key, 0xc0);  // 0xc0;
3498         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3499       }
3500       __ align(OptoLoopAlignment);
3501       __ BIND(L_multiBlock_loopTop[k]);
3502       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3503       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3504 
3505       if  (k != 0) {
3506         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3507         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3508       }
3509 
3510       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3511       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3512       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3513       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3514 
3515       DoFour(pxor, xmm_key_first);
3516       if (k == 0) {
3517         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3518           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3519         }
3520         DoFour(aesdeclast, xmm_key_last);
3521       } else if (k == 1) {
3522         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3523           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3524         }
3525         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3526         DoFour(aesdec, xmm1);  // key : 0xc0
3527         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3528         DoFour(aesdeclast, xmm_key_last);
3529       } else if (k == 2) {
3530         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3531           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3532         }
3533         DoFour(aesdec, xmm1);  // key : 0xc0
3534         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3535         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3536         DoFour(aesdec, xmm15);  // key : 0xd0
3537         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3538         DoFour(aesdec, xmm1);  // key : 0xe0
3539         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3540         DoFour(aesdeclast, xmm_key_last);
3541       }
3542 
3543       // for each result, xor with the r vector of previous cipher block
3544       __ pxor(xmm_result0, xmm_prev_block_cipher);
3545       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3546       __ pxor(xmm_result1, xmm_prev_block_cipher);
3547       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3548       __ pxor(xmm_result2, xmm_prev_block_cipher);
3549       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3550       __ pxor(xmm_result3, xmm_prev_block_cipher);
3551       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3552       if (k != 0) {
3553         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3554       }
3555 
3556       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3557       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3558       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3559       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3560 
3561       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3562       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3563       __ jmp(L_multiBlock_loopTop[k]);
3564 
3565       // registers used in the non-parallelized loops
3566       // xmm register assignments for the loops below
3567       const XMMRegister xmm_result = xmm0;
3568       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3569       const XMMRegister xmm_key11 = xmm3;
3570       const XMMRegister xmm_key12 = xmm4;
3571       const XMMRegister key_tmp = xmm4;
3572 
3573       __ BIND(L_singleBlock_loopTopHead[k]);
3574       if (k == 1) {
3575         __ addptr(rsp, 6 * wordSize);
3576       } else if (k == 2) {
3577         __ addptr(rsp, 10 * wordSize);
3578       }
3579       __ cmpptr(len_reg, 0); // any blocks left??
3580       __ jcc(Assembler::equal, L_exit);
3581       __ BIND(L_singleBlock_loopTopHead2[k]);
3582       if (k == 1) {
3583         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3584         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3585       }
3586       if (k == 2) {
3587         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3588       }
3589       __ align(OptoLoopAlignment);
3590       __ BIND(L_singleBlock_loopTop[k]);
3591       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3592       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3593       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3594       for (int rnum = 1; rnum <= 9 ; rnum++) {
3595           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3596       }
3597       if (k == 1) {
3598         __ aesdec(xmm_result, xmm_key11);
3599         __ aesdec(xmm_result, xmm_key12);
3600       }
3601       if (k == 2) {
3602         __ aesdec(xmm_result, xmm_key11);
3603         load_key(key_tmp, key, 0xc0);
3604         __ aesdec(xmm_result, key_tmp);
3605         load_key(key_tmp, key, 0xd0);
3606         __ aesdec(xmm_result, key_tmp);
3607         load_key(key_tmp, key, 0xe0);
3608         __ aesdec(xmm_result, key_tmp);
3609       }
3610 
3611       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3612       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3613       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3614       // no need to store r to memory until we exit
3615       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3616       __ addptr(pos, AESBlockSize);
3617       __ subptr(len_reg, AESBlockSize);
3618       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3619       if (k != 2) {
3620         __ jmp(L_exit);
3621       }
3622     } //for 128/192/256
3623 
3624     __ BIND(L_exit);
3625     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3626     __ pop(rbx);
3627 #ifdef _WIN64
3628     __ movl(rax, len_mem);
3629 #else
3630     __ pop(rax); // return length
3631 #endif
3632     __ leave(); // required for proper stackwalking of RuntimeStub frame
3633     __ ret(0);
3634     return start;
3635 }
3636 
3637   address generate_upper_word_mask() {
3638     __ align(64);
3639     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3640     address start = __ pc();
3641     __ emit_data64(0x0000000000000000, relocInfo::none);
3642     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3643     return start;
3644   }
3645 
3646   address generate_shuffle_byte_flip_mask() {
3647     __ align(64);
3648     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3649     address start = __ pc();
3650     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3651     __ emit_data64(0x0001020304050607, relocInfo::none);
3652     return start;
3653   }
3654 
3655   // ofs and limit are use for multi-block byte array.
3656   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3657   address generate_sha1_implCompress(bool multi_block, const char *name) {
3658     __ align(CodeEntryAlignment);
3659     StubCodeMark mark(this, "StubRoutines", name);
3660     address start = __ pc();
3661 
3662     Register buf = c_rarg0;
3663     Register state = c_rarg1;
3664     Register ofs = c_rarg2;
3665     Register limit = c_rarg3;
3666 
3667     const XMMRegister abcd = xmm0;
3668     const XMMRegister e0 = xmm1;
3669     const XMMRegister e1 = xmm2;
3670     const XMMRegister msg0 = xmm3;
3671 
3672     const XMMRegister msg1 = xmm4;
3673     const XMMRegister msg2 = xmm5;
3674     const XMMRegister msg3 = xmm6;
3675     const XMMRegister shuf_mask = xmm7;
3676 
3677     __ enter();
3678 
3679     __ subptr(rsp, 4 * wordSize);
3680 
3681     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3682       buf, state, ofs, limit, rsp, multi_block);
3683 
3684     __ addptr(rsp, 4 * wordSize);
3685 
3686     __ leave();
3687     __ ret(0);
3688     return start;
3689   }
3690 
3691   address generate_pshuffle_byte_flip_mask() {
3692     __ align(64);
3693     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3694     address start = __ pc();
3695     __ emit_data64(0x0405060700010203, relocInfo::none);
3696     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3697 
3698     if (VM_Version::supports_avx2()) {
3699       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3700       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3701       // _SHUF_00BA
3702       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3703       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3704       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3705       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3706       // _SHUF_DC00
3707       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3708       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3709       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3710       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3711     }
3712 
3713     return start;
3714   }
3715 
3716   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
3717   address generate_pshuffle_byte_flip_mask_sha512() {
3718     __ align(32);
3719     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3720     address start = __ pc();
3721     if (VM_Version::supports_avx2()) {
3722       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3723       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3724       __ emit_data64(0x1011121314151617, relocInfo::none);
3725       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3726       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3727       __ emit_data64(0x0000000000000000, relocInfo::none);
3728       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3729       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3730     }
3731 
3732     return start;
3733   }
3734 
3735 // ofs and limit are use for multi-block byte array.
3736 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3737   address generate_sha256_implCompress(bool multi_block, const char *name) {
3738     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3739     __ align(CodeEntryAlignment);
3740     StubCodeMark mark(this, "StubRoutines", name);
3741     address start = __ pc();
3742 
3743     Register buf = c_rarg0;
3744     Register state = c_rarg1;
3745     Register ofs = c_rarg2;
3746     Register limit = c_rarg3;
3747 
3748     const XMMRegister msg = xmm0;
3749     const XMMRegister state0 = xmm1;
3750     const XMMRegister state1 = xmm2;
3751     const XMMRegister msgtmp0 = xmm3;
3752 
3753     const XMMRegister msgtmp1 = xmm4;
3754     const XMMRegister msgtmp2 = xmm5;
3755     const XMMRegister msgtmp3 = xmm6;
3756     const XMMRegister msgtmp4 = xmm7;
3757 
3758     const XMMRegister shuf_mask = xmm8;
3759 
3760     __ enter();
3761 
3762     __ subptr(rsp, 4 * wordSize);
3763 
3764     if (VM_Version::supports_sha()) {
3765       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3766         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3767     } else if (VM_Version::supports_avx2()) {
3768       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3769         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3770     }
3771     __ addptr(rsp, 4 * wordSize);
3772     __ vzeroupper();
3773     __ leave();
3774     __ ret(0);
3775     return start;
3776   }
3777 
3778   address generate_sha512_implCompress(bool multi_block, const char *name) {
3779     assert(VM_Version::supports_avx2(), "");
3780     assert(VM_Version::supports_bmi2(), "");
3781     __ align(CodeEntryAlignment);
3782     StubCodeMark mark(this, "StubRoutines", name);
3783     address start = __ pc();
3784 
3785     Register buf = c_rarg0;
3786     Register state = c_rarg1;
3787     Register ofs = c_rarg2;
3788     Register limit = c_rarg3;
3789 
3790     const XMMRegister msg = xmm0;
3791     const XMMRegister state0 = xmm1;
3792     const XMMRegister state1 = xmm2;
3793     const XMMRegister msgtmp0 = xmm3;
3794     const XMMRegister msgtmp1 = xmm4;
3795     const XMMRegister msgtmp2 = xmm5;
3796     const XMMRegister msgtmp3 = xmm6;
3797     const XMMRegister msgtmp4 = xmm7;
3798 
3799     const XMMRegister shuf_mask = xmm8;
3800 
3801     __ enter();
3802 
3803     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3804     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3805 
3806     __ vzeroupper();
3807     __ leave();
3808     __ ret(0);
3809     return start;
3810   }
3811 
3812   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3813   // to hide instruction latency
3814   //
3815   // Arguments:
3816   //
3817   // Inputs:
3818   //   c_rarg0   - source byte array address
3819   //   c_rarg1   - destination byte array address
3820   //   c_rarg2   - K (key) in little endian int array
3821   //   c_rarg3   - counter vector byte array address
3822   //   Linux
3823   //     c_rarg4   -          input length
3824   //     c_rarg5   -          saved encryptedCounter start
3825   //     rbp + 6 * wordSize - saved used length
3826   //   Windows
3827   //     rbp + 6 * wordSize - input length
3828   //     rbp + 7 * wordSize - saved encryptedCounter start
3829   //     rbp + 8 * wordSize - saved used length
3830   //
3831   // Output:
3832   //   rax       - input length
3833   //
3834   address generate_counterMode_AESCrypt_Parallel() {
3835     assert(UseAES, "need AES instructions and misaligned SSE support");
3836     __ align(CodeEntryAlignment);
3837     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3838     address start = __ pc();
3839     const Register from = c_rarg0; // source array address
3840     const Register to = c_rarg1; // destination array address
3841     const Register key = c_rarg2; // key array address
3842     const Register counter = c_rarg3; // counter byte array initialized from counter array address
3843                                       // and updated with the incremented counter in the end
3844 #ifndef _WIN64
3845     const Register len_reg = c_rarg4;
3846     const Register saved_encCounter_start = c_rarg5;
3847     const Register used_addr = r10;
3848     const Address  used_mem(rbp, 2 * wordSize);
3849     const Register used = r11;
3850 #else
3851     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3852     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
3853     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
3854     const Register len_reg = r10; // pick the first volatile windows register
3855     const Register saved_encCounter_start = r11;
3856     const Register used_addr = r13;
3857     const Register used = r14;
3858 #endif
3859     const Register pos = rax;
3860 
3861     const int PARALLEL_FACTOR = 6;
3862     const XMMRegister xmm_counter_shuf_mask = xmm0;
3863     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3864     const XMMRegister xmm_curr_counter = xmm2;
3865 
3866     const XMMRegister xmm_key_tmp0 = xmm3;
3867     const XMMRegister xmm_key_tmp1 = xmm4;
3868 
3869     // registers holding the four results in the parallelized loop
3870     const XMMRegister xmm_result0 = xmm5;
3871     const XMMRegister xmm_result1 = xmm6;
3872     const XMMRegister xmm_result2 = xmm7;
3873     const XMMRegister xmm_result3 = xmm8;
3874     const XMMRegister xmm_result4 = xmm9;
3875     const XMMRegister xmm_result5 = xmm10;
3876 
3877     const XMMRegister xmm_from0 = xmm11;
3878     const XMMRegister xmm_from1 = xmm12;
3879     const XMMRegister xmm_from2 = xmm13;
3880     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
3881     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
3882     const XMMRegister xmm_from5 = xmm4;
3883 
3884     //for key_128, key_192, key_256
3885     const int rounds[3] = {10, 12, 14};
3886     Label L_exit_preLoop, L_preLoop_start;
3887     Label L_multiBlock_loopTop[3];
3888     Label L_singleBlockLoopTop[3];
3889     Label L__incCounter[3][6]; //for 6 blocks
3890     Label L__incCounter_single[3]; //for single block, key128, key192, key256
3891     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3892     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3893 
3894     Label L_exit;
3895 
3896     __ enter(); // required for proper stackwalking of RuntimeStub frame
3897 
3898 #ifdef _WIN64
3899     // allocate spill slots for r13, r14
3900     enum {
3901         saved_r13_offset,
3902         saved_r14_offset
3903     };
3904     __ subptr(rsp, 2 * wordSize);
3905     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
3906     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
3907 
3908     // on win64, fill len_reg from stack position
3909     __ movl(len_reg, len_mem);
3910     __ movptr(saved_encCounter_start, saved_encCounter_mem);
3911     __ movptr(used_addr, used_mem);
3912     __ movl(used, Address(used_addr, 0));
3913 #else
3914     __ push(len_reg); // Save
3915     __ movptr(used_addr, used_mem);
3916     __ movl(used, Address(used_addr, 0));
3917 #endif
3918 
3919     __ push(rbx); // Save RBX
3920     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
3921     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
3922     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
3923     __ movptr(pos, 0);
3924 
3925     // Use the partially used encrpyted counter from last invocation
3926     __ BIND(L_preLoop_start);
3927     __ cmpptr(used, 16);
3928     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
3929       __ cmpptr(len_reg, 0);
3930       __ jcc(Assembler::lessEqual, L_exit_preLoop);
3931       __ movb(rbx, Address(saved_encCounter_start, used));
3932       __ xorb(rbx, Address(from, pos));
3933       __ movb(Address(to, pos), rbx);
3934       __ addptr(pos, 1);
3935       __ addptr(used, 1);
3936       __ subptr(len_reg, 1);
3937 
3938     __ jmp(L_preLoop_start);
3939 
3940     __ BIND(L_exit_preLoop);
3941     __ movl(Address(used_addr, 0), used);
3942 
3943     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
3944     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
3945     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3946     __ cmpl(rbx, 52);
3947     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
3948     __ cmpl(rbx, 60);
3949     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
3950 
3951 #define CTR_DoSix(opc, src_reg)                \
3952     __ opc(xmm_result0, src_reg);              \
3953     __ opc(xmm_result1, src_reg);              \
3954     __ opc(xmm_result2, src_reg);              \
3955     __ opc(xmm_result3, src_reg);              \
3956     __ opc(xmm_result4, src_reg);              \
3957     __ opc(xmm_result5, src_reg);
3958 
3959     // k == 0 :  generate code for key_128
3960     // k == 1 :  generate code for key_192
3961     // k == 2 :  generate code for key_256
3962     for (int k = 0; k < 3; ++k) {
3963       //multi blocks starts here
3964       __ align(OptoLoopAlignment);
3965       __ BIND(L_multiBlock_loopTop[k]);
3966       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
3967       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
3968       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
3969 
3970       //load, then increase counters
3971       CTR_DoSix(movdqa, xmm_curr_counter);
3972       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
3973       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
3974       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
3975       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
3976       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
3977       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
3978       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
3979       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
3980 
3981       //load two ROUND_KEYs at a time
3982       for (int i = 1; i < rounds[k]; ) {
3983         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
3984         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
3985         CTR_DoSix(aesenc, xmm_key_tmp1);
3986         i++;
3987         if (i != rounds[k]) {
3988           CTR_DoSix(aesenc, xmm_key_tmp0);
3989         } else {
3990           CTR_DoSix(aesenclast, xmm_key_tmp0);
3991         }
3992         i++;
3993       }
3994 
3995       // get next PARALLEL_FACTOR blocks into xmm_result registers
3996       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3997       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3998       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3999       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4000       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4001       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4002 
4003       __ pxor(xmm_result0, xmm_from0);
4004       __ pxor(xmm_result1, xmm_from1);
4005       __ pxor(xmm_result2, xmm_from2);
4006       __ pxor(xmm_result3, xmm_from3);
4007       __ pxor(xmm_result4, xmm_from4);
4008       __ pxor(xmm_result5, xmm_from5);
4009 
4010       // store 6 results into the next 64 bytes of output
4011       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4012       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4013       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4014       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4015       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4016       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4017 
4018       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4019       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4020       __ jmp(L_multiBlock_loopTop[k]);
4021 
4022       // singleBlock starts here
4023       __ align(OptoLoopAlignment);
4024       __ BIND(L_singleBlockLoopTop[k]);
4025       __ cmpptr(len_reg, 0);
4026       __ jcc(Assembler::lessEqual, L_exit);
4027       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4028       __ movdqa(xmm_result0, xmm_curr_counter);
4029       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4030       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4031       __ pxor(xmm_result0, xmm_key_tmp0);
4032       for (int i = 1; i < rounds[k]; i++) {
4033         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4034         __ aesenc(xmm_result0, xmm_key_tmp0);
4035       }
4036       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4037       __ aesenclast(xmm_result0, xmm_key_tmp0);
4038       __ cmpptr(len_reg, AESBlockSize);
4039       __ jcc(Assembler::less, L_processTail_insr[k]);
4040         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4041         __ pxor(xmm_result0, xmm_from0);
4042         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4043         __ addptr(pos, AESBlockSize);
4044         __ subptr(len_reg, AESBlockSize);
4045         __ jmp(L_singleBlockLoopTop[k]);
4046       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4047         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4048         __ testptr(len_reg, 8);
4049         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4050           __ subptr(pos,8);
4051           __ pinsrq(xmm_from0, Address(from, pos), 0);
4052         __ BIND(L_processTail_4_insr[k]);
4053         __ testptr(len_reg, 4);
4054         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4055           __ subptr(pos,4);
4056           __ pslldq(xmm_from0, 4);
4057           __ pinsrd(xmm_from0, Address(from, pos), 0);
4058         __ BIND(L_processTail_2_insr[k]);
4059         __ testptr(len_reg, 2);
4060         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4061           __ subptr(pos, 2);
4062           __ pslldq(xmm_from0, 2);
4063           __ pinsrw(xmm_from0, Address(from, pos), 0);
4064         __ BIND(L_processTail_1_insr[k]);
4065         __ testptr(len_reg, 1);
4066         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4067           __ subptr(pos, 1);
4068           __ pslldq(xmm_from0, 1);
4069           __ pinsrb(xmm_from0, Address(from, pos), 0);
4070         __ BIND(L_processTail_exit_insr[k]);
4071 
4072         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4073         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4074 
4075         __ testptr(len_reg, 8);
4076         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4077           __ pextrq(Address(to, pos), xmm_result0, 0);
4078           __ psrldq(xmm_result0, 8);
4079           __ addptr(pos, 8);
4080         __ BIND(L_processTail_4_extr[k]);
4081         __ testptr(len_reg, 4);
4082         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4083           __ pextrd(Address(to, pos), xmm_result0, 0);
4084           __ psrldq(xmm_result0, 4);
4085           __ addptr(pos, 4);
4086         __ BIND(L_processTail_2_extr[k]);
4087         __ testptr(len_reg, 2);
4088         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4089           __ pextrw(Address(to, pos), xmm_result0, 0);
4090           __ psrldq(xmm_result0, 2);
4091           __ addptr(pos, 2);
4092         __ BIND(L_processTail_1_extr[k]);
4093         __ testptr(len_reg, 1);
4094         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4095           __ pextrb(Address(to, pos), xmm_result0, 0);
4096 
4097         __ BIND(L_processTail_exit_extr[k]);
4098         __ movl(Address(used_addr, 0), len_reg);
4099         __ jmp(L_exit);
4100 
4101     }
4102 
4103     __ BIND(L_exit);
4104     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4105     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4106     __ pop(rbx); // pop the saved RBX.
4107 #ifdef _WIN64
4108     __ movl(rax, len_mem);
4109     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4110     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4111     __ addptr(rsp, 2 * wordSize);
4112 #else
4113     __ pop(rax); // return 'len'
4114 #endif
4115     __ leave(); // required for proper stackwalking of RuntimeStub frame
4116     __ ret(0);
4117     return start;
4118   }
4119 
4120 void roundDec(XMMRegister xmm_reg) {
4121   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4122   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4123   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4124   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4125   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4126   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4127   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4128   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4129 }
4130 
4131 void roundDeclast(XMMRegister xmm_reg) {
4132   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4133   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4134   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4135   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4136   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4137   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4138   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4139   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4140 }
4141 
4142   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4143     __ movdqu(xmmdst, Address(key, offset));
4144     if (xmm_shuf_mask != NULL) {
4145       __ pshufb(xmmdst, xmm_shuf_mask);
4146     } else {
4147       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4148     }
4149     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4150 
4151   }
4152 
4153 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4154     assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
4155     __ align(CodeEntryAlignment);
4156     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4157     address start = __ pc();
4158 
4159     const Register from = c_rarg0;  // source array address
4160     const Register to = c_rarg1;  // destination array address
4161     const Register key = c_rarg2;  // key array address
4162     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4163     // and left with the results of the last encryption block
4164 #ifndef _WIN64
4165     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4166 #else
4167     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4168     const Register len_reg = r11;      // pick the volatile windows register
4169 #endif
4170 
4171     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4172           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4173 
4174     __ enter();
4175 
4176 #ifdef _WIN64
4177   // on win64, fill len_reg from stack position
4178     __ movl(len_reg, len_mem);
4179 #else
4180     __ push(len_reg); // Save
4181 #endif
4182     __ push(rbx);
4183     __ vzeroupper();
4184 
4185     // Temporary variable declaration for swapping key bytes
4186     const XMMRegister xmm_key_shuf_mask = xmm1;
4187     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4188 
4189     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4190     const Register rounds = rbx;
4191     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4192 
4193     const XMMRegister IV = xmm0;
4194     // Load IV and broadcast value to 512-bits
4195     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4196 
4197     // Temporary variables for storing round keys
4198     const XMMRegister RK0 = xmm30;
4199     const XMMRegister RK1 = xmm9;
4200     const XMMRegister RK2 = xmm18;
4201     const XMMRegister RK3 = xmm19;
4202     const XMMRegister RK4 = xmm20;
4203     const XMMRegister RK5 = xmm21;
4204     const XMMRegister RK6 = xmm22;
4205     const XMMRegister RK7 = xmm23;
4206     const XMMRegister RK8 = xmm24;
4207     const XMMRegister RK9 = xmm25;
4208     const XMMRegister RK10 = xmm26;
4209 
4210      // Load and shuffle key
4211     // the java expanded key ordering is rotated one position from what we want
4212     // so we start from 1*16 here and hit 0*16 last
4213     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4214     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4215     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4216     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4217     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4218     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4219     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4220     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4221     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4222     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4223     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4224 
4225     // Variables for storing source cipher text
4226     const XMMRegister S0 = xmm10;
4227     const XMMRegister S1 = xmm11;
4228     const XMMRegister S2 = xmm12;
4229     const XMMRegister S3 = xmm13;
4230     const XMMRegister S4 = xmm14;
4231     const XMMRegister S5 = xmm15;
4232     const XMMRegister S6 = xmm16;
4233     const XMMRegister S7 = xmm17;
4234 
4235     // Variables for storing decrypted text
4236     const XMMRegister B0 = xmm1;
4237     const XMMRegister B1 = xmm2;
4238     const XMMRegister B2 = xmm3;
4239     const XMMRegister B3 = xmm4;
4240     const XMMRegister B4 = xmm5;
4241     const XMMRegister B5 = xmm6;
4242     const XMMRegister B6 = xmm7;
4243     const XMMRegister B7 = xmm8;
4244 
4245     __ cmpl(rounds, 44);
4246     __ jcc(Assembler::greater, KEY_192);
4247     __ jmp(Loop);
4248 
4249     __ BIND(KEY_192);
4250     const XMMRegister RK11 = xmm27;
4251     const XMMRegister RK12 = xmm28;
4252     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
4253     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
4254 
4255     __ cmpl(rounds, 52);
4256     __ jcc(Assembler::greater, KEY_256);
4257     __ jmp(Loop);
4258 
4259     __ BIND(KEY_256);
4260     const XMMRegister RK13 = xmm29;
4261     const XMMRegister RK14 = xmm31;
4262     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
4263     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
4264 
4265     __ BIND(Loop);
4266     __ cmpl(len_reg, 512);
4267     __ jcc(Assembler::below, Lcbc_dec_rem);
4268     __ BIND(Loop1);
4269     __ subl(len_reg, 512);
4270     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
4271     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
4272     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
4273     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
4274     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
4275     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
4276     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
4277     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
4278     __ leaq(from, Address(from, 8 * 64));
4279 
4280     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4281     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
4282     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
4283     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
4284     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
4285     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
4286     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
4287     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
4288 
4289     __ evalignq(IV, S0, IV, 0x06);
4290     __ evalignq(S0, S1, S0, 0x06);
4291     __ evalignq(S1, S2, S1, 0x06);
4292     __ evalignq(S2, S3, S2, 0x06);
4293     __ evalignq(S3, S4, S3, 0x06);
4294     __ evalignq(S4, S5, S4, 0x06);
4295     __ evalignq(S5, S6, S5, 0x06);
4296     __ evalignq(S6, S7, S6, 0x06);
4297 
4298     roundDec(RK2);
4299     roundDec(RK3);
4300     roundDec(RK4);
4301     roundDec(RK5);
4302     roundDec(RK6);
4303     roundDec(RK7);
4304     roundDec(RK8);
4305     roundDec(RK9);
4306     roundDec(RK10);
4307 
4308     __ cmpl(rounds, 44);
4309     __ jcc(Assembler::belowEqual, L_128);
4310     roundDec(RK11);
4311     roundDec(RK12);
4312 
4313     __ cmpl(rounds, 52);
4314     __ jcc(Assembler::belowEqual, L_192);
4315     roundDec(RK13);
4316     roundDec(RK14);
4317 
4318     __ BIND(L_256);
4319     roundDeclast(RK0);
4320     __ jmp(Loop2);
4321 
4322     __ BIND(L_128);
4323     roundDeclast(RK0);
4324     __ jmp(Loop2);
4325 
4326     __ BIND(L_192);
4327     roundDeclast(RK0);
4328 
4329     __ BIND(Loop2);
4330     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4331     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
4332     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
4333     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
4334     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
4335     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
4336     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
4337     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
4338     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
4339 
4340     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
4341     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
4342     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
4343     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
4344     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
4345     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
4346     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
4347     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
4348     __ leaq(to, Address(to, 8 * 64));
4349     __ jmp(Loop);
4350 
4351     __ BIND(Lcbc_dec_rem);
4352     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
4353 
4354     __ BIND(Lcbc_dec_rem_loop);
4355     __ subl(len_reg, 16);
4356     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
4357 
4358     __ movdqu(S0, Address(from, 0));
4359     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4360     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
4361     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
4362     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
4363     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
4364     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
4365     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
4366     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
4367     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
4368     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
4369     __ cmpl(rounds, 44);
4370     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4371 
4372     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
4373     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
4374     __ cmpl(rounds, 52);
4375     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4376 
4377     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
4378     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
4379 
4380     __ BIND(Lcbc_dec_rem_last);
4381     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
4382 
4383     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4384     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
4385     __ movdqu(Address(to, 0), B0);
4386     __ leaq(from, Address(from, 16));
4387     __ leaq(to, Address(to, 16));
4388     __ jmp(Lcbc_dec_rem_loop);
4389 
4390     __ BIND(Lcbc_dec_ret);
4391     __ movdqu(Address(rvec, 0), IV);
4392 
4393     // Zero out the round keys
4394     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
4395     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
4396     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
4397     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
4398     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
4399     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
4400     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
4401     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
4402     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
4403     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
4404     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
4405     __ cmpl(rounds, 44);
4406     __ jcc(Assembler::belowEqual, Lcbc_exit);
4407     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
4408     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
4409     __ cmpl(rounds, 52);
4410     __ jcc(Assembler::belowEqual, Lcbc_exit);
4411     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
4412     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
4413 
4414     __ BIND(Lcbc_exit);
4415     __ pop(rbx);
4416 #ifdef _WIN64
4417     __ movl(rax, len_mem);
4418 #else
4419     __ pop(rax); // return length
4420 #endif
4421     __ leave(); // required for proper stackwalking of RuntimeStub frame
4422     __ ret(0);
4423     return start;
4424 }
4425 
4426 // Polynomial x^128+x^127+x^126+x^121+1
4427 address ghash_polynomial_addr() {
4428     __ align(CodeEntryAlignment);
4429     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
4430     address start = __ pc();
4431     __ emit_data64(0x0000000000000001, relocInfo::none);
4432     __ emit_data64(0xc200000000000000, relocInfo::none);
4433     return start;
4434 }
4435 
4436 address ghash_shufflemask_addr() {
4437     __ align(CodeEntryAlignment);
4438     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
4439     address start = __ pc();
4440     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4441     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4442     return start;
4443 }
4444 
4445 // Ghash single and multi block operations using AVX instructions
4446 address generate_avx_ghash_processBlocks() {
4447     __ align(CodeEntryAlignment);
4448 
4449     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4450     address start = __ pc();
4451 
4452     // arguments
4453     const Register state = c_rarg0;
4454     const Register htbl = c_rarg1;
4455     const Register data = c_rarg2;
4456     const Register blocks = c_rarg3;
4457     __ enter();
4458    // Save state before entering routine
4459     __ avx_ghash(state, htbl, data, blocks);
4460     __ leave(); // required for proper stackwalking of RuntimeStub frame
4461     __ ret(0);
4462     return start;
4463 }
4464 
4465   // byte swap x86 long
4466   address generate_ghash_long_swap_mask() {
4467     __ align(CodeEntryAlignment);
4468     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4469     address start = __ pc();
4470     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4471     __ emit_data64(0x0706050403020100, relocInfo::none );
4472   return start;
4473   }
4474 
4475   // byte swap x86 byte array
4476   address generate_ghash_byte_swap_mask() {
4477     __ align(CodeEntryAlignment);
4478     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4479     address start = __ pc();
4480     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4481     __ emit_data64(0x0001020304050607, relocInfo::none );
4482   return start;
4483   }
4484 
4485   /* Single and multi-block ghash operations */
4486   address generate_ghash_processBlocks() {
4487     __ align(CodeEntryAlignment);
4488     Label L_ghash_loop, L_exit;
4489     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4490     address start = __ pc();
4491 
4492     const Register state        = c_rarg0;
4493     const Register subkeyH      = c_rarg1;
4494     const Register data         = c_rarg2;
4495     const Register blocks       = c_rarg3;
4496 
4497     const XMMRegister xmm_temp0 = xmm0;
4498     const XMMRegister xmm_temp1 = xmm1;
4499     const XMMRegister xmm_temp2 = xmm2;
4500     const XMMRegister xmm_temp3 = xmm3;
4501     const XMMRegister xmm_temp4 = xmm4;
4502     const XMMRegister xmm_temp5 = xmm5;
4503     const XMMRegister xmm_temp6 = xmm6;
4504     const XMMRegister xmm_temp7 = xmm7;
4505     const XMMRegister xmm_temp8 = xmm8;
4506     const XMMRegister xmm_temp9 = xmm9;
4507     const XMMRegister xmm_temp10 = xmm10;
4508 
4509     __ enter();
4510 
4511     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4512 
4513     __ movdqu(xmm_temp0, Address(state, 0));
4514     __ pshufb(xmm_temp0, xmm_temp10);
4515 
4516 
4517     __ BIND(L_ghash_loop);
4518     __ movdqu(xmm_temp2, Address(data, 0));
4519     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4520 
4521     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4522     __ pshufb(xmm_temp1, xmm_temp10);
4523 
4524     __ pxor(xmm_temp0, xmm_temp2);
4525 
4526     //
4527     // Multiply with the hash key
4528     //
4529     __ movdqu(xmm_temp3, xmm_temp0);
4530     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4531     __ movdqu(xmm_temp4, xmm_temp0);
4532     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4533 
4534     __ movdqu(xmm_temp5, xmm_temp0);
4535     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4536     __ movdqu(xmm_temp6, xmm_temp0);
4537     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4538 
4539     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4540 
4541     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4542     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4543     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4544     __ pxor(xmm_temp3, xmm_temp5);
4545     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4546                                         // of the carry-less multiplication of
4547                                         // xmm0 by xmm1.
4548 
4549     // We shift the result of the multiplication by one bit position
4550     // to the left to cope for the fact that the bits are reversed.
4551     __ movdqu(xmm_temp7, xmm_temp3);
4552     __ movdqu(xmm_temp8, xmm_temp6);
4553     __ pslld(xmm_temp3, 1);
4554     __ pslld(xmm_temp6, 1);
4555     __ psrld(xmm_temp7, 31);
4556     __ psrld(xmm_temp8, 31);
4557     __ movdqu(xmm_temp9, xmm_temp7);
4558     __ pslldq(xmm_temp8, 4);
4559     __ pslldq(xmm_temp7, 4);
4560     __ psrldq(xmm_temp9, 12);
4561     __ por(xmm_temp3, xmm_temp7);
4562     __ por(xmm_temp6, xmm_temp8);
4563     __ por(xmm_temp6, xmm_temp9);
4564 
4565     //
4566     // First phase of the reduction
4567     //
4568     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4569     // independently.
4570     __ movdqu(xmm_temp7, xmm_temp3);
4571     __ movdqu(xmm_temp8, xmm_temp3);
4572     __ movdqu(xmm_temp9, xmm_temp3);
4573     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4574     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4575     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4576     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4577     __ pxor(xmm_temp7, xmm_temp9);
4578     __ movdqu(xmm_temp8, xmm_temp7);
4579     __ pslldq(xmm_temp7, 12);
4580     __ psrldq(xmm_temp8, 4);
4581     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4582 
4583     //
4584     // Second phase of the reduction
4585     //
4586     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4587     // shift operations.
4588     __ movdqu(xmm_temp2, xmm_temp3);
4589     __ movdqu(xmm_temp4, xmm_temp3);
4590     __ movdqu(xmm_temp5, xmm_temp3);
4591     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4592     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4593     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4594     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4595     __ pxor(xmm_temp2, xmm_temp5);
4596     __ pxor(xmm_temp2, xmm_temp8);
4597     __ pxor(xmm_temp3, xmm_temp2);
4598     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4599 
4600     __ decrement(blocks);
4601     __ jcc(Assembler::zero, L_exit);
4602     __ movdqu(xmm_temp0, xmm_temp6);
4603     __ addptr(data, 16);
4604     __ jmp(L_ghash_loop);
4605 
4606     __ BIND(L_exit);
4607     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4608     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4609     __ leave();
4610     __ ret(0);
4611     return start;
4612   }
4613 
4614   //base64 character set
4615   address base64_charset_addr() {
4616     __ align(CodeEntryAlignment);
4617     StubCodeMark mark(this, "StubRoutines", "base64_charset");
4618     address start = __ pc();
4619     __ emit_data64(0x0000004200000041, relocInfo::none);
4620     __ emit_data64(0x0000004400000043, relocInfo::none);
4621     __ emit_data64(0x0000004600000045, relocInfo::none);
4622     __ emit_data64(0x0000004800000047, relocInfo::none);
4623     __ emit_data64(0x0000004a00000049, relocInfo::none);
4624     __ emit_data64(0x0000004c0000004b, relocInfo::none);
4625     __ emit_data64(0x0000004e0000004d, relocInfo::none);
4626     __ emit_data64(0x000000500000004f, relocInfo::none);
4627     __ emit_data64(0x0000005200000051, relocInfo::none);
4628     __ emit_data64(0x0000005400000053, relocInfo::none);
4629     __ emit_data64(0x0000005600000055, relocInfo::none);
4630     __ emit_data64(0x0000005800000057, relocInfo::none);
4631     __ emit_data64(0x0000005a00000059, relocInfo::none);
4632     __ emit_data64(0x0000006200000061, relocInfo::none);
4633     __ emit_data64(0x0000006400000063, relocInfo::none);
4634     __ emit_data64(0x0000006600000065, relocInfo::none);
4635     __ emit_data64(0x0000006800000067, relocInfo::none);
4636     __ emit_data64(0x0000006a00000069, relocInfo::none);
4637     __ emit_data64(0x0000006c0000006b, relocInfo::none);
4638     __ emit_data64(0x0000006e0000006d, relocInfo::none);
4639     __ emit_data64(0x000000700000006f, relocInfo::none);
4640     __ emit_data64(0x0000007200000071, relocInfo::none);
4641     __ emit_data64(0x0000007400000073, relocInfo::none);
4642     __ emit_data64(0x0000007600000075, relocInfo::none);
4643     __ emit_data64(0x0000007800000077, relocInfo::none);
4644     __ emit_data64(0x0000007a00000079, relocInfo::none);
4645     __ emit_data64(0x0000003100000030, relocInfo::none);
4646     __ emit_data64(0x0000003300000032, relocInfo::none);
4647     __ emit_data64(0x0000003500000034, relocInfo::none);
4648     __ emit_data64(0x0000003700000036, relocInfo::none);
4649     __ emit_data64(0x0000003900000038, relocInfo::none);
4650     __ emit_data64(0x0000002f0000002b, relocInfo::none);
4651     return start;
4652   }
4653 
4654   //base64 url character set
4655   address base64url_charset_addr() {
4656     __ align(CodeEntryAlignment);
4657     StubCodeMark mark(this, "StubRoutines", "base64url_charset");
4658     address start = __ pc();
4659     __ emit_data64(0x0000004200000041, relocInfo::none);
4660     __ emit_data64(0x0000004400000043, relocInfo::none);
4661     __ emit_data64(0x0000004600000045, relocInfo::none);
4662     __ emit_data64(0x0000004800000047, relocInfo::none);
4663     __ emit_data64(0x0000004a00000049, relocInfo::none);
4664     __ emit_data64(0x0000004c0000004b, relocInfo::none);
4665     __ emit_data64(0x0000004e0000004d, relocInfo::none);
4666     __ emit_data64(0x000000500000004f, relocInfo::none);
4667     __ emit_data64(0x0000005200000051, relocInfo::none);
4668     __ emit_data64(0x0000005400000053, relocInfo::none);
4669     __ emit_data64(0x0000005600000055, relocInfo::none);
4670     __ emit_data64(0x0000005800000057, relocInfo::none);
4671     __ emit_data64(0x0000005a00000059, relocInfo::none);
4672     __ emit_data64(0x0000006200000061, relocInfo::none);
4673     __ emit_data64(0x0000006400000063, relocInfo::none);
4674     __ emit_data64(0x0000006600000065, relocInfo::none);
4675     __ emit_data64(0x0000006800000067, relocInfo::none);
4676     __ emit_data64(0x0000006a00000069, relocInfo::none);
4677     __ emit_data64(0x0000006c0000006b, relocInfo::none);
4678     __ emit_data64(0x0000006e0000006d, relocInfo::none);
4679     __ emit_data64(0x000000700000006f, relocInfo::none);
4680     __ emit_data64(0x0000007200000071, relocInfo::none);
4681     __ emit_data64(0x0000007400000073, relocInfo::none);
4682     __ emit_data64(0x0000007600000075, relocInfo::none);
4683     __ emit_data64(0x0000007800000077, relocInfo::none);
4684     __ emit_data64(0x0000007a00000079, relocInfo::none);
4685     __ emit_data64(0x0000003100000030, relocInfo::none);
4686     __ emit_data64(0x0000003300000032, relocInfo::none);
4687     __ emit_data64(0x0000003500000034, relocInfo::none);
4688     __ emit_data64(0x0000003700000036, relocInfo::none);
4689     __ emit_data64(0x0000003900000038, relocInfo::none);
4690     __ emit_data64(0x0000005f0000002d, relocInfo::none);
4691 
4692     return start;
4693   }
4694 
4695   address base64_bswap_mask_addr() {
4696     __ align(CodeEntryAlignment);
4697     StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64");
4698     address start = __ pc();
4699     __ emit_data64(0x0504038002010080, relocInfo::none);
4700     __ emit_data64(0x0b0a098008070680, relocInfo::none);
4701     __ emit_data64(0x0908078006050480, relocInfo::none);
4702     __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none);
4703     __ emit_data64(0x0605048003020180, relocInfo::none);
4704     __ emit_data64(0x0c0b0a8009080780, relocInfo::none);
4705     __ emit_data64(0x0504038002010080, relocInfo::none);
4706     __ emit_data64(0x0b0a098008070680, relocInfo::none);
4707 
4708     return start;
4709   }
4710 
4711   address base64_right_shift_mask_addr() {
4712     __ align(CodeEntryAlignment);
4713     StubCodeMark mark(this, "StubRoutines", "right_shift_mask");
4714     address start = __ pc();
4715     __ emit_data64(0x0006000400020000, relocInfo::none);
4716     __ emit_data64(0x0006000400020000, relocInfo::none);
4717     __ emit_data64(0x0006000400020000, relocInfo::none);
4718     __ emit_data64(0x0006000400020000, relocInfo::none);
4719     __ emit_data64(0x0006000400020000, relocInfo::none);
4720     __ emit_data64(0x0006000400020000, relocInfo::none);
4721     __ emit_data64(0x0006000400020000, relocInfo::none);
4722     __ emit_data64(0x0006000400020000, relocInfo::none);
4723 
4724     return start;
4725   }
4726 
4727   address base64_left_shift_mask_addr() {
4728     __ align(CodeEntryAlignment);
4729     StubCodeMark mark(this, "StubRoutines", "left_shift_mask");
4730     address start = __ pc();
4731     __ emit_data64(0x0000000200040000, relocInfo::none);
4732     __ emit_data64(0x0000000200040000, relocInfo::none);
4733     __ emit_data64(0x0000000200040000, relocInfo::none);
4734     __ emit_data64(0x0000000200040000, relocInfo::none);
4735     __ emit_data64(0x0000000200040000, relocInfo::none);
4736     __ emit_data64(0x0000000200040000, relocInfo::none);
4737     __ emit_data64(0x0000000200040000, relocInfo::none);
4738     __ emit_data64(0x0000000200040000, relocInfo::none);
4739 
4740     return start;
4741   }
4742 
4743   address base64_and_mask_addr() {
4744     __ align(CodeEntryAlignment);
4745     StubCodeMark mark(this, "StubRoutines", "and_mask");
4746     address start = __ pc();
4747     __ emit_data64(0x3f003f003f000000, relocInfo::none);
4748     __ emit_data64(0x3f003f003f000000, relocInfo::none);
4749     __ emit_data64(0x3f003f003f000000, relocInfo::none);
4750     __ emit_data64(0x3f003f003f000000, relocInfo::none);
4751     __ emit_data64(0x3f003f003f000000, relocInfo::none);
4752     __ emit_data64(0x3f003f003f000000, relocInfo::none);
4753     __ emit_data64(0x3f003f003f000000, relocInfo::none);
4754     __ emit_data64(0x3f003f003f000000, relocInfo::none);
4755     return start;
4756   }
4757 
4758   address base64_gather_mask_addr() {
4759     __ align(CodeEntryAlignment);
4760     StubCodeMark mark(this, "StubRoutines", "gather_mask");
4761     address start = __ pc();
4762     __ emit_data64(0xffffffffffffffff, relocInfo::none);
4763     return start;
4764   }
4765 
4766 // Code for generating Base64 encoding.
4767 // Intrinsic function prototype in Base64.java:
4768 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
4769   address generate_base64_encodeBlock() {
4770     __ align(CodeEntryAlignment);
4771     StubCodeMark mark(this, "StubRoutines", "implEncode");
4772     address start = __ pc();
4773     __ enter();
4774 
4775     // Save callee-saved registers before using them
4776     __ push(r12);
4777     __ push(r13);
4778     __ push(r14);
4779     __ push(r15);
4780 
4781     // arguments
4782     const Register source = c_rarg0; // Source Array
4783     const Register start_offset = c_rarg1; // start offset
4784     const Register end_offset = c_rarg2; // end offset
4785     const Register dest = c_rarg3; // destination array
4786 
4787 #ifndef _WIN64
4788     const Register dp = c_rarg4;  // Position for writing to dest array
4789     const Register isURL = c_rarg5;// Base64 or URL character set
4790 #else
4791     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4792     const Address isURL_mem(rbp, 7 * wordSize);
4793     const Register isURL = r10;      // pick the volatile windows register
4794     const Register dp = r12;
4795     __ movl(dp, dp_mem);
4796     __ movl(isURL, isURL_mem);
4797 #endif
4798 
4799     const Register length = r14;
4800     Label L_process80, L_process32, L_process3, L_exit, L_processdata;
4801 
4802     // calculate length from offsets
4803     __ movl(length, end_offset);
4804     __ subl(length, start_offset);
4805     __ cmpl(length, 0);
4806     __ jcc(Assembler::lessEqual, L_exit);
4807 
4808     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
4809     // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
4810     __ cmpl(isURL, 0);
4811     __ jcc(Assembler::equal, L_processdata);
4812     __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr()));
4813 
4814     // load masks required for encoding data
4815     __ BIND(L_processdata);
4816     __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
4817     // Set 64 bits of K register.
4818     __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit);
4819     __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
4820     __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
4821     __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
4822     __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13);
4823 
4824     // Vector Base64 implementation, producing 96 bytes of encoded data
4825     __ BIND(L_process80);
4826     __ cmpl(length, 80);
4827     __ jcc(Assembler::below, L_process32);
4828     __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit);
4829     __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit);
4830     __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit);
4831 
4832     //permute the input data in such a manner that we have continuity of the source
4833     __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit);
4834     __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit);
4835     __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit);
4836 
4837     //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte.
4838     //we can deal with 12 bytes at a time in a 128 bit register
4839     __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit);
4840     __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit);
4841     __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit);
4842 
4843     //convert byte to word. Each 128 bit register will have 6 bytes for processing
4844     __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit);
4845     __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit);
4846     __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit);
4847 
4848     // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers
4849     __ evpsrlvw(xmm0, xmm3, xmm13,  Assembler::AVX_512bit);
4850     __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit);
4851     __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit);
4852 
4853     __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit);
4854     __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit);
4855     __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit);
4856 
4857     __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit);
4858     __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
4859     __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
4860 
4861     __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
4862     __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit);
4863     __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit);
4864 
4865     __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
4866     __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit);
4867     __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit);
4868 
4869     // Get the final 4*6 bits base64 encoding
4870     __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit);
4871     __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit);
4872     __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit);
4873 
4874     // Shift
4875     __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit);
4876     __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit);
4877     __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit);
4878 
4879     // look up 6 bits in the base64 character set to fetch the encoding
4880     // we are converting word to dword as gather instructions need dword indices for looking up encoding
4881     __ vextracti64x4(xmm6, xmm3, 0);
4882     __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit);
4883     __ vextracti64x4(xmm6, xmm3, 1);
4884     __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit);
4885 
4886     __ vextracti64x4(xmm6, xmm4, 0);
4887     __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit);
4888     __ vextracti64x4(xmm6, xmm4, 1);
4889     __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit);
4890 
4891     __ vextracti64x4(xmm4, xmm5, 0);
4892     __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit);
4893 
4894     __ vextracti64x4(xmm4, xmm5, 1);
4895     __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);
4896 
4897     __ kmovql(k2, k3);
4898     __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
4899     __ kmovql(k2, k3);
4900     __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
4901     __ kmovql(k2, k3);
4902     __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
4903     __ kmovql(k2, k3);
4904     __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
4905     __ kmovql(k2, k3);
4906     __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
4907     __ kmovql(k2, k3);
4908     __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);
4909 
4910     //Down convert dword to byte. Final output is 16*6 = 96 bytes long
4911     __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit);
4912     __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit);
4913     __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit);
4914     __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit);
4915     __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit);
4916     __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit);
4917 
4918     __ addq(dest, 96);
4919     __ addq(source, 72);
4920     __ subq(length, 72);
4921     __ jmp(L_process80);
4922 
4923     // Vector Base64 implementation generating 32 bytes of encoded data
4924     __ BIND(L_process32);
4925     __ cmpl(length, 32);
4926     __ jcc(Assembler::below, L_process3);
4927     __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit);
4928     __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit);
4929     __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit);
4930     __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit);
4931     __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit);
4932     __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit);
4933 
4934     __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
4935     __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
4936     __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
4937     __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit);
4938     __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
4939     __ vextracti64x4(xmm9, xmm1, 0);
4940     __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
4941     __ vextracti64x4(xmm9, xmm1, 1);
4942     __ vpmovzxwd(xmm5, xmm9,  Assembler::AVX_512bit);
4943     __ kmovql(k2, k3);
4944     __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
4945     __ kmovql(k2, k3);
4946     __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
4947     __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
4948     __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
4949     __ subq(length, 24);
4950     __ addq(dest, 32);
4951     __ addq(source, 24);
4952     __ jmp(L_process32);
4953 
4954     // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data
4955     /* This code corresponds to the scalar version of the following snippet in Base64.java
4956     ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff);
4957     ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f];
4958     ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f];
4959     ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f];
4960     ** dst[dp0++] = (byte)base64[bits & 0x3f];*/
4961     __ BIND(L_process3);
4962     __ cmpl(length, 3);
4963     __ jcc(Assembler::below, L_exit);
4964     // Read 1 byte at a time
4965     __ movzbl(rax, Address(source, start_offset));
4966     __ shll(rax, 0x10);
4967     __ movl(r15, rax);
4968     __ movzbl(rax, Address(source, start_offset, Address::times_1, 1));
4969     __ shll(rax, 0x8);
4970     __ movzwl(rax, rax);
4971     __ orl(r15, rax);
4972     __ movzbl(rax, Address(source, start_offset, Address::times_1, 2));
4973     __ orl(rax, r15);
4974     // Save 3 bytes read in r15
4975     __ movl(r15, rax);
4976     __ shrl(rax, 0x12);
4977     __ andl(rax, 0x3f);
4978     // rax contains the index, r11 contains base64 lookup table
4979     __ movb(rax, Address(r11, rax, Address::times_4));
4980     // Write the encoded byte to destination
4981     __ movb(Address(dest, dp, Address::times_1, 0), rax);
4982     __ movl(rax, r15);
4983     __ shrl(rax, 0xc);
4984     __ andl(rax, 0x3f);
4985     __ movb(rax, Address(r11, rax, Address::times_4));
4986     __ movb(Address(dest, dp, Address::times_1, 1), rax);
4987     __ movl(rax, r15);
4988     __ shrl(rax, 0x6);
4989     __ andl(rax, 0x3f);
4990     __ movb(rax, Address(r11, rax, Address::times_4));
4991     __ movb(Address(dest, dp, Address::times_1, 2), rax);
4992     __ movl(rax, r15);
4993     __ andl(rax, 0x3f);
4994     __ movb(rax, Address(r11, rax, Address::times_4));
4995     __ movb(Address(dest, dp, Address::times_1, 3), rax);
4996     __ subl(length, 3);
4997     __ addq(dest, 4);
4998     __ addq(source, 3);
4999     __ jmp(L_process3);
5000     __ BIND(L_exit);
5001     __ pop(r15);
5002     __ pop(r14);
5003     __ pop(r13);
5004     __ pop(r12);
5005     __ leave();
5006     __ ret(0);
5007     return start;
5008   }
5009 
5010   /**
5011    *  Arguments:
5012    *
5013    * Inputs:
5014    *   c_rarg0   - int crc
5015    *   c_rarg1   - byte* buf
5016    *   c_rarg2   - int length
5017    *
5018    * Ouput:
5019    *       rax   - int crc result
5020    */
5021   address generate_updateBytesCRC32() {
5022     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
5023 
5024     __ align(CodeEntryAlignment);
5025     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5026 
5027     address start = __ pc();
5028     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5029     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5030     // rscratch1: r10
5031     const Register crc   = c_rarg0;  // crc
5032     const Register buf   = c_rarg1;  // source java byte array address
5033     const Register len   = c_rarg2;  // length
5034     const Register table = c_rarg3;  // crc_table address (reuse register)
5035     const Register tmp   = r11;
5036     assert_different_registers(crc, buf, len, table, tmp, rax);
5037 
5038     BLOCK_COMMENT("Entry:");
5039     __ enter(); // required for proper stackwalking of RuntimeStub frame
5040 
5041     __ kernel_crc32(crc, buf, len, table, tmp);
5042 
5043     __ movl(rax, crc);
5044     __ vzeroupper();
5045     __ leave(); // required for proper stackwalking of RuntimeStub frame
5046     __ ret(0);
5047 
5048     return start;
5049   }
5050 
5051   /**
5052   *  Arguments:
5053   *
5054   * Inputs:
5055   *   c_rarg0   - int crc
5056   *   c_rarg1   - byte* buf
5057   *   c_rarg2   - long length
5058   *   c_rarg3   - table_start - optional (present only when doing a library_call,
5059   *              not used by x86 algorithm)
5060   *
5061   * Ouput:
5062   *       rax   - int crc result
5063   */
5064   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
5065       assert(UseCRC32CIntrinsics, "need SSE4_2");
5066       __ align(CodeEntryAlignment);
5067       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
5068       address start = __ pc();
5069       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
5070       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
5071       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
5072       const Register crc = c_rarg0;  // crc
5073       const Register buf = c_rarg1;  // source java byte array address
5074       const Register len = c_rarg2;  // length
5075       const Register a = rax;
5076       const Register j = r9;
5077       const Register k = r10;
5078       const Register l = r11;
5079 #ifdef _WIN64
5080       const Register y = rdi;
5081       const Register z = rsi;
5082 #else
5083       const Register y = rcx;
5084       const Register z = r8;
5085 #endif
5086       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
5087 
5088       BLOCK_COMMENT("Entry:");
5089       __ enter(); // required for proper stackwalking of RuntimeStub frame
5090 #ifdef _WIN64
5091       __ push(y);
5092       __ push(z);
5093 #endif
5094       __ crc32c_ipl_alg2_alt2(crc, buf, len,
5095                               a, j, k,
5096                               l, y, z,
5097                               c_farg0, c_farg1, c_farg2,
5098                               is_pclmulqdq_supported);
5099       __ movl(rax, crc);
5100 #ifdef _WIN64
5101       __ pop(z);
5102       __ pop(y);
5103 #endif
5104       __ vzeroupper();
5105       __ leave(); // required for proper stackwalking of RuntimeStub frame
5106       __ ret(0);
5107 
5108       return start;
5109   }
5110 
5111   /**
5112    *  Arguments:
5113    *
5114    *  Input:
5115    *    c_rarg0   - x address
5116    *    c_rarg1   - x length
5117    *    c_rarg2   - y address
5118    *    c_rarg3   - y length
5119    * not Win64
5120    *    c_rarg4   - z address
5121    *    c_rarg5   - z length
5122    * Win64
5123    *    rsp+40    - z address
5124    *    rsp+48    - z length
5125    */
5126   address generate_multiplyToLen() {
5127     __ align(CodeEntryAlignment);
5128     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5129 
5130     address start = __ pc();
5131     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5132     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5133     const Register x     = rdi;
5134     const Register xlen  = rax;
5135     const Register y     = rsi;
5136     const Register ylen  = rcx;
5137     const Register z     = r8;
5138     const Register zlen  = r11;
5139 
5140     // Next registers will be saved on stack in multiply_to_len().
5141     const Register tmp1  = r12;
5142     const Register tmp2  = r13;
5143     const Register tmp3  = r14;
5144     const Register tmp4  = r15;
5145     const Register tmp5  = rbx;
5146 
5147     BLOCK_COMMENT("Entry:");
5148     __ enter(); // required for proper stackwalking of RuntimeStub frame
5149 
5150 #ifndef _WIN64
5151     __ movptr(zlen, r9); // Save r9 in r11 - zlen
5152 #endif
5153     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
5154                        // ylen => rcx, z => r8, zlen => r11
5155                        // r9 and r10 may be used to save non-volatile registers
5156 #ifdef _WIN64
5157     // last 2 arguments (#4, #5) are on stack on Win64
5158     __ movptr(z, Address(rsp, 6 * wordSize));
5159     __ movptr(zlen, Address(rsp, 7 * wordSize));
5160 #endif
5161 
5162     __ movptr(xlen, rsi);
5163     __ movptr(y,    rdx);
5164     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
5165 
5166     restore_arg_regs();
5167 
5168     __ leave(); // required for proper stackwalking of RuntimeStub frame
5169     __ ret(0);
5170 
5171     return start;
5172   }
5173 
5174   /**
5175   *  Arguments:
5176   *
5177   *  Input:
5178   *    c_rarg0   - obja     address
5179   *    c_rarg1   - objb     address
5180   *    c_rarg3   - length   length
5181   *    c_rarg4   - scale    log2_array_indxscale
5182   *
5183   *  Output:
5184   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
5185   */
5186   address generate_vectorizedMismatch() {
5187     __ align(CodeEntryAlignment);
5188     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
5189     address start = __ pc();
5190 
5191     BLOCK_COMMENT("Entry:");
5192     __ enter();
5193 
5194 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5195     const Register scale = c_rarg0;  //rcx, will exchange with r9
5196     const Register objb = c_rarg1;   //rdx
5197     const Register length = c_rarg2; //r8
5198     const Register obja = c_rarg3;   //r9
5199     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
5200 
5201     const Register tmp1 = r10;
5202     const Register tmp2 = r11;
5203 #endif
5204 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5205     const Register obja = c_rarg0;   //U:rdi
5206     const Register objb = c_rarg1;   //U:rsi
5207     const Register length = c_rarg2; //U:rdx
5208     const Register scale = c_rarg3;  //U:rcx
5209     const Register tmp1 = r8;
5210     const Register tmp2 = r9;
5211 #endif
5212     const Register result = rax; //return value
5213     const XMMRegister vec0 = xmm0;
5214     const XMMRegister vec1 = xmm1;
5215     const XMMRegister vec2 = xmm2;
5216 
5217     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
5218 
5219     __ vzeroupper();
5220     __ leave();
5221     __ ret(0);
5222 
5223     return start;
5224   }
5225 
5226 /**
5227    *  Arguments:
5228    *
5229   //  Input:
5230   //    c_rarg0   - x address
5231   //    c_rarg1   - x length
5232   //    c_rarg2   - z address
5233   //    c_rarg3   - z lenth
5234    *
5235    */
5236   address generate_squareToLen() {
5237 
5238     __ align(CodeEntryAlignment);
5239     StubCodeMark mark(this, "StubRoutines", "squareToLen");
5240 
5241     address start = __ pc();
5242     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5243     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
5244     const Register x      = rdi;
5245     const Register len    = rsi;
5246     const Register z      = r8;
5247     const Register zlen   = rcx;
5248 
5249    const Register tmp1      = r12;
5250    const Register tmp2      = r13;
5251    const Register tmp3      = r14;
5252    const Register tmp4      = r15;
5253    const Register tmp5      = rbx;
5254 
5255     BLOCK_COMMENT("Entry:");
5256     __ enter(); // required for proper stackwalking of RuntimeStub frame
5257 
5258     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
5259                        // zlen => rcx
5260                        // r9 and r10 may be used to save non-volatile registers
5261     __ movptr(r8, rdx);
5262     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5263 
5264     restore_arg_regs();
5265 
5266     __ leave(); // required for proper stackwalking of RuntimeStub frame
5267     __ ret(0);
5268 
5269     return start;
5270   }
5271 
5272   address generate_method_entry_barrier() {
5273     __ align(CodeEntryAlignment);
5274     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5275 
5276     Label deoptimize_label;
5277 
5278     address start = __ pc();
5279 
5280     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
5281 
5282     BLOCK_COMMENT("Entry:");
5283     __ enter(); // save rbp
5284 
5285     // save c_rarg0, because we want to use that value.
5286     // We could do without it but then we depend on the number of slots used by pusha
5287     __ push(c_rarg0);
5288 
5289     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
5290 
5291     __ pusha();
5292 
5293     // The method may have floats as arguments, and we must spill them before calling
5294     // the VM runtime.
5295     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
5296     const int xmm_size = wordSize * 2;
5297     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
5298     __ subptr(rsp, xmm_spill_size);
5299     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
5300     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
5301     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
5302     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
5303     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
5304     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
5305     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
5306     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
5307 
5308     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
5309 
5310     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
5311     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
5312     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
5313     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
5314     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
5315     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
5316     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
5317     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
5318     __ addptr(rsp, xmm_spill_size);
5319 
5320     __ cmpl(rax, 1); // 1 means deoptimize
5321     __ jcc(Assembler::equal, deoptimize_label);
5322 
5323     __ popa();
5324     __ pop(c_rarg0);
5325 
5326     __ leave();
5327 
5328     __ addptr(rsp, 1 * wordSize); // cookie
5329     __ ret(0);
5330 
5331 
5332     __ BIND(deoptimize_label);
5333 
5334     __ popa();
5335     __ pop(c_rarg0);
5336 
5337     __ leave();
5338 
5339     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
5340     // here while still having a correct stack is valuable
5341     __ testptr(rsp, Address(rsp, 0));
5342 
5343     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
5344     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
5345 
5346     return start;
5347   }
5348 
5349    /**
5350    *  Arguments:
5351    *
5352    *  Input:
5353    *    c_rarg0   - out address
5354    *    c_rarg1   - in address
5355    *    c_rarg2   - offset
5356    *    c_rarg3   - len
5357    * not Win64
5358    *    c_rarg4   - k
5359    * Win64
5360    *    rsp+40    - k
5361    */
5362   address generate_mulAdd() {
5363     __ align(CodeEntryAlignment);
5364     StubCodeMark mark(this, "StubRoutines", "mulAdd");
5365 
5366     address start = __ pc();
5367     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5368     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5369     const Register out     = rdi;
5370     const Register in      = rsi;
5371     const Register offset  = r11;
5372     const Register len     = rcx;
5373     const Register k       = r8;
5374 
5375     // Next registers will be saved on stack in mul_add().
5376     const Register tmp1  = r12;
5377     const Register tmp2  = r13;
5378     const Register tmp3  = r14;
5379     const Register tmp4  = r15;
5380     const Register tmp5  = rbx;
5381 
5382     BLOCK_COMMENT("Entry:");
5383     __ enter(); // required for proper stackwalking of RuntimeStub frame
5384 
5385     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
5386                        // len => rcx, k => r8
5387                        // r9 and r10 may be used to save non-volatile registers
5388 #ifdef _WIN64
5389     // last argument is on stack on Win64
5390     __ movl(k, Address(rsp, 6 * wordSize));
5391 #endif
5392     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
5393     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5394 
5395     restore_arg_regs();
5396 
5397     __ leave(); // required for proper stackwalking of RuntimeStub frame
5398     __ ret(0);
5399 
5400     return start;
5401   }
5402 
5403   address generate_libmExp() {
5404     StubCodeMark mark(this, "StubRoutines", "libmExp");
5405 
5406     address start = __ pc();
5407 
5408     const XMMRegister x0  = xmm0;
5409     const XMMRegister x1  = xmm1;
5410     const XMMRegister x2  = xmm2;
5411     const XMMRegister x3  = xmm3;
5412 
5413     const XMMRegister x4  = xmm4;
5414     const XMMRegister x5  = xmm5;
5415     const XMMRegister x6  = xmm6;
5416     const XMMRegister x7  = xmm7;
5417 
5418     const Register tmp   = r11;
5419 
5420     BLOCK_COMMENT("Entry:");
5421     __ enter(); // required for proper stackwalking of RuntimeStub frame
5422 
5423     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5424 
5425     __ leave(); // required for proper stackwalking of RuntimeStub frame
5426     __ ret(0);
5427 
5428     return start;
5429 
5430   }
5431 
5432   address generate_libmLog() {
5433     StubCodeMark mark(this, "StubRoutines", "libmLog");
5434 
5435     address start = __ pc();
5436 
5437     const XMMRegister x0 = xmm0;
5438     const XMMRegister x1 = xmm1;
5439     const XMMRegister x2 = xmm2;
5440     const XMMRegister x3 = xmm3;
5441 
5442     const XMMRegister x4 = xmm4;
5443     const XMMRegister x5 = xmm5;
5444     const XMMRegister x6 = xmm6;
5445     const XMMRegister x7 = xmm7;
5446 
5447     const Register tmp1 = r11;
5448     const Register tmp2 = r8;
5449 
5450     BLOCK_COMMENT("Entry:");
5451     __ enter(); // required for proper stackwalking of RuntimeStub frame
5452 
5453     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
5454 
5455     __ leave(); // required for proper stackwalking of RuntimeStub frame
5456     __ ret(0);
5457 
5458     return start;
5459 
5460   }
5461 
5462   address generate_libmLog10() {
5463     StubCodeMark mark(this, "StubRoutines", "libmLog10");
5464 
5465     address start = __ pc();
5466 
5467     const XMMRegister x0 = xmm0;
5468     const XMMRegister x1 = xmm1;
5469     const XMMRegister x2 = xmm2;
5470     const XMMRegister x3 = xmm3;
5471 
5472     const XMMRegister x4 = xmm4;
5473     const XMMRegister x5 = xmm5;
5474     const XMMRegister x6 = xmm6;
5475     const XMMRegister x7 = xmm7;
5476 
5477     const Register tmp = r11;
5478 
5479     BLOCK_COMMENT("Entry:");
5480     __ enter(); // required for proper stackwalking of RuntimeStub frame
5481 
5482     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5483 
5484     __ leave(); // required for proper stackwalking of RuntimeStub frame
5485     __ ret(0);
5486 
5487     return start;
5488 
5489   }
5490 
5491   address generate_libmPow() {
5492     StubCodeMark mark(this, "StubRoutines", "libmPow");
5493 
5494     address start = __ pc();
5495 
5496     const XMMRegister x0 = xmm0;
5497     const XMMRegister x1 = xmm1;
5498     const XMMRegister x2 = xmm2;
5499     const XMMRegister x3 = xmm3;
5500 
5501     const XMMRegister x4 = xmm4;
5502     const XMMRegister x5 = xmm5;
5503     const XMMRegister x6 = xmm6;
5504     const XMMRegister x7 = xmm7;
5505 
5506     const Register tmp1 = r8;
5507     const Register tmp2 = r9;
5508     const Register tmp3 = r10;
5509     const Register tmp4 = r11;
5510 
5511     BLOCK_COMMENT("Entry:");
5512     __ enter(); // required for proper stackwalking of RuntimeStub frame
5513 
5514     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5515 
5516     __ leave(); // required for proper stackwalking of RuntimeStub frame
5517     __ ret(0);
5518 
5519     return start;
5520 
5521   }
5522 
5523   address generate_libmSin() {
5524     StubCodeMark mark(this, "StubRoutines", "libmSin");
5525 
5526     address start = __ pc();
5527 
5528     const XMMRegister x0 = xmm0;
5529     const XMMRegister x1 = xmm1;
5530     const XMMRegister x2 = xmm2;
5531     const XMMRegister x3 = xmm3;
5532 
5533     const XMMRegister x4 = xmm4;
5534     const XMMRegister x5 = xmm5;
5535     const XMMRegister x6 = xmm6;
5536     const XMMRegister x7 = xmm7;
5537 
5538     const Register tmp1 = r8;
5539     const Register tmp2 = r9;
5540     const Register tmp3 = r10;
5541     const Register tmp4 = r11;
5542 
5543     BLOCK_COMMENT("Entry:");
5544     __ enter(); // required for proper stackwalking of RuntimeStub frame
5545 
5546 #ifdef _WIN64
5547     __ push(rsi);
5548     __ push(rdi);
5549 #endif
5550     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5551 
5552 #ifdef _WIN64
5553     __ pop(rdi);
5554     __ pop(rsi);
5555 #endif
5556 
5557     __ leave(); // required for proper stackwalking of RuntimeStub frame
5558     __ ret(0);
5559 
5560     return start;
5561 
5562   }
5563 
5564   address generate_libmCos() {
5565     StubCodeMark mark(this, "StubRoutines", "libmCos");
5566 
5567     address start = __ pc();
5568 
5569     const XMMRegister x0 = xmm0;
5570     const XMMRegister x1 = xmm1;
5571     const XMMRegister x2 = xmm2;
5572     const XMMRegister x3 = xmm3;
5573 
5574     const XMMRegister x4 = xmm4;
5575     const XMMRegister x5 = xmm5;
5576     const XMMRegister x6 = xmm6;
5577     const XMMRegister x7 = xmm7;
5578 
5579     const Register tmp1 = r8;
5580     const Register tmp2 = r9;
5581     const Register tmp3 = r10;
5582     const Register tmp4 = r11;
5583 
5584     BLOCK_COMMENT("Entry:");
5585     __ enter(); // required for proper stackwalking of RuntimeStub frame
5586 
5587 #ifdef _WIN64
5588     __ push(rsi);
5589     __ push(rdi);
5590 #endif
5591     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5592 
5593 #ifdef _WIN64
5594     __ pop(rdi);
5595     __ pop(rsi);
5596 #endif
5597 
5598     __ leave(); // required for proper stackwalking of RuntimeStub frame
5599     __ ret(0);
5600 
5601     return start;
5602 
5603   }
5604 
5605   address generate_libmTan() {
5606     StubCodeMark mark(this, "StubRoutines", "libmTan");
5607 
5608     address start = __ pc();
5609 
5610     const XMMRegister x0 = xmm0;
5611     const XMMRegister x1 = xmm1;
5612     const XMMRegister x2 = xmm2;
5613     const XMMRegister x3 = xmm3;
5614 
5615     const XMMRegister x4 = xmm4;
5616     const XMMRegister x5 = xmm5;
5617     const XMMRegister x6 = xmm6;
5618     const XMMRegister x7 = xmm7;
5619 
5620     const Register tmp1 = r8;
5621     const Register tmp2 = r9;
5622     const Register tmp3 = r10;
5623     const Register tmp4 = r11;
5624 
5625     BLOCK_COMMENT("Entry:");
5626     __ enter(); // required for proper stackwalking of RuntimeStub frame
5627 
5628 #ifdef _WIN64
5629     __ push(rsi);
5630     __ push(rdi);
5631 #endif
5632     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5633 
5634 #ifdef _WIN64
5635     __ pop(rdi);
5636     __ pop(rsi);
5637 #endif
5638 
5639     __ leave(); // required for proper stackwalking of RuntimeStub frame
5640     __ ret(0);
5641 
5642     return start;
5643 
5644   }
5645 
5646 #undef __
5647 #define __ masm->
5648 
5649   // Continuation point for throwing of implicit exceptions that are
5650   // not handled in the current activation. Fabricates an exception
5651   // oop and initiates normal exception dispatching in this
5652   // frame. Since we need to preserve callee-saved values (currently
5653   // only for C2, but done for C1 as well) we need a callee-saved oop
5654   // map and therefore have to make these stubs into RuntimeStubs
5655   // rather than BufferBlobs.  If the compiler needs all registers to
5656   // be preserved between the fault point and the exception handler
5657   // then it must assume responsibility for that in
5658   // AbstractCompiler::continuation_for_implicit_null_exception or
5659   // continuation_for_implicit_division_by_zero_exception. All other
5660   // implicit exceptions (e.g., NullPointerException or
5661   // AbstractMethodError on entry) are either at call sites or
5662   // otherwise assume that stack unwinding will be initiated, so
5663   // caller saved registers were assumed volatile in the compiler.
5664   address generate_throw_exception(const char* name,
5665                                    address runtime_entry,
5666                                    Register arg1 = noreg,
5667                                    Register arg2 = noreg) {
5668     // Information about frame layout at time of blocking runtime call.
5669     // Note that we only have to preserve callee-saved registers since
5670     // the compilers are responsible for supplying a continuation point
5671     // if they expect all registers to be preserved.
5672     enum layout {
5673       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5674       rbp_off2,
5675       return_off,
5676       return_off2,
5677       framesize // inclusive of return address
5678     };
5679 
5680     int insts_size = 512;
5681     int locs_size  = 64;
5682 
5683     CodeBuffer code(name, insts_size, locs_size);
5684     OopMapSet* oop_maps  = new OopMapSet();
5685     MacroAssembler* masm = new MacroAssembler(&code);
5686 
5687     address start = __ pc();
5688 
5689     // This is an inlined and slightly modified version of call_VM
5690     // which has the ability to fetch the return PC out of
5691     // thread-local storage and also sets up last_Java_sp slightly
5692     // differently than the real call_VM
5693 
5694     __ enter(); // required for proper stackwalking of RuntimeStub frame
5695 
5696     assert(is_even(framesize/2), "sp not 16-byte aligned");
5697 
5698     // return address and rbp are already in place
5699     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5700 
5701     int frame_complete = __ pc() - start;
5702 
5703     // Set up last_Java_sp and last_Java_fp
5704     address the_pc = __ pc();
5705     __ set_last_Java_frame(rsp, rbp, the_pc);
5706     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
5707 
5708     // Call runtime
5709     if (arg1 != noreg) {
5710       assert(arg2 != c_rarg1, "clobbered");
5711       __ movptr(c_rarg1, arg1);
5712     }
5713     if (arg2 != noreg) {
5714       __ movptr(c_rarg2, arg2);
5715     }
5716     __ movptr(c_rarg0, r15_thread);
5717     BLOCK_COMMENT("call runtime_entry");
5718     __ call(RuntimeAddress(runtime_entry));
5719 
5720     // Generate oop map
5721     OopMap* map = new OopMap(framesize, 0);
5722 
5723     oop_maps->add_gc_map(the_pc - start, map);
5724 
5725     __ reset_last_Java_frame(true);
5726 
5727     __ leave(); // required for proper stackwalking of RuntimeStub frame
5728 
5729     // check for pending exceptions
5730 #ifdef ASSERT
5731     Label L;
5732     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
5733             (int32_t) NULL_WORD);
5734     __ jcc(Assembler::notEqual, L);
5735     __ should_not_reach_here();
5736     __ bind(L);
5737 #endif // ASSERT
5738     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5739 
5740 
5741     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5742     RuntimeStub* stub =
5743       RuntimeStub::new_runtime_stub(name,
5744                                     &code,
5745                                     frame_complete,
5746                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5747                                     oop_maps, false);
5748     return stub->entry_point();
5749   }
5750 
5751   void create_control_words() {
5752     // Round to nearest, 53-bit mode, exceptions masked
5753     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
5754     // Round to zero, 53-bit mode, exception mased
5755     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
5756     // Round to nearest, 24-bit mode, exceptions masked
5757     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
5758     // Round to nearest, 64-bit mode, exceptions masked
5759     StubRoutines::_mxcsr_std           = 0x1F80;
5760     // Note: the following two constants are 80-bit values
5761     //       layout is critical for correct loading by FPU.
5762     // Bias for strict fp multiply/divide
5763     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
5764     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
5765     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
5766     // Un-Bias for strict fp multiply/divide
5767     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
5768     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
5769     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
5770   }
5771 
5772   // Initialization
5773   void generate_initial() {
5774     // Generates all stubs and initializes the entry points
5775 
5776     // This platform-specific settings are needed by generate_call_stub()
5777     create_control_words();
5778 
5779     // entry points that exist in all platforms Note: This is code
5780     // that could be shared among different platforms - however the
5781     // benefit seems to be smaller than the disadvantage of having a
5782     // much more complicated generator structure. See also comment in
5783     // stubRoutines.hpp.
5784 
5785     StubRoutines::_forward_exception_entry = generate_forward_exception();
5786 
5787     StubRoutines::_call_stub_entry =
5788       generate_call_stub(StubRoutines::_call_stub_return_address);
5789 
5790     // is referenced by megamorphic call
5791     StubRoutines::_catch_exception_entry = generate_catch_exception();
5792 
5793     // atomic calls
5794     StubRoutines::_atomic_xchg_entry          = generate_atomic_xchg();
5795     StubRoutines::_atomic_xchg_long_entry     = generate_atomic_xchg_long();
5796     StubRoutines::_atomic_cmpxchg_entry       = generate_atomic_cmpxchg();
5797     StubRoutines::_atomic_cmpxchg_byte_entry  = generate_atomic_cmpxchg_byte();
5798     StubRoutines::_atomic_cmpxchg_long_entry  = generate_atomic_cmpxchg_long();
5799     StubRoutines::_atomic_add_entry           = generate_atomic_add();
5800     StubRoutines::_atomic_add_long_entry      = generate_atomic_add_long();
5801     StubRoutines::_fence_entry                = generate_orderaccess_fence();
5802 
5803     // platform dependent
5804     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
5805     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
5806 
5807     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
5808 
5809     // Build this early so it's available for the interpreter.
5810     StubRoutines::_throw_StackOverflowError_entry =
5811       generate_throw_exception("StackOverflowError throw_exception",
5812                                CAST_FROM_FN_PTR(address,
5813                                                 SharedRuntime::
5814                                                 throw_StackOverflowError));
5815     StubRoutines::_throw_delayed_StackOverflowError_entry =
5816       generate_throw_exception("delayed StackOverflowError throw_exception",
5817                                CAST_FROM_FN_PTR(address,
5818                                                 SharedRuntime::
5819                                                 throw_delayed_StackOverflowError));
5820     if (UseCRC32Intrinsics) {
5821       // set table address before stub generation which use it
5822       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
5823       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5824     }
5825 
5826     if (UseCRC32CIntrinsics) {
5827       bool supports_clmul = VM_Version::supports_clmul();
5828       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
5829       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
5830       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
5831     }
5832     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
5833       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
5834           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
5835           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5836         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
5837         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
5838         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
5839         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
5840         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
5841         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
5842         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
5843         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
5844         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
5845         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
5846         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
5847         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
5848         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
5849         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
5850       }
5851       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
5852         StubRoutines::_dexp = generate_libmExp();
5853       }
5854       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5855         StubRoutines::_dlog = generate_libmLog();
5856       }
5857       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
5858         StubRoutines::_dlog10 = generate_libmLog10();
5859       }
5860       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
5861         StubRoutines::_dpow = generate_libmPow();
5862       }
5863       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5864         StubRoutines::_dsin = generate_libmSin();
5865       }
5866       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5867         StubRoutines::_dcos = generate_libmCos();
5868       }
5869       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5870         StubRoutines::_dtan = generate_libmTan();
5871       }
5872     }
5873   }
5874 
5875   void generate_all() {
5876     // Generates all stubs and initializes the entry points
5877 
5878     // These entry points require SharedInfo::stack0 to be set up in
5879     // non-core builds and need to be relocatable, so they each
5880     // fabricate a RuntimeStub internally.
5881     StubRoutines::_throw_AbstractMethodError_entry =
5882       generate_throw_exception("AbstractMethodError throw_exception",
5883                                CAST_FROM_FN_PTR(address,
5884                                                 SharedRuntime::
5885                                                 throw_AbstractMethodError));
5886 
5887     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5888       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5889                                CAST_FROM_FN_PTR(address,
5890                                                 SharedRuntime::
5891                                                 throw_IncompatibleClassChangeError));
5892 
5893     StubRoutines::_throw_NullPointerException_at_call_entry =
5894       generate_throw_exception("NullPointerException at call throw_exception",
5895                                CAST_FROM_FN_PTR(address,
5896                                                 SharedRuntime::
5897                                                 throw_NullPointerException_at_call));
5898 
5899     // entry points that are platform specific
5900     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
5901     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
5902     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
5903     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
5904 
5905     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
5906     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
5907     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5908     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
5909     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
5910     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
5911     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5912     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
5913     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
5914     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
5915     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
5916 
5917     // support for verify_oop (must happen after universe_init)
5918     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5919 
5920     // arraycopy stubs used by compilers
5921     generate_arraycopy_stubs();
5922 
5923     // don't bother generating these AES intrinsic stubs unless global flag is set
5924     if (UseAESIntrinsics) {
5925       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
5926       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5927       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5928       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5929       if (VM_Version::supports_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
5930         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
5931       } else {
5932         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5933       }
5934     }
5935     if (UseAESCTRIntrinsics){
5936       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
5937       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
5938     }
5939 
5940     if (UseSHA1Intrinsics) {
5941       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
5942       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
5943       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5944       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5945     }
5946     if (UseSHA256Intrinsics) {
5947       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
5948       char* dst = (char*)StubRoutines::x86::_k256_W;
5949       char* src = (char*)StubRoutines::x86::_k256;
5950       for (int ii = 0; ii < 16; ++ii) {
5951         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
5952         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
5953       }
5954       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
5955       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
5956       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5957       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5958     }
5959     if (UseSHA512Intrinsics) {
5960       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
5961       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
5962       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
5963       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
5964     }
5965 
5966     // Generate GHASH intrinsics code
5967     if (UseGHASHIntrinsics) {
5968     StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
5969     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
5970       if (VM_Version::supports_avx()) {
5971         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
5972         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
5973         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
5974       } else {
5975         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5976       }
5977     }
5978 
5979     if (UseBASE64Intrinsics) {
5980       StubRoutines::x86::_and_mask = base64_and_mask_addr();
5981       StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr();
5982       StubRoutines::x86::_base64_charset = base64_charset_addr();
5983       StubRoutines::x86::_url_charset = base64url_charset_addr();
5984       StubRoutines::x86::_gather_mask = base64_gather_mask_addr();
5985       StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr();
5986       StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr();
5987       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
5988     }
5989 
5990     // Safefetch stubs.
5991     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5992                                                        &StubRoutines::_safefetch32_fault_pc,
5993                                                        &StubRoutines::_safefetch32_continuation_pc);
5994     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5995                                                        &StubRoutines::_safefetchN_fault_pc,
5996                                                        &StubRoutines::_safefetchN_continuation_pc);
5997 
5998     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5999     if (bs_nm != NULL) {
6000       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
6001     }
6002 #ifdef COMPILER2
6003     if (UseMultiplyToLenIntrinsic) {
6004       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6005     }
6006     if (UseSquareToLenIntrinsic) {
6007       StubRoutines::_squareToLen = generate_squareToLen();
6008     }
6009     if (UseMulAddIntrinsic) {
6010       StubRoutines::_mulAdd = generate_mulAdd();
6011     }
6012 #ifndef _WINDOWS
6013     if (UseMontgomeryMultiplyIntrinsic) {
6014       StubRoutines::_montgomeryMultiply
6015         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
6016     }
6017     if (UseMontgomerySquareIntrinsic) {
6018       StubRoutines::_montgomerySquare
6019         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
6020     }
6021 #endif // WINDOWS
6022 #endif // COMPILER2
6023 
6024     if (UseVectorizedMismatchIntrinsic) {
6025       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
6026     }
6027   }
6028 
6029  public:
6030   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6031     if (all) {
6032       generate_all();
6033     } else {
6034       generate_initial();
6035     }
6036   }
6037 }; // end class declaration
6038 
6039 void StubGenerator_generate(CodeBuffer* code, bool all) {
6040   StubGenerator g(code, all);
6041 }