1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/barrierSetNMethod.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/universe.hpp"
  36 #include "nativeInst_x86.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"
  41 #include "prims/methodHandles.hpp"
  42 #include "runtime/arguments.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubCodeGenerator.hpp"
  47 #include "runtime/stubRoutines.hpp"
  48 #include "runtime/thread.inline.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_JVMCI
  53 #include "jvmci/jvmci_globals.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #define __ _masm->
  64 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  65 #define a__ ((Assembler*)_masm)->
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     // This can destroy rscratch1 if counter is far from the code cache
  86     __ incrementl(ExternalAddress((address)&counter));
  87   }
  88 #define inc_counter_np(counter) \
  89   BLOCK_COMMENT("inc_counter " #counter); \
  90   inc_counter_np_(counter);
  91 #endif
  92 
  93   // Call stubs are used to call Java from C
  94   //
  95   // Linux Arguments:
  96   //    c_rarg0:   call wrapper address                   address
  97   //    c_rarg1:   result                                 address
  98   //    c_rarg2:   result type                            BasicType
  99   //    c_rarg3:   method                                 Method*
 100   //    c_rarg4:   (interpreter) entry point              address
 101   //    c_rarg5:   parameters                             intptr_t*
 102   //    16(rbp): parameter size (in words)              int
 103   //    24(rbp): thread                                 Thread*
 104   //
 105   //     [ return_from_Java     ] <--- rsp
 106   //     [ argument word n      ]
 107   //      ...
 108   // -12 [ argument word 1      ]
 109   // -11 [ saved r15            ] <--- rsp_after_call
 110   // -10 [ saved r14            ]
 111   //  -9 [ saved r13            ]
 112   //  -8 [ saved r12            ]
 113   //  -7 [ saved rbx            ]
 114   //  -6 [ call wrapper         ]
 115   //  -5 [ result               ]
 116   //  -4 [ result type          ]
 117   //  -3 [ method               ]
 118   //  -2 [ entry point          ]
 119   //  -1 [ parameters           ]
 120   //   0 [ saved rbp            ] <--- rbp
 121   //   1 [ return address       ]
 122   //   2 [ parameter size       ]
 123   //   3 [ thread               ]
 124   //
 125   // Windows Arguments:
 126   //    c_rarg0:   call wrapper address                   address
 127   //    c_rarg1:   result                                 address
 128   //    c_rarg2:   result type                            BasicType
 129   //    c_rarg3:   method                                 Method*
 130   //    48(rbp): (interpreter) entry point              address
 131   //    56(rbp): parameters                             intptr_t*
 132   //    64(rbp): parameter size (in words)              int
 133   //    72(rbp): thread                                 Thread*
 134   //
 135   //     [ return_from_Java     ] <--- rsp
 136   //     [ argument word n      ]
 137   //      ...
 138   // -60 [ argument word 1      ]
 139   // -59 [ saved xmm31          ] <--- rsp after_call
 140   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 141   // -27 [ saved xmm15          ]
 142   //     [ saved xmm7-xmm14     ]
 143   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 144   //  -7 [ saved r15            ]
 145   //  -6 [ saved r14            ]
 146   //  -5 [ saved r13            ]
 147   //  -4 [ saved r12            ]
 148   //  -3 [ saved rdi            ]
 149   //  -2 [ saved rsi            ]
 150   //  -1 [ saved rbx            ]
 151   //   0 [ saved rbp            ] <--- rbp
 152   //   1 [ return address       ]
 153   //   2 [ call wrapper         ]
 154   //   3 [ result               ]
 155   //   4 [ result type          ]
 156   //   5 [ method               ]
 157   //   6 [ entry point          ]
 158   //   7 [ parameters           ]
 159   //   8 [ parameter size       ]
 160   //   9 [ thread               ]
 161   //
 162   //    Windows reserves the callers stack space for arguments 1-4.
 163   //    We spill c_rarg0-c_rarg3 to this space.
 164 
 165   // Call stub stack layout word offsets from rbp
 166   enum call_stub_layout {
 167 #ifdef _WIN64
 168     xmm_save_first     = 6,  // save from xmm6
 169     xmm_save_last      = 31, // to xmm31
 170     xmm_save_base      = -9,
 171     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 172     r15_off            = -7,
 173     r14_off            = -6,
 174     r13_off            = -5,
 175     r12_off            = -4,
 176     rdi_off            = -3,
 177     rsi_off            = -2,
 178     rbx_off            = -1,
 179     rbp_off            =  0,
 180     retaddr_off        =  1,
 181     call_wrapper_off   =  2,
 182     result_off         =  3,
 183     result_type_off    =  4,
 184     method_off         =  5,
 185     entry_point_off    =  6,
 186     parameters_off     =  7,
 187     parameter_size_off =  8,
 188     thread_off         =  9
 189 #else
 190     rsp_after_call_off = -12,
 191     mxcsr_off          = rsp_after_call_off,
 192     r15_off            = -11,
 193     r14_off            = -10,
 194     r13_off            = -9,
 195     r12_off            = -8,
 196     rbx_off            = -7,
 197     call_wrapper_off   = -6,
 198     result_off         = -5,
 199     result_type_off    = -4,
 200     method_off         = -3,
 201     entry_point_off    = -2,
 202     parameters_off     = -1,
 203     rbp_off            =  0,
 204     retaddr_off        =  1,
 205     parameter_size_off =  2,
 206     thread_off         =  3
 207 #endif
 208   };
 209 
 210 #ifdef _WIN64
 211   Address xmm_save(int reg) {
 212     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 213     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 214   }
 215 #endif
 216 
 217   address generate_call_stub(address& return_address) {
 218     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 219            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 220            "adjust this code");
 221     StubCodeMark mark(this, "StubRoutines", "call_stub");
 222     address start = __ pc();
 223 
 224     // same as in generate_catch_exception()!
 225     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 226 
 227     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 228     const Address result        (rbp, result_off         * wordSize);
 229     const Address result_type   (rbp, result_type_off    * wordSize);
 230     const Address method        (rbp, method_off         * wordSize);
 231     const Address entry_point   (rbp, entry_point_off    * wordSize);
 232     const Address parameters    (rbp, parameters_off     * wordSize);
 233     const Address parameter_size(rbp, parameter_size_off * wordSize);
 234 
 235     // same as in generate_catch_exception()!
 236     const Address thread        (rbp, thread_off         * wordSize);
 237 
 238     const Address r15_save(rbp, r15_off * wordSize);
 239     const Address r14_save(rbp, r14_off * wordSize);
 240     const Address r13_save(rbp, r13_off * wordSize);
 241     const Address r12_save(rbp, r12_off * wordSize);
 242     const Address rbx_save(rbp, rbx_off * wordSize);
 243 
 244     // stub code
 245     __ enter();
 246     __ subptr(rsp, -rsp_after_call_off * wordSize);
 247 
 248     // save register parameters
 249 #ifndef _WIN64
 250     __ movptr(parameters,   c_rarg5); // parameters
 251     __ movptr(entry_point,  c_rarg4); // entry_point
 252 #endif
 253 
 254     __ movptr(method,       c_rarg3); // method
 255     __ movl(result_type,  c_rarg2);   // result type
 256     __ movptr(result,       c_rarg1); // result
 257     __ movptr(call_wrapper, c_rarg0); // call wrapper
 258 
 259     // save regs belonging to calling function
 260     __ movptr(rbx_save, rbx);
 261     __ movptr(r12_save, r12);
 262     __ movptr(r13_save, r13);
 263     __ movptr(r14_save, r14);
 264     __ movptr(r15_save, r15);
 265 
 266 #ifdef _WIN64
 267     int last_reg = 15;
 268     if (UseAVX > 2) {
 269       last_reg = 31;
 270     }
 271     if (VM_Version::supports_evex()) {
 272       for (int i = xmm_save_first; i <= last_reg; i++) {
 273         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 274       }
 275     } else {
 276       for (int i = xmm_save_first; i <= last_reg; i++) {
 277         __ movdqu(xmm_save(i), as_XMMRegister(i));
 278       }
 279     }
 280 
 281     const Address rdi_save(rbp, rdi_off * wordSize);
 282     const Address rsi_save(rbp, rsi_off * wordSize);
 283 
 284     __ movptr(rsi_save, rsi);
 285     __ movptr(rdi_save, rdi);
 286 #else
 287     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 288     {
 289       Label skip_ldmx;
 290       __ stmxcsr(mxcsr_save);
 291       __ movl(rax, mxcsr_save);
 292       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 293       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 294       __ cmp32(rax, mxcsr_std);
 295       __ jcc(Assembler::equal, skip_ldmx);
 296       __ ldmxcsr(mxcsr_std);
 297       __ bind(skip_ldmx);
 298     }
 299 #endif
 300 
 301     // Load up thread register
 302     __ movptr(r15_thread, thread);
 303     __ reinit_heapbase();
 304 
 305 #ifdef ASSERT
 306     // make sure we have no pending exceptions
 307     {
 308       Label L;
 309       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 310       __ jcc(Assembler::equal, L);
 311       __ stop("StubRoutines::call_stub: entered with pending exception");
 312       __ bind(L);
 313     }
 314 #endif
 315 
 316     // pass parameters if any
 317     BLOCK_COMMENT("pass parameters if any");
 318     Label parameters_done;
 319     __ movl(c_rarg3, parameter_size);
 320     __ testl(c_rarg3, c_rarg3);
 321     __ jcc(Assembler::zero, parameters_done);
 322 
 323     Label loop;
 324     __ movptr(c_rarg2, parameters);       // parameter pointer
 325     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 326     __ BIND(loop);
 327     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 328     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 329     __ decrementl(c_rarg1);             // decrement counter
 330     __ push(rax);                       // pass parameter
 331     __ jcc(Assembler::notZero, loop);
 332 
 333     // call Java function
 334     __ BIND(parameters_done);
 335     __ movptr(rbx, method);             // get Method*
 336     __ movptr(c_rarg1, entry_point);    // get entry_point
 337     __ mov(r13, rsp);                   // set sender sp
 338     BLOCK_COMMENT("call Java function");
 339     __ call(c_rarg1);
 340 
 341     BLOCK_COMMENT("call_stub_return_address:");
 342     return_address = __ pc();
 343 
 344     // store result depending on type (everything that is not
 345     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 346     __ movptr(c_rarg0, result);
 347     Label is_long, is_float, is_double, exit;
 348     __ movl(c_rarg1, result_type);
 349     __ cmpl(c_rarg1, T_OBJECT);
 350     __ jcc(Assembler::equal, is_long);
 351     __ cmpl(c_rarg1, T_LONG);
 352     __ jcc(Assembler::equal, is_long);
 353     __ cmpl(c_rarg1, T_FLOAT);
 354     __ jcc(Assembler::equal, is_float);
 355     __ cmpl(c_rarg1, T_DOUBLE);
 356     __ jcc(Assembler::equal, is_double);
 357 
 358     // handle T_INT case
 359     __ movl(Address(c_rarg0, 0), rax);
 360 
 361     __ BIND(exit);
 362 
 363     // pop parameters
 364     __ lea(rsp, rsp_after_call);
 365 
 366 #ifdef ASSERT
 367     // verify that threads correspond
 368     {
 369      Label L1, L2, L3;
 370       __ cmpptr(r15_thread, thread);
 371       __ jcc(Assembler::equal, L1);
 372       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 373       __ bind(L1);
 374       __ get_thread(rbx);
 375       __ cmpptr(r15_thread, thread);
 376       __ jcc(Assembler::equal, L2);
 377       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 378       __ bind(L2);
 379       __ cmpptr(r15_thread, rbx);
 380       __ jcc(Assembler::equal, L3);
 381       __ stop("StubRoutines::call_stub: threads must correspond");
 382       __ bind(L3);
 383     }
 384 #endif
 385 
 386     // restore regs belonging to calling function
 387 #ifdef _WIN64
 388     // emit the restores for xmm regs
 389     if (VM_Version::supports_evex()) {
 390       for (int i = xmm_save_first; i <= last_reg; i++) {
 391         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 392       }
 393     } else {
 394       for (int i = xmm_save_first; i <= last_reg; i++) {
 395         __ movdqu(as_XMMRegister(i), xmm_save(i));
 396       }
 397     }
 398 #endif
 399     __ movptr(r15, r15_save);
 400     __ movptr(r14, r14_save);
 401     __ movptr(r13, r13_save);
 402     __ movptr(r12, r12_save);
 403     __ movptr(rbx, rbx_save);
 404 
 405 #ifdef _WIN64
 406     __ movptr(rdi, rdi_save);
 407     __ movptr(rsi, rsi_save);
 408 #else
 409     __ ldmxcsr(mxcsr_save);
 410 #endif
 411 
 412     // restore rsp
 413     __ addptr(rsp, -rsp_after_call_off * wordSize);
 414 
 415     // return
 416     __ vzeroupper();
 417     __ pop(rbp);
 418     __ ret(0);
 419 
 420     // handle return types different from T_INT
 421     __ BIND(is_long);
 422     __ movq(Address(c_rarg0, 0), rax);
 423     __ jmp(exit);
 424 
 425     __ BIND(is_float);
 426     __ movflt(Address(c_rarg0, 0), xmm0);
 427     __ jmp(exit);
 428 
 429     __ BIND(is_double);
 430     __ movdbl(Address(c_rarg0, 0), xmm0);
 431     __ jmp(exit);
 432 
 433     return start;
 434   }
 435 
 436   // Return point for a Java call if there's an exception thrown in
 437   // Java code.  The exception is caught and transformed into a
 438   // pending exception stored in JavaThread that can be tested from
 439   // within the VM.
 440   //
 441   // Note: Usually the parameters are removed by the callee. In case
 442   // of an exception crossing an activation frame boundary, that is
 443   // not the case if the callee is compiled code => need to setup the
 444   // rsp.
 445   //
 446   // rax: exception oop
 447 
 448   address generate_catch_exception() {
 449     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 450     address start = __ pc();
 451 
 452     // same as in generate_call_stub():
 453     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 454     const Address thread        (rbp, thread_off         * wordSize);
 455 
 456 #ifdef ASSERT
 457     // verify that threads correspond
 458     {
 459       Label L1, L2, L3;
 460       __ cmpptr(r15_thread, thread);
 461       __ jcc(Assembler::equal, L1);
 462       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 463       __ bind(L1);
 464       __ get_thread(rbx);
 465       __ cmpptr(r15_thread, thread);
 466       __ jcc(Assembler::equal, L2);
 467       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 468       __ bind(L2);
 469       __ cmpptr(r15_thread, rbx);
 470       __ jcc(Assembler::equal, L3);
 471       __ stop("StubRoutines::catch_exception: threads must correspond");
 472       __ bind(L3);
 473     }
 474 #endif
 475 
 476     // set pending exception
 477     __ verify_oop(rax);
 478 
 479     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 480     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 481     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 482     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 483 
 484     // complete return to VM
 485     assert(StubRoutines::_call_stub_return_address != NULL,
 486            "_call_stub_return_address must have been generated before");
 487     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 488 
 489     return start;
 490   }
 491 
 492   // Continuation point for runtime calls returning with a pending
 493   // exception.  The pending exception check happened in the runtime
 494   // or native call stub.  The pending exception in Thread is
 495   // converted into a Java-level exception.
 496   //
 497   // Contract with Java-level exception handlers:
 498   // rax: exception
 499   // rdx: throwing pc
 500   //
 501   // NOTE: At entry of this stub, exception-pc must be on stack !!
 502 
 503   address generate_forward_exception() {
 504     StubCodeMark mark(this, "StubRoutines", "forward exception");
 505     address start = __ pc();
 506 
 507     // Upon entry, the sp points to the return address returning into
 508     // Java (interpreted or compiled) code; i.e., the return address
 509     // becomes the throwing pc.
 510     //
 511     // Arguments pushed before the runtime call are still on the stack
 512     // but the exception handler will reset the stack pointer ->
 513     // ignore them.  A potential result in registers can be ignored as
 514     // well.
 515 
 516 #ifdef ASSERT
 517     // make sure this code is only executed if there is a pending exception
 518     {
 519       Label L;
 520       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 521       __ jcc(Assembler::notEqual, L);
 522       __ stop("StubRoutines::forward exception: no pending exception (1)");
 523       __ bind(L);
 524     }
 525 #endif
 526 
 527     // compute exception handler into rbx
 528     __ movptr(c_rarg0, Address(rsp, 0));
 529     BLOCK_COMMENT("call exception_handler_for_return_address");
 530     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 531                          SharedRuntime::exception_handler_for_return_address),
 532                     r15_thread, c_rarg0);
 533     __ mov(rbx, rax);
 534 
 535     // setup rax & rdx, remove return address & clear pending exception
 536     __ pop(rdx);
 537     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 538     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 539 
 540 #ifdef ASSERT
 541     // make sure exception is set
 542     {
 543       Label L;
 544       __ testptr(rax, rax);
 545       __ jcc(Assembler::notEqual, L);
 546       __ stop("StubRoutines::forward exception: no pending exception (2)");
 547       __ bind(L);
 548     }
 549 #endif
 550 
 551     // continue at exception handler (return address removed)
 552     // rax: exception
 553     // rbx: exception handler
 554     // rdx: throwing pc
 555     __ verify_oop(rax);
 556     __ jmp(rbx);
 557 
 558     return start;
 559   }
 560 
 561   // Support for intptr_t OrderAccess::fence()
 562   //
 563   // Arguments :
 564   //
 565   // Result:
 566   address generate_orderaccess_fence() {
 567     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 568     address start = __ pc();
 569     __ membar(Assembler::StoreLoad);
 570     __ ret(0);
 571 
 572     return start;
 573   }
 574 
 575 
 576   // Support for intptr_t get_previous_sp()
 577   //
 578   // This routine is used to find the previous stack pointer for the
 579   // caller.
 580   address generate_get_previous_sp() {
 581     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 582     address start = __ pc();
 583 
 584     __ movptr(rax, rsp);
 585     __ addptr(rax, 8); // return address is at the top of the stack.
 586     __ ret(0);
 587 
 588     return start;
 589   }
 590 
 591   //----------------------------------------------------------------------------------------------------
 592   // Support for void verify_mxcsr()
 593   //
 594   // This routine is used with -Xcheck:jni to verify that native
 595   // JNI code does not return to Java code without restoring the
 596   // MXCSR register to our expected state.
 597 
 598   address generate_verify_mxcsr() {
 599     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 600     address start = __ pc();
 601 
 602     const Address mxcsr_save(rsp, 0);
 603 
 604     if (CheckJNICalls) {
 605       Label ok_ret;
 606       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 607       __ push(rax);
 608       __ subptr(rsp, wordSize);      // allocate a temp location
 609       __ stmxcsr(mxcsr_save);
 610       __ movl(rax, mxcsr_save);
 611       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 612       __ cmp32(rax, mxcsr_std);
 613       __ jcc(Assembler::equal, ok_ret);
 614 
 615       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 616 
 617       __ ldmxcsr(mxcsr_std);
 618 
 619       __ bind(ok_ret);
 620       __ addptr(rsp, wordSize);
 621       __ pop(rax);
 622     }
 623 
 624     __ ret(0);
 625 
 626     return start;
 627   }
 628 
 629   address generate_f2i_fixup() {
 630     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 631     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 632 
 633     address start = __ pc();
 634 
 635     Label L;
 636 
 637     __ push(rax);
 638     __ push(c_rarg3);
 639     __ push(c_rarg2);
 640     __ push(c_rarg1);
 641 
 642     __ movl(rax, 0x7f800000);
 643     __ xorl(c_rarg3, c_rarg3);
 644     __ movl(c_rarg2, inout);
 645     __ movl(c_rarg1, c_rarg2);
 646     __ andl(c_rarg1, 0x7fffffff);
 647     __ cmpl(rax, c_rarg1); // NaN? -> 0
 648     __ jcc(Assembler::negative, L);
 649     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 650     __ movl(c_rarg3, 0x80000000);
 651     __ movl(rax, 0x7fffffff);
 652     __ cmovl(Assembler::positive, c_rarg3, rax);
 653 
 654     __ bind(L);
 655     __ movptr(inout, c_rarg3);
 656 
 657     __ pop(c_rarg1);
 658     __ pop(c_rarg2);
 659     __ pop(c_rarg3);
 660     __ pop(rax);
 661 
 662     __ ret(0);
 663 
 664     return start;
 665   }
 666 
 667   address generate_f2l_fixup() {
 668     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 669     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 670     address start = __ pc();
 671 
 672     Label L;
 673 
 674     __ push(rax);
 675     __ push(c_rarg3);
 676     __ push(c_rarg2);
 677     __ push(c_rarg1);
 678 
 679     __ movl(rax, 0x7f800000);
 680     __ xorl(c_rarg3, c_rarg3);
 681     __ movl(c_rarg2, inout);
 682     __ movl(c_rarg1, c_rarg2);
 683     __ andl(c_rarg1, 0x7fffffff);
 684     __ cmpl(rax, c_rarg1); // NaN? -> 0
 685     __ jcc(Assembler::negative, L);
 686     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 687     __ mov64(c_rarg3, 0x8000000000000000);
 688     __ mov64(rax, 0x7fffffffffffffff);
 689     __ cmov(Assembler::positive, c_rarg3, rax);
 690 
 691     __ bind(L);
 692     __ movptr(inout, c_rarg3);
 693 
 694     __ pop(c_rarg1);
 695     __ pop(c_rarg2);
 696     __ pop(c_rarg3);
 697     __ pop(rax);
 698 
 699     __ ret(0);
 700 
 701     return start;
 702   }
 703 
 704   address generate_d2i_fixup() {
 705     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 706     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 707 
 708     address start = __ pc();
 709 
 710     Label L;
 711 
 712     __ push(rax);
 713     __ push(c_rarg3);
 714     __ push(c_rarg2);
 715     __ push(c_rarg1);
 716     __ push(c_rarg0);
 717 
 718     __ movl(rax, 0x7ff00000);
 719     __ movq(c_rarg2, inout);
 720     __ movl(c_rarg3, c_rarg2);
 721     __ mov(c_rarg1, c_rarg2);
 722     __ mov(c_rarg0, c_rarg2);
 723     __ negl(c_rarg3);
 724     __ shrptr(c_rarg1, 0x20);
 725     __ orl(c_rarg3, c_rarg2);
 726     __ andl(c_rarg1, 0x7fffffff);
 727     __ xorl(c_rarg2, c_rarg2);
 728     __ shrl(c_rarg3, 0x1f);
 729     __ orl(c_rarg1, c_rarg3);
 730     __ cmpl(rax, c_rarg1);
 731     __ jcc(Assembler::negative, L); // NaN -> 0
 732     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 733     __ movl(c_rarg2, 0x80000000);
 734     __ movl(rax, 0x7fffffff);
 735     __ cmov(Assembler::positive, c_rarg2, rax);
 736 
 737     __ bind(L);
 738     __ movptr(inout, c_rarg2);
 739 
 740     __ pop(c_rarg0);
 741     __ pop(c_rarg1);
 742     __ pop(c_rarg2);
 743     __ pop(c_rarg3);
 744     __ pop(rax);
 745 
 746     __ ret(0);
 747 
 748     return start;
 749   }
 750 
 751   address generate_d2l_fixup() {
 752     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 753     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 754 
 755     address start = __ pc();
 756 
 757     Label L;
 758 
 759     __ push(rax);
 760     __ push(c_rarg3);
 761     __ push(c_rarg2);
 762     __ push(c_rarg1);
 763     __ push(c_rarg0);
 764 
 765     __ movl(rax, 0x7ff00000);
 766     __ movq(c_rarg2, inout);
 767     __ movl(c_rarg3, c_rarg2);
 768     __ mov(c_rarg1, c_rarg2);
 769     __ mov(c_rarg0, c_rarg2);
 770     __ negl(c_rarg3);
 771     __ shrptr(c_rarg1, 0x20);
 772     __ orl(c_rarg3, c_rarg2);
 773     __ andl(c_rarg1, 0x7fffffff);
 774     __ xorl(c_rarg2, c_rarg2);
 775     __ shrl(c_rarg3, 0x1f);
 776     __ orl(c_rarg1, c_rarg3);
 777     __ cmpl(rax, c_rarg1);
 778     __ jcc(Assembler::negative, L); // NaN -> 0
 779     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 780     __ mov64(c_rarg2, 0x8000000000000000);
 781     __ mov64(rax, 0x7fffffffffffffff);
 782     __ cmovq(Assembler::positive, c_rarg2, rax);
 783 
 784     __ bind(L);
 785     __ movq(inout, c_rarg2);
 786 
 787     __ pop(c_rarg0);
 788     __ pop(c_rarg1);
 789     __ pop(c_rarg2);
 790     __ pop(c_rarg3);
 791     __ pop(rax);
 792 
 793     __ ret(0);
 794 
 795     return start;
 796   }
 797 
 798   address generate_count_leading_zeros_lut(const char *stub_name) {
 799     __ align64();
 800     StubCodeMark mark(this, "StubRoutines", stub_name);
 801     address start = __ pc();
 802     __ emit_data64(0x0101010102020304, relocInfo::none);
 803     __ emit_data64(0x0000000000000000, relocInfo::none);
 804     __ emit_data64(0x0101010102020304, relocInfo::none);
 805     __ emit_data64(0x0000000000000000, relocInfo::none);
 806     __ emit_data64(0x0101010102020304, relocInfo::none);
 807     __ emit_data64(0x0000000000000000, relocInfo::none);
 808     __ emit_data64(0x0101010102020304, relocInfo::none);
 809     __ emit_data64(0x0000000000000000, relocInfo::none);
 810     return start;
 811   }
 812 
 813   address generate_popcount_avx_lut(const char *stub_name) {
 814     __ align64();
 815     StubCodeMark mark(this, "StubRoutines", stub_name);
 816     address start = __ pc();
 817     __ emit_data64(0x0302020102010100, relocInfo::none);
 818     __ emit_data64(0x0403030203020201, relocInfo::none);
 819     __ emit_data64(0x0302020102010100, relocInfo::none);
 820     __ emit_data64(0x0403030203020201, relocInfo::none);
 821     __ emit_data64(0x0302020102010100, relocInfo::none);
 822     __ emit_data64(0x0403030203020201, relocInfo::none);
 823     __ emit_data64(0x0302020102010100, relocInfo::none);
 824     __ emit_data64(0x0403030203020201, relocInfo::none);
 825     return start;
 826   }
 827 
 828   address generate_iota_indices(const char *stub_name) {
 829     __ align(CodeEntryAlignment);
 830     StubCodeMark mark(this, "StubRoutines", stub_name);
 831     address start = __ pc();
 832     __ emit_data64(0x0706050403020100, relocInfo::none);
 833     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 834     __ emit_data64(0x1716151413121110, relocInfo::none);
 835     __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
 836     __ emit_data64(0x2726252423222120, relocInfo::none);
 837     __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
 838     __ emit_data64(0x3736353433323130, relocInfo::none);
 839     __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
 840     return start;
 841   }
 842 
 843   address generate_vector_reverse_bit_lut(const char *stub_name) {
 844     __ align(CodeEntryAlignment);
 845     StubCodeMark mark(this, "StubRoutines", stub_name);
 846     address start = __ pc();
 847     __ emit_data64(0x0E060A020C040800, relocInfo::none);
 848     __ emit_data64(0x0F070B030D050901, relocInfo::none);
 849     __ emit_data64(0x0E060A020C040800, relocInfo::none);
 850     __ emit_data64(0x0F070B030D050901, relocInfo::none);
 851     __ emit_data64(0x0E060A020C040800, relocInfo::none);
 852     __ emit_data64(0x0F070B030D050901, relocInfo::none);
 853     __ emit_data64(0x0E060A020C040800, relocInfo::none);
 854     __ emit_data64(0x0F070B030D050901, relocInfo::none);
 855     return start;
 856   }
 857 
 858   address generate_vector_reverse_byte_perm_mask_long(const char *stub_name) {
 859     __ align(CodeEntryAlignment);
 860     StubCodeMark mark(this, "StubRoutines", stub_name);
 861     address start = __ pc();
 862     __ emit_data64(0x0001020304050607, relocInfo::none);
 863     __ emit_data64(0x08090A0B0C0D0E0F, relocInfo::none);
 864     __ emit_data64(0x0001020304050607, relocInfo::none);
 865     __ emit_data64(0x08090A0B0C0D0E0F, relocInfo::none);
 866     __ emit_data64(0x0001020304050607, relocInfo::none);
 867     __ emit_data64(0x08090A0B0C0D0E0F, relocInfo::none);
 868     __ emit_data64(0x0001020304050607, relocInfo::none);
 869     __ emit_data64(0x08090A0B0C0D0E0F, relocInfo::none);
 870     return start;
 871   }
 872 
 873   address generate_vector_reverse_byte_perm_mask_int(const char *stub_name) {
 874     __ align(CodeEntryAlignment);
 875     StubCodeMark mark(this, "StubRoutines", stub_name);
 876     address start = __ pc();
 877     __ emit_data64(0x0405060700010203, relocInfo::none);
 878     __ emit_data64(0x0C0D0E0F08090A0B, relocInfo::none);
 879     __ emit_data64(0x0405060700010203, relocInfo::none);
 880     __ emit_data64(0x0C0D0E0F08090A0B, relocInfo::none);
 881     __ emit_data64(0x0405060700010203, relocInfo::none);
 882     __ emit_data64(0x0C0D0E0F08090A0B, relocInfo::none);
 883     __ emit_data64(0x0405060700010203, relocInfo::none);
 884     __ emit_data64(0x0C0D0E0F08090A0B, relocInfo::none);
 885     return start;
 886   }
 887 
 888   address generate_vector_reverse_byte_perm_mask_short(const char *stub_name) {
 889     __ align(CodeEntryAlignment);
 890     StubCodeMark mark(this, "StubRoutines", stub_name);
 891     address start = __ pc();
 892     __ emit_data64(0x0607040502030001, relocInfo::none);
 893     __ emit_data64(0x0E0F0C0D0A0B0809, relocInfo::none);
 894     __ emit_data64(0x0607040502030001, relocInfo::none);
 895     __ emit_data64(0x0E0F0C0D0A0B0809, relocInfo::none);
 896     __ emit_data64(0x0607040502030001, relocInfo::none);
 897     __ emit_data64(0x0E0F0C0D0A0B0809, relocInfo::none);
 898     __ emit_data64(0x0607040502030001, relocInfo::none);
 899     __ emit_data64(0x0E0F0C0D0A0B0809, relocInfo::none);
 900     return start;
 901   }
 902 
 903   address generate_vector_byte_shuffle_mask(const char *stub_name) {
 904     __ align(CodeEntryAlignment);
 905     StubCodeMark mark(this, "StubRoutines", stub_name);
 906     address start = __ pc();
 907     __ emit_data64(0x7070707070707070, relocInfo::none);
 908     __ emit_data64(0x7070707070707070, relocInfo::none);
 909     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 910     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 911     return start;
 912   }
 913 
 914   address generate_fp_mask(const char *stub_name, int64_t mask) {
 915     __ align(CodeEntryAlignment);
 916     StubCodeMark mark(this, "StubRoutines", stub_name);
 917     address start = __ pc();
 918 
 919     __ emit_data64( mask, relocInfo::none );
 920     __ emit_data64( mask, relocInfo::none );
 921 
 922     return start;
 923   }
 924 
 925   address generate_vector_mask(const char *stub_name, int64_t mask) {
 926     __ align(CodeEntryAlignment);
 927     StubCodeMark mark(this, "StubRoutines", stub_name);
 928     address start = __ pc();
 929 
 930     __ emit_data64(mask, relocInfo::none);
 931     __ emit_data64(mask, relocInfo::none);
 932     __ emit_data64(mask, relocInfo::none);
 933     __ emit_data64(mask, relocInfo::none);
 934     __ emit_data64(mask, relocInfo::none);
 935     __ emit_data64(mask, relocInfo::none);
 936     __ emit_data64(mask, relocInfo::none);
 937     __ emit_data64(mask, relocInfo::none);
 938 
 939     return start;
 940   }
 941 
 942   address generate_vector_byte_perm_mask(const char *stub_name) {
 943     __ align(CodeEntryAlignment);
 944     StubCodeMark mark(this, "StubRoutines", stub_name);
 945     address start = __ pc();
 946 
 947     __ emit_data64(0x0000000000000001, relocInfo::none);
 948     __ emit_data64(0x0000000000000003, relocInfo::none);
 949     __ emit_data64(0x0000000000000005, relocInfo::none);
 950     __ emit_data64(0x0000000000000007, relocInfo::none);
 951     __ emit_data64(0x0000000000000000, relocInfo::none);
 952     __ emit_data64(0x0000000000000002, relocInfo::none);
 953     __ emit_data64(0x0000000000000004, relocInfo::none);
 954     __ emit_data64(0x0000000000000006, relocInfo::none);
 955 
 956     return start;
 957   }
 958 
 959   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
 960     __ align(CodeEntryAlignment);
 961     StubCodeMark mark(this, "StubRoutines", stub_name);
 962     address start = __ pc();
 963 
 964     __ emit_data64(mask, relocInfo::none);
 965     __ emit_data64(mask, relocInfo::none);
 966     __ emit_data64(mask, relocInfo::none);
 967     __ emit_data64(mask, relocInfo::none);
 968     __ emit_data64(mask, relocInfo::none);
 969     __ emit_data64(mask, relocInfo::none);
 970     __ emit_data64(mask, relocInfo::none);
 971     __ emit_data64(mask, relocInfo::none);
 972 
 973     return start;
 974   }
 975 
 976   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 977                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 978                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 979                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 980                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 981     __ align(CodeEntryAlignment);
 982     StubCodeMark mark(this, "StubRoutines", stub_name);
 983     address start = __ pc();
 984 
 985     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 986     __ emit_data(val0, relocInfo::none, 0);
 987     __ emit_data(val1, relocInfo::none, 0);
 988     __ emit_data(val2, relocInfo::none, 0);
 989     __ emit_data(val3, relocInfo::none, 0);
 990     if (len >= Assembler::AVX_256bit) {
 991       __ emit_data(val4, relocInfo::none, 0);
 992       __ emit_data(val5, relocInfo::none, 0);
 993       __ emit_data(val6, relocInfo::none, 0);
 994       __ emit_data(val7, relocInfo::none, 0);
 995       if (len >= Assembler::AVX_512bit) {
 996         __ emit_data(val8, relocInfo::none, 0);
 997         __ emit_data(val9, relocInfo::none, 0);
 998         __ emit_data(val10, relocInfo::none, 0);
 999         __ emit_data(val11, relocInfo::none, 0);
1000         __ emit_data(val12, relocInfo::none, 0);
1001         __ emit_data(val13, relocInfo::none, 0);
1002         __ emit_data(val14, relocInfo::none, 0);
1003         __ emit_data(val15, relocInfo::none, 0);
1004       }
1005     }
1006 
1007     return start;
1008   }
1009 
1010   // Non-destructive plausibility checks for oops
1011   //
1012   // Arguments:
1013   //    all args on stack!
1014   //
1015   // Stack after saving c_rarg3:
1016   //    [tos + 0]: saved c_rarg3
1017   //    [tos + 1]: saved c_rarg2
1018   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
1019   //    [tos + 3]: saved flags
1020   //    [tos + 4]: return address
1021   //  * [tos + 5]: error message (char*)
1022   //  * [tos + 6]: object to verify (oop)
1023   //  * [tos + 7]: saved rax - saved by caller and bashed
1024   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
1025   //  * = popped on exit
1026   address generate_verify_oop() {
1027     StubCodeMark mark(this, "StubRoutines", "verify_oop");
1028     address start = __ pc();
1029 
1030     Label exit, error;
1031 
1032     __ pushf();
1033     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1034 
1035     __ push(r12);
1036 
1037     // save c_rarg2 and c_rarg3
1038     __ push(c_rarg2);
1039     __ push(c_rarg3);
1040 
1041     enum {
1042            // After previous pushes.
1043            oop_to_verify = 6 * wordSize,
1044            saved_rax     = 7 * wordSize,
1045            saved_r10     = 8 * wordSize,
1046 
1047            // Before the call to MacroAssembler::debug(), see below.
1048            return_addr   = 16 * wordSize,
1049            error_msg     = 17 * wordSize
1050     };
1051 
1052     // get object
1053     __ movptr(rax, Address(rsp, oop_to_verify));
1054 
1055     // make sure object is 'reasonable'
1056     __ testptr(rax, rax);
1057     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1058 
1059 #if INCLUDE_ZGC
1060     if (UseZGC) {
1061       // Check if metadata bits indicate a bad oop
1062       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1063       __ jcc(Assembler::notZero, error);
1064     }
1065 #endif
1066 
1067     // Check if the oop is in the right area of memory
1068     __ movptr(c_rarg2, rax);
1069     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1070     __ andptr(c_rarg2, c_rarg3);
1071     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1072     __ cmpptr(c_rarg2, c_rarg3);
1073     __ jcc(Assembler::notZero, error);
1074 
1075     // make sure klass is 'reasonable', which is not zero.
1076     __ load_klass(rax, rax, rscratch1);  // get klass
1077     __ testptr(rax, rax);
1078     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1079 
1080     // return if everything seems ok
1081     __ bind(exit);
1082     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1083     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1084     __ pop(c_rarg3);                             // restore c_rarg3
1085     __ pop(c_rarg2);                             // restore c_rarg2
1086     __ pop(r12);                                 // restore r12
1087     __ popf();                                   // restore flags
1088     __ ret(4 * wordSize);                        // pop caller saved stuff
1089 
1090     // handle errors
1091     __ bind(error);
1092     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1093     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1094     __ pop(c_rarg3);                             // get saved c_rarg3 back
1095     __ pop(c_rarg2);                             // get saved c_rarg2 back
1096     __ pop(r12);                                 // get saved r12 back
1097     __ popf();                                   // get saved flags off stack --
1098                                                  // will be ignored
1099 
1100     __ pusha();                                  // push registers
1101                                                  // (rip is already
1102                                                  // already pushed)
1103     // debug(char* msg, int64_t pc, int64_t regs[])
1104     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1105     // pushed all the registers, so now the stack looks like:
1106     //     [tos +  0] 16 saved registers
1107     //     [tos + 16] return address
1108     //   * [tos + 17] error message (char*)
1109     //   * [tos + 18] object to verify (oop)
1110     //   * [tos + 19] saved rax - saved by caller and bashed
1111     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1112     //   * = popped on exit
1113 
1114     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1115     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1116     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1117     __ mov(r12, rsp);                               // remember rsp
1118     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1119     __ andptr(rsp, -16);                            // align stack as required by ABI
1120     BLOCK_COMMENT("call MacroAssembler::debug");
1121     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1122     __ hlt();
1123     return start;
1124   }
1125 
1126   //
1127   // Verify that a register contains clean 32-bits positive value
1128   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1129   //
1130   //  Input:
1131   //    Rint  -  32-bits value
1132   //    Rtmp  -  scratch
1133   //
1134   void assert_clean_int(Register Rint, Register Rtmp) {
1135 #ifdef ASSERT
1136     Label L;
1137     assert_different_registers(Rtmp, Rint);
1138     __ movslq(Rtmp, Rint);
1139     __ cmpq(Rtmp, Rint);
1140     __ jcc(Assembler::equal, L);
1141     __ stop("high 32-bits of int value are not 0");
1142     __ bind(L);
1143 #endif
1144   }
1145 
1146   //  Generate overlap test for array copy stubs
1147   //
1148   //  Input:
1149   //     c_rarg0 - from
1150   //     c_rarg1 - to
1151   //     c_rarg2 - element count
1152   //
1153   //  Output:
1154   //     rax   - &from[element count - 1]
1155   //
1156   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1157     assert(no_overlap_target != NULL, "must be generated");
1158     array_overlap_test(no_overlap_target, NULL, sf);
1159   }
1160   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1161     array_overlap_test(NULL, &L_no_overlap, sf);
1162   }
1163   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1164     const Register from     = c_rarg0;
1165     const Register to       = c_rarg1;
1166     const Register count    = c_rarg2;
1167     const Register end_from = rax;
1168 
1169     __ cmpptr(to, from);
1170     __ lea(end_from, Address(from, count, sf, 0));
1171     if (NOLp == NULL) {
1172       ExternalAddress no_overlap(no_overlap_target);
1173       __ jump_cc(Assembler::belowEqual, no_overlap);
1174       __ cmpptr(to, end_from);
1175       __ jump_cc(Assembler::aboveEqual, no_overlap);
1176     } else {
1177       __ jcc(Assembler::belowEqual, (*NOLp));
1178       __ cmpptr(to, end_from);
1179       __ jcc(Assembler::aboveEqual, (*NOLp));
1180     }
1181   }
1182 
1183   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1184   //
1185   // Outputs:
1186   //    rdi - rcx
1187   //    rsi - rdx
1188   //    rdx - r8
1189   //    rcx - r9
1190   //
1191   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1192   // are non-volatile.  r9 and r10 should not be used by the caller.
1193   //
1194   DEBUG_ONLY(bool regs_in_thread;)
1195 
1196   void setup_arg_regs(int nargs = 3) {
1197     const Register saved_rdi = r9;
1198     const Register saved_rsi = r10;
1199     assert(nargs == 3 || nargs == 4, "else fix");
1200 #ifdef _WIN64
1201     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1202            "unexpected argument registers");
1203     if (nargs >= 4)
1204       __ mov(rax, r9);  // r9 is also saved_rdi
1205     __ movptr(saved_rdi, rdi);
1206     __ movptr(saved_rsi, rsi);
1207     __ mov(rdi, rcx); // c_rarg0
1208     __ mov(rsi, rdx); // c_rarg1
1209     __ mov(rdx, r8);  // c_rarg2
1210     if (nargs >= 4)
1211       __ mov(rcx, rax); // c_rarg3 (via rax)
1212 #else
1213     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1214            "unexpected argument registers");
1215 #endif
1216     DEBUG_ONLY(regs_in_thread = false;)
1217   }
1218 
1219   void restore_arg_regs() {
1220     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1221     const Register saved_rdi = r9;
1222     const Register saved_rsi = r10;
1223 #ifdef _WIN64
1224     __ movptr(rdi, saved_rdi);
1225     __ movptr(rsi, saved_rsi);
1226 #endif
1227   }
1228 
1229   // This is used in places where r10 is a scratch register, and can
1230   // be adapted if r9 is needed also.
1231   void setup_arg_regs_using_thread() {
1232     const Register saved_r15 = r9;
1233 #ifdef _WIN64
1234     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1235     __ get_thread(r15_thread);
1236     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1237            "unexpected argument registers");
1238     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1239     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1240 
1241     __ mov(rdi, rcx); // c_rarg0
1242     __ mov(rsi, rdx); // c_rarg1
1243     __ mov(rdx, r8);  // c_rarg2
1244 #else
1245     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1246            "unexpected argument registers");
1247 #endif
1248     DEBUG_ONLY(regs_in_thread = true;)
1249   }
1250 
1251   void restore_arg_regs_using_thread() {
1252     assert(regs_in_thread, "wrong call to restore_arg_regs");
1253     const Register saved_r15 = r9;
1254 #ifdef _WIN64
1255     __ get_thread(r15_thread);
1256     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1257     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1258     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1259 #endif
1260   }
1261 
1262   // Copy big chunks forward
1263   //
1264   // Inputs:
1265   //   end_from     - source arrays end address
1266   //   end_to       - destination array end address
1267   //   qword_count  - 64-bits element count, negative
1268   //   to           - scratch
1269   //   L_copy_bytes - entry label
1270   //   L_copy_8_bytes  - exit  label
1271   //
1272   void copy_bytes_forward(Register end_from, Register end_to,
1273                              Register qword_count, Register to,
1274                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1275     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1276     Label L_loop;
1277     __ align(OptoLoopAlignment);
1278     if (UseUnalignedLoadStores) {
1279       Label L_end;
1280       __ BIND(L_loop);
1281       if (UseAVX >= 2) {
1282         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1283         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1284         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1285         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1286       } else {
1287         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1288         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1289         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1290         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1291         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1292         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1293         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1294         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1295       }
1296 
1297       __ BIND(L_copy_bytes);
1298       __ addptr(qword_count, 8);
1299       __ jcc(Assembler::lessEqual, L_loop);
1300       __ subptr(qword_count, 4);  // sub(8) and add(4)
1301       __ jccb(Assembler::greater, L_end);
1302       // Copy trailing 32 bytes
1303       if (UseAVX >= 2) {
1304         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1305         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1306       } else {
1307         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1308         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1309         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1310         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1311       }
1312       __ addptr(qword_count, 4);
1313       __ BIND(L_end);
1314     } else {
1315       // Copy 32-bytes per iteration
1316       __ BIND(L_loop);
1317       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1318       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1319       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1320       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1321       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1322       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1323       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1324       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1325 
1326       __ BIND(L_copy_bytes);
1327       __ addptr(qword_count, 4);
1328       __ jcc(Assembler::lessEqual, L_loop);
1329     }
1330     __ subptr(qword_count, 4);
1331     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1332   }
1333 
1334   // Copy big chunks backward
1335   //
1336   // Inputs:
1337   //   from         - source arrays address
1338   //   dest         - destination array address
1339   //   qword_count  - 64-bits element count
1340   //   to           - scratch
1341   //   L_copy_bytes - entry label
1342   //   L_copy_8_bytes  - exit  label
1343   //
1344   void copy_bytes_backward(Register from, Register dest,
1345                               Register qword_count, Register to,
1346                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1347     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1348     Label L_loop;
1349     __ align(OptoLoopAlignment);
1350     if (UseUnalignedLoadStores) {
1351       Label L_end;
1352       __ BIND(L_loop);
1353       if (UseAVX >= 2) {
1354         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1355         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1356         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1357         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1358       } else {
1359         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1360         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1361         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1362         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1363         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1364         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1365         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1366         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1367       }
1368 
1369       __ BIND(L_copy_bytes);
1370       __ subptr(qword_count, 8);
1371       __ jcc(Assembler::greaterEqual, L_loop);
1372 
1373       __ addptr(qword_count, 4);  // add(8) and sub(4)
1374       __ jccb(Assembler::less, L_end);
1375       // Copy trailing 32 bytes
1376       if (UseAVX >= 2) {
1377         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1378         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1379       } else {
1380         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1381         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1382         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1383         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1384       }
1385       __ subptr(qword_count, 4);
1386       __ BIND(L_end);
1387     } else {
1388       // Copy 32-bytes per iteration
1389       __ BIND(L_loop);
1390       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1391       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1392       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1393       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1394       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1395       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1396       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1397       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1398 
1399       __ BIND(L_copy_bytes);
1400       __ subptr(qword_count, 4);
1401       __ jcc(Assembler::greaterEqual, L_loop);
1402     }
1403     __ addptr(qword_count, 4);
1404     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1405   }
1406 
1407 #ifndef PRODUCT
1408     int& get_profile_ctr(int shift) {
1409       if ( 0 == shift)
1410         return SharedRuntime::_jbyte_array_copy_ctr;
1411       else if(1 == shift)
1412         return SharedRuntime::_jshort_array_copy_ctr;
1413       else if(2 == shift)
1414         return SharedRuntime::_jint_array_copy_ctr;
1415       else
1416         return SharedRuntime::_jlong_array_copy_ctr;
1417     }
1418 #endif
1419 
1420   void setup_argument_regs(BasicType type) {
1421     if (type == T_BYTE || type == T_SHORT) {
1422       setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1423                         // r9 and r10 may be used to save non-volatile registers
1424     } else {
1425       setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1426                                      // r9 is used to save r15_thread
1427     }
1428   }
1429 
1430   void restore_argument_regs(BasicType type) {
1431     if (type == T_BYTE || type == T_SHORT) {
1432       restore_arg_regs();
1433     } else {
1434       restore_arg_regs_using_thread();
1435     }
1436   }
1437 
1438 #if COMPILER2_OR_JVMCI
1439   // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
1440   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
1441   //   for both special cases (various small block sizes) and aligned copy loop. This is the
1442   //   default configuration.
1443   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
1444   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
1445   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
1446   //   better performance for disjoint copies. For conjoint/backward copy vector based
1447   //   copy performs better.
1448   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
1449   //   64 byte vector registers (ZMMs).
1450 
1451   // Inputs:
1452   //   c_rarg0   - source array address
1453   //   c_rarg1   - destination array address
1454   //   c_rarg2   - element count, treated as ssize_t, can be zero
1455   //
1456   //
1457   // Side Effects:
1458   //   disjoint_copy_avx3_masked is set to the no-overlap entry point
1459   //   used by generate_conjoint_[byte/int/short/long]_copy().
1460   //
1461 
1462   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1463                                              bool aligned, bool is_oop, bool dest_uninitialized) {
1464     __ align(CodeEntryAlignment);
1465     StubCodeMark mark(this, "StubRoutines", name);
1466     address start = __ pc();
1467     int avx3threshold = VM_Version::avx3_threshold();
1468     bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
1469     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1470     Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1471     const Register from        = rdi;  // source array address
1472     const Register to          = rsi;  // destination array address
1473     const Register count       = rdx;  // elements count
1474     const Register temp1       = r8;
1475     const Register temp2       = r11;
1476     const Register temp3       = rax;
1477     const Register temp4       = rcx;
1478     // End pointers are inclusive, and if count is not zero they point
1479     // to the last unit copied:  end_to[0] := end_from[0]
1480 
1481     __ enter(); // required for proper stackwalking of RuntimeStub frame
1482     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1483 
1484     if (entry != NULL) {
1485       *entry = __ pc();
1486        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1487       BLOCK_COMMENT("Entry:");
1488     }
1489 
1490     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1491     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1492 
1493     setup_argument_regs(type);
1494 
1495     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1496     if (dest_uninitialized) {
1497       decorators |= IS_DEST_UNINITIALIZED;
1498     }
1499     if (aligned) {
1500       decorators |= ARRAYCOPY_ALIGNED;
1501     }
1502     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1503     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1504 
1505     {
1506       // Type(shift)           byte(0), short(1), int(2),   long(3)
1507       int loop_size[]        = { 192,     96,       48,      24};
1508       int threshold[]        = { 4096,    2048,     1024,    512};
1509 
1510       // UnsafeCopyMemory page error: continue after ucm
1511       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1512       // 'from', 'to' and 'count' are now valid
1513 
1514       // temp1 holds remaining count and temp4 holds running count used to compute
1515       // next address offset for start of to/from addresses (temp4 * scale).
1516       __ mov64(temp4, 0);
1517       __ movq(temp1, count);
1518 
1519       // Zero length check.
1520       __ BIND(L_tail);
1521       __ cmpq(temp1, 0);
1522       __ jcc(Assembler::lessEqual, L_exit);
1523 
1524       // Special cases using 32 byte [masked] vector copy operations.
1525       __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1526                                       temp4, temp3, use64byteVector, L_entry, L_exit);
1527 
1528       // PRE-MAIN-POST loop for aligned copy.
1529       __ BIND(L_entry);
1530 
1531       if (avx3threshold != 0) {
1532         __ cmpq(count, threshold[shift]);
1533         if (MaxVectorSize == 64) {
1534           // Copy using 64 byte vectors.
1535           __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1536         } else {
1537           assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
1538           // REP MOVS offer a faster copy path.
1539           __ jcc(Assembler::greaterEqual, L_repmovs);
1540         }
1541       }
1542 
1543       if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
1544         // Partial copy to make dst address 32 byte aligned.
1545         __ movq(temp2, to);
1546         __ andq(temp2, 31);
1547         __ jcc(Assembler::equal, L_main_pre_loop);
1548 
1549         __ negptr(temp2);
1550         __ addq(temp2, 32);
1551         if (shift) {
1552           __ shrq(temp2, shift);
1553         }
1554         __ movq(temp3, temp2);
1555         __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
1556         __ movq(temp4, temp2);
1557         __ movq(temp1, count);
1558         __ subq(temp1, temp2);
1559 
1560         __ cmpq(temp1, loop_size[shift]);
1561         __ jcc(Assembler::less, L_tail);
1562 
1563         __ BIND(L_main_pre_loop);
1564         __ subq(temp1, loop_size[shift]);
1565 
1566         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1567         __ align32();
1568         __ BIND(L_main_loop);
1569            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
1570            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
1571            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
1572            __ addptr(temp4, loop_size[shift]);
1573            __ subq(temp1, loop_size[shift]);
1574            __ jcc(Assembler::greater, L_main_loop);
1575 
1576         __ addq(temp1, loop_size[shift]);
1577 
1578         // Tail loop.
1579         __ jmp(L_tail);
1580 
1581         __ BIND(L_repmovs);
1582           __ movq(temp2, temp1);
1583           // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
1584           __ movq(temp3, to);
1585           __ movq(to,  from);
1586           __ movq(from, temp3);
1587           // Save to/from for restoration post rep_mov.
1588           __ movq(temp1, to);
1589           __ movq(temp3, from);
1590           if(shift < 3) {
1591             __ shrq(temp2, 3-shift);     // quad word count
1592           }
1593           __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
1594           __ rep_mov();
1595           __ shlq(temp2, 3);             // convert quad words into byte count.
1596           if(shift) {
1597             __ shrq(temp2, shift);       // type specific count.
1598           }
1599           // Restore original addresses in to/from.
1600           __ movq(to, temp3);
1601           __ movq(from, temp1);
1602           __ movq(temp4, temp2);
1603           __ movq(temp1, count);
1604           __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
1605           __ jmp(L_tail);
1606       }
1607 
1608       if (MaxVectorSize > 32) {
1609         __ BIND(L_pre_main_post_64);
1610         // Partial copy to make dst address 64 byte aligned.
1611         __ movq(temp2, to);
1612         __ andq(temp2, 63);
1613         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1614 
1615         __ negptr(temp2);
1616         __ addq(temp2, 64);
1617         if (shift) {
1618           __ shrq(temp2, shift);
1619         }
1620         __ movq(temp3, temp2);
1621         __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
1622         __ movq(temp4, temp2);
1623         __ movq(temp1, count);
1624         __ subq(temp1, temp2);
1625 
1626         __ cmpq(temp1, loop_size[shift]);
1627         __ jcc(Assembler::less, L_tail64);
1628 
1629         __ BIND(L_main_pre_loop_64bytes);
1630         __ subq(temp1, loop_size[shift]);
1631 
1632         // Main loop with aligned copy block size of 192 bytes at
1633         // 64 byte copy granularity.
1634         __ align32();
1635         __ BIND(L_main_loop_64bytes);
1636            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
1637            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
1638            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
1639            __ addptr(temp4, loop_size[shift]);
1640            __ subq(temp1, loop_size[shift]);
1641            __ jcc(Assembler::greater, L_main_loop_64bytes);
1642 
1643         __ addq(temp1, loop_size[shift]);
1644         // Zero length check.
1645         __ jcc(Assembler::lessEqual, L_exit);
1646 
1647         __ BIND(L_tail64);
1648 
1649         // Tail handling using 64 byte [masked] vector copy operations.
1650         use64byteVector = true;
1651         __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1652                                         temp4, temp3, use64byteVector, L_entry, L_exit);
1653       }
1654       __ BIND(L_exit);
1655     }
1656 
1657     address ucme_exit_pc = __ pc();
1658     // When called from generic_arraycopy r11 contains specific values
1659     // used during arraycopy epilogue, re-initializing r11.
1660     if (is_oop) {
1661       __ movq(r11, shift == 3 ? count : to);
1662     }
1663     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1664     restore_argument_regs(type);
1665     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1666     __ xorptr(rax, rax); // return 0
1667     __ vzeroupper();
1668     __ leave(); // required for proper stackwalking of RuntimeStub frame
1669     __ ret(0);
1670     return start;
1671   }
1672 
1673   // Inputs:
1674   //   c_rarg0   - source array address
1675   //   c_rarg1   - destination array address
1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
1677   //
1678   //
1679   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1680                                              address nooverlap_target, bool aligned, bool is_oop,
1681                                              bool dest_uninitialized) {
1682     __ align(CodeEntryAlignment);
1683     StubCodeMark mark(this, "StubRoutines", name);
1684     address start = __ pc();
1685 
1686     int avx3threshold = VM_Version::avx3_threshold();
1687     bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
1688 
1689     Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1690     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1691     const Register from        = rdi;  // source array address
1692     const Register to          = rsi;  // destination array address
1693     const Register count       = rdx;  // elements count
1694     const Register temp1       = r8;
1695     const Register temp2       = rcx;
1696     const Register temp3       = r11;
1697     const Register temp4       = rax;
1698     // End pointers are inclusive, and if count is not zero they point
1699     // to the last unit copied:  end_to[0] := end_from[0]
1700 
1701     __ enter(); // required for proper stackwalking of RuntimeStub frame
1702     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1703 
1704     if (entry != NULL) {
1705       *entry = __ pc();
1706        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1707       BLOCK_COMMENT("Entry:");
1708     }
1709 
1710     array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1711 
1712     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1713     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1714 
1715     setup_argument_regs(type);
1716 
1717     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1718     if (dest_uninitialized) {
1719       decorators |= IS_DEST_UNINITIALIZED;
1720     }
1721     if (aligned) {
1722       decorators |= ARRAYCOPY_ALIGNED;
1723     }
1724     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1725     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1726     {
1727       // Type(shift)       byte(0), short(1), int(2),   long(3)
1728       int loop_size[]   = { 192,     96,       48,      24};
1729       int threshold[]   = { 4096,    2048,     1024,    512};
1730 
1731       // UnsafeCopyMemory page error: continue after ucm
1732       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1733       // 'from', 'to' and 'count' are now valid
1734 
1735       // temp1 holds remaining count.
1736       __ movq(temp1, count);
1737 
1738       // Zero length check.
1739       __ BIND(L_tail);
1740       __ cmpq(temp1, 0);
1741       __ jcc(Assembler::lessEqual, L_exit);
1742 
1743       __ mov64(temp2, 0);
1744       __ movq(temp3, temp1);
1745       // Special cases using 32 byte [masked] vector copy operations.
1746       __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1747                                                temp4, use64byteVector, L_entry, L_exit);
1748 
1749       // PRE-MAIN-POST loop for aligned copy.
1750       __ BIND(L_entry);
1751 
1752       if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
1753         __ cmpq(temp1, threshold[shift]);
1754         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1755       }
1756 
1757       if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
1758         // Partial copy to make dst address 32 byte aligned.
1759         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1760         __ andq(temp2, 31);
1761         __ jcc(Assembler::equal, L_main_pre_loop);
1762 
1763         if (shift) {
1764           __ shrq(temp2, shift);
1765         }
1766         __ subq(temp1, temp2);
1767         __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1768 
1769         __ cmpq(temp1, loop_size[shift]);
1770         __ jcc(Assembler::less, L_tail);
1771 
1772         __ BIND(L_main_pre_loop);
1773 
1774         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1775         __ align32();
1776         __ BIND(L_main_loop);
1777            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1778            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1779            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1780            __ subptr(temp1, loop_size[shift]);
1781            __ cmpq(temp1, loop_size[shift]);
1782            __ jcc(Assembler::greater, L_main_loop);
1783 
1784         // Tail loop.
1785         __ jmp(L_tail);
1786       }
1787 
1788       if (MaxVectorSize > 32) {
1789         __ BIND(L_pre_main_post_64);
1790         // Partial copy to make dst address 64 byte aligned.
1791         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1792         __ andq(temp2, 63);
1793         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1794 
1795         if (shift) {
1796           __ shrq(temp2, shift);
1797         }
1798         __ subq(temp1, temp2);
1799         __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1800 
1801         __ cmpq(temp1, loop_size[shift]);
1802         __ jcc(Assembler::less, L_tail64);
1803 
1804         __ BIND(L_main_pre_loop_64bytes);
1805 
1806         // Main loop with aligned copy block size of 192 bytes at
1807         // 64 byte copy granularity.
1808         __ align32();
1809         __ BIND(L_main_loop_64bytes);
1810            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1811            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1812            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1813            __ subq(temp1, loop_size[shift]);
1814            __ cmpq(temp1, loop_size[shift]);
1815            __ jcc(Assembler::greater, L_main_loop_64bytes);
1816 
1817         // Zero length check.
1818         __ cmpq(temp1, 0);
1819         __ jcc(Assembler::lessEqual, L_exit);
1820 
1821         __ BIND(L_tail64);
1822 
1823         // Tail handling using 64 byte [masked] vector copy operations.
1824         use64byteVector = true;
1825         __ mov64(temp2, 0);
1826         __ movq(temp3, temp1);
1827         __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1828                                                  temp4, use64byteVector, L_entry, L_exit);
1829       }
1830       __ BIND(L_exit);
1831     }
1832     address ucme_exit_pc = __ pc();
1833     // When called from generic_arraycopy r11 contains specific values
1834     // used during arraycopy epilogue, re-initializing r11.
1835     if(is_oop) {
1836       __ movq(r11, count);
1837     }
1838     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1839     restore_argument_regs(type);
1840     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1841     __ xorptr(rax, rax); // return 0
1842     __ vzeroupper();
1843     __ leave(); // required for proper stackwalking of RuntimeStub frame
1844     __ ret(0);
1845     return start;
1846   }
1847 #endif // COMPILER2_OR_JVMCI
1848 
1849 
1850   // Arguments:
1851   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1852   //             ignored
1853   //   name    - stub name string
1854   //
1855   // Inputs:
1856   //   c_rarg0   - source array address
1857   //   c_rarg1   - destination array address
1858   //   c_rarg2   - element count, treated as ssize_t, can be zero
1859   //
1860   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1861   // we let the hardware handle it.  The one to eight bytes within words,
1862   // dwords or qwords that span cache line boundaries will still be loaded
1863   // and stored atomically.
1864   //
1865   // Side Effects:
1866   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1867   //   used by generate_conjoint_byte_copy().
1868   //
1869   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1870 #if COMPILER2_OR_JVMCI
1871     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1872        return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1873                                                  aligned, false, false);
1874     }
1875 #endif
1876     __ align(CodeEntryAlignment);
1877     StubCodeMark mark(this, "StubRoutines", name);
1878     address start = __ pc();
1879 
1880     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1881     Label L_copy_byte, L_exit;
1882     const Register from        = rdi;  // source array address
1883     const Register to          = rsi;  // destination array address
1884     const Register count       = rdx;  // elements count
1885     const Register byte_count  = rcx;
1886     const Register qword_count = count;
1887     const Register end_from    = from; // source array end address
1888     const Register end_to      = to;   // destination array end address
1889     // End pointers are inclusive, and if count is not zero they point
1890     // to the last unit copied:  end_to[0] := end_from[0]
1891 
1892     __ enter(); // required for proper stackwalking of RuntimeStub frame
1893     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1894 
1895     if (entry != NULL) {
1896       *entry = __ pc();
1897        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1898       BLOCK_COMMENT("Entry:");
1899     }
1900 
1901     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1902                       // r9 and r10 may be used to save non-volatile registers
1903 
1904     {
1905       // UnsafeCopyMemory page error: continue after ucm
1906       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1907       // 'from', 'to' and 'count' are now valid
1908       __ movptr(byte_count, count);
1909       __ shrptr(count, 3); // count => qword_count
1910 
1911       // Copy from low to high addresses.  Use 'to' as scratch.
1912       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1913       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1914       __ negptr(qword_count); // make the count negative
1915       __ jmp(L_copy_bytes);
1916 
1917       // Copy trailing qwords
1918     __ BIND(L_copy_8_bytes);
1919       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1920       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1921       __ increment(qword_count);
1922       __ jcc(Assembler::notZero, L_copy_8_bytes);
1923 
1924       // Check for and copy trailing dword
1925     __ BIND(L_copy_4_bytes);
1926       __ testl(byte_count, 4);
1927       __ jccb(Assembler::zero, L_copy_2_bytes);
1928       __ movl(rax, Address(end_from, 8));
1929       __ movl(Address(end_to, 8), rax);
1930 
1931       __ addptr(end_from, 4);
1932       __ addptr(end_to, 4);
1933 
1934       // Check for and copy trailing word
1935     __ BIND(L_copy_2_bytes);
1936       __ testl(byte_count, 2);
1937       __ jccb(Assembler::zero, L_copy_byte);
1938       __ movw(rax, Address(end_from, 8));
1939       __ movw(Address(end_to, 8), rax);
1940 
1941       __ addptr(end_from, 2);
1942       __ addptr(end_to, 2);
1943 
1944       // Check for and copy trailing byte
1945     __ BIND(L_copy_byte);
1946       __ testl(byte_count, 1);
1947       __ jccb(Assembler::zero, L_exit);
1948       __ movb(rax, Address(end_from, 8));
1949       __ movb(Address(end_to, 8), rax);
1950     }
1951   __ BIND(L_exit);
1952     address ucme_exit_pc = __ pc();
1953     restore_arg_regs();
1954     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1955     __ xorptr(rax, rax); // return 0
1956     __ vzeroupper();
1957     __ leave(); // required for proper stackwalking of RuntimeStub frame
1958     __ ret(0);
1959 
1960     {
1961       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1962       // Copy in multi-bytes chunks
1963       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1964       __ jmp(L_copy_4_bytes);
1965     }
1966     return start;
1967   }
1968 
1969   // Arguments:
1970   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1971   //             ignored
1972   //   name    - stub name string
1973   //
1974   // Inputs:
1975   //   c_rarg0   - source array address
1976   //   c_rarg1   - destination array address
1977   //   c_rarg2   - element count, treated as ssize_t, can be zero
1978   //
1979   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1980   // we let the hardware handle it.  The one to eight bytes within words,
1981   // dwords or qwords that span cache line boundaries will still be loaded
1982   // and stored atomically.
1983   //
1984   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1985                                       address* entry, const char *name) {
1986 #if COMPILER2_OR_JVMCI
1987     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1988        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1989                                                  nooverlap_target, aligned, false, false);
1990     }
1991 #endif
1992     __ align(CodeEntryAlignment);
1993     StubCodeMark mark(this, "StubRoutines", name);
1994     address start = __ pc();
1995 
1996     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1997     const Register from        = rdi;  // source array address
1998     const Register to          = rsi;  // destination array address
1999     const Register count       = rdx;  // elements count
2000     const Register byte_count  = rcx;
2001     const Register qword_count = count;
2002 
2003     __ enter(); // required for proper stackwalking of RuntimeStub frame
2004     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2005 
2006     if (entry != NULL) {
2007       *entry = __ pc();
2008       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2009       BLOCK_COMMENT("Entry:");
2010     }
2011 
2012     array_overlap_test(nooverlap_target, Address::times_1);
2013     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2014                       // r9 and r10 may be used to save non-volatile registers
2015 
2016     {
2017       // UnsafeCopyMemory page error: continue after ucm
2018       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2019       // 'from', 'to' and 'count' are now valid
2020       __ movptr(byte_count, count);
2021       __ shrptr(count, 3);   // count => qword_count
2022 
2023       // Copy from high to low addresses.
2024 
2025       // Check for and copy trailing byte
2026       __ testl(byte_count, 1);
2027       __ jcc(Assembler::zero, L_copy_2_bytes);
2028       __ movb(rax, Address(from, byte_count, Address::times_1, -1));
2029       __ movb(Address(to, byte_count, Address::times_1, -1), rax);
2030       __ decrement(byte_count); // Adjust for possible trailing word
2031 
2032       // Check for and copy trailing word
2033     __ BIND(L_copy_2_bytes);
2034       __ testl(byte_count, 2);
2035       __ jcc(Assembler::zero, L_copy_4_bytes);
2036       __ movw(rax, Address(from, byte_count, Address::times_1, -2));
2037       __ movw(Address(to, byte_count, Address::times_1, -2), rax);
2038 
2039       // Check for and copy trailing dword
2040     __ BIND(L_copy_4_bytes);
2041       __ testl(byte_count, 4);
2042       __ jcc(Assembler::zero, L_copy_bytes);
2043       __ movl(rax, Address(from, qword_count, Address::times_8));
2044       __ movl(Address(to, qword_count, Address::times_8), rax);
2045       __ jmp(L_copy_bytes);
2046 
2047       // Copy trailing qwords
2048     __ BIND(L_copy_8_bytes);
2049       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2050       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2051       __ decrement(qword_count);
2052       __ jcc(Assembler::notZero, L_copy_8_bytes);
2053     }
2054     restore_arg_regs();
2055     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
2056     __ xorptr(rax, rax); // return 0
2057     __ vzeroupper();
2058     __ leave(); // required for proper stackwalking of RuntimeStub frame
2059     __ ret(0);
2060 
2061     {
2062       // UnsafeCopyMemory page error: continue after ucm
2063       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2064       // Copy in multi-bytes chunks
2065       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2066     }
2067     restore_arg_regs();
2068     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
2069     __ xorptr(rax, rax); // return 0
2070     __ vzeroupper();
2071     __ leave(); // required for proper stackwalking of RuntimeStub frame
2072     __ ret(0);
2073 
2074     return start;
2075   }
2076 
2077   // Arguments:
2078   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2079   //             ignored
2080   //   name    - stub name string
2081   //
2082   // Inputs:
2083   //   c_rarg0   - source array address
2084   //   c_rarg1   - destination array address
2085   //   c_rarg2   - element count, treated as ssize_t, can be zero
2086   //
2087   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2088   // let the hardware handle it.  The two or four words within dwords
2089   // or qwords that span cache line boundaries will still be loaded
2090   // and stored atomically.
2091   //
2092   // Side Effects:
2093   //   disjoint_short_copy_entry is set to the no-overlap entry point
2094   //   used by generate_conjoint_short_copy().
2095   //
2096   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
2097 #if COMPILER2_OR_JVMCI
2098     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2099        return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
2100                                                  aligned, false, false);
2101     }
2102 #endif
2103 
2104     __ align(CodeEntryAlignment);
2105     StubCodeMark mark(this, "StubRoutines", name);
2106     address start = __ pc();
2107 
2108     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
2109     const Register from        = rdi;  // source array address
2110     const Register to          = rsi;  // destination array address
2111     const Register count       = rdx;  // elements count
2112     const Register word_count  = rcx;
2113     const Register qword_count = count;
2114     const Register end_from    = from; // source array end address
2115     const Register end_to      = to;   // destination array end address
2116     // End pointers are inclusive, and if count is not zero they point
2117     // to the last unit copied:  end_to[0] := end_from[0]
2118 
2119     __ enter(); // required for proper stackwalking of RuntimeStub frame
2120     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2121 
2122     if (entry != NULL) {
2123       *entry = __ pc();
2124       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2125       BLOCK_COMMENT("Entry:");
2126     }
2127 
2128     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2129                       // r9 and r10 may be used to save non-volatile registers
2130 
2131     {
2132       // UnsafeCopyMemory page error: continue after ucm
2133       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2134       // 'from', 'to' and 'count' are now valid
2135       __ movptr(word_count, count);
2136       __ shrptr(count, 2); // count => qword_count
2137 
2138       // Copy from low to high addresses.  Use 'to' as scratch.
2139       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2140       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2141       __ negptr(qword_count);
2142       __ jmp(L_copy_bytes);
2143 
2144       // Copy trailing qwords
2145     __ BIND(L_copy_8_bytes);
2146       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2147       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2148       __ increment(qword_count);
2149       __ jcc(Assembler::notZero, L_copy_8_bytes);
2150 
2151       // Original 'dest' is trashed, so we can't use it as a
2152       // base register for a possible trailing word copy
2153 
2154       // Check for and copy trailing dword
2155     __ BIND(L_copy_4_bytes);
2156       __ testl(word_count, 2);
2157       __ jccb(Assembler::zero, L_copy_2_bytes);
2158       __ movl(rax, Address(end_from, 8));
2159       __ movl(Address(end_to, 8), rax);
2160 
2161       __ addptr(end_from, 4);
2162       __ addptr(end_to, 4);
2163 
2164       // Check for and copy trailing word
2165     __ BIND(L_copy_2_bytes);
2166       __ testl(word_count, 1);
2167       __ jccb(Assembler::zero, L_exit);
2168       __ movw(rax, Address(end_from, 8));
2169       __ movw(Address(end_to, 8), rax);
2170     }
2171   __ BIND(L_exit);
2172     address ucme_exit_pc = __ pc();
2173     restore_arg_regs();
2174     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2175     __ xorptr(rax, rax); // return 0
2176     __ vzeroupper();
2177     __ leave(); // required for proper stackwalking of RuntimeStub frame
2178     __ ret(0);
2179 
2180     {
2181       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
2182       // Copy in multi-bytes chunks
2183       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2184       __ jmp(L_copy_4_bytes);
2185     }
2186 
2187     return start;
2188   }
2189 
2190   address generate_fill(BasicType t, bool aligned, const char *name) {
2191     __ align(CodeEntryAlignment);
2192     StubCodeMark mark(this, "StubRoutines", name);
2193     address start = __ pc();
2194 
2195     BLOCK_COMMENT("Entry:");
2196 
2197     const Register to       = c_rarg0;  // destination array address
2198     const Register value    = c_rarg1;  // value
2199     const Register count    = c_rarg2;  // elements count
2200     __ mov(r11, count);
2201 
2202     __ enter(); // required for proper stackwalking of RuntimeStub frame
2203 
2204     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
2205 
2206     __ vzeroupper();
2207     __ leave(); // required for proper stackwalking of RuntimeStub frame
2208     __ ret(0);
2209     return start;
2210   }
2211 
2212   // Arguments:
2213   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2214   //             ignored
2215   //   name    - stub name string
2216   //
2217   // Inputs:
2218   //   c_rarg0   - source array address
2219   //   c_rarg1   - destination array address
2220   //   c_rarg2   - element count, treated as ssize_t, can be zero
2221   //
2222   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2223   // let the hardware handle it.  The two or four words within dwords
2224   // or qwords that span cache line boundaries will still be loaded
2225   // and stored atomically.
2226   //
2227   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2228                                        address *entry, const char *name) {
2229 #if COMPILER2_OR_JVMCI
2230     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2231        return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
2232                                                  nooverlap_target, aligned, false, false);
2233     }
2234 #endif
2235     __ align(CodeEntryAlignment);
2236     StubCodeMark mark(this, "StubRoutines", name);
2237     address start = __ pc();
2238 
2239     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2240     const Register from        = rdi;  // source array address
2241     const Register to          = rsi;  // destination array address
2242     const Register count       = rdx;  // elements count
2243     const Register word_count  = rcx;
2244     const Register qword_count = count;
2245 
2246     __ enter(); // required for proper stackwalking of RuntimeStub frame
2247     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2248 
2249     if (entry != NULL) {
2250       *entry = __ pc();
2251       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2252       BLOCK_COMMENT("Entry:");
2253     }
2254 
2255     array_overlap_test(nooverlap_target, Address::times_2);
2256     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2257                       // r9 and r10 may be used to save non-volatile registers
2258 
2259     {
2260       // UnsafeCopyMemory page error: continue after ucm
2261       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2262       // 'from', 'to' and 'count' are now valid
2263       __ movptr(word_count, count);
2264       __ shrptr(count, 2); // count => qword_count
2265 
2266       // Copy from high to low addresses.  Use 'to' as scratch.
2267 
2268       // Check for and copy trailing word
2269       __ testl(word_count, 1);
2270       __ jccb(Assembler::zero, L_copy_4_bytes);
2271       __ movw(rax, Address(from, word_count, Address::times_2, -2));
2272       __ movw(Address(to, word_count, Address::times_2, -2), rax);
2273 
2274      // Check for and copy trailing dword
2275     __ BIND(L_copy_4_bytes);
2276       __ testl(word_count, 2);
2277       __ jcc(Assembler::zero, L_copy_bytes);
2278       __ movl(rax, Address(from, qword_count, Address::times_8));
2279       __ movl(Address(to, qword_count, Address::times_8), rax);
2280       __ jmp(L_copy_bytes);
2281 
2282       // Copy trailing qwords
2283     __ BIND(L_copy_8_bytes);
2284       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2285       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2286       __ decrement(qword_count);
2287       __ jcc(Assembler::notZero, L_copy_8_bytes);
2288     }
2289     restore_arg_regs();
2290     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2291     __ xorptr(rax, rax); // return 0
2292     __ vzeroupper();
2293     __ leave(); // required for proper stackwalking of RuntimeStub frame
2294     __ ret(0);
2295 
2296     {
2297       // UnsafeCopyMemory page error: continue after ucm
2298       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2299       // Copy in multi-bytes chunks
2300       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2301     }
2302     restore_arg_regs();
2303     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2304     __ xorptr(rax, rax); // return 0
2305     __ vzeroupper();
2306     __ leave(); // required for proper stackwalking of RuntimeStub frame
2307     __ ret(0);
2308 
2309     return start;
2310   }
2311 
2312   // Arguments:
2313   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2314   //             ignored
2315   //   is_oop  - true => oop array, so generate store check code
2316   //   name    - stub name string
2317   //
2318   // Inputs:
2319   //   c_rarg0   - source array address
2320   //   c_rarg1   - destination array address
2321   //   c_rarg2   - element count, treated as ssize_t, can be zero
2322   //
2323   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2324   // the hardware handle it.  The two dwords within qwords that span
2325   // cache line boundaries will still be loaded and stored atomically.
2326   //
2327   // Side Effects:
2328   //   disjoint_int_copy_entry is set to the no-overlap entry point
2329   //   used by generate_conjoint_int_oop_copy().
2330   //
2331   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2332                                          const char *name, bool dest_uninitialized = false) {
2333 #if COMPILER2_OR_JVMCI
2334     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2335        return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
2336                                                  aligned, is_oop, dest_uninitialized);
2337     }
2338 #endif
2339 
2340     __ align(CodeEntryAlignment);
2341     StubCodeMark mark(this, "StubRoutines", name);
2342     address start = __ pc();
2343 
2344     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2345     const Register from        = rdi;  // source array address
2346     const Register to          = rsi;  // destination array address
2347     const Register count       = rdx;  // elements count
2348     const Register dword_count = rcx;
2349     const Register qword_count = count;
2350     const Register end_from    = from; // source array end address
2351     const Register end_to      = to;   // destination array end address
2352     // End pointers are inclusive, and if count is not zero they point
2353     // to the last unit copied:  end_to[0] := end_from[0]
2354 
2355     __ enter(); // required for proper stackwalking of RuntimeStub frame
2356     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2357 
2358     if (entry != NULL) {
2359       *entry = __ pc();
2360       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2361       BLOCK_COMMENT("Entry:");
2362     }
2363 
2364     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2365                                    // r9 is used to save r15_thread
2366 
2367     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2368     if (dest_uninitialized) {
2369       decorators |= IS_DEST_UNINITIALIZED;
2370     }
2371     if (aligned) {
2372       decorators |= ARRAYCOPY_ALIGNED;
2373     }
2374 
2375     BasicType type = is_oop ? T_OBJECT : T_INT;
2376     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2377     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2378 
2379     {
2380       // UnsafeCopyMemory page error: continue after ucm
2381       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2382       // 'from', 'to' and 'count' are now valid
2383       __ movptr(dword_count, count);
2384       __ shrptr(count, 1); // count => qword_count
2385 
2386       // Copy from low to high addresses.  Use 'to' as scratch.
2387       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2388       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2389       __ negptr(qword_count);
2390       __ jmp(L_copy_bytes);
2391 
2392       // Copy trailing qwords
2393     __ BIND(L_copy_8_bytes);
2394       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2395       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2396       __ increment(qword_count);
2397       __ jcc(Assembler::notZero, L_copy_8_bytes);
2398 
2399       // Check for and copy trailing dword
2400     __ BIND(L_copy_4_bytes);
2401       __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2402       __ jccb(Assembler::zero, L_exit);
2403       __ movl(rax, Address(end_from, 8));
2404       __ movl(Address(end_to, 8), rax);
2405     }
2406   __ BIND(L_exit);
2407     address ucme_exit_pc = __ pc();
2408     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2409     restore_arg_regs_using_thread();
2410     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2411     __ vzeroupper();
2412     __ xorptr(rax, rax); // return 0
2413     __ leave(); // required for proper stackwalking of RuntimeStub frame
2414     __ ret(0);
2415 
2416     {
2417       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2418       // Copy in multi-bytes chunks
2419       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2420       __ jmp(L_copy_4_bytes);
2421     }
2422 
2423     return start;
2424   }
2425 
2426   // Arguments:
2427   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2428   //             ignored
2429   //   is_oop  - true => oop array, so generate store check code
2430   //   name    - stub name string
2431   //
2432   // Inputs:
2433   //   c_rarg0   - source array address
2434   //   c_rarg1   - destination array address
2435   //   c_rarg2   - element count, treated as ssize_t, can be zero
2436   //
2437   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2438   // the hardware handle it.  The two dwords within qwords that span
2439   // cache line boundaries will still be loaded and stored atomically.
2440   //
2441   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2442                                          address *entry, const char *name,
2443                                          bool dest_uninitialized = false) {
2444 #if COMPILER2_OR_JVMCI
2445     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2446        return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
2447                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2448     }
2449 #endif
2450     __ align(CodeEntryAlignment);
2451     StubCodeMark mark(this, "StubRoutines", name);
2452     address start = __ pc();
2453 
2454     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2455     const Register from        = rdi;  // source array address
2456     const Register to          = rsi;  // destination array address
2457     const Register count       = rdx;  // elements count
2458     const Register dword_count = rcx;
2459     const Register qword_count = count;
2460 
2461     __ enter(); // required for proper stackwalking of RuntimeStub frame
2462     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2463 
2464     if (entry != NULL) {
2465       *entry = __ pc();
2466        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2467       BLOCK_COMMENT("Entry:");
2468     }
2469 
2470     array_overlap_test(nooverlap_target, Address::times_4);
2471     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2472                                    // r9 is used to save r15_thread
2473 
2474     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2475     if (dest_uninitialized) {
2476       decorators |= IS_DEST_UNINITIALIZED;
2477     }
2478     if (aligned) {
2479       decorators |= ARRAYCOPY_ALIGNED;
2480     }
2481 
2482     BasicType type = is_oop ? T_OBJECT : T_INT;
2483     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2484     // no registers are destroyed by this call
2485     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2486 
2487     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2488     {
2489       // UnsafeCopyMemory page error: continue after ucm
2490       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2491       // 'from', 'to' and 'count' are now valid
2492       __ movptr(dword_count, count);
2493       __ shrptr(count, 1); // count => qword_count
2494 
2495       // Copy from high to low addresses.  Use 'to' as scratch.
2496 
2497       // Check for and copy trailing dword
2498       __ testl(dword_count, 1);
2499       __ jcc(Assembler::zero, L_copy_bytes);
2500       __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2501       __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2502       __ jmp(L_copy_bytes);
2503 
2504       // Copy trailing qwords
2505     __ BIND(L_copy_8_bytes);
2506       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2507       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2508       __ decrement(qword_count);
2509       __ jcc(Assembler::notZero, L_copy_8_bytes);
2510     }
2511     if (is_oop) {
2512       __ jmp(L_exit);
2513     }
2514     restore_arg_regs_using_thread();
2515     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2516     __ xorptr(rax, rax); // return 0
2517     __ vzeroupper();
2518     __ leave(); // required for proper stackwalking of RuntimeStub frame
2519     __ ret(0);
2520 
2521     {
2522       // UnsafeCopyMemory page error: continue after ucm
2523       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2524       // Copy in multi-bytes chunks
2525       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2526     }
2527 
2528   __ BIND(L_exit);
2529     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2530     restore_arg_regs_using_thread();
2531     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2532     __ xorptr(rax, rax); // return 0
2533     __ vzeroupper();
2534     __ leave(); // required for proper stackwalking of RuntimeStub frame
2535     __ ret(0);
2536 
2537     return start;
2538   }
2539 
2540   // Arguments:
2541   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2542   //             ignored
2543   //   is_oop  - true => oop array, so generate store check code
2544   //   name    - stub name string
2545   //
2546   // Inputs:
2547   //   c_rarg0   - source array address
2548   //   c_rarg1   - destination array address
2549   //   c_rarg2   - element count, treated as ssize_t, can be zero
2550   //
2551  // Side Effects:
2552   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2553   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2554   //
2555   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2556                                           const char *name, bool dest_uninitialized = false) {
2557 #if COMPILER2_OR_JVMCI
2558     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2559        return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
2560                                                  aligned, is_oop, dest_uninitialized);
2561     }
2562 #endif
2563     __ align(CodeEntryAlignment);
2564     StubCodeMark mark(this, "StubRoutines", name);
2565     address start = __ pc();
2566 
2567     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2568     const Register from        = rdi;  // source array address
2569     const Register to          = rsi;  // destination array address
2570     const Register qword_count = rdx;  // elements count
2571     const Register end_from    = from; // source array end address
2572     const Register end_to      = rcx;  // destination array end address
2573     const Register saved_count = r11;
2574     // End pointers are inclusive, and if count is not zero they point
2575     // to the last unit copied:  end_to[0] := end_from[0]
2576 
2577     __ enter(); // required for proper stackwalking of RuntimeStub frame
2578     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2579     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2580 
2581     if (entry != NULL) {
2582       *entry = __ pc();
2583       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2584       BLOCK_COMMENT("Entry:");
2585     }
2586 
2587     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2588                                      // r9 is used to save r15_thread
2589     // 'from', 'to' and 'qword_count' are now valid
2590 
2591     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2592     if (dest_uninitialized) {
2593       decorators |= IS_DEST_UNINITIALIZED;
2594     }
2595     if (aligned) {
2596       decorators |= ARRAYCOPY_ALIGNED;
2597     }
2598 
2599     BasicType type = is_oop ? T_OBJECT : T_LONG;
2600     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2601     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2602     {
2603       // UnsafeCopyMemory page error: continue after ucm
2604       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2605 
2606       // Copy from low to high addresses.  Use 'to' as scratch.
2607       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2608       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2609       __ negptr(qword_count);
2610       __ jmp(L_copy_bytes);
2611 
2612       // Copy trailing qwords
2613     __ BIND(L_copy_8_bytes);
2614       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2615       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2616       __ increment(qword_count);
2617       __ jcc(Assembler::notZero, L_copy_8_bytes);
2618     }
2619     if (is_oop) {
2620       __ jmp(L_exit);
2621     } else {
2622       restore_arg_regs_using_thread();
2623       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2624       __ xorptr(rax, rax); // return 0
2625       __ vzeroupper();
2626       __ leave(); // required for proper stackwalking of RuntimeStub frame
2627       __ ret(0);
2628     }
2629 
2630     {
2631       // UnsafeCopyMemory page error: continue after ucm
2632       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2633       // Copy in multi-bytes chunks
2634       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2635     }
2636 
2637     __ BIND(L_exit);
2638     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2639     restore_arg_regs_using_thread();
2640     if (is_oop) {
2641       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2642     } else {
2643       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2644     }
2645     __ vzeroupper();
2646     __ xorptr(rax, rax); // return 0
2647     __ leave(); // required for proper stackwalking of RuntimeStub frame
2648     __ ret(0);
2649 
2650     return start;
2651   }
2652 
2653   // Arguments:
2654   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2655   //             ignored
2656   //   is_oop  - true => oop array, so generate store check code
2657   //   name    - stub name string
2658   //
2659   // Inputs:
2660   //   c_rarg0   - source array address
2661   //   c_rarg1   - destination array address
2662   //   c_rarg2   - element count, treated as ssize_t, can be zero
2663   //
2664   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2665                                           address nooverlap_target, address *entry,
2666                                           const char *name, bool dest_uninitialized = false) {
2667 #if COMPILER2_OR_JVMCI
2668     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2669        return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2670                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2671     }
2672 #endif
2673     __ align(CodeEntryAlignment);
2674     StubCodeMark mark(this, "StubRoutines", name);
2675     address start = __ pc();
2676 
2677     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2678     const Register from        = rdi;  // source array address
2679     const Register to          = rsi;  // destination array address
2680     const Register qword_count = rdx;  // elements count
2681     const Register saved_count = rcx;
2682 
2683     __ enter(); // required for proper stackwalking of RuntimeStub frame
2684     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2685 
2686     if (entry != NULL) {
2687       *entry = __ pc();
2688       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2689       BLOCK_COMMENT("Entry:");
2690     }
2691 
2692     array_overlap_test(nooverlap_target, Address::times_8);
2693     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2694                                    // r9 is used to save r15_thread
2695     // 'from', 'to' and 'qword_count' are now valid
2696 
2697     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2698     if (dest_uninitialized) {
2699       decorators |= IS_DEST_UNINITIALIZED;
2700     }
2701     if (aligned) {
2702       decorators |= ARRAYCOPY_ALIGNED;
2703     }
2704 
2705     BasicType type = is_oop ? T_OBJECT : T_LONG;
2706     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2707     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2708     {
2709       // UnsafeCopyMemory page error: continue after ucm
2710       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2711 
2712       __ jmp(L_copy_bytes);
2713 
2714       // Copy trailing qwords
2715     __ BIND(L_copy_8_bytes);
2716       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2717       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2718       __ decrement(qword_count);
2719       __ jcc(Assembler::notZero, L_copy_8_bytes);
2720     }
2721     if (is_oop) {
2722       __ jmp(L_exit);
2723     } else {
2724       restore_arg_regs_using_thread();
2725       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2726       __ xorptr(rax, rax); // return 0
2727       __ vzeroupper();
2728       __ leave(); // required for proper stackwalking of RuntimeStub frame
2729       __ ret(0);
2730     }
2731     {
2732       // UnsafeCopyMemory page error: continue after ucm
2733       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2734 
2735       // Copy in multi-bytes chunks
2736       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2737     }
2738     __ BIND(L_exit);
2739     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2740     restore_arg_regs_using_thread();
2741     if (is_oop) {
2742       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2743     } else {
2744       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2745     }
2746     __ vzeroupper();
2747     __ xorptr(rax, rax); // return 0
2748     __ leave(); // required for proper stackwalking of RuntimeStub frame
2749     __ ret(0);
2750 
2751     return start;
2752   }
2753 
2754 
2755   // Helper for generating a dynamic type check.
2756   // Smashes no registers.
2757   void generate_type_check(Register sub_klass,
2758                            Register super_check_offset,
2759                            Register super_klass,
2760                            Label& L_success) {
2761     assert_different_registers(sub_klass, super_check_offset, super_klass);
2762 
2763     BLOCK_COMMENT("type_check:");
2764 
2765     Label L_miss;
2766 
2767     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2768                                      super_check_offset);
2769     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2770 
2771     // Fall through on failure!
2772     __ BIND(L_miss);
2773   }
2774 
2775   //
2776   //  Generate checkcasting array copy stub
2777   //
2778   //  Input:
2779   //    c_rarg0   - source array address
2780   //    c_rarg1   - destination array address
2781   //    c_rarg2   - element count, treated as ssize_t, can be zero
2782   //    c_rarg3   - size_t ckoff (super_check_offset)
2783   // not Win64
2784   //    c_rarg4   - oop ckval (super_klass)
2785   // Win64
2786   //    rsp+40    - oop ckval (super_klass)
2787   //
2788   //  Output:
2789   //    rax ==  0  -  success
2790   //    rax == -1^K - failure, where K is partial transfer count
2791   //
2792   address generate_checkcast_copy(const char *name, address *entry,
2793                                   bool dest_uninitialized = false) {
2794 
2795     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2796 
2797     // Input registers (after setup_arg_regs)
2798     const Register from        = rdi;   // source array address
2799     const Register to          = rsi;   // destination array address
2800     const Register length      = rdx;   // elements count
2801     const Register ckoff       = rcx;   // super_check_offset
2802     const Register ckval       = r8;    // super_klass
2803 
2804     // Registers used as temps (r13, r14 are save-on-entry)
2805     const Register end_from    = from;  // source array end address
2806     const Register end_to      = r13;   // destination array end address
2807     const Register count       = rdx;   // -(count_remaining)
2808     const Register r14_length  = r14;   // saved copy of length
2809     // End pointers are inclusive, and if length is not zero they point
2810     // to the last unit copied:  end_to[0] := end_from[0]
2811 
2812     const Register rax_oop    = rax;    // actual oop copied
2813     const Register r11_klass  = r11;    // oop._klass
2814 
2815     //---------------------------------------------------------------
2816     // Assembler stub will be used for this call to arraycopy
2817     // if the two arrays are subtypes of Object[] but the
2818     // destination array type is not equal to or a supertype
2819     // of the source type.  Each element must be separately
2820     // checked.
2821 
2822     __ align(CodeEntryAlignment);
2823     StubCodeMark mark(this, "StubRoutines", name);
2824     address start = __ pc();
2825 
2826     __ enter(); // required for proper stackwalking of RuntimeStub frame
2827 
2828 #ifdef ASSERT
2829     // caller guarantees that the arrays really are different
2830     // otherwise, we would have to make conjoint checks
2831     { Label L;
2832       array_overlap_test(L, TIMES_OOP);
2833       __ stop("checkcast_copy within a single array");
2834       __ bind(L);
2835     }
2836 #endif //ASSERT
2837 
2838     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2839                        // ckoff => rcx, ckval => r8
2840                        // r9 and r10 may be used to save non-volatile registers
2841 #ifdef _WIN64
2842     // last argument (#4) is on stack on Win64
2843     __ movptr(ckval, Address(rsp, 6 * wordSize));
2844 #endif
2845 
2846     // Caller of this entry point must set up the argument registers.
2847     if (entry != NULL) {
2848       *entry = __ pc();
2849       BLOCK_COMMENT("Entry:");
2850     }
2851 
2852     // allocate spill slots for r13, r14
2853     enum {
2854       saved_r13_offset,
2855       saved_r14_offset,
2856       saved_r10_offset,
2857       saved_rbp_offset
2858     };
2859     __ subptr(rsp, saved_rbp_offset * wordSize);
2860     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2861     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2862     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2863 
2864 #ifdef ASSERT
2865       Label L2;
2866       __ get_thread(r14);
2867       __ cmpptr(r15_thread, r14);
2868       __ jcc(Assembler::equal, L2);
2869       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2870       __ bind(L2);
2871 #endif // ASSERT
2872 
2873     // check that int operands are properly extended to size_t
2874     assert_clean_int(length, rax);
2875     assert_clean_int(ckoff, rax);
2876 
2877 #ifdef ASSERT
2878     BLOCK_COMMENT("assert consistent ckoff/ckval");
2879     // The ckoff and ckval must be mutually consistent,
2880     // even though caller generates both.
2881     { Label L;
2882       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2883       __ cmpl(ckoff, Address(ckval, sco_offset));
2884       __ jcc(Assembler::equal, L);
2885       __ stop("super_check_offset inconsistent");
2886       __ bind(L);
2887     }
2888 #endif //ASSERT
2889 
2890     // Loop-invariant addresses.  They are exclusive end pointers.
2891     Address end_from_addr(from, length, TIMES_OOP, 0);
2892     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2893     // Loop-variant addresses.  They assume post-incremented count < 0.
2894     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2895     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2896 
2897     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2898     if (dest_uninitialized) {
2899       decorators |= IS_DEST_UNINITIALIZED;
2900     }
2901 
2902     BasicType type = T_OBJECT;
2903     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2904     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2905 
2906     // Copy from low to high addresses, indexed from the end of each array.
2907     __ lea(end_from, end_from_addr);
2908     __ lea(end_to,   end_to_addr);
2909     __ movptr(r14_length, length);        // save a copy of the length
2910     assert(length == count, "");          // else fix next line:
2911     __ negptr(count);                     // negate and test the length
2912     __ jcc(Assembler::notZero, L_load_element);
2913 
2914     // Empty array:  Nothing to do.
2915     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2916     __ jmp(L_done);
2917 
2918     // ======== begin loop ========
2919     // (Loop is rotated; its entry is L_load_element.)
2920     // Loop control:
2921     //   for (count = -count; count != 0; count++)
2922     // Base pointers src, dst are biased by 8*(count-1),to last element.
2923     __ align(OptoLoopAlignment);
2924 
2925     __ BIND(L_store_element);
2926     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
2927     __ increment(count);               // increment the count toward zero
2928     __ jcc(Assembler::zero, L_do_card_marks);
2929 
2930     // ======== loop entry is here ========
2931     __ BIND(L_load_element);
2932     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2933     __ testptr(rax_oop, rax_oop);
2934     __ jcc(Assembler::zero, L_store_element);
2935 
2936     __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2937     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2938     // ======== end loop ========
2939 
2940     // It was a real error; we must depend on the caller to finish the job.
2941     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2942     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2943     // and report their number to the caller.
2944     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2945     Label L_post_barrier;
2946     __ addptr(r14_length, count);     // K = (original - remaining) oops
2947     __ movptr(rax, r14_length);       // save the value
2948     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2949     __ jccb(Assembler::notZero, L_post_barrier);
2950     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2951 
2952     // Come here on success only.
2953     __ BIND(L_do_card_marks);
2954     __ xorptr(rax, rax);              // return 0 on success
2955 
2956     __ BIND(L_post_barrier);
2957     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2958 
2959     // Common exit point (success or failure).
2960     __ BIND(L_done);
2961     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2962     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2963     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2964     restore_arg_regs();
2965     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2966     __ leave(); // required for proper stackwalking of RuntimeStub frame
2967     __ ret(0);
2968 
2969     return start;
2970   }
2971 
2972   //
2973   //  Generate 'unsafe' array copy stub
2974   //  Though just as safe as the other stubs, it takes an unscaled
2975   //  size_t argument instead of an element count.
2976   //
2977   //  Input:
2978   //    c_rarg0   - source array address
2979   //    c_rarg1   - destination array address
2980   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2981   //
2982   // Examines the alignment of the operands and dispatches
2983   // to a long, int, short, or byte copy loop.
2984   //
2985   address generate_unsafe_copy(const char *name,
2986                                address byte_copy_entry, address short_copy_entry,
2987                                address int_copy_entry, address long_copy_entry) {
2988 
2989     Label L_long_aligned, L_int_aligned, L_short_aligned;
2990 
2991     // Input registers (before setup_arg_regs)
2992     const Register from        = c_rarg0;  // source array address
2993     const Register to          = c_rarg1;  // destination array address
2994     const Register size        = c_rarg2;  // byte count (size_t)
2995 
2996     // Register used as a temp
2997     const Register bits        = rax;      // test copy of low bits
2998 
2999     __ align(CodeEntryAlignment);
3000     StubCodeMark mark(this, "StubRoutines", name);
3001     address start = __ pc();
3002 
3003     __ enter(); // required for proper stackwalking of RuntimeStub frame
3004 
3005     // bump this on entry, not on exit:
3006     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
3007 
3008     __ mov(bits, from);
3009     __ orptr(bits, to);
3010     __ orptr(bits, size);
3011 
3012     __ testb(bits, BytesPerLong-1);
3013     __ jccb(Assembler::zero, L_long_aligned);
3014 
3015     __ testb(bits, BytesPerInt-1);
3016     __ jccb(Assembler::zero, L_int_aligned);
3017 
3018     __ testb(bits, BytesPerShort-1);
3019     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
3020 
3021     __ BIND(L_short_aligned);
3022     __ shrptr(size, LogBytesPerShort); // size => short_count
3023     __ jump(RuntimeAddress(short_copy_entry));
3024 
3025     __ BIND(L_int_aligned);
3026     __ shrptr(size, LogBytesPerInt); // size => int_count
3027     __ jump(RuntimeAddress(int_copy_entry));
3028 
3029     __ BIND(L_long_aligned);
3030     __ shrptr(size, LogBytesPerLong); // size => qword_count
3031     __ jump(RuntimeAddress(long_copy_entry));
3032 
3033     return start;
3034   }
3035 
3036   // Perform range checks on the proposed arraycopy.
3037   // Kills temp, but nothing else.
3038   // Also, clean the sign bits of src_pos and dst_pos.
3039   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
3040                               Register src_pos, // source position (c_rarg1)
3041                               Register dst,     // destination array oo (c_rarg2)
3042                               Register dst_pos, // destination position (c_rarg3)
3043                               Register length,
3044                               Register temp,
3045                               Label& L_failed) {
3046     BLOCK_COMMENT("arraycopy_range_checks:");
3047 
3048     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
3049     __ movl(temp, length);
3050     __ addl(temp, src_pos);             // src_pos + length
3051     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
3052     __ jcc(Assembler::above, L_failed);
3053 
3054     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
3055     __ movl(temp, length);
3056     __ addl(temp, dst_pos);             // dst_pos + length
3057     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
3058     __ jcc(Assembler::above, L_failed);
3059 
3060     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
3061     // Move with sign extension can be used since they are positive.
3062     __ movslq(src_pos, src_pos);
3063     __ movslq(dst_pos, dst_pos);
3064 
3065     BLOCK_COMMENT("arraycopy_range_checks done");
3066   }
3067 
3068   //
3069   //  Generate generic array copy stubs
3070   //
3071   //  Input:
3072   //    c_rarg0    -  src oop
3073   //    c_rarg1    -  src_pos (32-bits)
3074   //    c_rarg2    -  dst oop
3075   //    c_rarg3    -  dst_pos (32-bits)
3076   // not Win64
3077   //    c_rarg4    -  element count (32-bits)
3078   // Win64
3079   //    rsp+40     -  element count (32-bits)
3080   //
3081   //  Output:
3082   //    rax ==  0  -  success
3083   //    rax == -1^K - failure, where K is partial transfer count
3084   //
3085   address generate_generic_copy(const char *name,
3086                                 address byte_copy_entry, address short_copy_entry,
3087                                 address int_copy_entry, address oop_copy_entry,
3088                                 address long_copy_entry, address checkcast_copy_entry) {
3089 
3090     Label L_failed, L_failed_0, L_objArray;
3091     Label L_copy_shorts, L_copy_ints, L_copy_longs;
3092 
3093     // Input registers
3094     const Register src        = c_rarg0;  // source array oop
3095     const Register src_pos    = c_rarg1;  // source position
3096     const Register dst        = c_rarg2;  // destination array oop
3097     const Register dst_pos    = c_rarg3;  // destination position
3098 #ifndef _WIN64
3099     const Register length     = c_rarg4;
3100     const Register rklass_tmp = r9;  // load_klass
3101 #else
3102     const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3103     const Register rklass_tmp = rdi;  // load_klass
3104 #endif
3105 
3106     { int modulus = CodeEntryAlignment;
3107       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
3108       int advance = target - (__ offset() % modulus);
3109       if (advance < 0)  advance += modulus;
3110       if (advance > 0)  __ nop(advance);
3111     }
3112     StubCodeMark mark(this, "StubRoutines", name);
3113 
3114     // Short-hop target to L_failed.  Makes for denser prologue code.
3115     __ BIND(L_failed_0);
3116     __ jmp(L_failed);
3117     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
3118 
3119     __ align(CodeEntryAlignment);
3120     address start = __ pc();
3121 
3122     __ enter(); // required for proper stackwalking of RuntimeStub frame
3123 
3124 #ifdef _WIN64
3125     __ push(rklass_tmp); // rdi is callee-save on Windows
3126 #endif
3127 
3128     // bump this on entry, not on exit:
3129     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
3130 
3131     //-----------------------------------------------------------------------
3132     // Assembler stub will be used for this call to arraycopy
3133     // if the following conditions are met:
3134     //
3135     // (1) src and dst must not be null.
3136     // (2) src_pos must not be negative.
3137     // (3) dst_pos must not be negative.
3138     // (4) length  must not be negative.
3139     // (5) src klass and dst klass should be the same and not NULL.
3140     // (6) src and dst should be arrays.
3141     // (7) src_pos + length must not exceed length of src.
3142     // (8) dst_pos + length must not exceed length of dst.
3143     //
3144 
3145     //  if (src == NULL) return -1;
3146     __ testptr(src, src);         // src oop
3147     size_t j1off = __ offset();
3148     __ jccb(Assembler::zero, L_failed_0);
3149 
3150     //  if (src_pos < 0) return -1;
3151     __ testl(src_pos, src_pos); // src_pos (32-bits)
3152     __ jccb(Assembler::negative, L_failed_0);
3153 
3154     //  if (dst == NULL) return -1;
3155     __ testptr(dst, dst);         // dst oop
3156     __ jccb(Assembler::zero, L_failed_0);
3157 
3158     //  if (dst_pos < 0) return -1;
3159     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3160     size_t j4off = __ offset();
3161     __ jccb(Assembler::negative, L_failed_0);
3162 
3163     // The first four tests are very dense code,
3164     // but not quite dense enough to put four
3165     // jumps in a 16-byte instruction fetch buffer.
3166     // That's good, because some branch predicters
3167     // do not like jumps so close together.
3168     // Make sure of this.
3169     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3170 
3171     // registers used as temp
3172     const Register r11_length    = r11; // elements count to copy
3173     const Register r10_src_klass = r10; // array klass
3174 
3175     //  if (length < 0) return -1;
3176     __ movl(r11_length, length);        // length (elements count, 32-bits value)
3177     __ testl(r11_length, r11_length);
3178     __ jccb(Assembler::negative, L_failed_0);
3179 
3180     __ load_klass(r10_src_klass, src, rklass_tmp);
3181 #ifdef ASSERT
3182     //  assert(src->klass() != NULL);
3183     {
3184       BLOCK_COMMENT("assert klasses not null {");
3185       Label L1, L2;
3186       __ testptr(r10_src_klass, r10_src_klass);
3187       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
3188       __ bind(L1);
3189       __ stop("broken null klass");
3190       __ bind(L2);
3191       __ load_klass(rax, dst, rklass_tmp);
3192       __ cmpq(rax, 0);
3193       __ jcc(Assembler::equal, L1);     // this would be broken also
3194       BLOCK_COMMENT("} assert klasses not null done");
3195     }
3196 #endif
3197 
3198     // Load layout helper (32-bits)
3199     //
3200     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3201     // 32        30    24            16              8     2                 0
3202     //
3203     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3204     //
3205 
3206     const int lh_offset = in_bytes(Klass::layout_helper_offset());
3207 
3208     // Handle objArrays completely differently...
3209     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3210     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3211     __ jcc(Assembler::equal, L_objArray);
3212 
3213     //  if (src->klass() != dst->klass()) return -1;
3214     __ load_klass(rax, dst, rklass_tmp);
3215     __ cmpq(r10_src_klass, rax);
3216     __ jcc(Assembler::notEqual, L_failed);
3217 
3218     const Register rax_lh = rax;  // layout helper
3219     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3220 
3221     //  if (!src->is_Array()) return -1;
3222     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3223     __ jcc(Assembler::greaterEqual, L_failed);
3224 
3225     // At this point, it is known to be a typeArray (array_tag 0x3).
3226 #ifdef ASSERT
3227     {
3228       BLOCK_COMMENT("assert primitive array {");
3229       Label L;
3230       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3231       __ jcc(Assembler::greaterEqual, L);
3232       __ stop("must be a primitive array");
3233       __ bind(L);
3234       BLOCK_COMMENT("} assert primitive array done");
3235     }
3236 #endif
3237 
3238     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3239                            r10, L_failed);
3240 
3241     // TypeArrayKlass
3242     //
3243     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3244     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3245     //
3246 
3247     const Register r10_offset = r10;    // array offset
3248     const Register rax_elsize = rax_lh; // element size
3249 
3250     __ movl(r10_offset, rax_lh);
3251     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3252     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3253     __ addptr(src, r10_offset);           // src array offset
3254     __ addptr(dst, r10_offset);           // dst array offset
3255     BLOCK_COMMENT("choose copy loop based on element size");
3256     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3257 
3258 #ifdef _WIN64
3259     __ pop(rklass_tmp); // Restore callee-save rdi
3260 #endif
3261 
3262     // next registers should be set before the jump to corresponding stub
3263     const Register from     = c_rarg0;  // source array address
3264     const Register to       = c_rarg1;  // destination array address
3265     const Register count    = c_rarg2;  // elements count
3266 
3267     // 'from', 'to', 'count' registers should be set in such order
3268     // since they are the same as 'src', 'src_pos', 'dst'.
3269 
3270     __ cmpl(rax_elsize, 0);
3271     __ jccb(Assembler::notEqual, L_copy_shorts);
3272     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3273     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3274     __ movl2ptr(count, r11_length); // length
3275     __ jump(RuntimeAddress(byte_copy_entry));
3276 
3277   __ BIND(L_copy_shorts);
3278     __ cmpl(rax_elsize, LogBytesPerShort);
3279     __ jccb(Assembler::notEqual, L_copy_ints);
3280     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3281     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3282     __ movl2ptr(count, r11_length); // length
3283     __ jump(RuntimeAddress(short_copy_entry));
3284 
3285   __ BIND(L_copy_ints);
3286     __ cmpl(rax_elsize, LogBytesPerInt);
3287     __ jccb(Assembler::notEqual, L_copy_longs);
3288     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3289     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3290     __ movl2ptr(count, r11_length); // length
3291     __ jump(RuntimeAddress(int_copy_entry));
3292 
3293   __ BIND(L_copy_longs);
3294 #ifdef ASSERT
3295     {
3296       BLOCK_COMMENT("assert long copy {");
3297       Label L;
3298       __ cmpl(rax_elsize, LogBytesPerLong);
3299       __ jcc(Assembler::equal, L);
3300       __ stop("must be long copy, but elsize is wrong");
3301       __ bind(L);
3302       BLOCK_COMMENT("} assert long copy done");
3303     }
3304 #endif
3305     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3306     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3307     __ movl2ptr(count, r11_length); // length
3308     __ jump(RuntimeAddress(long_copy_entry));
3309 
3310     // ObjArrayKlass
3311   __ BIND(L_objArray);
3312     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3313 
3314     Label L_plain_copy, L_checkcast_copy;
3315     //  test array classes for subtyping
3316     __ load_klass(rax, dst, rklass_tmp);
3317     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3318     __ jcc(Assembler::notEqual, L_checkcast_copy);
3319 
3320     // Identically typed arrays can be copied without element-wise checks.
3321     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3322                            r10, L_failed);
3323 
3324     __ lea(from, Address(src, src_pos, TIMES_OOP,
3325                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3326     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3327                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3328     __ movl2ptr(count, r11_length); // length
3329   __ BIND(L_plain_copy);
3330 #ifdef _WIN64
3331     __ pop(rklass_tmp); // Restore callee-save rdi
3332 #endif
3333     __ jump(RuntimeAddress(oop_copy_entry));
3334 
3335   __ BIND(L_checkcast_copy);
3336     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3337     {
3338       // Before looking at dst.length, make sure dst is also an objArray.
3339       __ cmpl(Address(rax, lh_offset), objArray_lh);
3340       __ jcc(Assembler::notEqual, L_failed);
3341 
3342       // It is safe to examine both src.length and dst.length.
3343       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3344                              rax, L_failed);
3345 
3346       const Register r11_dst_klass = r11;
3347       __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3348 
3349       // Marshal the base address arguments now, freeing registers.
3350       __ lea(from, Address(src, src_pos, TIMES_OOP,
3351                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3352       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3353                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3354       __ movl(count, length);           // length (reloaded)
3355       Register sco_temp = c_rarg3;      // this register is free now
3356       assert_different_registers(from, to, count, sco_temp,
3357                                  r11_dst_klass, r10_src_klass);
3358       assert_clean_int(count, sco_temp);
3359 
3360       // Generate the type check.
3361       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3362       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3363       assert_clean_int(sco_temp, rax);
3364       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3365 
3366       // Fetch destination element klass from the ObjArrayKlass header.
3367       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3368       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3369       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3370       assert_clean_int(sco_temp, rax);
3371 
3372 #ifdef _WIN64
3373       __ pop(rklass_tmp); // Restore callee-save rdi
3374 #endif
3375 
3376       // the checkcast_copy loop needs two extra arguments:
3377       assert(c_rarg3 == sco_temp, "#3 already in place");
3378       // Set up arguments for checkcast_copy_entry.
3379       setup_arg_regs(4);
3380       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3381       __ jump(RuntimeAddress(checkcast_copy_entry));
3382     }
3383 
3384   __ BIND(L_failed);
3385 #ifdef _WIN64
3386     __ pop(rklass_tmp); // Restore callee-save rdi
3387 #endif
3388     __ xorptr(rax, rax);
3389     __ notptr(rax); // return -1
3390     __ leave();   // required for proper stackwalking of RuntimeStub frame
3391     __ ret(0);
3392 
3393     return start;
3394   }
3395 
3396   address generate_data_cache_writeback() {
3397     const Register src        = c_rarg0;  // source address
3398 
3399     __ align(CodeEntryAlignment);
3400 
3401     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3402 
3403     address start = __ pc();
3404     __ enter();
3405     __ cache_wb(Address(src, 0));
3406     __ leave();
3407     __ ret(0);
3408 
3409     return start;
3410   }
3411 
3412   address generate_data_cache_writeback_sync() {
3413     const Register is_pre    = c_rarg0;  // pre or post sync
3414 
3415     __ align(CodeEntryAlignment);
3416 
3417     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3418 
3419     // pre wbsync is a no-op
3420     // post wbsync translates to an sfence
3421 
3422     Label skip;
3423     address start = __ pc();
3424     __ enter();
3425     __ cmpl(is_pre, 0);
3426     __ jcc(Assembler::notEqual, skip);
3427     __ cache_wbsync(false);
3428     __ bind(skip);
3429     __ leave();
3430     __ ret(0);
3431 
3432     return start;
3433   }
3434 
3435   void generate_arraycopy_stubs() {
3436     address entry;
3437     address entry_jbyte_arraycopy;
3438     address entry_jshort_arraycopy;
3439     address entry_jint_arraycopy;
3440     address entry_oop_arraycopy;
3441     address entry_jlong_arraycopy;
3442     address entry_checkcast_arraycopy;
3443 
3444     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3445                                                                            "jbyte_disjoint_arraycopy");
3446     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3447                                                                            "jbyte_arraycopy");
3448 
3449     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3450                                                                             "jshort_disjoint_arraycopy");
3451     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3452                                                                             "jshort_arraycopy");
3453 
3454     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3455                                                                               "jint_disjoint_arraycopy");
3456     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3457                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3458 
3459     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3460                                                                                "jlong_disjoint_arraycopy");
3461     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3462                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3463 
3464 
3465     if (UseCompressedOops) {
3466       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3467                                                                               "oop_disjoint_arraycopy");
3468       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3469                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3470       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3471                                                                                      "oop_disjoint_arraycopy_uninit",
3472                                                                                      /*dest_uninitialized*/true);
3473       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3474                                                                                      NULL, "oop_arraycopy_uninit",
3475                                                                                      /*dest_uninitialized*/true);
3476     } else {
3477       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3478                                                                                "oop_disjoint_arraycopy");
3479       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3480                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3481       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3482                                                                                       "oop_disjoint_arraycopy_uninit",
3483                                                                                       /*dest_uninitialized*/true);
3484       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3485                                                                                       NULL, "oop_arraycopy_uninit",
3486                                                                                       /*dest_uninitialized*/true);
3487     }
3488 
3489     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3490     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3491                                                                         /*dest_uninitialized*/true);
3492 
3493     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3494                                                               entry_jbyte_arraycopy,
3495                                                               entry_jshort_arraycopy,
3496                                                               entry_jint_arraycopy,
3497                                                               entry_jlong_arraycopy);
3498     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3499                                                                entry_jbyte_arraycopy,
3500                                                                entry_jshort_arraycopy,
3501                                                                entry_jint_arraycopy,
3502                                                                entry_oop_arraycopy,
3503                                                                entry_jlong_arraycopy,
3504                                                                entry_checkcast_arraycopy);
3505 
3506     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3507     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3508     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3509     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3510     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3511     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3512 
3513     // We don't generate specialized code for HeapWord-aligned source
3514     // arrays, so just use the code we've already generated
3515     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3516     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3517 
3518     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3519     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3520 
3521     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3522     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3523 
3524     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3525     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3526 
3527     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3528     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3529 
3530     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3531     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3532   }
3533 
3534   // AES intrinsic stubs
3535   enum {AESBlockSize = 16};
3536 
3537   address generate_key_shuffle_mask() {
3538     __ align(16);
3539     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3540     address start = __ pc();
3541     __ emit_data64( 0x0405060700010203, relocInfo::none );
3542     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3543     return start;
3544   }
3545 
3546   address generate_counter_shuffle_mask() {
3547     __ align(16);
3548     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3549     address start = __ pc();
3550     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3551     __ emit_data64(0x0001020304050607, relocInfo::none);
3552     return start;
3553   }
3554 
3555   // Utility routine for loading a 128-bit key word in little endian format
3556   // can optionally specify that the shuffle mask is already in an xmmregister
3557   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3558     __ movdqu(xmmdst, Address(key, offset));
3559     if (xmm_shuf_mask != NULL) {
3560       __ pshufb(xmmdst, xmm_shuf_mask);
3561     } else {
3562       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3563     }
3564   }
3565 
3566   // Utility routine for increase 128bit counter (iv in CTR mode)
3567   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3568     __ pextrq(reg, xmmdst, 0x0);
3569     __ addq(reg, inc_delta);
3570     __ pinsrq(xmmdst, reg, 0x0);
3571     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3572     __ pextrq(reg, xmmdst, 0x01); // Carry
3573     __ addq(reg, 0x01);
3574     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3575     __ BIND(next_block);          // next instruction
3576   }
3577 
3578   // Arguments:
3579   //
3580   // Inputs:
3581   //   c_rarg0   - source byte array address
3582   //   c_rarg1   - destination byte array address
3583   //   c_rarg2   - K (key) in little endian int array
3584   //
3585   address generate_aescrypt_encryptBlock() {
3586     assert(UseAES, "need AES instructions and misaligned SSE support");
3587     __ align(CodeEntryAlignment);
3588     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3589     Label L_doLast;
3590     address start = __ pc();
3591 
3592     const Register from        = c_rarg0;  // source array address
3593     const Register to          = c_rarg1;  // destination array address
3594     const Register key         = c_rarg2;  // key array address
3595     const Register keylen      = rax;
3596 
3597     const XMMRegister xmm_result = xmm0;
3598     const XMMRegister xmm_key_shuf_mask = xmm1;
3599     // On win64 xmm6-xmm15 must be preserved so don't use them.
3600     const XMMRegister xmm_temp1  = xmm2;
3601     const XMMRegister xmm_temp2  = xmm3;
3602     const XMMRegister xmm_temp3  = xmm4;
3603     const XMMRegister xmm_temp4  = xmm5;
3604 
3605     __ enter(); // required for proper stackwalking of RuntimeStub frame
3606 
3607     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3608     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3609 
3610     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3611     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3612 
3613     // For encryption, the java expanded key ordering is just what we need
3614     // we don't know if the key is aligned, hence not using load-execute form
3615 
3616     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3617     __ pxor(xmm_result, xmm_temp1);
3618 
3619     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3620     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3621     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3622     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3623 
3624     __ aesenc(xmm_result, xmm_temp1);
3625     __ aesenc(xmm_result, xmm_temp2);
3626     __ aesenc(xmm_result, xmm_temp3);
3627     __ aesenc(xmm_result, xmm_temp4);
3628 
3629     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3630     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3631     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3632     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3633 
3634     __ aesenc(xmm_result, xmm_temp1);
3635     __ aesenc(xmm_result, xmm_temp2);
3636     __ aesenc(xmm_result, xmm_temp3);
3637     __ aesenc(xmm_result, xmm_temp4);
3638 
3639     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3640     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3641 
3642     __ cmpl(keylen, 44);
3643     __ jccb(Assembler::equal, L_doLast);
3644 
3645     __ aesenc(xmm_result, xmm_temp1);
3646     __ aesenc(xmm_result, xmm_temp2);
3647 
3648     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3649     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3650 
3651     __ cmpl(keylen, 52);
3652     __ jccb(Assembler::equal, L_doLast);
3653 
3654     __ aesenc(xmm_result, xmm_temp1);
3655     __ aesenc(xmm_result, xmm_temp2);
3656 
3657     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3658     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3659 
3660     __ BIND(L_doLast);
3661     __ aesenc(xmm_result, xmm_temp1);
3662     __ aesenclast(xmm_result, xmm_temp2);
3663     __ movdqu(Address(to, 0), xmm_result);        // store the result
3664     __ xorptr(rax, rax); // return 0
3665     __ leave(); // required for proper stackwalking of RuntimeStub frame
3666     __ ret(0);
3667 
3668     return start;
3669   }
3670 
3671 
3672   // Arguments:
3673   //
3674   // Inputs:
3675   //   c_rarg0   - source byte array address
3676   //   c_rarg1   - destination byte array address
3677   //   c_rarg2   - K (key) in little endian int array
3678   //
3679   address generate_aescrypt_decryptBlock() {
3680     assert(UseAES, "need AES instructions and misaligned SSE support");
3681     __ align(CodeEntryAlignment);
3682     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3683     Label L_doLast;
3684     address start = __ pc();
3685 
3686     const Register from        = c_rarg0;  // source array address
3687     const Register to          = c_rarg1;  // destination array address
3688     const Register key         = c_rarg2;  // key array address
3689     const Register keylen      = rax;
3690 
3691     const XMMRegister xmm_result = xmm0;
3692     const XMMRegister xmm_key_shuf_mask = xmm1;
3693     // On win64 xmm6-xmm15 must be preserved so don't use them.
3694     const XMMRegister xmm_temp1  = xmm2;
3695     const XMMRegister xmm_temp2  = xmm3;
3696     const XMMRegister xmm_temp3  = xmm4;
3697     const XMMRegister xmm_temp4  = xmm5;
3698 
3699     __ enter(); // required for proper stackwalking of RuntimeStub frame
3700 
3701     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3702     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3703 
3704     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3705     __ movdqu(xmm_result, Address(from, 0));
3706 
3707     // for decryption java expanded key ordering is rotated one position from what we want
3708     // so we start from 0x10 here and hit 0x00 last
3709     // we don't know if the key is aligned, hence not using load-execute form
3710     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3711     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3712     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3713     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3714 
3715     __ pxor  (xmm_result, xmm_temp1);
3716     __ aesdec(xmm_result, xmm_temp2);
3717     __ aesdec(xmm_result, xmm_temp3);
3718     __ aesdec(xmm_result, xmm_temp4);
3719 
3720     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3721     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3722     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3723     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3724 
3725     __ aesdec(xmm_result, xmm_temp1);
3726     __ aesdec(xmm_result, xmm_temp2);
3727     __ aesdec(xmm_result, xmm_temp3);
3728     __ aesdec(xmm_result, xmm_temp4);
3729 
3730     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3731     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3732     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3733 
3734     __ cmpl(keylen, 44);
3735     __ jccb(Assembler::equal, L_doLast);
3736 
3737     __ aesdec(xmm_result, xmm_temp1);
3738     __ aesdec(xmm_result, xmm_temp2);
3739 
3740     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3741     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3742 
3743     __ cmpl(keylen, 52);
3744     __ jccb(Assembler::equal, L_doLast);
3745 
3746     __ aesdec(xmm_result, xmm_temp1);
3747     __ aesdec(xmm_result, xmm_temp2);
3748 
3749     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3750     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3751 
3752     __ BIND(L_doLast);
3753     __ aesdec(xmm_result, xmm_temp1);
3754     __ aesdec(xmm_result, xmm_temp2);
3755 
3756     // for decryption the aesdeclast operation is always on key+0x00
3757     __ aesdeclast(xmm_result, xmm_temp3);
3758     __ movdqu(Address(to, 0), xmm_result);  // store the result
3759     __ xorptr(rax, rax); // return 0
3760     __ leave(); // required for proper stackwalking of RuntimeStub frame
3761     __ ret(0);
3762 
3763     return start;
3764   }
3765 
3766 
3767   // Arguments:
3768   //
3769   // Inputs:
3770   //   c_rarg0   - source byte array address
3771   //   c_rarg1   - destination byte array address
3772   //   c_rarg2   - K (key) in little endian int array
3773   //   c_rarg3   - r vector byte array address
3774   //   c_rarg4   - input length
3775   //
3776   // Output:
3777   //   rax       - input length
3778   //
3779   address generate_cipherBlockChaining_encryptAESCrypt() {
3780     assert(UseAES, "need AES instructions and misaligned SSE support");
3781     __ align(CodeEntryAlignment);
3782     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3783     address start = __ pc();
3784 
3785     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3786     const Register from        = c_rarg0;  // source array address
3787     const Register to          = c_rarg1;  // destination array address
3788     const Register key         = c_rarg2;  // key array address
3789     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3790                                            // and left with the results of the last encryption block
3791 #ifndef _WIN64
3792     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3793 #else
3794     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3795     const Register len_reg     = r11;      // pick the volatile windows register
3796 #endif
3797     const Register pos         = rax;
3798 
3799     // xmm register assignments for the loops below
3800     const XMMRegister xmm_result = xmm0;
3801     const XMMRegister xmm_temp   = xmm1;
3802     // keys 0-10 preloaded into xmm2-xmm12
3803     const int XMM_REG_NUM_KEY_FIRST = 2;
3804     const int XMM_REG_NUM_KEY_LAST  = 15;
3805     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3806     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3807     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3808     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3809     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3810 
3811     __ enter(); // required for proper stackwalking of RuntimeStub frame
3812 
3813 #ifdef _WIN64
3814     // on win64, fill len_reg from stack position
3815     __ movl(len_reg, len_mem);
3816 #else
3817     __ push(len_reg); // Save
3818 #endif
3819 
3820     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3821     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3822     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3823     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3824       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3825       offset += 0x10;
3826     }
3827     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3828 
3829     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3830     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3831     __ cmpl(rax, 44);
3832     __ jcc(Assembler::notEqual, L_key_192_256);
3833 
3834     // 128 bit code follows here
3835     __ movptr(pos, 0);
3836     __ align(OptoLoopAlignment);
3837 
3838     __ BIND(L_loopTop_128);
3839     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3840     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3841     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3842     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3843       __ aesenc(xmm_result, as_XMMRegister(rnum));
3844     }
3845     __ aesenclast(xmm_result, xmm_key10);
3846     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3847     // no need to store r to memory until we exit
3848     __ addptr(pos, AESBlockSize);
3849     __ subptr(len_reg, AESBlockSize);
3850     __ jcc(Assembler::notEqual, L_loopTop_128);
3851 
3852     __ BIND(L_exit);
3853     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3854 
3855 #ifdef _WIN64
3856     __ movl(rax, len_mem);
3857 #else
3858     __ pop(rax); // return length
3859 #endif
3860     __ leave(); // required for proper stackwalking of RuntimeStub frame
3861     __ ret(0);
3862 
3863     __ BIND(L_key_192_256);
3864     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3865     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3866     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3867     __ cmpl(rax, 52);
3868     __ jcc(Assembler::notEqual, L_key_256);
3869 
3870     // 192-bit code follows here (could be changed to use more xmm registers)
3871     __ movptr(pos, 0);
3872     __ align(OptoLoopAlignment);
3873 
3874     __ BIND(L_loopTop_192);
3875     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3876     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3877     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3878     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3879       __ aesenc(xmm_result, as_XMMRegister(rnum));
3880     }
3881     __ aesenclast(xmm_result, xmm_key12);
3882     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3883     // no need to store r to memory until we exit
3884     __ addptr(pos, AESBlockSize);
3885     __ subptr(len_reg, AESBlockSize);
3886     __ jcc(Assembler::notEqual, L_loopTop_192);
3887     __ jmp(L_exit);
3888 
3889     __ BIND(L_key_256);
3890     // 256-bit code follows here (could be changed to use more xmm registers)
3891     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3892     __ movptr(pos, 0);
3893     __ align(OptoLoopAlignment);
3894 
3895     __ BIND(L_loopTop_256);
3896     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3897     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3898     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3899     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3900       __ aesenc(xmm_result, as_XMMRegister(rnum));
3901     }
3902     load_key(xmm_temp, key, 0xe0);
3903     __ aesenclast(xmm_result, xmm_temp);
3904     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3905     // no need to store r to memory until we exit
3906     __ addptr(pos, AESBlockSize);
3907     __ subptr(len_reg, AESBlockSize);
3908     __ jcc(Assembler::notEqual, L_loopTop_256);
3909     __ jmp(L_exit);
3910 
3911     return start;
3912   }
3913 
3914   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3915   // to hide instruction latency
3916   //
3917   // Arguments:
3918   //
3919   // Inputs:
3920   //   c_rarg0   - source byte array address
3921   //   c_rarg1   - destination byte array address
3922   //   c_rarg2   - K (key) in little endian int array
3923   //   c_rarg3   - r vector byte array address
3924   //   c_rarg4   - input length
3925   //
3926   // Output:
3927   //   rax       - input length
3928   //
3929   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3930     assert(UseAES, "need AES instructions and misaligned SSE support");
3931     __ align(CodeEntryAlignment);
3932     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3933     address start = __ pc();
3934 
3935     const Register from        = c_rarg0;  // source array address
3936     const Register to          = c_rarg1;  // destination array address
3937     const Register key         = c_rarg2;  // key array address
3938     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3939                                            // and left with the results of the last encryption block
3940 #ifndef _WIN64
3941     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3942 #else
3943     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3944     const Register len_reg     = r11;      // pick the volatile windows register
3945 #endif
3946     const Register pos         = rax;
3947 
3948     const int PARALLEL_FACTOR = 4;
3949     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3950 
3951     Label L_exit;
3952     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3953     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3954     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3955     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3956     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3957 
3958     // keys 0-10 preloaded into xmm5-xmm15
3959     const int XMM_REG_NUM_KEY_FIRST = 5;
3960     const int XMM_REG_NUM_KEY_LAST  = 15;
3961     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3962     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3963 
3964     __ enter(); // required for proper stackwalking of RuntimeStub frame
3965 
3966 #ifdef _WIN64
3967     // on win64, fill len_reg from stack position
3968     __ movl(len_reg, len_mem);
3969 #else
3970     __ push(len_reg); // Save
3971 #endif
3972     __ push(rbx);
3973     // the java expanded key ordering is rotated one position from what we want
3974     // so we start from 0x10 here and hit 0x00 last
3975     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3976     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3977     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3978     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3979       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3980       offset += 0x10;
3981     }
3982     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3983 
3984     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3985 
3986     // registers holding the four results in the parallelized loop
3987     const XMMRegister xmm_result0 = xmm0;
3988     const XMMRegister xmm_result1 = xmm2;
3989     const XMMRegister xmm_result2 = xmm3;
3990     const XMMRegister xmm_result3 = xmm4;
3991 
3992     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3993 
3994     __ xorptr(pos, pos);
3995 
3996     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3997     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3998     __ cmpl(rbx, 52);
3999     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
4000     __ cmpl(rbx, 60);
4001     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
4002 
4003 #define DoFour(opc, src_reg)           \
4004   __ opc(xmm_result0, src_reg);         \
4005   __ opc(xmm_result1, src_reg);         \
4006   __ opc(xmm_result2, src_reg);         \
4007   __ opc(xmm_result3, src_reg);         \
4008 
4009     for (int k = 0; k < 3; ++k) {
4010       __ BIND(L_multiBlock_loopTopHead[k]);
4011       if (k != 0) {
4012         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
4013         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
4014       }
4015       if (k == 1) {
4016         __ subptr(rsp, 6 * wordSize);
4017         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
4018         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
4019         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
4020         load_key(xmm1, key, 0xc0);  // 0xc0;
4021         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
4022       } else if (k == 2) {
4023         __ subptr(rsp, 10 * wordSize);
4024         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
4025         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes up to 0xe0
4026         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
4027         load_key(xmm1, key, 0xe0);  // 0xe0;
4028         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
4029         load_key(xmm15, key, 0xb0); // 0xb0;
4030         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
4031         load_key(xmm1, key, 0xc0);  // 0xc0;
4032         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
4033       }
4034       __ align(OptoLoopAlignment);
4035       __ BIND(L_multiBlock_loopTop[k]);
4036       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
4037       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
4038 
4039       if  (k != 0) {
4040         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
4041         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
4042       }
4043 
4044       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
4045       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4046       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4047       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4048 
4049       DoFour(pxor, xmm_key_first);
4050       if (k == 0) {
4051         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
4052           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4053         }
4054         DoFour(aesdeclast, xmm_key_last);
4055       } else if (k == 1) {
4056         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
4057           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4058         }
4059         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4060         DoFour(aesdec, xmm1);  // key : 0xc0
4061         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4062         DoFour(aesdeclast, xmm_key_last);
4063       } else if (k == 2) {
4064         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
4065           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4066         }
4067         DoFour(aesdec, xmm1);  // key : 0xc0
4068         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
4069         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
4070         DoFour(aesdec, xmm15);  // key : 0xd0
4071         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4072         DoFour(aesdec, xmm1);  // key : 0xe0
4073         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4074         DoFour(aesdeclast, xmm_key_last);
4075       }
4076 
4077       // for each result, xor with the r vector of previous cipher block
4078       __ pxor(xmm_result0, xmm_prev_block_cipher);
4079       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4080       __ pxor(xmm_result1, xmm_prev_block_cipher);
4081       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4082       __ pxor(xmm_result2, xmm_prev_block_cipher);
4083       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4084       __ pxor(xmm_result3, xmm_prev_block_cipher);
4085       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
4086       if (k != 0) {
4087         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
4088       }
4089 
4090       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
4091       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4092       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4093       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4094 
4095       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
4096       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
4097       __ jmp(L_multiBlock_loopTop[k]);
4098 
4099       // registers used in the non-parallelized loops
4100       // xmm register assignments for the loops below
4101       const XMMRegister xmm_result = xmm0;
4102       const XMMRegister xmm_prev_block_cipher_save = xmm2;
4103       const XMMRegister xmm_key11 = xmm3;
4104       const XMMRegister xmm_key12 = xmm4;
4105       const XMMRegister key_tmp = xmm4;
4106 
4107       __ BIND(L_singleBlock_loopTopHead[k]);
4108       if (k == 1) {
4109         __ addptr(rsp, 6 * wordSize);
4110       } else if (k == 2) {
4111         __ addptr(rsp, 10 * wordSize);
4112       }
4113       __ cmpptr(len_reg, 0); // any blocks left??
4114       __ jcc(Assembler::equal, L_exit);
4115       __ BIND(L_singleBlock_loopTopHead2[k]);
4116       if (k == 1) {
4117         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
4118         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes up to 0xc0
4119       }
4120       if (k == 2) {
4121         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes up to 0xe0
4122       }
4123       __ align(OptoLoopAlignment);
4124       __ BIND(L_singleBlock_loopTop[k]);
4125       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
4126       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
4127       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
4128       for (int rnum = 1; rnum <= 9 ; rnum++) {
4129           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4130       }
4131       if (k == 1) {
4132         __ aesdec(xmm_result, xmm_key11);
4133         __ aesdec(xmm_result, xmm_key12);
4134       }
4135       if (k == 2) {
4136         __ aesdec(xmm_result, xmm_key11);
4137         load_key(key_tmp, key, 0xc0);
4138         __ aesdec(xmm_result, key_tmp);
4139         load_key(key_tmp, key, 0xd0);
4140         __ aesdec(xmm_result, key_tmp);
4141         load_key(key_tmp, key, 0xe0);
4142         __ aesdec(xmm_result, key_tmp);
4143       }
4144 
4145       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
4146       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
4147       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
4148       // no need to store r to memory until we exit
4149       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
4150       __ addptr(pos, AESBlockSize);
4151       __ subptr(len_reg, AESBlockSize);
4152       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
4153       if (k != 2) {
4154         __ jmp(L_exit);
4155       }
4156     } //for 128/192/256
4157 
4158     __ BIND(L_exit);
4159     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
4160     __ pop(rbx);
4161 #ifdef _WIN64
4162     __ movl(rax, len_mem);
4163 #else
4164     __ pop(rax); // return length
4165 #endif
4166     __ leave(); // required for proper stackwalking of RuntimeStub frame
4167     __ ret(0);
4168     return start;
4169 }
4170 
4171   address generate_electronicCodeBook_encryptAESCrypt() {
4172     __ align(CodeEntryAlignment);
4173     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
4174     address start = __ pc();
4175     const Register from = c_rarg0;  // source array address
4176     const Register to = c_rarg1;  // destination array address
4177     const Register key = c_rarg2;  // key array address
4178     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4179     __ enter(); // required for proper stackwalking of RuntimeStub frame
4180     __ aesecb_encrypt(from, to, key, len);
4181     __ vzeroupper();
4182     __ leave(); // required for proper stackwalking of RuntimeStub frame
4183     __ ret(0);
4184     return start;
4185  }
4186 
4187   address generate_electronicCodeBook_decryptAESCrypt() {
4188     __ align(CodeEntryAlignment);
4189     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
4190     address start = __ pc();
4191     const Register from = c_rarg0;  // source array address
4192     const Register to = c_rarg1;  // destination array address
4193     const Register key = c_rarg2;  // key array address
4194     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4195     __ enter(); // required for proper stackwalking of RuntimeStub frame
4196     __ aesecb_decrypt(from, to, key, len);
4197     __ vzeroupper();
4198     __ leave(); // required for proper stackwalking of RuntimeStub frame
4199     __ ret(0);
4200     return start;
4201   }
4202 
4203   // ofs and limit are use for multi-block byte array.
4204   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
4205   address generate_md5_implCompress(bool multi_block, const char *name) {
4206     __ align(CodeEntryAlignment);
4207     StubCodeMark mark(this, "StubRoutines", name);
4208     address start = __ pc();
4209 
4210     const Register buf_param = r15;
4211     const Address state_param(rsp, 0 * wordSize);
4212     const Address ofs_param  (rsp, 1 * wordSize    );
4213     const Address limit_param(rsp, 1 * wordSize + 4);
4214 
4215     __ enter();
4216     __ push(rbx);
4217     __ push(rdi);
4218     __ push(rsi);
4219     __ push(r15);
4220     __ subptr(rsp, 2 * wordSize);
4221 
4222     __ movptr(buf_param, c_rarg0);
4223     __ movptr(state_param, c_rarg1);
4224     if (multi_block) {
4225       __ movl(ofs_param, c_rarg2);
4226       __ movl(limit_param, c_rarg3);
4227     }
4228     __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
4229 
4230     __ addptr(rsp, 2 * wordSize);
4231     __ pop(r15);
4232     __ pop(rsi);
4233     __ pop(rdi);
4234     __ pop(rbx);
4235     __ leave();
4236     __ ret(0);
4237     return start;
4238   }
4239 
4240   address generate_upper_word_mask() {
4241     __ align64();
4242     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
4243     address start = __ pc();
4244     __ emit_data64(0x0000000000000000, relocInfo::none);
4245     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
4246     return start;
4247   }
4248 
4249   address generate_shuffle_byte_flip_mask() {
4250     __ align64();
4251     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
4252     address start = __ pc();
4253     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4254     __ emit_data64(0x0001020304050607, relocInfo::none);
4255     return start;
4256   }
4257 
4258   // ofs and limit are use for multi-block byte array.
4259   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4260   address generate_sha1_implCompress(bool multi_block, const char *name) {
4261     __ align(CodeEntryAlignment);
4262     StubCodeMark mark(this, "StubRoutines", name);
4263     address start = __ pc();
4264 
4265     Register buf = c_rarg0;
4266     Register state = c_rarg1;
4267     Register ofs = c_rarg2;
4268     Register limit = c_rarg3;
4269 
4270     const XMMRegister abcd = xmm0;
4271     const XMMRegister e0 = xmm1;
4272     const XMMRegister e1 = xmm2;
4273     const XMMRegister msg0 = xmm3;
4274 
4275     const XMMRegister msg1 = xmm4;
4276     const XMMRegister msg2 = xmm5;
4277     const XMMRegister msg3 = xmm6;
4278     const XMMRegister shuf_mask = xmm7;
4279 
4280     __ enter();
4281 
4282     __ subptr(rsp, 4 * wordSize);
4283 
4284     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4285       buf, state, ofs, limit, rsp, multi_block);
4286 
4287     __ addptr(rsp, 4 * wordSize);
4288 
4289     __ leave();
4290     __ ret(0);
4291     return start;
4292   }
4293 
4294   address generate_pshuffle_byte_flip_mask() {
4295     __ align64();
4296     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4297     address start = __ pc();
4298     __ emit_data64(0x0405060700010203, relocInfo::none);
4299     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4300 
4301     if (VM_Version::supports_avx2()) {
4302       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4303       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4304       // _SHUF_00BA
4305       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4306       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4307       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4308       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4309       // _SHUF_DC00
4310       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4311       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4312       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4313       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4314     }
4315 
4316     return start;
4317   }
4318 
4319   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4320   address generate_pshuffle_byte_flip_mask_sha512() {
4321     __ align32();
4322     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4323     address start = __ pc();
4324     if (VM_Version::supports_avx2()) {
4325       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4326       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4327       __ emit_data64(0x1011121314151617, relocInfo::none);
4328       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4329       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4330       __ emit_data64(0x0000000000000000, relocInfo::none);
4331       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4332       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4333     }
4334 
4335     return start;
4336   }
4337 
4338 // ofs and limit are use for multi-block byte array.
4339 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4340   address generate_sha256_implCompress(bool multi_block, const char *name) {
4341     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4342     __ align(CodeEntryAlignment);
4343     StubCodeMark mark(this, "StubRoutines", name);
4344     address start = __ pc();
4345 
4346     Register buf = c_rarg0;
4347     Register state = c_rarg1;
4348     Register ofs = c_rarg2;
4349     Register limit = c_rarg3;
4350 
4351     const XMMRegister msg = xmm0;
4352     const XMMRegister state0 = xmm1;
4353     const XMMRegister state1 = xmm2;
4354     const XMMRegister msgtmp0 = xmm3;
4355 
4356     const XMMRegister msgtmp1 = xmm4;
4357     const XMMRegister msgtmp2 = xmm5;
4358     const XMMRegister msgtmp3 = xmm6;
4359     const XMMRegister msgtmp4 = xmm7;
4360 
4361     const XMMRegister shuf_mask = xmm8;
4362 
4363     __ enter();
4364 
4365     __ subptr(rsp, 4 * wordSize);
4366 
4367     if (VM_Version::supports_sha()) {
4368       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4369         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4370     } else if (VM_Version::supports_avx2()) {
4371       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4372         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4373     }
4374     __ addptr(rsp, 4 * wordSize);
4375     __ vzeroupper();
4376     __ leave();
4377     __ ret(0);
4378     return start;
4379   }
4380 
4381   address generate_sha512_implCompress(bool multi_block, const char *name) {
4382     assert(VM_Version::supports_avx2(), "");
4383     assert(VM_Version::supports_bmi2(), "");
4384     __ align(CodeEntryAlignment);
4385     StubCodeMark mark(this, "StubRoutines", name);
4386     address start = __ pc();
4387 
4388     Register buf = c_rarg0;
4389     Register state = c_rarg1;
4390     Register ofs = c_rarg2;
4391     Register limit = c_rarg3;
4392 
4393     const XMMRegister msg = xmm0;
4394     const XMMRegister state0 = xmm1;
4395     const XMMRegister state1 = xmm2;
4396     const XMMRegister msgtmp0 = xmm3;
4397     const XMMRegister msgtmp1 = xmm4;
4398     const XMMRegister msgtmp2 = xmm5;
4399     const XMMRegister msgtmp3 = xmm6;
4400     const XMMRegister msgtmp4 = xmm7;
4401 
4402     const XMMRegister shuf_mask = xmm8;
4403 
4404     __ enter();
4405 
4406     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4407     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4408 
4409     __ vzeroupper();
4410     __ leave();
4411     __ ret(0);
4412     return start;
4413   }
4414 
4415   address ghash_polynomial512_addr() {
4416     __ align(CodeEntryAlignment);
4417     StubCodeMark mark(this, "StubRoutines", "_ghash_poly512_addr");
4418     address start = __ pc();
4419     __ emit_data64(0x00000001C2000000, relocInfo::none); // POLY for reduction
4420     __ emit_data64(0xC200000000000000, relocInfo::none);
4421     __ emit_data64(0x00000001C2000000, relocInfo::none);
4422     __ emit_data64(0xC200000000000000, relocInfo::none);
4423     __ emit_data64(0x00000001C2000000, relocInfo::none);
4424     __ emit_data64(0xC200000000000000, relocInfo::none);
4425     __ emit_data64(0x00000001C2000000, relocInfo::none);
4426     __ emit_data64(0xC200000000000000, relocInfo::none);
4427     __ emit_data64(0x0000000000000001, relocInfo::none); // POLY
4428     __ emit_data64(0xC200000000000000, relocInfo::none);
4429     __ emit_data64(0x0000000000000001, relocInfo::none); // TWOONE
4430     __ emit_data64(0x0000000100000000, relocInfo::none);
4431     return start;
4432 }
4433 
4434   // Vector AES Galois Counter Mode implementation. Parameters:
4435   // Windows regs            |  Linux regs
4436   // in = c_rarg0 (rcx)      |  c_rarg0 (rsi)
4437   // len = c_rarg1 (rdx)     |  c_rarg1 (rdi)
4438   // ct = c_rarg2 (r8)       |  c_rarg2 (rdx)
4439   // out = c_rarg3 (r9)      |  c_rarg3 (rcx)
4440   // key = r10               |  c_rarg4 (r8)
4441   // state = r13             |  c_rarg5 (r9)
4442   // subkeyHtbl = r14        |  r11
4443   // counter = rsi           |  r12
4444   // return - number of processed bytes
4445   address generate_galoisCounterMode_AESCrypt() {
4446     __ align(CodeEntryAlignment);
4447     StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
4448     address start = __ pc();
4449     const Register in = c_rarg0;
4450     const Register len = c_rarg1;
4451     const Register ct = c_rarg2;
4452     const Register out = c_rarg3;
4453     // and updated with the incremented counter in the end
4454 #ifndef _WIN64
4455     const Register key = c_rarg4;
4456     const Register state = c_rarg5;
4457     const Address subkeyH_mem(rbp, 2 * wordSize);
4458     const Register subkeyHtbl = r11;
4459     const Register avx512_subkeyHtbl = r13;
4460     const Address counter_mem(rbp, 3 * wordSize);
4461     const Register counter = r12;
4462 #else
4463     const Address key_mem(rbp, 6 * wordSize);
4464     const Register key = r10;
4465     const Address state_mem(rbp, 7 * wordSize);
4466     const Register state = r13;
4467     const Address subkeyH_mem(rbp, 8 * wordSize);
4468     const Register subkeyHtbl = r14;
4469     const Register avx512_subkeyHtbl = r12;
4470     const Address counter_mem(rbp, 9 * wordSize);
4471     const Register counter = rsi;
4472 #endif
4473     __ enter();
4474    // Save state before entering routine
4475     __ push(r12);
4476     __ push(r13);
4477     __ push(r14);
4478     __ push(r15);
4479     __ push(rbx);
4480 #ifdef _WIN64
4481     // on win64, fill len_reg from stack position
4482     __ push(rsi);
4483     __ movptr(key, key_mem);
4484     __ movptr(state, state_mem);
4485 #endif
4486     __ movptr(subkeyHtbl, subkeyH_mem);
4487     __ movptr(counter, counter_mem);
4488 // Save rbp and rsp
4489     __ push(rbp);
4490     __ movq(rbp, rsp);
4491 // Align stack
4492     __ andq(rsp, -64);
4493     __ subptr(rsp, 96 * longSize); // Create space on the stack for htbl entries
4494     __ movptr(avx512_subkeyHtbl, rsp);
4495 
4496     __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
4497     __ vzeroupper();
4498 
4499     __ movq(rsp, rbp);
4500     __ pop(rbp);
4501 
4502     // Restore state before leaving routine
4503 #ifdef _WIN64
4504     __ pop(rsi);
4505 #endif
4506     __ pop(rbx);
4507     __ pop(r15);
4508     __ pop(r14);
4509     __ pop(r13);
4510     __ pop(r12);
4511 
4512     __ leave(); // required for proper stackwalking of RuntimeStub frame
4513     __ ret(0);
4514      return start;
4515   }
4516 
4517   // This mask is used for incrementing counter value(linc0, linc4, etc.)
4518   address counter_mask_addr() {
4519     __ align64();
4520     StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
4521     address start = __ pc();
4522     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
4523     __ emit_data64(0x0001020304050607, relocInfo::none);
4524     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4525     __ emit_data64(0x0001020304050607, relocInfo::none);
4526     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4527     __ emit_data64(0x0001020304050607, relocInfo::none);
4528     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4529     __ emit_data64(0x0001020304050607, relocInfo::none);
4530     __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4531     __ emit_data64(0x0000000000000000, relocInfo::none);
4532     __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4533     __ emit_data64(0x0000000000000000, relocInfo::none);
4534     __ emit_data64(0x0000000000000002, relocInfo::none);
4535     __ emit_data64(0x0000000000000000, relocInfo::none);
4536     __ emit_data64(0x0000000000000003, relocInfo::none);
4537     __ emit_data64(0x0000000000000000, relocInfo::none);
4538     __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4539     __ emit_data64(0x0000000000000000, relocInfo::none);
4540     __ emit_data64(0x0000000000000004, relocInfo::none);
4541     __ emit_data64(0x0000000000000000, relocInfo::none);
4542     __ emit_data64(0x0000000000000004, relocInfo::none);
4543     __ emit_data64(0x0000000000000000, relocInfo::none);
4544     __ emit_data64(0x0000000000000004, relocInfo::none);
4545     __ emit_data64(0x0000000000000000, relocInfo::none);
4546     __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4547     __ emit_data64(0x0000000000000000, relocInfo::none);
4548     __ emit_data64(0x0000000000000008, relocInfo::none);
4549     __ emit_data64(0x0000000000000000, relocInfo::none);
4550     __ emit_data64(0x0000000000000008, relocInfo::none);
4551     __ emit_data64(0x0000000000000000, relocInfo::none);
4552     __ emit_data64(0x0000000000000008, relocInfo::none);
4553     __ emit_data64(0x0000000000000000, relocInfo::none);
4554     __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4555     __ emit_data64(0x0000000000000000, relocInfo::none);
4556     __ emit_data64(0x0000000000000020, relocInfo::none);
4557     __ emit_data64(0x0000000000000000, relocInfo::none);
4558     __ emit_data64(0x0000000000000020, relocInfo::none);
4559     __ emit_data64(0x0000000000000000, relocInfo::none);
4560     __ emit_data64(0x0000000000000020, relocInfo::none);
4561     __ emit_data64(0x0000000000000000, relocInfo::none);
4562     __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4563     __ emit_data64(0x0000000000000000, relocInfo::none);
4564     __ emit_data64(0x0000000000000010, relocInfo::none);
4565     __ emit_data64(0x0000000000000000, relocInfo::none);
4566     __ emit_data64(0x0000000000000010, relocInfo::none);
4567     __ emit_data64(0x0000000000000000, relocInfo::none);
4568     __ emit_data64(0x0000000000000010, relocInfo::none);
4569     __ emit_data64(0x0000000000000000, relocInfo::none);
4570     return start;
4571   }
4572 
4573  // Vector AES Counter implementation
4574   address generate_counterMode_VectorAESCrypt()  {
4575     __ align(CodeEntryAlignment);
4576     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4577     address start = __ pc();
4578     const Register from = c_rarg0; // source array address
4579     const Register to = c_rarg1; // destination array address
4580     const Register key = c_rarg2; // key array address r8
4581     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4582     // and updated with the incremented counter in the end
4583 #ifndef _WIN64
4584     const Register len_reg = c_rarg4;
4585     const Register saved_encCounter_start = c_rarg5;
4586     const Register used_addr = r10;
4587     const Address  used_mem(rbp, 2 * wordSize);
4588     const Register used = r11;
4589 #else
4590     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4591     const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4592     const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4593     const Register len_reg = r10; // pick the first volatile windows register
4594     const Register saved_encCounter_start = r11;
4595     const Register used_addr = r13;
4596     const Register used = r14;
4597 #endif
4598     __ enter();
4599    // Save state before entering routine
4600     __ push(r12);
4601     __ push(r13);
4602     __ push(r14);
4603     __ push(r15);
4604 #ifdef _WIN64
4605     // on win64, fill len_reg from stack position
4606     __ movl(len_reg, len_mem);
4607     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4608     __ movptr(used_addr, used_mem);
4609     __ movl(used, Address(used_addr, 0));
4610 #else
4611     __ push(len_reg); // Save
4612     __ movptr(used_addr, used_mem);
4613     __ movl(used, Address(used_addr, 0));
4614 #endif
4615     __ push(rbx);
4616     __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
4617     __ vzeroupper();
4618     // Restore state before leaving routine
4619     __ pop(rbx);
4620 #ifdef _WIN64
4621     __ movl(rax, len_mem); // return length
4622 #else
4623     __ pop(rax); // return length
4624 #endif
4625     __ pop(r15);
4626     __ pop(r14);
4627     __ pop(r13);
4628     __ pop(r12);
4629 
4630     __ leave(); // required for proper stackwalking of RuntimeStub frame
4631     __ ret(0);
4632     return start;
4633   }
4634 
4635   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4636   // to hide instruction latency
4637   //
4638   // Arguments:
4639   //
4640   // Inputs:
4641   //   c_rarg0   - source byte array address
4642   //   c_rarg1   - destination byte array address
4643   //   c_rarg2   - K (key) in little endian int array
4644   //   c_rarg3   - counter vector byte array address
4645   //   Linux
4646   //     c_rarg4   -          input length
4647   //     c_rarg5   -          saved encryptedCounter start
4648   //     rbp + 6 * wordSize - saved used length
4649   //   Windows
4650   //     rbp + 6 * wordSize - input length
4651   //     rbp + 7 * wordSize - saved encryptedCounter start
4652   //     rbp + 8 * wordSize - saved used length
4653   //
4654   // Output:
4655   //   rax       - input length
4656   //
4657   address generate_counterMode_AESCrypt_Parallel() {
4658     assert(UseAES, "need AES instructions and misaligned SSE support");
4659     __ align(CodeEntryAlignment);
4660     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4661     address start = __ pc();
4662     const Register from = c_rarg0; // source array address
4663     const Register to = c_rarg1; // destination array address
4664     const Register key = c_rarg2; // key array address
4665     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4666                                       // and updated with the incremented counter in the end
4667 #ifndef _WIN64
4668     const Register len_reg = c_rarg4;
4669     const Register saved_encCounter_start = c_rarg5;
4670     const Register used_addr = r10;
4671     const Address  used_mem(rbp, 2 * wordSize);
4672     const Register used = r11;
4673 #else
4674     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4675     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4676     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4677     const Register len_reg = r10; // pick the first volatile windows register
4678     const Register saved_encCounter_start = r11;
4679     const Register used_addr = r13;
4680     const Register used = r14;
4681 #endif
4682     const Register pos = rax;
4683 
4684     const int PARALLEL_FACTOR = 6;
4685     const XMMRegister xmm_counter_shuf_mask = xmm0;
4686     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4687     const XMMRegister xmm_curr_counter = xmm2;
4688 
4689     const XMMRegister xmm_key_tmp0 = xmm3;
4690     const XMMRegister xmm_key_tmp1 = xmm4;
4691 
4692     // registers holding the four results in the parallelized loop
4693     const XMMRegister xmm_result0 = xmm5;
4694     const XMMRegister xmm_result1 = xmm6;
4695     const XMMRegister xmm_result2 = xmm7;
4696     const XMMRegister xmm_result3 = xmm8;
4697     const XMMRegister xmm_result4 = xmm9;
4698     const XMMRegister xmm_result5 = xmm10;
4699 
4700     const XMMRegister xmm_from0 = xmm11;
4701     const XMMRegister xmm_from1 = xmm12;
4702     const XMMRegister xmm_from2 = xmm13;
4703     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4704     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4705     const XMMRegister xmm_from5 = xmm4;
4706 
4707     //for key_128, key_192, key_256
4708     const int rounds[3] = {10, 12, 14};
4709     Label L_exit_preLoop, L_preLoop_start;
4710     Label L_multiBlock_loopTop[3];
4711     Label L_singleBlockLoopTop[3];
4712     Label L__incCounter[3][6]; //for 6 blocks
4713     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4714     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4715     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4716 
4717     Label L_exit;
4718 
4719     __ enter(); // required for proper stackwalking of RuntimeStub frame
4720 
4721 #ifdef _WIN64
4722     // allocate spill slots for r13, r14
4723     enum {
4724         saved_r13_offset,
4725         saved_r14_offset
4726     };
4727     __ subptr(rsp, 2 * wordSize);
4728     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4729     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4730 
4731     // on win64, fill len_reg from stack position
4732     __ movl(len_reg, len_mem);
4733     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4734     __ movptr(used_addr, used_mem);
4735     __ movl(used, Address(used_addr, 0));
4736 #else
4737     __ push(len_reg); // Save
4738     __ movptr(used_addr, used_mem);
4739     __ movl(used, Address(used_addr, 0));
4740 #endif
4741 
4742     __ push(rbx); // Save RBX
4743     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4744     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4745     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4746     __ movptr(pos, 0);
4747 
4748     // Use the partially used encrpyted counter from last invocation
4749     __ BIND(L_preLoop_start);
4750     __ cmpptr(used, 16);
4751     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4752       __ cmpptr(len_reg, 0);
4753       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4754       __ movb(rbx, Address(saved_encCounter_start, used));
4755       __ xorb(rbx, Address(from, pos));
4756       __ movb(Address(to, pos), rbx);
4757       __ addptr(pos, 1);
4758       __ addptr(used, 1);
4759       __ subptr(len_reg, 1);
4760 
4761     __ jmp(L_preLoop_start);
4762 
4763     __ BIND(L_exit_preLoop);
4764     __ movl(Address(used_addr, 0), used);
4765 
4766     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4767     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4768     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4769     __ cmpl(rbx, 52);
4770     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4771     __ cmpl(rbx, 60);
4772     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4773 
4774 #define CTR_DoSix(opc, src_reg)                \
4775     __ opc(xmm_result0, src_reg);              \
4776     __ opc(xmm_result1, src_reg);              \
4777     __ opc(xmm_result2, src_reg);              \
4778     __ opc(xmm_result3, src_reg);              \
4779     __ opc(xmm_result4, src_reg);              \
4780     __ opc(xmm_result5, src_reg);
4781 
4782     // k == 0 :  generate code for key_128
4783     // k == 1 :  generate code for key_192
4784     // k == 2 :  generate code for key_256
4785     for (int k = 0; k < 3; ++k) {
4786       //multi blocks starts here
4787       __ align(OptoLoopAlignment);
4788       __ BIND(L_multiBlock_loopTop[k]);
4789       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4790       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4791       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4792 
4793       //load, then increase counters
4794       CTR_DoSix(movdqa, xmm_curr_counter);
4795       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4796       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4797       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4798       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4799       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4800       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4801       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4802       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4803 
4804       //load two ROUND_KEYs at a time
4805       for (int i = 1; i < rounds[k]; ) {
4806         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4807         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4808         CTR_DoSix(aesenc, xmm_key_tmp1);
4809         i++;
4810         if (i != rounds[k]) {
4811           CTR_DoSix(aesenc, xmm_key_tmp0);
4812         } else {
4813           CTR_DoSix(aesenclast, xmm_key_tmp0);
4814         }
4815         i++;
4816       }
4817 
4818       // get next PARALLEL_FACTOR blocks into xmm_result registers
4819       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4820       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4821       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4822       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4823       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4824       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4825 
4826       __ pxor(xmm_result0, xmm_from0);
4827       __ pxor(xmm_result1, xmm_from1);
4828       __ pxor(xmm_result2, xmm_from2);
4829       __ pxor(xmm_result3, xmm_from3);
4830       __ pxor(xmm_result4, xmm_from4);
4831       __ pxor(xmm_result5, xmm_from5);
4832 
4833       // store 6 results into the next 64 bytes of output
4834       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4835       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4836       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4837       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4838       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4839       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4840 
4841       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4842       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4843       __ jmp(L_multiBlock_loopTop[k]);
4844 
4845       // singleBlock starts here
4846       __ align(OptoLoopAlignment);
4847       __ BIND(L_singleBlockLoopTop[k]);
4848       __ cmpptr(len_reg, 0);
4849       __ jcc(Assembler::lessEqual, L_exit);
4850       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4851       __ movdqa(xmm_result0, xmm_curr_counter);
4852       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4853       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4854       __ pxor(xmm_result0, xmm_key_tmp0);
4855       for (int i = 1; i < rounds[k]; i++) {
4856         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4857         __ aesenc(xmm_result0, xmm_key_tmp0);
4858       }
4859       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4860       __ aesenclast(xmm_result0, xmm_key_tmp0);
4861       __ cmpptr(len_reg, AESBlockSize);
4862       __ jcc(Assembler::less, L_processTail_insr[k]);
4863         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4864         __ pxor(xmm_result0, xmm_from0);
4865         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4866         __ addptr(pos, AESBlockSize);
4867         __ subptr(len_reg, AESBlockSize);
4868         __ jmp(L_singleBlockLoopTop[k]);
4869       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4870         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4871         __ testptr(len_reg, 8);
4872         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4873           __ subptr(pos,8);
4874           __ pinsrq(xmm_from0, Address(from, pos), 0);
4875         __ BIND(L_processTail_4_insr[k]);
4876         __ testptr(len_reg, 4);
4877         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4878           __ subptr(pos,4);
4879           __ pslldq(xmm_from0, 4);
4880           __ pinsrd(xmm_from0, Address(from, pos), 0);
4881         __ BIND(L_processTail_2_insr[k]);
4882         __ testptr(len_reg, 2);
4883         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4884           __ subptr(pos, 2);
4885           __ pslldq(xmm_from0, 2);
4886           __ pinsrw(xmm_from0, Address(from, pos), 0);
4887         __ BIND(L_processTail_1_insr[k]);
4888         __ testptr(len_reg, 1);
4889         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4890           __ subptr(pos, 1);
4891           __ pslldq(xmm_from0, 1);
4892           __ pinsrb(xmm_from0, Address(from, pos), 0);
4893         __ BIND(L_processTail_exit_insr[k]);
4894 
4895         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4896         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4897 
4898         __ testptr(len_reg, 8);
4899         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4900           __ pextrq(Address(to, pos), xmm_result0, 0);
4901           __ psrldq(xmm_result0, 8);
4902           __ addptr(pos, 8);
4903         __ BIND(L_processTail_4_extr[k]);
4904         __ testptr(len_reg, 4);
4905         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4906           __ pextrd(Address(to, pos), xmm_result0, 0);
4907           __ psrldq(xmm_result0, 4);
4908           __ addptr(pos, 4);
4909         __ BIND(L_processTail_2_extr[k]);
4910         __ testptr(len_reg, 2);
4911         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4912           __ pextrw(Address(to, pos), xmm_result0, 0);
4913           __ psrldq(xmm_result0, 2);
4914           __ addptr(pos, 2);
4915         __ BIND(L_processTail_1_extr[k]);
4916         __ testptr(len_reg, 1);
4917         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4918           __ pextrb(Address(to, pos), xmm_result0, 0);
4919 
4920         __ BIND(L_processTail_exit_extr[k]);
4921         __ movl(Address(used_addr, 0), len_reg);
4922         __ jmp(L_exit);
4923 
4924     }
4925 
4926     __ BIND(L_exit);
4927     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4928     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4929     __ pop(rbx); // pop the saved RBX.
4930 #ifdef _WIN64
4931     __ movl(rax, len_mem);
4932     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4933     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4934     __ addptr(rsp, 2 * wordSize);
4935 #else
4936     __ pop(rax); // return 'len'
4937 #endif
4938     __ leave(); // required for proper stackwalking of RuntimeStub frame
4939     __ ret(0);
4940     return start;
4941   }
4942 
4943 void roundDec(XMMRegister xmm_reg) {
4944   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4945   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4946   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4947   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4948   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4949   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4950   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4951   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4952 }
4953 
4954 void roundDeclast(XMMRegister xmm_reg) {
4955   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4956   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4957   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4958   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4959   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4960   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4961   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4962   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4963 }
4964 
4965   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4966     __ movdqu(xmmdst, Address(key, offset));
4967     if (xmm_shuf_mask != NULL) {
4968       __ pshufb(xmmdst, xmm_shuf_mask);
4969     } else {
4970       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4971     }
4972     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4973 
4974   }
4975 
4976 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4977     assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
4978     __ align(CodeEntryAlignment);
4979     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4980     address start = __ pc();
4981 
4982     const Register from = c_rarg0;  // source array address
4983     const Register to = c_rarg1;  // destination array address
4984     const Register key = c_rarg2;  // key array address
4985     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4986     // and left with the results of the last encryption block
4987 #ifndef _WIN64
4988     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4989 #else
4990     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4991     const Register len_reg = r11;      // pick the volatile windows register
4992 #endif
4993 
4994     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4995           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4996 
4997     __ enter();
4998 
4999 #ifdef _WIN64
5000   // on win64, fill len_reg from stack position
5001     __ movl(len_reg, len_mem);
5002 #else
5003     __ push(len_reg); // Save
5004 #endif
5005     __ push(rbx);
5006     __ vzeroupper();
5007 
5008     // Temporary variable declaration for swapping key bytes
5009     const XMMRegister xmm_key_shuf_mask = xmm1;
5010     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
5011 
5012     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
5013     const Register rounds = rbx;
5014     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
5015 
5016     const XMMRegister IV = xmm0;
5017     // Load IV and broadcast value to 512-bits
5018     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
5019 
5020     // Temporary variables for storing round keys
5021     const XMMRegister RK0 = xmm30;
5022     const XMMRegister RK1 = xmm9;
5023     const XMMRegister RK2 = xmm18;
5024     const XMMRegister RK3 = xmm19;
5025     const XMMRegister RK4 = xmm20;
5026     const XMMRegister RK5 = xmm21;
5027     const XMMRegister RK6 = xmm22;
5028     const XMMRegister RK7 = xmm23;
5029     const XMMRegister RK8 = xmm24;
5030     const XMMRegister RK9 = xmm25;
5031     const XMMRegister RK10 = xmm26;
5032 
5033      // Load and shuffle key
5034     // the java expanded key ordering is rotated one position from what we want
5035     // so we start from 1*16 here and hit 0*16 last
5036     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
5037     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
5038     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
5039     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
5040     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
5041     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
5042     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
5043     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
5044     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
5045     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
5046     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
5047 
5048     // Variables for storing source cipher text
5049     const XMMRegister S0 = xmm10;
5050     const XMMRegister S1 = xmm11;
5051     const XMMRegister S2 = xmm12;
5052     const XMMRegister S3 = xmm13;
5053     const XMMRegister S4 = xmm14;
5054     const XMMRegister S5 = xmm15;
5055     const XMMRegister S6 = xmm16;
5056     const XMMRegister S7 = xmm17;
5057 
5058     // Variables for storing decrypted text
5059     const XMMRegister B0 = xmm1;
5060     const XMMRegister B1 = xmm2;
5061     const XMMRegister B2 = xmm3;
5062     const XMMRegister B3 = xmm4;
5063     const XMMRegister B4 = xmm5;
5064     const XMMRegister B5 = xmm6;
5065     const XMMRegister B6 = xmm7;
5066     const XMMRegister B7 = xmm8;
5067 
5068     __ cmpl(rounds, 44);
5069     __ jcc(Assembler::greater, KEY_192);
5070     __ jmp(Loop);
5071 
5072     __ BIND(KEY_192);
5073     const XMMRegister RK11 = xmm27;
5074     const XMMRegister RK12 = xmm28;
5075     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
5076     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
5077 
5078     __ cmpl(rounds, 52);
5079     __ jcc(Assembler::greater, KEY_256);
5080     __ jmp(Loop);
5081 
5082     __ BIND(KEY_256);
5083     const XMMRegister RK13 = xmm29;
5084     const XMMRegister RK14 = xmm31;
5085     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
5086     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
5087 
5088     __ BIND(Loop);
5089     __ cmpl(len_reg, 512);
5090     __ jcc(Assembler::below, Lcbc_dec_rem);
5091     __ BIND(Loop1);
5092     __ subl(len_reg, 512);
5093     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
5094     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
5095     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
5096     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
5097     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
5098     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
5099     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
5100     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
5101     __ leaq(from, Address(from, 8 * 64));
5102 
5103     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5104     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
5105     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
5106     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
5107     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
5108     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
5109     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
5110     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
5111 
5112     __ evalignq(IV, S0, IV, 0x06);
5113     __ evalignq(S0, S1, S0, 0x06);
5114     __ evalignq(S1, S2, S1, 0x06);
5115     __ evalignq(S2, S3, S2, 0x06);
5116     __ evalignq(S3, S4, S3, 0x06);
5117     __ evalignq(S4, S5, S4, 0x06);
5118     __ evalignq(S5, S6, S5, 0x06);
5119     __ evalignq(S6, S7, S6, 0x06);
5120 
5121     roundDec(RK2);
5122     roundDec(RK3);
5123     roundDec(RK4);
5124     roundDec(RK5);
5125     roundDec(RK6);
5126     roundDec(RK7);
5127     roundDec(RK8);
5128     roundDec(RK9);
5129     roundDec(RK10);
5130 
5131     __ cmpl(rounds, 44);
5132     __ jcc(Assembler::belowEqual, L_128);
5133     roundDec(RK11);
5134     roundDec(RK12);
5135 
5136     __ cmpl(rounds, 52);
5137     __ jcc(Assembler::belowEqual, L_192);
5138     roundDec(RK13);
5139     roundDec(RK14);
5140 
5141     __ BIND(L_256);
5142     roundDeclast(RK0);
5143     __ jmp(Loop2);
5144 
5145     __ BIND(L_128);
5146     roundDeclast(RK0);
5147     __ jmp(Loop2);
5148 
5149     __ BIND(L_192);
5150     roundDeclast(RK0);
5151 
5152     __ BIND(Loop2);
5153     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5154     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
5155     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
5156     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
5157     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
5158     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
5159     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
5160     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
5161     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
5162 
5163     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
5164     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
5165     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
5166     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
5167     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
5168     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
5169     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
5170     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
5171     __ leaq(to, Address(to, 8 * 64));
5172     __ jmp(Loop);
5173 
5174     __ BIND(Lcbc_dec_rem);
5175     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
5176 
5177     __ BIND(Lcbc_dec_rem_loop);
5178     __ subl(len_reg, 16);
5179     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
5180 
5181     __ movdqu(S0, Address(from, 0));
5182     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5183     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
5184     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
5185     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
5186     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
5187     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
5188     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
5189     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
5190     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
5191     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
5192     __ cmpl(rounds, 44);
5193     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5194 
5195     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
5196     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
5197     __ cmpl(rounds, 52);
5198     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5199 
5200     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
5201     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
5202 
5203     __ BIND(Lcbc_dec_rem_last);
5204     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
5205 
5206     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5207     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
5208     __ movdqu(Address(to, 0), B0);
5209     __ leaq(from, Address(from, 16));
5210     __ leaq(to, Address(to, 16));
5211     __ jmp(Lcbc_dec_rem_loop);
5212 
5213     __ BIND(Lcbc_dec_ret);
5214     __ movdqu(Address(rvec, 0), IV);
5215 
5216     // Zero out the round keys
5217     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
5218     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
5219     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
5220     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
5221     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
5222     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
5223     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
5224     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
5225     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
5226     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
5227     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
5228     __ cmpl(rounds, 44);
5229     __ jcc(Assembler::belowEqual, Lcbc_exit);
5230     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
5231     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
5232     __ cmpl(rounds, 52);
5233     __ jcc(Assembler::belowEqual, Lcbc_exit);
5234     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
5235     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
5236 
5237     __ BIND(Lcbc_exit);
5238     __ vzeroupper();
5239     __ pop(rbx);
5240 #ifdef _WIN64
5241     __ movl(rax, len_mem);
5242 #else
5243     __ pop(rax); // return length
5244 #endif
5245     __ leave(); // required for proper stackwalking of RuntimeStub frame
5246     __ ret(0);
5247     return start;
5248 }
5249 
5250 // Polynomial x^128+x^127+x^126+x^121+1
5251 address ghash_polynomial_addr() {
5252     __ align(CodeEntryAlignment);
5253     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
5254     address start = __ pc();
5255     __ emit_data64(0x0000000000000001, relocInfo::none);
5256     __ emit_data64(0xc200000000000000, relocInfo::none);
5257     return start;
5258 }
5259 
5260 address ghash_shufflemask_addr() {
5261     __ align(CodeEntryAlignment);
5262     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
5263     address start = __ pc();
5264     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5265     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5266     return start;
5267 }
5268 
5269 // Ghash single and multi block operations using AVX instructions
5270 address generate_avx_ghash_processBlocks() {
5271     __ align(CodeEntryAlignment);
5272 
5273     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5274     address start = __ pc();
5275 
5276     // arguments
5277     const Register state = c_rarg0;
5278     const Register htbl = c_rarg1;
5279     const Register data = c_rarg2;
5280     const Register blocks = c_rarg3;
5281     __ enter();
5282    // Save state before entering routine
5283     __ avx_ghash(state, htbl, data, blocks);
5284     __ leave(); // required for proper stackwalking of RuntimeStub frame
5285     __ ret(0);
5286     return start;
5287 }
5288 
5289   // byte swap x86 long
5290   address generate_ghash_long_swap_mask() {
5291     __ align(CodeEntryAlignment);
5292     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
5293     address start = __ pc();
5294     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
5295     __ emit_data64(0x0706050403020100, relocInfo::none );
5296   return start;
5297   }
5298 
5299   // byte swap x86 byte array
5300   address generate_ghash_byte_swap_mask() {
5301     __ align(CodeEntryAlignment);
5302     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
5303     address start = __ pc();
5304     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
5305     __ emit_data64(0x0001020304050607, relocInfo::none );
5306   return start;
5307   }
5308 
5309   /* Single and multi-block ghash operations */
5310   address generate_ghash_processBlocks() {
5311     __ align(CodeEntryAlignment);
5312     Label L_ghash_loop, L_exit;
5313     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5314     address start = __ pc();
5315 
5316     const Register state        = c_rarg0;
5317     const Register subkeyH      = c_rarg1;
5318     const Register data         = c_rarg2;
5319     const Register blocks       = c_rarg3;
5320 
5321     const XMMRegister xmm_temp0 = xmm0;
5322     const XMMRegister xmm_temp1 = xmm1;
5323     const XMMRegister xmm_temp2 = xmm2;
5324     const XMMRegister xmm_temp3 = xmm3;
5325     const XMMRegister xmm_temp4 = xmm4;
5326     const XMMRegister xmm_temp5 = xmm5;
5327     const XMMRegister xmm_temp6 = xmm6;
5328     const XMMRegister xmm_temp7 = xmm7;
5329     const XMMRegister xmm_temp8 = xmm8;
5330     const XMMRegister xmm_temp9 = xmm9;
5331     const XMMRegister xmm_temp10 = xmm10;
5332 
5333     __ enter();
5334 
5335     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
5336 
5337     __ movdqu(xmm_temp0, Address(state, 0));
5338     __ pshufb(xmm_temp0, xmm_temp10);
5339 
5340 
5341     __ BIND(L_ghash_loop);
5342     __ movdqu(xmm_temp2, Address(data, 0));
5343     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
5344 
5345     __ movdqu(xmm_temp1, Address(subkeyH, 0));
5346     __ pshufb(xmm_temp1, xmm_temp10);
5347 
5348     __ pxor(xmm_temp0, xmm_temp2);
5349 
5350     //
5351     // Multiply with the hash key
5352     //
5353     __ movdqu(xmm_temp3, xmm_temp0);
5354     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
5355     __ movdqu(xmm_temp4, xmm_temp0);
5356     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
5357 
5358     __ movdqu(xmm_temp5, xmm_temp0);
5359     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
5360     __ movdqu(xmm_temp6, xmm_temp0);
5361     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
5362 
5363     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
5364 
5365     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
5366     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
5367     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
5368     __ pxor(xmm_temp3, xmm_temp5);
5369     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
5370                                         // of the carry-less multiplication of
5371                                         // xmm0 by xmm1.
5372 
5373     // We shift the result of the multiplication by one bit position
5374     // to the left to cope for the fact that the bits are reversed.
5375     __ movdqu(xmm_temp7, xmm_temp3);
5376     __ movdqu(xmm_temp8, xmm_temp6);
5377     __ pslld(xmm_temp3, 1);
5378     __ pslld(xmm_temp6, 1);
5379     __ psrld(xmm_temp7, 31);
5380     __ psrld(xmm_temp8, 31);
5381     __ movdqu(xmm_temp9, xmm_temp7);
5382     __ pslldq(xmm_temp8, 4);
5383     __ pslldq(xmm_temp7, 4);
5384     __ psrldq(xmm_temp9, 12);
5385     __ por(xmm_temp3, xmm_temp7);
5386     __ por(xmm_temp6, xmm_temp8);
5387     __ por(xmm_temp6, xmm_temp9);
5388 
5389     //
5390     // First phase of the reduction
5391     //
5392     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
5393     // independently.
5394     __ movdqu(xmm_temp7, xmm_temp3);
5395     __ movdqu(xmm_temp8, xmm_temp3);
5396     __ movdqu(xmm_temp9, xmm_temp3);
5397     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
5398     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
5399     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
5400     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
5401     __ pxor(xmm_temp7, xmm_temp9);
5402     __ movdqu(xmm_temp8, xmm_temp7);
5403     __ pslldq(xmm_temp7, 12);
5404     __ psrldq(xmm_temp8, 4);
5405     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
5406 
5407     //
5408     // Second phase of the reduction
5409     //
5410     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
5411     // shift operations.
5412     __ movdqu(xmm_temp2, xmm_temp3);
5413     __ movdqu(xmm_temp4, xmm_temp3);
5414     __ movdqu(xmm_temp5, xmm_temp3);
5415     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
5416     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
5417     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
5418     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
5419     __ pxor(xmm_temp2, xmm_temp5);
5420     __ pxor(xmm_temp2, xmm_temp8);
5421     __ pxor(xmm_temp3, xmm_temp2);
5422     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
5423 
5424     __ decrement(blocks);
5425     __ jcc(Assembler::zero, L_exit);
5426     __ movdqu(xmm_temp0, xmm_temp6);
5427     __ addptr(data, 16);
5428     __ jmp(L_ghash_loop);
5429 
5430     __ BIND(L_exit);
5431     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
5432     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
5433     __ leave();
5434     __ ret(0);
5435     return start;
5436   }
5437 
5438   address base64_shuffle_addr()
5439   {
5440     __ align64();
5441     StubCodeMark mark(this, "StubRoutines", "shuffle_base64");
5442     address start = __ pc();
5443     assert(((unsigned long long)start & 0x3f) == 0,
5444            "Alignment problem (0x%08llx)", (unsigned long long)start);
5445     __ emit_data64(0x0405030401020001, relocInfo::none);
5446     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5447     __ emit_data64(0x10110f100d0e0c0d, relocInfo::none);
5448     __ emit_data64(0x1617151613141213, relocInfo::none);
5449     __ emit_data64(0x1c1d1b1c191a1819, relocInfo::none);
5450     __ emit_data64(0x222321221f201e1f, relocInfo::none);
5451     __ emit_data64(0x2829272825262425, relocInfo::none);
5452     __ emit_data64(0x2e2f2d2e2b2c2a2b, relocInfo::none);
5453     return start;
5454   }
5455 
5456   address base64_avx2_shuffle_addr()
5457   {
5458     __ align32();
5459     StubCodeMark mark(this, "StubRoutines", "avx2_shuffle_base64");
5460     address start = __ pc();
5461     __ emit_data64(0x0809070805060405, relocInfo::none);
5462     __ emit_data64(0x0e0f0d0e0b0c0a0b, relocInfo::none);
5463     __ emit_data64(0x0405030401020001, relocInfo::none);
5464     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5465     return start;
5466   }
5467 
5468   address base64_avx2_input_mask_addr()
5469   {
5470     __ align32();
5471     StubCodeMark mark(this, "StubRoutines", "avx2_input_mask_base64");
5472     address start = __ pc();
5473     __ emit_data64(0x8000000000000000, relocInfo::none);
5474     __ emit_data64(0x8000000080000000, relocInfo::none);
5475     __ emit_data64(0x8000000080000000, relocInfo::none);
5476     __ emit_data64(0x8000000080000000, relocInfo::none);
5477     return start;
5478   }
5479 
5480   address base64_avx2_lut_addr()
5481   {
5482     __ align32();
5483     StubCodeMark mark(this, "StubRoutines", "avx2_lut_base64");
5484     address start = __ pc();
5485     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5486     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5487     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5488     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5489 
5490     // URL LUT
5491     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5492     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5493     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5494     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5495     return start;
5496   }
5497 
5498   address base64_encoding_table_addr()
5499   {
5500     __ align64();
5501     StubCodeMark mark(this, "StubRoutines", "encoding_table_base64");
5502     address start = __ pc();
5503     assert(((unsigned long long)start & 0x3f) == 0, "Alignment problem (0x%08llx)", (unsigned long long)start);
5504     __ emit_data64(0x4847464544434241, relocInfo::none);
5505     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5506     __ emit_data64(0x5857565554535251, relocInfo::none);
5507     __ emit_data64(0x6665646362615a59, relocInfo::none);
5508     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5509     __ emit_data64(0x767574737271706f, relocInfo::none);
5510     __ emit_data64(0x333231307a797877, relocInfo::none);
5511     __ emit_data64(0x2f2b393837363534, relocInfo::none);
5512 
5513     // URL table
5514     __ emit_data64(0x4847464544434241, relocInfo::none);
5515     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5516     __ emit_data64(0x5857565554535251, relocInfo::none);
5517     __ emit_data64(0x6665646362615a59, relocInfo::none);
5518     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5519     __ emit_data64(0x767574737271706f, relocInfo::none);
5520     __ emit_data64(0x333231307a797877, relocInfo::none);
5521     __ emit_data64(0x5f2d393837363534, relocInfo::none);
5522     return start;
5523   }
5524 
5525   // Code for generating Base64 encoding.
5526   // Intrinsic function prototype in Base64.java:
5527   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp,
5528   // boolean isURL) {
5529   address generate_base64_encodeBlock()
5530   {
5531     __ align(CodeEntryAlignment);
5532     StubCodeMark mark(this, "StubRoutines", "implEncode");
5533     address start = __ pc();
5534     __ enter();
5535 
5536     // Save callee-saved registers before using them
5537     __ push(r12);
5538     __ push(r13);
5539     __ push(r14);
5540     __ push(r15);
5541 
5542     // arguments
5543     const Register source = c_rarg0;       // Source Array
5544     const Register start_offset = c_rarg1; // start offset
5545     const Register end_offset = c_rarg2;   // end offset
5546     const Register dest = c_rarg3;   // destination array
5547 
5548 #ifndef _WIN64
5549     const Register dp = c_rarg4;    // Position for writing to dest array
5550     const Register isURL = c_rarg5; // Base64 or URL character set
5551 #else
5552     const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5553     const Address isURL_mem(rbp, 7 * wordSize);
5554     const Register isURL = r10; // pick the volatile windows register
5555     const Register dp = r12;
5556     __ movl(dp, dp_mem);
5557     __ movl(isURL, isURL_mem);
5558 #endif
5559 
5560     const Register length = r14;
5561     const Register encode_table = r13;
5562     Label L_process3, L_exit, L_processdata, L_vbmiLoop, L_not512, L_32byteLoop;
5563 
5564     // calculate length from offsets
5565     __ movl(length, end_offset);
5566     __ subl(length, start_offset);
5567     __ cmpl(length, 0);
5568     __ jcc(Assembler::lessEqual, L_exit);
5569 
5570     // Code for 512-bit VBMI encoding.  Encodes 48 input bytes into 64
5571     // output bytes. We read 64 input bytes and ignore the last 16, so be
5572     // sure not to read past the end of the input buffer.
5573     if (VM_Version::supports_avx512_vbmi()) {
5574       __ cmpl(length, 64); // Do not overrun input buffer.
5575       __ jcc(Assembler::below, L_not512);
5576 
5577       __ shll(isURL, 6); // index into decode table based on isURL
5578       __ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5579       __ addptr(encode_table, isURL);
5580       __ shrl(isURL, 6); // restore isURL
5581 
5582       __ mov64(rax, 0x3036242a1016040aull); // Shifts
5583       __ evmovdquq(xmm3, ExternalAddress(StubRoutines::x86::base64_shuffle_addr()), Assembler::AVX_512bit, r15);
5584       __ evmovdquq(xmm2, Address(encode_table, 0), Assembler::AVX_512bit);
5585       __ evpbroadcastq(xmm1, rax, Assembler::AVX_512bit);
5586 
5587       __ align32();
5588       __ BIND(L_vbmiLoop);
5589 
5590       __ vpermb(xmm0, xmm3, Address(source, start_offset), Assembler::AVX_512bit);
5591       __ subl(length, 48);
5592 
5593       // Put the input bytes into the proper lanes for writing, then
5594       // encode them.
5595       __ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
5596       __ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
5597 
5598       // Write to destination
5599       __ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
5600 
5601       __ addptr(dest, 64);
5602       __ addptr(source, 48);
5603       __ cmpl(length, 64);
5604       __ jcc(Assembler::aboveEqual, L_vbmiLoop);
5605 
5606       __ vzeroupper();
5607     }
5608 
5609     __ BIND(L_not512);
5610     if (VM_Version::supports_avx2()
5611         && VM_Version::supports_avx512vlbw()) {
5612       /*
5613       ** This AVX2 encoder is based off the paper at:
5614       **      https://dl.acm.org/doi/10.1145/3132709
5615       **
5616       ** We use AVX2 SIMD instructions to encode 24 bytes into 32
5617       ** output bytes.
5618       **
5619       */
5620       // Lengths under 32 bytes are done with scalar routine
5621       __ cmpl(length, 31);
5622       __ jcc(Assembler::belowEqual, L_process3);
5623 
5624       // Set up supporting constant table data
5625       __ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
5626       // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
5627       __ movl(rax, 0x0fc0fc00);
5628       __ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
5629       __ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
5630 
5631       // Multiplication constant for "shifting" right by 6 and 10
5632       // bits
5633       __ movl(rax, 0x04000040);
5634 
5635       __ subl(length, 24);
5636       __ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
5637 
5638       // For the first load, we mask off reading of the first 4
5639       // bytes into the register. This is so we can get 4 3-byte
5640       // chunks into each lane of the register, avoiding having to
5641       // handle end conditions.  We then shuffle these bytes into a
5642       // specific order so that manipulation is easier.
5643       //
5644       // The initial read loads the XMM register like this:
5645       //
5646       // Lower 128-bit lane:
5647       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5648       // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1
5649       // | C2 | D0 | D1 | D2 |
5650       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5651       //
5652       // Upper 128-bit lane:
5653       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5654       // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2
5655       // | XX | XX | XX | XX |
5656       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5657       //
5658       // Where A0 is the first input byte, B0 is the fourth, etc.
5659       // The alphabetical significance denotes the 3 bytes to be
5660       // consumed and encoded into 4 bytes.
5661       //
5662       // We then shuffle the register so each 32-bit word contains
5663       // the sequence:
5664       //    A1 A0 A2 A1, B1, B0, B2, B1, etc.
5665       // Each of these byte sequences are then manipulated into 4
5666       // 6-bit values ready for encoding.
5667       //
5668       // If we focus on one set of 3-byte chunks, changing the
5669       // nomenclature such that A0 => a, A1 => b, and A2 => c, we
5670       // shuffle such that each 24-bit chunk contains:
5671       //
5672       // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6
5673       // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0
5674       // Explain this step.
5675       // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4
5676       // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2
5677       //
5678       // W first and off all but bits 4-9 and 16-21 (c5..c0 and
5679       // a5..a0) and shift them using a vector multiplication
5680       // operation (vpmulhuw) which effectively shifts c right by 6
5681       // bits and a right by 10 bits.  We similarly mask bits 10-15
5682       // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4
5683       // bits respectively.  This is done using vpmullw.  We end up
5684       // with 4 6-bit values, thus splitting the 3 input bytes,
5685       // ready for encoding:
5686       //    0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0
5687       //
5688       // For translation, we recognize that there are 5 distinct
5689       // ranges of legal Base64 characters as below:
5690       //
5691       //   +-------------+-------------+------------+
5692       //   | 6-bit value | ASCII range |   offset   |
5693       //   +-------------+-------------+------------+
5694       //   |    0..25    |    A..Z     |     65     |
5695       //   |   26..51    |    a..z     |     71     |
5696       //   |   52..61    |    0..9     |     -4     |
5697       //   |     62      |   + or -    | -19 or -17 |
5698       //   |     63      |   / or _    | -16 or 32  |
5699       //   +-------------+-------------+------------+
5700       //
5701       // We note that vpshufb does a parallel lookup in a
5702       // destination register using the lower 4 bits of bytes from a
5703       // source register.  If we use a saturated subtraction and
5704       // subtract 51 from each 6-bit value, bytes from [0,51]
5705       // saturate to 0, and [52,63] map to a range of [1,12].  We
5706       // distinguish the [0,25] and [26,51] ranges by assigning a
5707       // value of 13 for all 6-bit values less than 26.  We end up
5708       // with:
5709       //
5710       //   +-------------+-------------+------------+
5711       //   | 6-bit value |   Reduced   |   offset   |
5712       //   +-------------+-------------+------------+
5713       //   |    0..25    |     13      |     65     |
5714       //   |   26..51    |      0      |     71     |
5715       //   |   52..61    |    0..9     |     -4     |
5716       //   |     62      |     11      | -19 or -17 |
5717       //   |     63      |     12      | -16 or 32  |
5718       //   +-------------+-------------+------------+
5719       //
5720       // We then use a final vpshufb to add the appropriate offset,
5721       // translating the bytes.
5722       //
5723       // Load input bytes - only 28 bytes.  Mask the first load to
5724       // not load into the full register.
5725       __ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
5726 
5727       // Move 3-byte chunks of input (12 bytes) into 16 bytes,
5728       // ordering by:
5729       //   1, 0, 2, 1; 4, 3, 5, 4; etc.  This groups 6-bit chunks
5730       //   for easy masking
5731       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5732 
5733       __ addl(start_offset, 24);
5734 
5735       // Load masking register for first and third (and multiples)
5736       // 6-bit values.
5737       __ movl(rax, 0x003f03f0);
5738       __ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
5739       // Multiplication constant for "shifting" left by 4 and 8 bits
5740       __ movl(rax, 0x01000010);
5741       __ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
5742 
5743       // Isolate 6-bit chunks of interest
5744       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5745 
5746       // Load constants for encoding
5747       __ movl(rax, 0x19191919);
5748       __ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
5749       __ movl(rax, 0x33333333);
5750       __ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
5751 
5752       // Shift output bytes 0 and 2 into proper lanes
5753       __ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
5754 
5755       // Mask and shift output bytes 1 and 3 into proper lanes and
5756       // combine
5757       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5758       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5759       __ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
5760 
5761       // Find out which are 0..25.  This indicates which input
5762       // values fall in the range of 'A'-'Z', which require an
5763       // additional offset (see comments above)
5764       __ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
5765       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5766       __ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
5767 
5768       // Load the proper lookup table
5769       __ lea(r11, ExternalAddress(StubRoutines::x86::base64_avx2_lut_addr()));
5770       __ movl(r15, isURL);
5771       __ shll(r15, 5);
5772       __ vmovdqu(xmm2, Address(r11, r15));
5773 
5774       // Shuffle the offsets based on the range calculation done
5775       // above. This allows us to add the correct offset to the
5776       // 6-bit value corresponding to the range documented above.
5777       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5778       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5779 
5780       // Store the encoded bytes
5781       __ vmovdqu(Address(dest, dp), xmm0);
5782       __ addl(dp, 32);
5783 
5784       __ cmpl(length, 31);
5785       __ jcc(Assembler::belowEqual, L_process3);
5786 
5787       __ align32();
5788       __ BIND(L_32byteLoop);
5789 
5790       // Get next 32 bytes
5791       __ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
5792 
5793       __ subl(length, 24);
5794       __ addl(start_offset, 24);
5795 
5796       // This logic is identical to the above, with only constant
5797       // register loads removed.  Shuffle the input, mask off 6-bit
5798       // chunks, shift them into place, then add the offset to
5799       // encode.
5800       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5801 
5802       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5803       __ vpmulhuw(xmm10, xmm0, xmm7, Assembler::AVX_256bit);
5804       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5805       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5806       __ vpor(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
5807       __ vpcmpgtb(xmm10, xmm0, xmm3, Assembler::AVX_256bit);
5808       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5809       __ vpsubb(xmm1, xmm1, xmm10, Assembler::AVX_256bit);
5810       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5811       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5812 
5813       // Store the encoded bytes
5814       __ vmovdqu(Address(dest, dp), xmm0);
5815       __ addl(dp, 32);
5816 
5817       __ cmpl(length, 31);
5818       __ jcc(Assembler::above, L_32byteLoop);
5819 
5820       __ BIND(L_process3);
5821       __ vzeroupper();
5822     } else {
5823       __ BIND(L_process3);
5824     }
5825 
5826     __ cmpl(length, 3);
5827     __ jcc(Assembler::below, L_exit);
5828 
5829     // Load the encoding table based on isURL
5830     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5831     __ movl(r15, isURL);
5832     __ shll(r15, 6);
5833     __ addptr(r11, r15);
5834 
5835     __ BIND(L_processdata);
5836 
5837     // Load 3 bytes
5838     __ load_unsigned_byte(r15, Address(source, start_offset));
5839     __ load_unsigned_byte(r10, Address(source, start_offset, Address::times_1, 1));
5840     __ load_unsigned_byte(r13, Address(source, start_offset, Address::times_1, 2));
5841 
5842     // Build a 32-bit word with bytes 1, 2, 0, 1
5843     __ movl(rax, r10);
5844     __ shll(r10, 24);
5845     __ orl(rax, r10);
5846 
5847     __ subl(length, 3);
5848 
5849     __ shll(r15, 8);
5850     __ shll(r13, 16);
5851     __ orl(rax, r15);
5852 
5853     __ addl(start_offset, 3);
5854 
5855     __ orl(rax, r13);
5856     // At this point, rax contains | byte1 | byte2 | byte0 | byte1
5857     // r13 has byte2 << 16 - need low-order 6 bits to translate.
5858     // This translated byte is the fourth output byte.
5859     __ shrl(r13, 16);
5860     __ andl(r13, 0x3f);
5861 
5862     // The high-order 6 bits of r15 (byte0) is translated.
5863     // The translated byte is the first output byte.
5864     __ shrl(r15, 10);
5865 
5866     __ load_unsigned_byte(r13, Address(r11, r13));
5867     __ load_unsigned_byte(r15, Address(r11, r15));
5868 
5869     __ movb(Address(dest, dp, Address::times_1, 3), r13);
5870 
5871     // Extract high-order 4 bits of byte1 and low-order 2 bits of byte0.
5872     // This translated byte is the second output byte.
5873     __ shrl(rax, 4);
5874     __ movl(r10, rax);
5875     __ andl(rax, 0x3f);
5876 
5877     __ movb(Address(dest, dp, Address::times_1, 0), r15);
5878 
5879     __ load_unsigned_byte(rax, Address(r11, rax));
5880 
5881     // Extract low-order 2 bits of byte1 and high-order 4 bits of byte2.
5882     // This translated byte is the third output byte.
5883     __ shrl(r10, 18);
5884     __ andl(r10, 0x3f);
5885 
5886     __ load_unsigned_byte(r10, Address(r11, r10));
5887 
5888     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5889     __ movb(Address(dest, dp, Address::times_1, 2), r10);
5890 
5891     __ addl(dp, 4);
5892     __ cmpl(length, 3);
5893     __ jcc(Assembler::aboveEqual, L_processdata);
5894 
5895     __ BIND(L_exit);
5896     __ pop(r15);
5897     __ pop(r14);
5898     __ pop(r13);
5899     __ pop(r12);
5900     __ leave();
5901     __ ret(0);
5902     return start;
5903   }
5904 
5905   // base64 AVX512vbmi tables
5906   address base64_vbmi_lookup_lo_addr() {
5907     __ align64();
5908     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64");
5909     address start = __ pc();
5910     assert(((unsigned long long)start & 0x3f) == 0,
5911            "Alignment problem (0x%08llx)", (unsigned long long)start);
5912     __ emit_data64(0x8080808080808080, relocInfo::none);
5913     __ emit_data64(0x8080808080808080, relocInfo::none);
5914     __ emit_data64(0x8080808080808080, relocInfo::none);
5915     __ emit_data64(0x8080808080808080, relocInfo::none);
5916     __ emit_data64(0x8080808080808080, relocInfo::none);
5917     __ emit_data64(0x3f8080803e808080, relocInfo::none);
5918     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5919     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5920     return start;
5921   }
5922 
5923   address base64_vbmi_lookup_hi_addr() {
5924     __ align64();
5925     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64");
5926     address start = __ pc();
5927     assert(((unsigned long long)start & 0x3f) == 0,
5928            "Alignment problem (0x%08llx)", (unsigned long long)start);
5929     __ emit_data64(0x0605040302010080, relocInfo::none);
5930     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5931     __ emit_data64(0x161514131211100f, relocInfo::none);
5932     __ emit_data64(0x8080808080191817, relocInfo::none);
5933     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5934     __ emit_data64(0x2827262524232221, relocInfo::none);
5935     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5936     __ emit_data64(0x8080808080333231, relocInfo::none);
5937     return start;
5938   }
5939   address base64_vbmi_lookup_lo_url_addr() {
5940     __ align64();
5941     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64url");
5942     address start = __ pc();
5943     assert(((unsigned long long)start & 0x3f) == 0,
5944            "Alignment problem (0x%08llx)", (unsigned long long)start);
5945     __ emit_data64(0x8080808080808080, relocInfo::none);
5946     __ emit_data64(0x8080808080808080, relocInfo::none);
5947     __ emit_data64(0x8080808080808080, relocInfo::none);
5948     __ emit_data64(0x8080808080808080, relocInfo::none);
5949     __ emit_data64(0x8080808080808080, relocInfo::none);
5950     __ emit_data64(0x80803e8080808080, relocInfo::none);
5951     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5952     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5953     return start;
5954   }
5955 
5956   address base64_vbmi_lookup_hi_url_addr() {
5957     __ align64();
5958     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64url");
5959     address start = __ pc();
5960     assert(((unsigned long long)start & 0x3f) == 0,
5961            "Alignment problem (0x%08llx)", (unsigned long long)start);
5962     __ emit_data64(0x0605040302010080, relocInfo::none);
5963     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5964     __ emit_data64(0x161514131211100f, relocInfo::none);
5965     __ emit_data64(0x3f80808080191817, relocInfo::none);
5966     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5967     __ emit_data64(0x2827262524232221, relocInfo::none);
5968     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5969     __ emit_data64(0x8080808080333231, relocInfo::none);
5970     return start;
5971   }
5972 
5973   address base64_vbmi_pack_vec_addr() {
5974     __ align64();
5975     StubCodeMark mark(this, "StubRoutines", "pack_vec_base64");
5976     address start = __ pc();
5977     assert(((unsigned long long)start & 0x3f) == 0,
5978            "Alignment problem (0x%08llx)", (unsigned long long)start);
5979     __ emit_data64(0x090a040506000102, relocInfo::none);
5980     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5981     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5982     __ emit_data64(0x292a242526202122, relocInfo::none);
5983     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5984     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5985     __ emit_data64(0x0000000000000000, relocInfo::none);
5986     __ emit_data64(0x0000000000000000, relocInfo::none);
5987     return start;
5988   }
5989 
5990   address base64_vbmi_join_0_1_addr() {
5991     __ align64();
5992     StubCodeMark mark(this, "StubRoutines", "join_0_1_base64");
5993     address start = __ pc();
5994     assert(((unsigned long long)start & 0x3f) == 0,
5995            "Alignment problem (0x%08llx)", (unsigned long long)start);
5996     __ emit_data64(0x090a040506000102, relocInfo::none);
5997     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5998     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5999     __ emit_data64(0x292a242526202122, relocInfo::none);
6000     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6001     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6002     __ emit_data64(0x494a444546404142, relocInfo::none);
6003     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6004     return start;
6005   }
6006 
6007   address base64_vbmi_join_1_2_addr() {
6008     __ align64();
6009     StubCodeMark mark(this, "StubRoutines", "join_1_2_base64");
6010     address start = __ pc();
6011     assert(((unsigned long long)start & 0x3f) == 0,
6012            "Alignment problem (0x%08llx)", (unsigned long long)start);
6013     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
6014     __ emit_data64(0x292a242526202122, relocInfo::none);
6015     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6016     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6017     __ emit_data64(0x494a444546404142, relocInfo::none);
6018     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6019     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
6020     __ emit_data64(0x696a646566606162, relocInfo::none);
6021     return start;
6022   }
6023 
6024   address base64_vbmi_join_2_3_addr() {
6025     __ align64();
6026     StubCodeMark mark(this, "StubRoutines", "join_2_3_base64");
6027     address start = __ pc();
6028     assert(((unsigned long long)start & 0x3f) == 0,
6029            "Alignment problem (0x%08llx)", (unsigned long long)start);
6030     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6031     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6032     __ emit_data64(0x494a444546404142, relocInfo::none);
6033     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6034     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
6035     __ emit_data64(0x696a646566606162, relocInfo::none);
6036     __ emit_data64(0x767071726c6d6e68, relocInfo::none);
6037     __ emit_data64(0x7c7d7e78797a7475, relocInfo::none);
6038     return start;
6039   }
6040 
6041   address base64_decoding_table_addr() {
6042     StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
6043     address start = __ pc();
6044     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6045     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6046     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6047     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6048     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6049     __ emit_data64(0x3fffffff3effffff, relocInfo::none);
6050     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6051     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6052     __ emit_data64(0x06050403020100ff, relocInfo::none);
6053     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6054     __ emit_data64(0x161514131211100f, relocInfo::none);
6055     __ emit_data64(0xffffffffff191817, relocInfo::none);
6056     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6057     __ emit_data64(0x2827262524232221, relocInfo::none);
6058     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6059     __ emit_data64(0xffffffffff333231, relocInfo::none);
6060     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6061     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6062     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6063     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6064     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6065     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6066     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6067     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6068     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6069     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6070     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6071     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6072     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6073     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6074     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6075     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6076 
6077     // URL table
6078     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6079     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6080     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6081     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6082     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6083     __ emit_data64(0xffff3effffffffff, relocInfo::none);
6084     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6085     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6086     __ emit_data64(0x06050403020100ff, relocInfo::none);
6087     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6088     __ emit_data64(0x161514131211100f, relocInfo::none);
6089     __ emit_data64(0x3fffffffff191817, relocInfo::none);
6090     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6091     __ emit_data64(0x2827262524232221, relocInfo::none);
6092     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6093     __ emit_data64(0xffffffffff333231, relocInfo::none);
6094     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6095     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6096     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6097     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6098     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6099     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6100     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6101     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6102     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6103     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6104     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6105     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6106     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6107     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6108     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6109     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6110     return start;
6111   }
6112 
6113 
6114 // Code for generating Base64 decoding.
6115 //
6116 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
6117 //
6118 // Intrinsic function prototype in Base64.java:
6119 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME) {
6120   address generate_base64_decodeBlock() {
6121     __ align(CodeEntryAlignment);
6122     StubCodeMark mark(this, "StubRoutines", "implDecode");
6123     address start = __ pc();
6124     __ enter();
6125 
6126     // Save callee-saved registers before using them
6127     __ push(r12);
6128     __ push(r13);
6129     __ push(r14);
6130     __ push(r15);
6131     __ push(rbx);
6132 
6133     // arguments
6134     const Register source = c_rarg0; // Source Array
6135     const Register start_offset = c_rarg1; // start offset
6136     const Register end_offset = c_rarg2; // end offset
6137     const Register dest = c_rarg3; // destination array
6138     const Register isMIME = rbx;
6139 
6140 #ifndef _WIN64
6141     const Register dp = c_rarg4;  // Position for writing to dest array
6142     const Register isURL = c_rarg5;// Base64 or URL character set
6143     __ movl(isMIME, Address(rbp, 2 * wordSize));
6144 #else
6145     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
6146     const Address isURL_mem(rbp, 7 * wordSize);
6147     const Register isURL = r10;      // pick the volatile windows register
6148     const Register dp = r12;
6149     __ movl(dp, dp_mem);
6150     __ movl(isURL, isURL_mem);
6151     __ movl(isMIME, Address(rbp, 8 * wordSize));
6152 #endif
6153 
6154     const XMMRegister lookup_lo = xmm5;
6155     const XMMRegister lookup_hi = xmm6;
6156     const XMMRegister errorvec = xmm7;
6157     const XMMRegister pack16_op = xmm9;
6158     const XMMRegister pack32_op = xmm8;
6159     const XMMRegister input0 = xmm3;
6160     const XMMRegister input1 = xmm20;
6161     const XMMRegister input2 = xmm21;
6162     const XMMRegister input3 = xmm19;
6163     const XMMRegister join01 = xmm12;
6164     const XMMRegister join12 = xmm11;
6165     const XMMRegister join23 = xmm10;
6166     const XMMRegister translated0 = xmm2;
6167     const XMMRegister translated1 = xmm1;
6168     const XMMRegister translated2 = xmm0;
6169     const XMMRegister translated3 = xmm4;
6170 
6171     const XMMRegister merged0 = xmm2;
6172     const XMMRegister merged1 = xmm1;
6173     const XMMRegister merged2 = xmm0;
6174     const XMMRegister merged3 = xmm4;
6175     const XMMRegister merge_ab_bc0 = xmm2;
6176     const XMMRegister merge_ab_bc1 = xmm1;
6177     const XMMRegister merge_ab_bc2 = xmm0;
6178     const XMMRegister merge_ab_bc3 = xmm4;
6179 
6180     const XMMRegister pack24bits = xmm4;
6181 
6182     const Register length = r14;
6183     const Register output_size = r13;
6184     const Register output_mask = r15;
6185     const KRegister input_mask = k1;
6186 
6187     const XMMRegister input_initial_valid_b64 = xmm0;
6188     const XMMRegister tmp = xmm10;
6189     const XMMRegister mask = xmm0;
6190     const XMMRegister invalid_b64 = xmm1;
6191 
6192     Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
6193     Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
6194     Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
6195 
6196     // calculate length from offsets
6197     __ movl(length, end_offset);
6198     __ subl(length, start_offset);
6199     __ push(dest);          // Save for return value calc
6200 
6201     // If AVX512 VBMI not supported, just compile non-AVX code
6202     if(VM_Version::supports_avx512_vbmi() &&
6203        VM_Version::supports_avx512bw()) {
6204       __ cmpl(length, 128);     // 128-bytes is break-even for AVX-512
6205       __ jcc(Assembler::lessEqual, L_bruteForce);
6206 
6207       __ cmpl(isMIME, 0);
6208       __ jcc(Assembler::notEqual, L_bruteForce);
6209 
6210       // Load lookup tables based on isURL
6211       __ cmpl(isURL, 0);
6212       __ jcc(Assembler::notZero, L_loadURL);
6213 
6214       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_addr()), Assembler::AVX_512bit, r13);
6215       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_addr()), Assembler::AVX_512bit, r13);
6216 
6217       __ BIND(L_continue);
6218 
6219       __ movl(r15, 0x01400140);
6220       __ evpbroadcastd(pack16_op, r15, Assembler::AVX_512bit);
6221 
6222       __ movl(r15, 0x00011000);
6223       __ evpbroadcastd(pack32_op, r15, Assembler::AVX_512bit);
6224 
6225       __ cmpl(length, 0xff);
6226       __ jcc(Assembler::lessEqual, L_process64);
6227 
6228       // load masks required for decoding data
6229       __ BIND(L_processdata);
6230       __ evmovdquq(join01, ExternalAddress(StubRoutines::x86::base64_vbmi_join_0_1_addr()), Assembler::AVX_512bit,r13);
6231       __ evmovdquq(join12, ExternalAddress(StubRoutines::x86::base64_vbmi_join_1_2_addr()), Assembler::AVX_512bit, r13);
6232       __ evmovdquq(join23, ExternalAddress(StubRoutines::x86::base64_vbmi_join_2_3_addr()), Assembler::AVX_512bit, r13);
6233 
6234       __ align32();
6235       __ BIND(L_process256);
6236       // Grab input data
6237       __ evmovdquq(input0, Address(source, start_offset, Address::times_1, 0x00), Assembler::AVX_512bit);
6238       __ evmovdquq(input1, Address(source, start_offset, Address::times_1, 0x40), Assembler::AVX_512bit);
6239       __ evmovdquq(input2, Address(source, start_offset, Address::times_1, 0x80), Assembler::AVX_512bit);
6240       __ evmovdquq(input3, Address(source, start_offset, Address::times_1, 0xc0), Assembler::AVX_512bit);
6241 
6242       // Copy the low part of the lookup table into the destination of the permutation
6243       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6244       __ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
6245       __ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
6246       __ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
6247 
6248       // Translate the base64 input into "decoded" bytes
6249       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6250       __ evpermt2b(translated1, input1, lookup_hi, Assembler::AVX_512bit);
6251       __ evpermt2b(translated2, input2, lookup_hi, Assembler::AVX_512bit);
6252       __ evpermt2b(translated3, input3, lookup_hi, Assembler::AVX_512bit);
6253 
6254       // OR all of the translations together to check for errors (high-order bit of byte set)
6255       __ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
6256 
6257       __ vpternlogd(input3, 0xfe, translated0, translated1, Assembler::AVX_512bit);
6258       __ vpternlogd(input0, 0xfe, translated2, translated3, Assembler::AVX_512bit);
6259       __ vpor(errorvec, input3, input0, Assembler::AVX_512bit);
6260 
6261       // Check if there was an error - if so, try 64-byte chunks
6262       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6263       __ kortestql(k3, k3);
6264       __ jcc(Assembler::notZero, L_process64);
6265 
6266       // The merging and shuffling happens here
6267       // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa]
6268       // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd]
6269       // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
6270       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6271       __ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
6272       __ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
6273       __ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
6274 
6275       // Now do the same with packed 16-bit values.
6276       // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb]
6277       // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12
6278       // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
6279       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6280       __ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
6281       __ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
6282       __ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
6283 
6284       // The join vectors specify which byte from which vector goes into the outputs
6285       // One of every 4 bytes in the extended vector is zero, so we pack them into their
6286       // final positions in the register for storing (256 bytes in, 192 bytes out)
6287       __ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
6288       __ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
6289       __ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
6290 
6291       // Store result
6292       __ evmovdquq(Address(dest, dp, Address::times_1, 0x00), merged0, Assembler::AVX_512bit);
6293       __ evmovdquq(Address(dest, dp, Address::times_1, 0x40), merged1, Assembler::AVX_512bit);
6294       __ evmovdquq(Address(dest, dp, Address::times_1, 0x80), merged2, Assembler::AVX_512bit);
6295 
6296       __ addptr(source, 0x100);
6297       __ addptr(dest, 0xc0);
6298       __ subl(length, 0x100);
6299       __ cmpl(length, 64 * 4);
6300       __ jcc(Assembler::greaterEqual, L_process256);
6301 
6302       // At this point, we've decoded 64 * 4 * n bytes.
6303       // The remaining length will be <= 64 * 4 - 1.
6304       // UNLESS there was an error decoding the first 256-byte chunk.  In this
6305       // case, the length will be arbitrarily long.
6306       //
6307       // Note that this will be the path for MIME-encoded strings.
6308 
6309       __ BIND(L_process64);
6310 
6311       __ evmovdquq(pack24bits, ExternalAddress(StubRoutines::x86::base64_vbmi_pack_vec_addr()), Assembler::AVX_512bit, r13);
6312 
6313       __ cmpl(length, 63);
6314       __ jcc(Assembler::lessEqual, L_finalBit);
6315 
6316       __ mov64(rax, 0x0000ffffffffffff);
6317       __ kmovql(k2, rax);
6318 
6319       __ align32();
6320       __ BIND(L_process64Loop);
6321 
6322       // Handle first 64-byte block
6323 
6324       __ evmovdquq(input0, Address(source, start_offset), Assembler::AVX_512bit);
6325       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6326       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6327 
6328       __ vpor(errorvec, translated0, input0, Assembler::AVX_512bit);
6329 
6330       // Check for error and bomb out before updating dest
6331       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6332       __ kortestql(k3, k3);
6333       __ jcc(Assembler::notZero, L_exit);
6334 
6335       // Pack output register, selecting correct byte ordering
6336       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6337       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6338       __ vpermb(merged0, pack24bits, merged0, Assembler::AVX_512bit);
6339 
6340       __ evmovdqub(Address(dest, dp), k2, merged0, true, Assembler::AVX_512bit);
6341 
6342       __ subl(length, 64);
6343       __ addptr(source, 64);
6344       __ addptr(dest, 48);
6345 
6346       __ cmpl(length, 64);
6347       __ jcc(Assembler::greaterEqual, L_process64Loop);
6348 
6349       __ cmpl(length, 0);
6350       __ jcc(Assembler::lessEqual, L_exit);
6351 
6352       __ BIND(L_finalBit);
6353       // Now have 1 to 63 bytes left to decode
6354 
6355       // I was going to let Java take care of the final fragment
6356       // however it will repeatedly call this routine for every 4 bytes
6357       // of input data, so handle the rest here.
6358       __ movq(rax, -1);
6359       __ bzhiq(rax, rax, length);    // Input mask in rax
6360 
6361       __ movl(output_size, length);
6362       __ shrl(output_size, 2);   // Find (len / 4) * 3 (output length)
6363       __ lea(output_size, Address(output_size, output_size, Address::times_2, 0));
6364       // output_size in r13
6365 
6366       // Strip pad characters, if any, and adjust length and mask
6367       __ cmpb(Address(source, length, Address::times_1, -1), '=');
6368       __ jcc(Assembler::equal, L_padding);
6369 
6370       __ BIND(L_donePadding);
6371 
6372       // Output size is (64 - output_size), output mask is (all 1s >> output_size).
6373       __ kmovql(input_mask, rax);
6374       __ movq(output_mask, -1);
6375       __ bzhiq(output_mask, output_mask, output_size);
6376 
6377       // Load initial input with all valid base64 characters.  Will be used
6378       // in merging source bytes to avoid masking when determining if an error occurred.
6379       __ movl(rax, 0x61616161);
6380       __ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
6381 
6382       // A register containing all invalid base64 decoded values
6383       __ movl(rax, 0x80808080);
6384       __ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
6385 
6386       // input_mask is in k1
6387       // output_size is in r13
6388       // output_mask is in r15
6389       // zmm0 - free
6390       // zmm1 - 0x00011000
6391       // zmm2 - 0x01400140
6392       // zmm3 - errorvec
6393       // zmm4 - pack vector
6394       // zmm5 - lookup_lo
6395       // zmm6 - lookup_hi
6396       // zmm7 - errorvec
6397       // zmm8 - 0x61616161
6398       // zmm9 - 0x80808080
6399 
6400       // Load only the bytes from source, merging into our "fully-valid" register
6401       __ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
6402 
6403       // Decode all bytes within our merged input
6404       __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
6405       __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
6406       __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
6407 
6408       // Check for error.  Compare (decoded | initial) to all invalid.
6409       // If any bytes have their high-order bit set, then we have an error.
6410       __ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
6411       __ kortestql(k2, k2);
6412 
6413       // If we have an error, use the brute force loop to decode what we can (4-byte chunks).
6414       __ jcc(Assembler::notZero, L_bruteForce);
6415 
6416       // Shuffle output bytes
6417       __ vpmaddubsw(tmp, tmp, pack16_op, Assembler::AVX_512bit);
6418       __ vpmaddwd(tmp, tmp, pack32_op, Assembler::AVX_512bit);
6419 
6420       __ vpermb(tmp, pack24bits, tmp, Assembler::AVX_512bit);
6421       __ kmovql(k1, output_mask);
6422       __ evmovdqub(Address(dest, dp), k1, tmp, true, Assembler::AVX_512bit);
6423 
6424       __ addptr(dest, output_size);
6425 
6426       __ BIND(L_exit);
6427       __ vzeroupper();
6428       __ pop(rax);             // Get original dest value
6429       __ subptr(dest, rax);      // Number of bytes converted
6430       __ movptr(rax, dest);
6431       __ pop(rbx);
6432       __ pop(r15);
6433       __ pop(r14);
6434       __ pop(r13);
6435       __ pop(r12);
6436       __ leave();
6437       __ ret(0);
6438 
6439       __ BIND(L_loadURL);
6440       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_url_addr()), Assembler::AVX_512bit, r13);
6441       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_url_addr()), Assembler::AVX_512bit, r13);
6442       __ jmp(L_continue);
6443 
6444       __ BIND(L_padding);
6445       __ decrementq(output_size, 1);
6446       __ shrq(rax, 1);
6447 
6448       __ cmpb(Address(source, length, Address::times_1, -2), '=');
6449       __ jcc(Assembler::notEqual, L_donePadding);
6450 
6451       __ decrementq(output_size, 1);
6452       __ shrq(rax, 1);
6453       __ jmp(L_donePadding);
6454 
6455       __ align32();
6456       __ BIND(L_bruteForce);
6457     }   // End of if(avx512_vbmi)
6458 
6459     // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
6460 
6461     // Register state (Linux):
6462     // r12-15 - saved on stack
6463     // rdi - src
6464     // rsi - sp
6465     // rdx - sl
6466     // rcx - dst
6467     // r8 - dp
6468     // r9 - isURL
6469 
6470     // Register state (Windows):
6471     // r12-15 - saved on stack
6472     // rcx - src
6473     // rdx - sp
6474     // r8 - sl
6475     // r9 - dst
6476     // r12 - dp
6477     // r10 - isURL
6478 
6479     // Registers (common):
6480     // length (r14) - bytes in src
6481 
6482     const Register decode_table = r11;
6483     const Register out_byte_count = rbx;
6484     const Register byte1 = r13;
6485     const Register byte2 = r15;
6486     const Register byte3 = WINDOWS_ONLY(r8) NOT_WINDOWS(rdx);
6487     const Register byte4 = WINDOWS_ONLY(r10) NOT_WINDOWS(r9);
6488 
6489     __ shrl(length, 2);    // Multiple of 4 bytes only - length is # 4-byte chunks
6490     __ cmpl(length, 0);
6491     __ jcc(Assembler::lessEqual, L_exit_no_vzero);
6492 
6493     __ shll(isURL, 8);    // index into decode table based on isURL
6494     __ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
6495     __ addptr(decode_table, isURL);
6496 
6497     __ jmp(L_bottomLoop);
6498 
6499     __ align32();
6500     __ BIND(L_forceLoop);
6501     __ shll(byte1, 18);
6502     __ shll(byte2, 12);
6503     __ shll(byte3, 6);
6504     __ orl(byte1, byte2);
6505     __ orl(byte1, byte3);
6506     __ orl(byte1, byte4);
6507 
6508     __ addptr(source, 4);
6509 
6510     __ movb(Address(dest, dp, Address::times_1, 2), byte1);
6511     __ shrl(byte1, 8);
6512     __ movb(Address(dest, dp, Address::times_1, 1), byte1);
6513     __ shrl(byte1, 8);
6514     __ movb(Address(dest, dp, Address::times_1, 0), byte1);
6515 
6516     __ addptr(dest, 3);
6517     __ decrementl(length, 1);
6518     __ jcc(Assembler::zero, L_exit_no_vzero);
6519 
6520     __ BIND(L_bottomLoop);
6521     __ load_unsigned_byte(byte1, Address(source, start_offset, Address::times_1, 0x00));
6522     __ load_unsigned_byte(byte2, Address(source, start_offset, Address::times_1, 0x01));
6523     __ load_signed_byte(byte1, Address(decode_table, byte1));
6524     __ load_signed_byte(byte2, Address(decode_table, byte2));
6525     __ load_unsigned_byte(byte3, Address(source, start_offset, Address::times_1, 0x02));
6526     __ load_unsigned_byte(byte4, Address(source, start_offset, Address::times_1, 0x03));
6527     __ load_signed_byte(byte3, Address(decode_table, byte3));
6528     __ load_signed_byte(byte4, Address(decode_table, byte4));
6529 
6530     __ mov(rax, byte1);
6531     __ orl(rax, byte2);
6532     __ orl(rax, byte3);
6533     __ orl(rax, byte4);
6534     __ jcc(Assembler::positive, L_forceLoop);
6535 
6536     __ BIND(L_exit_no_vzero);
6537     __ pop(rax);             // Get original dest value
6538     __ subptr(dest, rax);      // Number of bytes converted
6539     __ movptr(rax, dest);
6540     __ pop(rbx);
6541     __ pop(r15);
6542     __ pop(r14);
6543     __ pop(r13);
6544     __ pop(r12);
6545     __ leave();
6546     __ ret(0);
6547 
6548     return start;
6549   }
6550 
6551 
6552   /**
6553    *  Arguments:
6554    *
6555    * Inputs:
6556    *   c_rarg0   - int crc
6557    *   c_rarg1   - byte* buf
6558    *   c_rarg2   - int length
6559    *
6560    * Output:
6561    *       rax   - int crc result
6562    */
6563   address generate_updateBytesCRC32() {
6564     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
6565 
6566     __ align(CodeEntryAlignment);
6567     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6568 
6569     address start = __ pc();
6570     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6571     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6572     // rscratch1: r10
6573     const Register crc   = c_rarg0;  // crc
6574     const Register buf   = c_rarg1;  // source java byte array address
6575     const Register len   = c_rarg2;  // length
6576     const Register table = c_rarg3;  // crc_table address (reuse register)
6577     const Register tmp1   = r11;
6578     const Register tmp2   = r10;
6579     assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
6580 
6581     BLOCK_COMMENT("Entry:");
6582     __ enter(); // required for proper stackwalking of RuntimeStub frame
6583 
6584     if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6585         VM_Version::supports_avx512bw() &&
6586         VM_Version::supports_avx512vl()) {
6587         // The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value.
6588         // However, the constant table for CRC32-C assumes the original crc value.  Account for this
6589         // difference before calling and after returning.
6590       __ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
6591       __ notl(crc);
6592       __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6593       __ notl(crc);
6594     } else {
6595       __ kernel_crc32(crc, buf, len, table, tmp1);
6596     }
6597 
6598     __ movl(rax, crc);
6599     __ vzeroupper();
6600     __ leave(); // required for proper stackwalking of RuntimeStub frame
6601     __ ret(0);
6602 
6603     return start;
6604   }
6605 
6606   /**
6607   *  Arguments:
6608   *
6609   * Inputs:
6610   *   c_rarg0   - int crc
6611   *   c_rarg1   - byte* buf
6612   *   c_rarg2   - long length
6613   *   c_rarg3   - table_start - optional (present only when doing a library_call,
6614   *              not used by x86 algorithm)
6615   *
6616   * Output:
6617   *       rax   - int crc result
6618   */
6619   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
6620       assert(UseCRC32CIntrinsics, "need SSE4_2");
6621       __ align(CodeEntryAlignment);
6622       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
6623       address start = __ pc();
6624       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
6625       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
6626       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
6627       const Register crc = c_rarg0;  // crc
6628       const Register buf = c_rarg1;  // source java byte array address
6629       const Register len = c_rarg2;  // length
6630       const Register a = rax;
6631       const Register j = r9;
6632       const Register k = r10;
6633       const Register l = r11;
6634 #ifdef _WIN64
6635       const Register y = rdi;
6636       const Register z = rsi;
6637 #else
6638       const Register y = rcx;
6639       const Register z = r8;
6640 #endif
6641       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
6642 
6643       BLOCK_COMMENT("Entry:");
6644       __ enter(); // required for proper stackwalking of RuntimeStub frame
6645       if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6646           VM_Version::supports_avx512bw() &&
6647           VM_Version::supports_avx512vl()) {
6648         __ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr()));
6649         __ kernel_crc32_avx512(crc, buf, len, j, l, k);
6650       } else {
6651 #ifdef _WIN64
6652         __ push(y);
6653         __ push(z);
6654 #endif
6655         __ crc32c_ipl_alg2_alt2(crc, buf, len,
6656                                 a, j, k,
6657                                 l, y, z,
6658                                 c_farg0, c_farg1, c_farg2,
6659                                 is_pclmulqdq_supported);
6660 #ifdef _WIN64
6661         __ pop(z);
6662         __ pop(y);
6663 #endif
6664       }
6665       __ movl(rax, crc);
6666       __ vzeroupper();
6667       __ leave(); // required for proper stackwalking of RuntimeStub frame
6668       __ ret(0);
6669 
6670       return start;
6671   }
6672 
6673 
6674   /***
6675    *  Arguments:
6676    *
6677    *  Inputs:
6678    *   c_rarg0   - int   adler
6679    *   c_rarg1   - byte* buff
6680    *   c_rarg2   - int   len
6681    *
6682    * Output:
6683    *   rax   - int adler result
6684    */
6685 
6686   address generate_updateBytesAdler32() {
6687       assert(UseAdler32Intrinsics, "need AVX2");
6688 
6689       __ align(CodeEntryAlignment);
6690       StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
6691 
6692       address start = __ pc();
6693 
6694       const Register data = r9;
6695       const Register size = r10;
6696 
6697       const XMMRegister yshuf0 = xmm6;
6698       const XMMRegister yshuf1 = xmm7;
6699       assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
6700 
6701       BLOCK_COMMENT("Entry:");
6702       __ enter(); // required for proper stackwalking of RuntimeStub frame
6703 
6704       __ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
6705       __ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
6706       __ movptr(data, c_rarg1); //data
6707       __ movl(size, c_rarg2); //length
6708       __ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
6709       __ leave();
6710       __ ret(0);
6711       return start;
6712   }
6713 
6714   /**
6715    *  Arguments:
6716    *
6717    *  Input:
6718    *    c_rarg0   - x address
6719    *    c_rarg1   - x length
6720    *    c_rarg2   - y address
6721    *    c_rarg3   - y length
6722    * not Win64
6723    *    c_rarg4   - z address
6724    *    c_rarg5   - z length
6725    * Win64
6726    *    rsp+40    - z address
6727    *    rsp+48    - z length
6728    */
6729   address generate_multiplyToLen() {
6730     __ align(CodeEntryAlignment);
6731     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
6732 
6733     address start = __ pc();
6734     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6735     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6736     const Register x     = rdi;
6737     const Register xlen  = rax;
6738     const Register y     = rsi;
6739     const Register ylen  = rcx;
6740     const Register z     = r8;
6741     const Register zlen  = r11;
6742 
6743     // Next registers will be saved on stack in multiply_to_len().
6744     const Register tmp1  = r12;
6745     const Register tmp2  = r13;
6746     const Register tmp3  = r14;
6747     const Register tmp4  = r15;
6748     const Register tmp5  = rbx;
6749 
6750     BLOCK_COMMENT("Entry:");
6751     __ enter(); // required for proper stackwalking of RuntimeStub frame
6752 
6753 #ifndef _WIN64
6754     __ movptr(zlen, r9); // Save r9 in r11 - zlen
6755 #endif
6756     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
6757                        // ylen => rcx, z => r8, zlen => r11
6758                        // r9 and r10 may be used to save non-volatile registers
6759 #ifdef _WIN64
6760     // last 2 arguments (#4, #5) are on stack on Win64
6761     __ movptr(z, Address(rsp, 6 * wordSize));
6762     __ movptr(zlen, Address(rsp, 7 * wordSize));
6763 #endif
6764 
6765     __ movptr(xlen, rsi);
6766     __ movptr(y,    rdx);
6767     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
6768 
6769     restore_arg_regs();
6770 
6771     __ leave(); // required for proper stackwalking of RuntimeStub frame
6772     __ ret(0);
6773 
6774     return start;
6775   }
6776 
6777   /**
6778   *  Arguments:
6779   *
6780   *  Input:
6781   *    c_rarg0   - obja     address
6782   *    c_rarg1   - objb     address
6783   *    c_rarg3   - length   length
6784   *    c_rarg4   - scale    log2_array_indxscale
6785   *
6786   *  Output:
6787   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
6788   */
6789   address generate_vectorizedMismatch() {
6790     __ align(CodeEntryAlignment);
6791     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
6792     address start = __ pc();
6793 
6794     BLOCK_COMMENT("Entry:");
6795     __ enter();
6796 
6797 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6798     const Register scale = c_rarg0;  //rcx, will exchange with r9
6799     const Register objb = c_rarg1;   //rdx
6800     const Register length = c_rarg2; //r8
6801     const Register obja = c_rarg3;   //r9
6802     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
6803 
6804     const Register tmp1 = r10;
6805     const Register tmp2 = r11;
6806 #endif
6807 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6808     const Register obja = c_rarg0;   //U:rdi
6809     const Register objb = c_rarg1;   //U:rsi
6810     const Register length = c_rarg2; //U:rdx
6811     const Register scale = c_rarg3;  //U:rcx
6812     const Register tmp1 = r8;
6813     const Register tmp2 = r9;
6814 #endif
6815     const Register result = rax; //return value
6816     const XMMRegister vec0 = xmm0;
6817     const XMMRegister vec1 = xmm1;
6818     const XMMRegister vec2 = xmm2;
6819 
6820     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
6821 
6822     __ vzeroupper();
6823     __ leave();
6824     __ ret(0);
6825 
6826     return start;
6827   }
6828 
6829 /**
6830    *  Arguments:
6831    *
6832   //  Input:
6833   //    c_rarg0   - x address
6834   //    c_rarg1   - x length
6835   //    c_rarg2   - z address
6836   //    c_rarg3   - z length
6837    *
6838    */
6839   address generate_squareToLen() {
6840 
6841     __ align(CodeEntryAlignment);
6842     StubCodeMark mark(this, "StubRoutines", "squareToLen");
6843 
6844     address start = __ pc();
6845     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6846     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
6847     const Register x      = rdi;
6848     const Register len    = rsi;
6849     const Register z      = r8;
6850     const Register zlen   = rcx;
6851 
6852    const Register tmp1      = r12;
6853    const Register tmp2      = r13;
6854    const Register tmp3      = r14;
6855    const Register tmp4      = r15;
6856    const Register tmp5      = rbx;
6857 
6858     BLOCK_COMMENT("Entry:");
6859     __ enter(); // required for proper stackwalking of RuntimeStub frame
6860 
6861     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
6862                        // zlen => rcx
6863                        // r9 and r10 may be used to save non-volatile registers
6864     __ movptr(r8, rdx);
6865     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6866 
6867     restore_arg_regs();
6868 
6869     __ leave(); // required for proper stackwalking of RuntimeStub frame
6870     __ ret(0);
6871 
6872     return start;
6873   }
6874 
6875   address generate_method_entry_barrier() {
6876     __ align(CodeEntryAlignment);
6877     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
6878 
6879     Label deoptimize_label;
6880 
6881     address start = __ pc();
6882 
6883     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
6884 
6885     BLOCK_COMMENT("Entry:");
6886     __ enter(); // save rbp
6887 
6888     // save c_rarg0, because we want to use that value.
6889     // We could do without it but then we depend on the number of slots used by pusha
6890     __ push(c_rarg0);
6891 
6892     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
6893 
6894     __ pusha();
6895 
6896     // The method may have floats as arguments, and we must spill them before calling
6897     // the VM runtime.
6898     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
6899     const int xmm_size = wordSize * 2;
6900     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
6901     __ subptr(rsp, xmm_spill_size);
6902     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
6903     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
6904     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
6905     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
6906     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
6907     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
6908     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
6909     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
6910 
6911     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
6912 
6913     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
6914     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
6915     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
6916     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
6917     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
6918     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
6919     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
6920     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
6921     __ addptr(rsp, xmm_spill_size);
6922 
6923     __ cmpl(rax, 1); // 1 means deoptimize
6924     __ jcc(Assembler::equal, deoptimize_label);
6925 
6926     __ popa();
6927     __ pop(c_rarg0);
6928 
6929     __ leave();
6930 
6931     __ addptr(rsp, 1 * wordSize); // cookie
6932     __ ret(0);
6933 
6934 
6935     __ BIND(deoptimize_label);
6936 
6937     __ popa();
6938     __ pop(c_rarg0);
6939 
6940     __ leave();
6941 
6942     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
6943     // here while still having a correct stack is valuable
6944     __ testptr(rsp, Address(rsp, 0));
6945 
6946     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
6947     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
6948 
6949     return start;
6950   }
6951 
6952    /**
6953    *  Arguments:
6954    *
6955    *  Input:
6956    *    c_rarg0   - out address
6957    *    c_rarg1   - in address
6958    *    c_rarg2   - offset
6959    *    c_rarg3   - len
6960    * not Win64
6961    *    c_rarg4   - k
6962    * Win64
6963    *    rsp+40    - k
6964    */
6965   address generate_mulAdd() {
6966     __ align(CodeEntryAlignment);
6967     StubCodeMark mark(this, "StubRoutines", "mulAdd");
6968 
6969     address start = __ pc();
6970     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6971     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6972     const Register out     = rdi;
6973     const Register in      = rsi;
6974     const Register offset  = r11;
6975     const Register len     = rcx;
6976     const Register k       = r8;
6977 
6978     // Next registers will be saved on stack in mul_add().
6979     const Register tmp1  = r12;
6980     const Register tmp2  = r13;
6981     const Register tmp3  = r14;
6982     const Register tmp4  = r15;
6983     const Register tmp5  = rbx;
6984 
6985     BLOCK_COMMENT("Entry:");
6986     __ enter(); // required for proper stackwalking of RuntimeStub frame
6987 
6988     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
6989                        // len => rcx, k => r8
6990                        // r9 and r10 may be used to save non-volatile registers
6991 #ifdef _WIN64
6992     // last argument is on stack on Win64
6993     __ movl(k, Address(rsp, 6 * wordSize));
6994 #endif
6995     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
6996     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6997 
6998     restore_arg_regs();
6999 
7000     __ leave(); // required for proper stackwalking of RuntimeStub frame
7001     __ ret(0);
7002 
7003     return start;
7004   }
7005 
7006   address generate_bigIntegerRightShift() {
7007     __ align(CodeEntryAlignment);
7008     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
7009 
7010     address start = __ pc();
7011     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7012     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7013     const Register newArr = rdi;
7014     const Register oldArr = rsi;
7015     const Register newIdx = rdx;
7016     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7017     const Register totalNumIter = r8;
7018 
7019     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7020     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7021     const Register tmp1 = r11;                    // Caller save.
7022     const Register tmp2 = rax;                    // Caller save.
7023     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7024     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7025     const Register tmp5 = r14;                    // Callee save.
7026     const Register tmp6 = r15;
7027 
7028     const XMMRegister x0 = xmm0;
7029     const XMMRegister x1 = xmm1;
7030     const XMMRegister x2 = xmm2;
7031 
7032     BLOCK_COMMENT("Entry:");
7033     __ enter(); // required for proper stackwalking of RuntimeStub frame
7034 
7035 #ifdef _WINDOWS
7036     setup_arg_regs(4);
7037     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7038     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7039     // Save callee save registers.
7040     __ push(tmp3);
7041     __ push(tmp4);
7042 #endif
7043     __ push(tmp5);
7044 
7045     // Rename temps used throughout the code.
7046     const Register idx = tmp1;
7047     const Register nIdx = tmp2;
7048 
7049     __ xorl(idx, idx);
7050 
7051     // Start right shift from end of the array.
7052     // For example, if #iteration = 4 and newIdx = 1
7053     // then dest[4] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
7054     // if #iteration = 4 and newIdx = 0
7055     // then dest[3] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
7056     __ movl(idx, totalNumIter);
7057     __ movl(nIdx, idx);
7058     __ addl(nIdx, newIdx);
7059 
7060     // If vectorization is enabled, check if the number of iterations is at least 64
7061     // If not, then go to ShifTwo processing 2 iterations
7062     if (VM_Version::supports_avx512_vbmi2()) {
7063       __ cmpptr(totalNumIter, (AVX3Threshold/64));
7064       __ jcc(Assembler::less, ShiftTwo);
7065 
7066       if (AVX3Threshold < 16 * 64) {
7067         __ cmpl(totalNumIter, 16);
7068         __ jcc(Assembler::less, ShiftTwo);
7069       }
7070       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7071       __ subl(idx, 16);
7072       __ subl(nIdx, 16);
7073       __ BIND(Shift512Loop);
7074       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
7075       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7076       __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
7077       __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
7078       __ subl(nIdx, 16);
7079       __ subl(idx, 16);
7080       __ jcc(Assembler::greaterEqual, Shift512Loop);
7081       __ addl(idx, 16);
7082       __ addl(nIdx, 16);
7083     }
7084     __ BIND(ShiftTwo);
7085     __ cmpl(idx, 2);
7086     __ jcc(Assembler::less, ShiftOne);
7087     __ subl(idx, 2);
7088     __ subl(nIdx, 2);
7089     __ BIND(ShiftTwoLoop);
7090     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
7091     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7092     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7093     __ shrdl(tmp5, tmp4);
7094     __ shrdl(tmp4, tmp3);
7095     __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
7096     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7097     __ subl(nIdx, 2);
7098     __ subl(idx, 2);
7099     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7100     __ addl(idx, 2);
7101     __ addl(nIdx, 2);
7102 
7103     // Do the last iteration
7104     __ BIND(ShiftOne);
7105     __ cmpl(idx, 1);
7106     __ jcc(Assembler::less, Exit);
7107     __ subl(idx, 1);
7108     __ subl(nIdx, 1);
7109     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7110     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7111     __ shrdl(tmp4, tmp3);
7112     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7113     __ BIND(Exit);
7114     __ vzeroupper();
7115     // Restore callee save registers.
7116     __ pop(tmp5);
7117 #ifdef _WINDOWS
7118     __ pop(tmp4);
7119     __ pop(tmp3);
7120     restore_arg_regs();
7121 #endif
7122     __ leave(); // required for proper stackwalking of RuntimeStub frame
7123     __ ret(0);
7124     return start;
7125   }
7126 
7127    /**
7128    *  Arguments:
7129    *
7130    *  Input:
7131    *    c_rarg0   - newArr address
7132    *    c_rarg1   - oldArr address
7133    *    c_rarg2   - newIdx
7134    *    c_rarg3   - shiftCount
7135    * not Win64
7136    *    c_rarg4   - numIter
7137    * Win64
7138    *    rsp40    - numIter
7139    */
7140   address generate_bigIntegerLeftShift() {
7141     __ align(CodeEntryAlignment);
7142     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
7143     address start = __ pc();
7144     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7145     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7146     const Register newArr = rdi;
7147     const Register oldArr = rsi;
7148     const Register newIdx = rdx;
7149     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7150     const Register totalNumIter = r8;
7151     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7152     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7153     const Register tmp1 = r11;                    // Caller save.
7154     const Register tmp2 = rax;                    // Caller save.
7155     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7156     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7157     const Register tmp5 = r14;                    // Callee save.
7158 
7159     const XMMRegister x0 = xmm0;
7160     const XMMRegister x1 = xmm1;
7161     const XMMRegister x2 = xmm2;
7162     BLOCK_COMMENT("Entry:");
7163     __ enter(); // required for proper stackwalking of RuntimeStub frame
7164 
7165 #ifdef _WINDOWS
7166     setup_arg_regs(4);
7167     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7168     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7169     // Save callee save registers.
7170     __ push(tmp3);
7171     __ push(tmp4);
7172 #endif
7173     __ push(tmp5);
7174 
7175     // Rename temps used throughout the code
7176     const Register idx = tmp1;
7177     const Register numIterTmp = tmp2;
7178 
7179     // Start idx from zero.
7180     __ xorl(idx, idx);
7181     // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
7182     __ lea(newArr, Address(newArr, newIdx, Address::times_4));
7183     __ movl(numIterTmp, totalNumIter);
7184 
7185     // If vectorization is enabled, check if the number of iterations is at least 64
7186     // If not, then go to ShiftTwo shifting two numbers at a time
7187     if (VM_Version::supports_avx512_vbmi2()) {
7188       __ cmpl(totalNumIter, (AVX3Threshold/64));
7189       __ jcc(Assembler::less, ShiftTwo);
7190 
7191       if (AVX3Threshold < 16 * 64) {
7192         __ cmpl(totalNumIter, 16);
7193         __ jcc(Assembler::less, ShiftTwo);
7194       }
7195       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7196       __ subl(numIterTmp, 16);
7197       __ BIND(Shift512Loop);
7198       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7199       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
7200       __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
7201       __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
7202       __ addl(idx, 16);
7203       __ subl(numIterTmp, 16);
7204       __ jcc(Assembler::greaterEqual, Shift512Loop);
7205       __ addl(numIterTmp, 16);
7206     }
7207     __ BIND(ShiftTwo);
7208     __ cmpl(totalNumIter, 1);
7209     __ jcc(Assembler::less, Exit);
7210     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7211     __ subl(numIterTmp, 2);
7212     __ jcc(Assembler::less, ShiftOne);
7213 
7214     __ BIND(ShiftTwoLoop);
7215     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7216     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
7217     __ shldl(tmp3, tmp4);
7218     __ shldl(tmp4, tmp5);
7219     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7220     __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
7221     __ movl(tmp3, tmp5);
7222     __ addl(idx, 2);
7223     __ subl(numIterTmp, 2);
7224     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7225 
7226     // Do the last iteration
7227     __ BIND(ShiftOne);
7228     __ addl(numIterTmp, 2);
7229     __ cmpl(numIterTmp, 1);
7230     __ jcc(Assembler::less, Exit);
7231     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7232     __ shldl(tmp3, tmp4);
7233     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7234 
7235     __ BIND(Exit);
7236     __ vzeroupper();
7237     // Restore callee save registers.
7238     __ pop(tmp5);
7239 #ifdef _WINDOWS
7240     __ pop(tmp4);
7241     __ pop(tmp3);
7242     restore_arg_regs();
7243 #endif
7244     __ leave(); // required for proper stackwalking of RuntimeStub frame
7245     __ ret(0);
7246     return start;
7247   }
7248 
7249   address generate_libmExp() {
7250     StubCodeMark mark(this, "StubRoutines", "libmExp");
7251 
7252     address start = __ pc();
7253 
7254     const XMMRegister x0  = xmm0;
7255     const XMMRegister x1  = xmm1;
7256     const XMMRegister x2  = xmm2;
7257     const XMMRegister x3  = xmm3;
7258 
7259     const XMMRegister x4  = xmm4;
7260     const XMMRegister x5  = xmm5;
7261     const XMMRegister x6  = xmm6;
7262     const XMMRegister x7  = xmm7;
7263 
7264     const Register tmp   = r11;
7265 
7266     BLOCK_COMMENT("Entry:");
7267     __ enter(); // required for proper stackwalking of RuntimeStub frame
7268 
7269     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7270 
7271     __ leave(); // required for proper stackwalking of RuntimeStub frame
7272     __ ret(0);
7273 
7274     return start;
7275 
7276   }
7277 
7278   address generate_libmLog() {
7279     StubCodeMark mark(this, "StubRoutines", "libmLog");
7280 
7281     address start = __ pc();
7282 
7283     const XMMRegister x0 = xmm0;
7284     const XMMRegister x1 = xmm1;
7285     const XMMRegister x2 = xmm2;
7286     const XMMRegister x3 = xmm3;
7287 
7288     const XMMRegister x4 = xmm4;
7289     const XMMRegister x5 = xmm5;
7290     const XMMRegister x6 = xmm6;
7291     const XMMRegister x7 = xmm7;
7292 
7293     const Register tmp1 = r11;
7294     const Register tmp2 = r8;
7295 
7296     BLOCK_COMMENT("Entry:");
7297     __ enter(); // required for proper stackwalking of RuntimeStub frame
7298 
7299     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
7300 
7301     __ leave(); // required for proper stackwalking of RuntimeStub frame
7302     __ ret(0);
7303 
7304     return start;
7305 
7306   }
7307 
7308   address generate_libmLog10() {
7309     StubCodeMark mark(this, "StubRoutines", "libmLog10");
7310 
7311     address start = __ pc();
7312 
7313     const XMMRegister x0 = xmm0;
7314     const XMMRegister x1 = xmm1;
7315     const XMMRegister x2 = xmm2;
7316     const XMMRegister x3 = xmm3;
7317 
7318     const XMMRegister x4 = xmm4;
7319     const XMMRegister x5 = xmm5;
7320     const XMMRegister x6 = xmm6;
7321     const XMMRegister x7 = xmm7;
7322 
7323     const Register tmp = r11;
7324 
7325     BLOCK_COMMENT("Entry:");
7326     __ enter(); // required for proper stackwalking of RuntimeStub frame
7327 
7328     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7329 
7330     __ leave(); // required for proper stackwalking of RuntimeStub frame
7331     __ ret(0);
7332 
7333     return start;
7334 
7335   }
7336 
7337   address generate_libmPow() {
7338     StubCodeMark mark(this, "StubRoutines", "libmPow");
7339 
7340     address start = __ pc();
7341 
7342     const XMMRegister x0 = xmm0;
7343     const XMMRegister x1 = xmm1;
7344     const XMMRegister x2 = xmm2;
7345     const XMMRegister x3 = xmm3;
7346 
7347     const XMMRegister x4 = xmm4;
7348     const XMMRegister x5 = xmm5;
7349     const XMMRegister x6 = xmm6;
7350     const XMMRegister x7 = xmm7;
7351 
7352     const Register tmp1 = r8;
7353     const Register tmp2 = r9;
7354     const Register tmp3 = r10;
7355     const Register tmp4 = r11;
7356 
7357     BLOCK_COMMENT("Entry:");
7358     __ enter(); // required for proper stackwalking of RuntimeStub frame
7359 
7360     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7361 
7362     __ leave(); // required for proper stackwalking of RuntimeStub frame
7363     __ ret(0);
7364 
7365     return start;
7366 
7367   }
7368 
7369   address generate_libmSin() {
7370     StubCodeMark mark(this, "StubRoutines", "libmSin");
7371 
7372     address start = __ pc();
7373 
7374     const XMMRegister x0 = xmm0;
7375     const XMMRegister x1 = xmm1;
7376     const XMMRegister x2 = xmm2;
7377     const XMMRegister x3 = xmm3;
7378 
7379     const XMMRegister x4 = xmm4;
7380     const XMMRegister x5 = xmm5;
7381     const XMMRegister x6 = xmm6;
7382     const XMMRegister x7 = xmm7;
7383 
7384     const Register tmp1 = r8;
7385     const Register tmp2 = r9;
7386     const Register tmp3 = r10;
7387     const Register tmp4 = r11;
7388 
7389     BLOCK_COMMENT("Entry:");
7390     __ enter(); // required for proper stackwalking of RuntimeStub frame
7391 
7392 #ifdef _WIN64
7393     __ push(rsi);
7394     __ push(rdi);
7395 #endif
7396     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7397 
7398 #ifdef _WIN64
7399     __ pop(rdi);
7400     __ pop(rsi);
7401 #endif
7402 
7403     __ leave(); // required for proper stackwalking of RuntimeStub frame
7404     __ ret(0);
7405 
7406     return start;
7407 
7408   }
7409 
7410   address generate_libmCos() {
7411     StubCodeMark mark(this, "StubRoutines", "libmCos");
7412 
7413     address start = __ pc();
7414 
7415     const XMMRegister x0 = xmm0;
7416     const XMMRegister x1 = xmm1;
7417     const XMMRegister x2 = xmm2;
7418     const XMMRegister x3 = xmm3;
7419 
7420     const XMMRegister x4 = xmm4;
7421     const XMMRegister x5 = xmm5;
7422     const XMMRegister x6 = xmm6;
7423     const XMMRegister x7 = xmm7;
7424 
7425     const Register tmp1 = r8;
7426     const Register tmp2 = r9;
7427     const Register tmp3 = r10;
7428     const Register tmp4 = r11;
7429 
7430     BLOCK_COMMENT("Entry:");
7431     __ enter(); // required for proper stackwalking of RuntimeStub frame
7432 
7433 #ifdef _WIN64
7434     __ push(rsi);
7435     __ push(rdi);
7436 #endif
7437     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7438 
7439 #ifdef _WIN64
7440     __ pop(rdi);
7441     __ pop(rsi);
7442 #endif
7443 
7444     __ leave(); // required for proper stackwalking of RuntimeStub frame
7445     __ ret(0);
7446 
7447     return start;
7448 
7449   }
7450 
7451   address generate_libmTan() {
7452     StubCodeMark mark(this, "StubRoutines", "libmTan");
7453 
7454     address start = __ pc();
7455 
7456     const XMMRegister x0 = xmm0;
7457     const XMMRegister x1 = xmm1;
7458     const XMMRegister x2 = xmm2;
7459     const XMMRegister x3 = xmm3;
7460 
7461     const XMMRegister x4 = xmm4;
7462     const XMMRegister x5 = xmm5;
7463     const XMMRegister x6 = xmm6;
7464     const XMMRegister x7 = xmm7;
7465 
7466     const Register tmp1 = r8;
7467     const Register tmp2 = r9;
7468     const Register tmp3 = r10;
7469     const Register tmp4 = r11;
7470 
7471     BLOCK_COMMENT("Entry:");
7472     __ enter(); // required for proper stackwalking of RuntimeStub frame
7473 
7474 #ifdef _WIN64
7475     __ push(rsi);
7476     __ push(rdi);
7477 #endif
7478     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7479 
7480 #ifdef _WIN64
7481     __ pop(rdi);
7482     __ pop(rsi);
7483 #endif
7484 
7485     __ leave(); // required for proper stackwalking of RuntimeStub frame
7486     __ ret(0);
7487 
7488     return start;
7489 
7490   }
7491 
7492 #undef __
7493 #define __ masm->
7494 
7495   // Continuation point for throwing of implicit exceptions that are
7496   // not handled in the current activation. Fabricates an exception
7497   // oop and initiates normal exception dispatching in this
7498   // frame. Since we need to preserve callee-saved values (currently
7499   // only for C2, but done for C1 as well) we need a callee-saved oop
7500   // map and therefore have to make these stubs into RuntimeStubs
7501   // rather than BufferBlobs.  If the compiler needs all registers to
7502   // be preserved between the fault point and the exception handler
7503   // then it must assume responsibility for that in
7504   // AbstractCompiler::continuation_for_implicit_null_exception or
7505   // continuation_for_implicit_division_by_zero_exception. All other
7506   // implicit exceptions (e.g., NullPointerException or
7507   // AbstractMethodError on entry) are either at call sites or
7508   // otherwise assume that stack unwinding will be initiated, so
7509   // caller saved registers were assumed volatile in the compiler.
7510   address generate_throw_exception(const char* name,
7511                                    address runtime_entry,
7512                                    Register arg1 = noreg,
7513                                    Register arg2 = noreg) {
7514     // Information about frame layout at time of blocking runtime call.
7515     // Note that we only have to preserve callee-saved registers since
7516     // the compilers are responsible for supplying a continuation point
7517     // if they expect all registers to be preserved.
7518     enum layout {
7519       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
7520       rbp_off2,
7521       return_off,
7522       return_off2,
7523       framesize // inclusive of return address
7524     };
7525 
7526     int insts_size = 512;
7527     int locs_size  = 64;
7528 
7529     CodeBuffer code(name, insts_size, locs_size);
7530     OopMapSet* oop_maps  = new OopMapSet();
7531     MacroAssembler* masm = new MacroAssembler(&code);
7532 
7533     address start = __ pc();
7534 
7535     // This is an inlined and slightly modified version of call_VM
7536     // which has the ability to fetch the return PC out of
7537     // thread-local storage and also sets up last_Java_sp slightly
7538     // differently than the real call_VM
7539 
7540     __ enter(); // required for proper stackwalking of RuntimeStub frame
7541 
7542     assert(is_even(framesize/2), "sp not 16-byte aligned");
7543 
7544     // return address and rbp are already in place
7545     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
7546 
7547     int frame_complete = __ pc() - start;
7548 
7549     // Set up last_Java_sp and last_Java_fp
7550     address the_pc = __ pc();
7551     __ set_last_Java_frame(rsp, rbp, the_pc);
7552     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
7553 
7554     // Call runtime
7555     if (arg1 != noreg) {
7556       assert(arg2 != c_rarg1, "clobbered");
7557       __ movptr(c_rarg1, arg1);
7558     }
7559     if (arg2 != noreg) {
7560       __ movptr(c_rarg2, arg2);
7561     }
7562     __ movptr(c_rarg0, r15_thread);
7563     BLOCK_COMMENT("call runtime_entry");
7564     __ call(RuntimeAddress(runtime_entry));
7565 
7566     // Generate oop map
7567     OopMap* map = new OopMap(framesize, 0);
7568 
7569     oop_maps->add_gc_map(the_pc - start, map);
7570 
7571     __ reset_last_Java_frame(true);
7572 
7573     __ leave(); // required for proper stackwalking of RuntimeStub frame
7574 
7575     // check for pending exceptions
7576 #ifdef ASSERT
7577     Label L;
7578     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
7579             (int32_t) NULL_WORD);
7580     __ jcc(Assembler::notEqual, L);
7581     __ should_not_reach_here();
7582     __ bind(L);
7583 #endif // ASSERT
7584     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7585 
7586 
7587     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7588     RuntimeStub* stub =
7589       RuntimeStub::new_runtime_stub(name,
7590                                     &code,
7591                                     frame_complete,
7592                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7593                                     oop_maps, false);
7594     return stub->entry_point();
7595   }
7596 
7597   void create_control_words() {
7598     // Round to nearest, 64-bit mode, exceptions masked
7599     StubRoutines::x86::_mxcsr_std = 0x1F80;
7600   }
7601 
7602   // Initialization
7603   void generate_initial() {
7604     // Generates all stubs and initializes the entry points
7605 
7606     // This platform-specific settings are needed by generate_call_stub()
7607     create_control_words();
7608 
7609     // entry points that exist in all platforms Note: This is code
7610     // that could be shared among different platforms - however the
7611     // benefit seems to be smaller than the disadvantage of having a
7612     // much more complicated generator structure. See also comment in
7613     // stubRoutines.hpp.
7614 
7615     StubRoutines::_forward_exception_entry = generate_forward_exception();
7616 
7617     StubRoutines::_call_stub_entry =
7618       generate_call_stub(StubRoutines::_call_stub_return_address);
7619 
7620     // is referenced by megamorphic call
7621     StubRoutines::_catch_exception_entry = generate_catch_exception();
7622 
7623     // atomic calls
7624     StubRoutines::_fence_entry                = generate_orderaccess_fence();
7625 
7626     // platform dependent
7627     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
7628 
7629     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
7630 
7631     StubRoutines::x86::_f2i_fixup             = generate_f2i_fixup();
7632     StubRoutines::x86::_f2l_fixup             = generate_f2l_fixup();
7633     StubRoutines::x86::_d2i_fixup             = generate_d2i_fixup();
7634     StubRoutines::x86::_d2l_fixup             = generate_d2l_fixup();
7635 
7636     StubRoutines::x86::_float_sign_mask       = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
7637     StubRoutines::x86::_float_sign_flip       = generate_fp_mask("float_sign_flip",  0x8000000080000000);
7638     StubRoutines::x86::_double_sign_mask      = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7639     StubRoutines::x86::_double_sign_flip      = generate_fp_mask("double_sign_flip", 0x8000000000000000);
7640 
7641     // Build this early so it's available for the interpreter.
7642     StubRoutines::_throw_StackOverflowError_entry =
7643       generate_throw_exception("StackOverflowError throw_exception",
7644                                CAST_FROM_FN_PTR(address,
7645                                                 SharedRuntime::
7646                                                 throw_StackOverflowError));
7647     StubRoutines::_throw_delayed_StackOverflowError_entry =
7648       generate_throw_exception("delayed StackOverflowError throw_exception",
7649                                CAST_FROM_FN_PTR(address,
7650                                                 SharedRuntime::
7651                                                 throw_delayed_StackOverflowError));
7652     if (UseCRC32Intrinsics) {
7653       // set table address before stub generation which use it
7654       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
7655       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7656     }
7657 
7658     if (UseCRC32CIntrinsics) {
7659       bool supports_clmul = VM_Version::supports_clmul();
7660       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
7661       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
7662       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
7663     }
7664 
7665     if (UseAdler32Intrinsics) {
7666        StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7667     }
7668 
7669     if (UseLibmIntrinsic && InlineIntrinsics) {
7670       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
7671           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
7672           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7673         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
7674         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
7675         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
7676         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
7677         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
7678         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
7679         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
7680         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
7681         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
7682         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
7683         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
7684         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
7685         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
7686         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
7687       }
7688       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
7689         StubRoutines::_dexp = generate_libmExp();
7690       }
7691       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7692         StubRoutines::_dlog = generate_libmLog();
7693       }
7694       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
7695         StubRoutines::_dlog10 = generate_libmLog10();
7696       }
7697       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
7698         StubRoutines::_dpow = generate_libmPow();
7699       }
7700       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7701         StubRoutines::_dsin = generate_libmSin();
7702       }
7703       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7704         StubRoutines::_dcos = generate_libmCos();
7705       }
7706       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7707         StubRoutines::_dtan = generate_libmTan();
7708       }
7709     }
7710   }
7711 
7712   void generate_all() {
7713     // Generates all stubs and initializes the entry points
7714 
7715     // These entry points require SharedInfo::stack0 to be set up in
7716     // non-core builds and need to be relocatable, so they each
7717     // fabricate a RuntimeStub internally.
7718     StubRoutines::_throw_AbstractMethodError_entry =
7719       generate_throw_exception("AbstractMethodError throw_exception",
7720                                CAST_FROM_FN_PTR(address,
7721                                                 SharedRuntime::
7722                                                 throw_AbstractMethodError));
7723 
7724     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7725       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7726                                CAST_FROM_FN_PTR(address,
7727                                                 SharedRuntime::
7728                                                 throw_IncompatibleClassChangeError));
7729 
7730     StubRoutines::_throw_NullPointerException_at_call_entry =
7731       generate_throw_exception("NullPointerException at call throw_exception",
7732                                CAST_FROM_FN_PTR(address,
7733                                                 SharedRuntime::
7734                                                 throw_NullPointerException_at_call));
7735 
7736     // entry points that are platform specific
7737     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
7738     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
7739     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7740     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
7741     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
7742     StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x0000000100000001);
7743     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
7744     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
7745     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
7746     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
7747     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
7748                                                                         0xFFFFFFFF, 0, 0, 0);
7749     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
7750                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
7751     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
7752     StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
7753     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
7754     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
7755     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
7756     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
7757     StubRoutines::x86::_vector_count_leading_zeros_lut = generate_count_leading_zeros_lut("count_leading_zeros_lut");
7758     StubRoutines::x86::_vector_reverse_bit_lut = generate_vector_reverse_bit_lut("reverse_bit_lut");
7759     StubRoutines::x86::_vector_reverse_byte_perm_mask_long = generate_vector_reverse_byte_perm_mask_long("perm_mask_long");
7760     StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
7761     StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");
7762 
7763     if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
7764       // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
7765       StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
7766     }
7767 
7768     // support for verify_oop (must happen after universe_init)
7769     if (VerifyOops) {
7770       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7771     }
7772 
7773     // data cache line writeback
7774     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7775     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7776 
7777     // arraycopy stubs used by compilers
7778     generate_arraycopy_stubs();
7779 
7780     // don't bother generating these AES intrinsic stubs unless global flag is set
7781     if (UseAESIntrinsics) {
7782       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
7783       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7784       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7785       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7786       if (VM_Version::supports_avx512_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
7787         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
7788         StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
7789         StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
7790         StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7791         StubRoutines::x86::_ghash_poly512_addr = ghash_polynomial512_addr();
7792         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7793         StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7794       } else {
7795         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
7796       }
7797     }
7798 
7799     if (UseAESCTRIntrinsics) {
7800       if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
7801         if (StubRoutines::x86::_counter_mask_addr == NULL) {
7802           StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7803         }
7804         StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
7805       } else {
7806         StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
7807         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
7808       }
7809     }
7810 
7811     if (UseMD5Intrinsics) {
7812       StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
7813       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
7814     }
7815     if (UseSHA1Intrinsics) {
7816       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
7817       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
7818       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
7819       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
7820     }
7821     if (UseSHA256Intrinsics) {
7822       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
7823       char* dst = (char*)StubRoutines::x86::_k256_W;
7824       char* src = (char*)StubRoutines::x86::_k256;
7825       for (int ii = 0; ii < 16; ++ii) {
7826         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
7827         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
7828       }
7829       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
7830       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
7831       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
7832       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
7833     }
7834     if (UseSHA512Intrinsics) {
7835       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
7836       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
7837       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
7838       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
7839     }
7840 
7841     // Generate GHASH intrinsics code
7842     if (UseGHASHIntrinsics) {
7843       if (StubRoutines::x86::_ghash_long_swap_mask_addr == NULL) {
7844         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7845       }
7846     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
7847       if (VM_Version::supports_avx()) {
7848         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
7849         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
7850         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
7851       } else {
7852         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7853       }
7854     }
7855 
7856 
7857     if (UseBASE64Intrinsics) {
7858       if(VM_Version::supports_avx2() &&
7859          VM_Version::supports_avx512bw() &&
7860          VM_Version::supports_avx512vl()) {
7861         StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
7862         StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
7863         StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
7864       }
7865       StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
7866       if (VM_Version::supports_avx512_vbmi()) {
7867         StubRoutines::x86::_shuffle_base64 = base64_shuffle_addr();
7868         StubRoutines::x86::_lookup_lo_base64 = base64_vbmi_lookup_lo_addr();
7869         StubRoutines::x86::_lookup_hi_base64 = base64_vbmi_lookup_hi_addr();
7870         StubRoutines::x86::_lookup_lo_base64url = base64_vbmi_lookup_lo_url_addr();
7871         StubRoutines::x86::_lookup_hi_base64url = base64_vbmi_lookup_hi_url_addr();
7872         StubRoutines::x86::_pack_vec_base64 = base64_vbmi_pack_vec_addr();
7873         StubRoutines::x86::_join_0_1_base64 = base64_vbmi_join_0_1_addr();
7874         StubRoutines::x86::_join_1_2_base64 = base64_vbmi_join_1_2_addr();
7875         StubRoutines::x86::_join_2_3_base64 = base64_vbmi_join_2_3_addr();
7876       }
7877       StubRoutines::x86::_decoding_table_base64 = base64_decoding_table_addr();
7878       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7879       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7880     }
7881 
7882     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7883     if (bs_nm != NULL) {
7884       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
7885     }
7886 #ifdef COMPILER2
7887     if (UseMultiplyToLenIntrinsic) {
7888       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7889     }
7890     if (UseSquareToLenIntrinsic) {
7891       StubRoutines::_squareToLen = generate_squareToLen();
7892     }
7893     if (UseMulAddIntrinsic) {
7894       StubRoutines::_mulAdd = generate_mulAdd();
7895     }
7896     if (VM_Version::supports_avx512_vbmi2()) {
7897       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7898       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7899     }
7900     if (UseMontgomeryMultiplyIntrinsic) {
7901       StubRoutines::_montgomeryMultiply
7902         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
7903     }
7904     if (UseMontgomerySquareIntrinsic) {
7905       StubRoutines::_montgomerySquare
7906         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
7907     }
7908 
7909     // Get svml stub routine addresses
7910     void *libjsvml = NULL;
7911     char ebuf[1024];
7912     char dll_name[JVM_MAXPATHLEN];
7913     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "jsvml")) {
7914       libjsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
7915     }
7916     if (libjsvml != NULL) {
7917       // SVML method naming convention
7918       //   All the methods are named as __jsvml_op<T><N>_ha_<VV>
7919       //   Where:
7920       //      ha stands for high accuracy
7921       //      <T> is optional to indicate float/double
7922       //              Set to f for vector float operation
7923       //              Omitted for vector double operation
7924       //      <N> is the number of elements in the vector
7925       //              1, 2, 4, 8, 16
7926       //              e.g. 128 bit float vector has 4 float elements
7927       //      <VV> indicates the avx/sse level:
7928       //              z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2
7929       //      e.g. __jsvml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns
7930       //           __jsvml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
7931 
7932       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "jsvml" JNI_LIB_SUFFIX, p2i(libjsvml));
7933       if (UseAVX > 2) {
7934         for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7935           int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7936           if ((!VM_Version::supports_avx512dq()) &&
7937               (vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) {
7938             continue;
7939           }
7940           snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf16_ha_z0", VectorSupport::svmlname[op]);
7941           StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
7942 
7943           snprintf(ebuf, sizeof(ebuf), "__jsvml_%s8_ha_z0", VectorSupport::svmlname[op]);
7944           StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
7945         }
7946       }
7947       const char* avx_sse_str = (UseAVX >= 2) ? "l9" : ((UseAVX == 1) ? "e9" : "ex");
7948       for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7949         int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7950         if (vop == VectorSupport::VECTOR_OP_POW) {
7951           continue;
7952         }
7953         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7954         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
7955 
7956         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7957         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
7958 
7959         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf8_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7960         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
7961 
7962         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7963         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
7964 
7965         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7966         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
7967 
7968         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7969         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
7970       }
7971     }
7972 #endif // COMPILER2
7973 
7974     if (UseVectorizedMismatchIntrinsic) {
7975       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
7976     }
7977   }
7978 
7979  public:
7980   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7981     if (all) {
7982       generate_all();
7983     } else {
7984       generate_initial();
7985     }
7986   }
7987 }; // end class declaration
7988 
7989 #define UCM_TABLE_MAX_ENTRIES 16
7990 void StubGenerator_generate(CodeBuffer* code, bool all) {
7991   if (UnsafeCopyMemory::_table == NULL) {
7992     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7993   }
7994   StubGenerator g(code, all);
7995 }