1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/barrierSetNMethod.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/universe.hpp"
  36 #include "nativeInst_x86.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"
  41 #include "prims/methodHandles.hpp"
  42 #include "runtime/arguments.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubCodeGenerator.hpp"
  47 #include "runtime/stubRoutines.hpp"
  48 #include "runtime/thread.inline.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_JVMCI
  53 #include "jvmci/jvmci_globals.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #define __ _masm->
  64 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  65 #define a__ ((Assembler*)_masm)->
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     // This can destroy rscratch1 if counter is far from the code cache
  86     __ incrementl(ExternalAddress((address)&counter));
  87   }
  88 #define inc_counter_np(counter) \
  89   BLOCK_COMMENT("inc_counter " #counter); \
  90   inc_counter_np_(counter);
  91 #endif
  92 
  93   // Call stubs are used to call Java from C
  94   //
  95   // Linux Arguments:
  96   //    c_rarg0:   call wrapper address                   address
  97   //    c_rarg1:   result                                 address
  98   //    c_rarg2:   result type                            BasicType
  99   //    c_rarg3:   method                                 Method*
 100   //    c_rarg4:   (interpreter) entry point              address
 101   //    c_rarg5:   parameters                             intptr_t*
 102   //    16(rbp): parameter size (in words)              int
 103   //    24(rbp): thread                                 Thread*
 104   //
 105   //     [ return_from_Java     ] <--- rsp
 106   //     [ argument word n      ]
 107   //      ...
 108   // -12 [ argument word 1      ]
 109   // -11 [ saved r15            ] <--- rsp_after_call
 110   // -10 [ saved r14            ]
 111   //  -9 [ saved r13            ]
 112   //  -8 [ saved r12            ]
 113   //  -7 [ saved rbx            ]
 114   //  -6 [ call wrapper         ]
 115   //  -5 [ result               ]
 116   //  -4 [ result type          ]
 117   //  -3 [ method               ]
 118   //  -2 [ entry point          ]
 119   //  -1 [ parameters           ]
 120   //   0 [ saved rbp            ] <--- rbp
 121   //   1 [ return address       ]
 122   //   2 [ parameter size       ]
 123   //   3 [ thread               ]
 124   //
 125   // Windows Arguments:
 126   //    c_rarg0:   call wrapper address                   address
 127   //    c_rarg1:   result                                 address
 128   //    c_rarg2:   result type                            BasicType
 129   //    c_rarg3:   method                                 Method*
 130   //    48(rbp): (interpreter) entry point              address
 131   //    56(rbp): parameters                             intptr_t*
 132   //    64(rbp): parameter size (in words)              int
 133   //    72(rbp): thread                                 Thread*
 134   //
 135   //     [ return_from_Java     ] <--- rsp
 136   //     [ argument word n      ]
 137   //      ...
 138   // -60 [ argument word 1      ]
 139   // -59 [ saved xmm31          ] <--- rsp after_call
 140   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 141   // -27 [ saved xmm15          ]
 142   //     [ saved xmm7-xmm14     ]
 143   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 144   //  -7 [ saved r15            ]
 145   //  -6 [ saved r14            ]
 146   //  -5 [ saved r13            ]
 147   //  -4 [ saved r12            ]
 148   //  -3 [ saved rdi            ]
 149   //  -2 [ saved rsi            ]
 150   //  -1 [ saved rbx            ]
 151   //   0 [ saved rbp            ] <--- rbp
 152   //   1 [ return address       ]
 153   //   2 [ call wrapper         ]
 154   //   3 [ result               ]
 155   //   4 [ result type          ]
 156   //   5 [ method               ]
 157   //   6 [ entry point          ]
 158   //   7 [ parameters           ]
 159   //   8 [ parameter size       ]
 160   //   9 [ thread               ]
 161   //
 162   //    Windows reserves the callers stack space for arguments 1-4.
 163   //    We spill c_rarg0-c_rarg3 to this space.
 164 
 165   // Call stub stack layout word offsets from rbp
 166   enum call_stub_layout {
 167 #ifdef _WIN64
 168     xmm_save_first     = 6,  // save from xmm6
 169     xmm_save_last      = 31, // to xmm31
 170     xmm_save_base      = -9,
 171     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 172     r15_off            = -7,
 173     r14_off            = -6,
 174     r13_off            = -5,
 175     r12_off            = -4,
 176     rdi_off            = -3,
 177     rsi_off            = -2,
 178     rbx_off            = -1,
 179     rbp_off            =  0,
 180     retaddr_off        =  1,
 181     call_wrapper_off   =  2,
 182     result_off         =  3,
 183     result_type_off    =  4,
 184     method_off         =  5,
 185     entry_point_off    =  6,
 186     parameters_off     =  7,
 187     parameter_size_off =  8,
 188     thread_off         =  9
 189 #else
 190     rsp_after_call_off = -12,
 191     mxcsr_off          = rsp_after_call_off,
 192     r15_off            = -11,
 193     r14_off            = -10,
 194     r13_off            = -9,
 195     r12_off            = -8,
 196     rbx_off            = -7,
 197     call_wrapper_off   = -6,
 198     result_off         = -5,
 199     result_type_off    = -4,
 200     method_off         = -3,
 201     entry_point_off    = -2,
 202     parameters_off     = -1,
 203     rbp_off            =  0,
 204     retaddr_off        =  1,
 205     parameter_size_off =  2,
 206     thread_off         =  3
 207 #endif
 208   };
 209 
 210 #ifdef _WIN64
 211   Address xmm_save(int reg) {
 212     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 213     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 214   }
 215 #endif
 216 
 217   address generate_call_stub(address& return_address) {
 218     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 219            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 220            "adjust this code");
 221     StubCodeMark mark(this, "StubRoutines", "call_stub");
 222     address start = __ pc();
 223 
 224     // same as in generate_catch_exception()!
 225     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 226 
 227     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 228     const Address result        (rbp, result_off         * wordSize);
 229     const Address result_type   (rbp, result_type_off    * wordSize);
 230     const Address method        (rbp, method_off         * wordSize);
 231     const Address entry_point   (rbp, entry_point_off    * wordSize);
 232     const Address parameters    (rbp, parameters_off     * wordSize);
 233     const Address parameter_size(rbp, parameter_size_off * wordSize);
 234 
 235     // same as in generate_catch_exception()!
 236     const Address thread        (rbp, thread_off         * wordSize);
 237 
 238     const Address r15_save(rbp, r15_off * wordSize);
 239     const Address r14_save(rbp, r14_off * wordSize);
 240     const Address r13_save(rbp, r13_off * wordSize);
 241     const Address r12_save(rbp, r12_off * wordSize);
 242     const Address rbx_save(rbp, rbx_off * wordSize);
 243 
 244     // stub code
 245     __ enter();
 246     __ subptr(rsp, -rsp_after_call_off * wordSize);
 247 
 248     // save register parameters
 249 #ifndef _WIN64
 250     __ movptr(parameters,   c_rarg5); // parameters
 251     __ movptr(entry_point,  c_rarg4); // entry_point
 252 #endif
 253 
 254     __ movptr(method,       c_rarg3); // method
 255     __ movl(result_type,  c_rarg2);   // result type
 256     __ movptr(result,       c_rarg1); // result
 257     __ movptr(call_wrapper, c_rarg0); // call wrapper
 258 
 259     // save regs belonging to calling function
 260     __ movptr(rbx_save, rbx);
 261     __ movptr(r12_save, r12);
 262     __ movptr(r13_save, r13);
 263     __ movptr(r14_save, r14);
 264     __ movptr(r15_save, r15);
 265 
 266 #ifdef _WIN64
 267     int last_reg = 15;
 268     if (UseAVX > 2) {
 269       last_reg = 31;
 270     }
 271     if (VM_Version::supports_evex()) {
 272       for (int i = xmm_save_first; i <= last_reg; i++) {
 273         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 274       }
 275     } else {
 276       for (int i = xmm_save_first; i <= last_reg; i++) {
 277         __ movdqu(xmm_save(i), as_XMMRegister(i));
 278       }
 279     }
 280 
 281     const Address rdi_save(rbp, rdi_off * wordSize);
 282     const Address rsi_save(rbp, rsi_off * wordSize);
 283 
 284     __ movptr(rsi_save, rsi);
 285     __ movptr(rdi_save, rdi);
 286 #else
 287     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 288     {
 289       Label skip_ldmx;
 290       __ stmxcsr(mxcsr_save);
 291       __ movl(rax, mxcsr_save);
 292       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 293       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 294       __ cmp32(rax, mxcsr_std);
 295       __ jcc(Assembler::equal, skip_ldmx);
 296       __ ldmxcsr(mxcsr_std);
 297       __ bind(skip_ldmx);
 298     }
 299 #endif
 300 
 301     // Load up thread register
 302     __ movptr(r15_thread, thread);
 303     __ reinit_heapbase();
 304 
 305 #ifdef ASSERT
 306     // make sure we have no pending exceptions
 307     {
 308       Label L;
 309       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 310       __ jcc(Assembler::equal, L);
 311       __ stop("StubRoutines::call_stub: entered with pending exception");
 312       __ bind(L);
 313     }
 314 #endif
 315 
 316     // pass parameters if any
 317     BLOCK_COMMENT("pass parameters if any");
 318     Label parameters_done;
 319     __ movl(c_rarg3, parameter_size);
 320     __ testl(c_rarg3, c_rarg3);
 321     __ jcc(Assembler::zero, parameters_done);
 322 
 323     Label loop;
 324     __ movptr(c_rarg2, parameters);       // parameter pointer
 325     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 326     __ BIND(loop);
 327     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 328     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 329     __ decrementl(c_rarg1);             // decrement counter
 330     __ push(rax);                       // pass parameter
 331     __ jcc(Assembler::notZero, loop);
 332 
 333     // call Java function
 334     __ BIND(parameters_done);
 335     __ movptr(rbx, method);             // get Method*
 336     __ movptr(c_rarg1, entry_point);    // get entry_point
 337     __ mov(r13, rsp);                   // set sender sp
 338     BLOCK_COMMENT("call Java function");
 339     __ call(c_rarg1);
 340 
 341     BLOCK_COMMENT("call_stub_return_address:");
 342     return_address = __ pc();
 343 
 344     // store result depending on type (everything that is not
 345     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 346     __ movptr(c_rarg0, result);
 347     Label is_long, is_float, is_double, exit;
 348     __ movl(c_rarg1, result_type);
 349     __ cmpl(c_rarg1, T_OBJECT);
 350     __ jcc(Assembler::equal, is_long);
 351     __ cmpl(c_rarg1, T_LONG);
 352     __ jcc(Assembler::equal, is_long);
 353     __ cmpl(c_rarg1, T_FLOAT);
 354     __ jcc(Assembler::equal, is_float);
 355     __ cmpl(c_rarg1, T_DOUBLE);
 356     __ jcc(Assembler::equal, is_double);
 357 
 358     // handle T_INT case
 359     __ movl(Address(c_rarg0, 0), rax);
 360 
 361     __ BIND(exit);
 362 
 363     // pop parameters
 364     __ lea(rsp, rsp_after_call);
 365 
 366 #ifdef ASSERT
 367     // verify that threads correspond
 368     {
 369      Label L1, L2, L3;
 370       __ cmpptr(r15_thread, thread);
 371       __ jcc(Assembler::equal, L1);
 372       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 373       __ bind(L1);
 374       __ get_thread(rbx);
 375       __ cmpptr(r15_thread, thread);
 376       __ jcc(Assembler::equal, L2);
 377       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 378       __ bind(L2);
 379       __ cmpptr(r15_thread, rbx);
 380       __ jcc(Assembler::equal, L3);
 381       __ stop("StubRoutines::call_stub: threads must correspond");
 382       __ bind(L3);
 383     }
 384 #endif
 385 
 386     // restore regs belonging to calling function
 387 #ifdef _WIN64
 388     // emit the restores for xmm regs
 389     if (VM_Version::supports_evex()) {
 390       for (int i = xmm_save_first; i <= last_reg; i++) {
 391         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 392       }
 393     } else {
 394       for (int i = xmm_save_first; i <= last_reg; i++) {
 395         __ movdqu(as_XMMRegister(i), xmm_save(i));
 396       }
 397     }
 398 #endif
 399     __ movptr(r15, r15_save);
 400     __ movptr(r14, r14_save);
 401     __ movptr(r13, r13_save);
 402     __ movptr(r12, r12_save);
 403     __ movptr(rbx, rbx_save);
 404 
 405 #ifdef _WIN64
 406     __ movptr(rdi, rdi_save);
 407     __ movptr(rsi, rsi_save);
 408 #else
 409     __ ldmxcsr(mxcsr_save);
 410 #endif
 411 
 412     // restore rsp
 413     __ addptr(rsp, -rsp_after_call_off * wordSize);
 414 
 415     // return
 416     __ vzeroupper();
 417     __ pop(rbp);
 418     __ ret(0);
 419 
 420     // handle return types different from T_INT
 421     __ BIND(is_long);
 422     __ movq(Address(c_rarg0, 0), rax);
 423     __ jmp(exit);
 424 
 425     __ BIND(is_float);
 426     __ movflt(Address(c_rarg0, 0), xmm0);
 427     __ jmp(exit);
 428 
 429     __ BIND(is_double);
 430     __ movdbl(Address(c_rarg0, 0), xmm0);
 431     __ jmp(exit);
 432 
 433     return start;
 434   }
 435 
 436   // Return point for a Java call if there's an exception thrown in
 437   // Java code.  The exception is caught and transformed into a
 438   // pending exception stored in JavaThread that can be tested from
 439   // within the VM.
 440   //
 441   // Note: Usually the parameters are removed by the callee. In case
 442   // of an exception crossing an activation frame boundary, that is
 443   // not the case if the callee is compiled code => need to setup the
 444   // rsp.
 445   //
 446   // rax: exception oop
 447 
 448   address generate_catch_exception() {
 449     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 450     address start = __ pc();
 451 
 452     // same as in generate_call_stub():
 453     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 454     const Address thread        (rbp, thread_off         * wordSize);
 455 
 456 #ifdef ASSERT
 457     // verify that threads correspond
 458     {
 459       Label L1, L2, L3;
 460       __ cmpptr(r15_thread, thread);
 461       __ jcc(Assembler::equal, L1);
 462       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 463       __ bind(L1);
 464       __ get_thread(rbx);
 465       __ cmpptr(r15_thread, thread);
 466       __ jcc(Assembler::equal, L2);
 467       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 468       __ bind(L2);
 469       __ cmpptr(r15_thread, rbx);
 470       __ jcc(Assembler::equal, L3);
 471       __ stop("StubRoutines::catch_exception: threads must correspond");
 472       __ bind(L3);
 473     }
 474 #endif
 475 
 476     // set pending exception
 477     __ verify_oop(rax);
 478 
 479     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 480     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 481     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 482     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 483 
 484     // complete return to VM
 485     assert(StubRoutines::_call_stub_return_address != NULL,
 486            "_call_stub_return_address must have been generated before");
 487     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 488 
 489     return start;
 490   }
 491 
 492   // Continuation point for runtime calls returning with a pending
 493   // exception.  The pending exception check happened in the runtime
 494   // or native call stub.  The pending exception in Thread is
 495   // converted into a Java-level exception.
 496   //
 497   // Contract with Java-level exception handlers:
 498   // rax: exception
 499   // rdx: throwing pc
 500   //
 501   // NOTE: At entry of this stub, exception-pc must be on stack !!
 502 
 503   address generate_forward_exception() {
 504     StubCodeMark mark(this, "StubRoutines", "forward exception");
 505     address start = __ pc();
 506 
 507     // Upon entry, the sp points to the return address returning into
 508     // Java (interpreted or compiled) code; i.e., the return address
 509     // becomes the throwing pc.
 510     //
 511     // Arguments pushed before the runtime call are still on the stack
 512     // but the exception handler will reset the stack pointer ->
 513     // ignore them.  A potential result in registers can be ignored as
 514     // well.
 515 
 516 #ifdef ASSERT
 517     // make sure this code is only executed if there is a pending exception
 518     {
 519       Label L;
 520       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 521       __ jcc(Assembler::notEqual, L);
 522       __ stop("StubRoutines::forward exception: no pending exception (1)");
 523       __ bind(L);
 524     }
 525 #endif
 526 
 527     // compute exception handler into rbx
 528     __ movptr(c_rarg0, Address(rsp, 0));
 529     BLOCK_COMMENT("call exception_handler_for_return_address");
 530     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 531                          SharedRuntime::exception_handler_for_return_address),
 532                     r15_thread, c_rarg0);
 533     __ mov(rbx, rax);
 534 
 535     // setup rax & rdx, remove return address & clear pending exception
 536     __ pop(rdx);
 537     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 538     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 539 
 540 #ifdef ASSERT
 541     // make sure exception is set
 542     {
 543       Label L;
 544       __ testptr(rax, rax);
 545       __ jcc(Assembler::notEqual, L);
 546       __ stop("StubRoutines::forward exception: no pending exception (2)");
 547       __ bind(L);
 548     }
 549 #endif
 550 
 551     // continue at exception handler (return address removed)
 552     // rax: exception
 553     // rbx: exception handler
 554     // rdx: throwing pc
 555     __ verify_oop(rax);
 556     __ jmp(rbx);
 557 
 558     return start;
 559   }
 560 
 561   // Support for intptr_t OrderAccess::fence()
 562   //
 563   // Arguments :
 564   //
 565   // Result:
 566   address generate_orderaccess_fence() {
 567     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 568     address start = __ pc();
 569     __ membar(Assembler::StoreLoad);
 570     __ ret(0);
 571 
 572     return start;
 573   }
 574 
 575 
 576   // Support for intptr_t get_previous_sp()
 577   //
 578   // This routine is used to find the previous stack pointer for the
 579   // caller.
 580   address generate_get_previous_sp() {
 581     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 582     address start = __ pc();
 583 
 584     __ movptr(rax, rsp);
 585     __ addptr(rax, 8); // return address is at the top of the stack.
 586     __ ret(0);
 587 
 588     return start;
 589   }
 590 
 591   //----------------------------------------------------------------------------------------------------
 592   // Support for void verify_mxcsr()
 593   //
 594   // This routine is used with -Xcheck:jni to verify that native
 595   // JNI code does not return to Java code without restoring the
 596   // MXCSR register to our expected state.
 597 
 598   address generate_verify_mxcsr() {
 599     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 600     address start = __ pc();
 601 
 602     const Address mxcsr_save(rsp, 0);
 603 
 604     if (CheckJNICalls) {
 605       Label ok_ret;
 606       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 607       __ push(rax);
 608       __ subptr(rsp, wordSize);      // allocate a temp location
 609       __ stmxcsr(mxcsr_save);
 610       __ movl(rax, mxcsr_save);
 611       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 612       __ cmp32(rax, mxcsr_std);
 613       __ jcc(Assembler::equal, ok_ret);
 614 
 615       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 616 
 617       __ ldmxcsr(mxcsr_std);
 618 
 619       __ bind(ok_ret);
 620       __ addptr(rsp, wordSize);
 621       __ pop(rax);
 622     }
 623 
 624     __ ret(0);
 625 
 626     return start;
 627   }
 628 
 629   address generate_f2i_fixup() {
 630     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 631     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 632 
 633     address start = __ pc();
 634 
 635     Label L;
 636 
 637     __ push(rax);
 638     __ push(c_rarg3);
 639     __ push(c_rarg2);
 640     __ push(c_rarg1);
 641 
 642     __ movl(rax, 0x7f800000);
 643     __ xorl(c_rarg3, c_rarg3);
 644     __ movl(c_rarg2, inout);
 645     __ movl(c_rarg1, c_rarg2);
 646     __ andl(c_rarg1, 0x7fffffff);
 647     __ cmpl(rax, c_rarg1); // NaN? -> 0
 648     __ jcc(Assembler::negative, L);
 649     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 650     __ movl(c_rarg3, 0x80000000);
 651     __ movl(rax, 0x7fffffff);
 652     __ cmovl(Assembler::positive, c_rarg3, rax);
 653 
 654     __ bind(L);
 655     __ movptr(inout, c_rarg3);
 656 
 657     __ pop(c_rarg1);
 658     __ pop(c_rarg2);
 659     __ pop(c_rarg3);
 660     __ pop(rax);
 661 
 662     __ ret(0);
 663 
 664     return start;
 665   }
 666 
 667   address generate_f2l_fixup() {
 668     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 669     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 670     address start = __ pc();
 671 
 672     Label L;
 673 
 674     __ push(rax);
 675     __ push(c_rarg3);
 676     __ push(c_rarg2);
 677     __ push(c_rarg1);
 678 
 679     __ movl(rax, 0x7f800000);
 680     __ xorl(c_rarg3, c_rarg3);
 681     __ movl(c_rarg2, inout);
 682     __ movl(c_rarg1, c_rarg2);
 683     __ andl(c_rarg1, 0x7fffffff);
 684     __ cmpl(rax, c_rarg1); // NaN? -> 0
 685     __ jcc(Assembler::negative, L);
 686     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 687     __ mov64(c_rarg3, 0x8000000000000000);
 688     __ mov64(rax, 0x7fffffffffffffff);
 689     __ cmov(Assembler::positive, c_rarg3, rax);
 690 
 691     __ bind(L);
 692     __ movptr(inout, c_rarg3);
 693 
 694     __ pop(c_rarg1);
 695     __ pop(c_rarg2);
 696     __ pop(c_rarg3);
 697     __ pop(rax);
 698 
 699     __ ret(0);
 700 
 701     return start;
 702   }
 703 
 704   address generate_d2i_fixup() {
 705     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 706     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 707 
 708     address start = __ pc();
 709 
 710     Label L;
 711 
 712     __ push(rax);
 713     __ push(c_rarg3);
 714     __ push(c_rarg2);
 715     __ push(c_rarg1);
 716     __ push(c_rarg0);
 717 
 718     __ movl(rax, 0x7ff00000);
 719     __ movq(c_rarg2, inout);
 720     __ movl(c_rarg3, c_rarg2);
 721     __ mov(c_rarg1, c_rarg2);
 722     __ mov(c_rarg0, c_rarg2);
 723     __ negl(c_rarg3);
 724     __ shrptr(c_rarg1, 0x20);
 725     __ orl(c_rarg3, c_rarg2);
 726     __ andl(c_rarg1, 0x7fffffff);
 727     __ xorl(c_rarg2, c_rarg2);
 728     __ shrl(c_rarg3, 0x1f);
 729     __ orl(c_rarg1, c_rarg3);
 730     __ cmpl(rax, c_rarg1);
 731     __ jcc(Assembler::negative, L); // NaN -> 0
 732     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 733     __ movl(c_rarg2, 0x80000000);
 734     __ movl(rax, 0x7fffffff);
 735     __ cmov(Assembler::positive, c_rarg2, rax);
 736 
 737     __ bind(L);
 738     __ movptr(inout, c_rarg2);
 739 
 740     __ pop(c_rarg0);
 741     __ pop(c_rarg1);
 742     __ pop(c_rarg2);
 743     __ pop(c_rarg3);
 744     __ pop(rax);
 745 
 746     __ ret(0);
 747 
 748     return start;
 749   }
 750 
 751   address generate_d2l_fixup() {
 752     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 753     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 754 
 755     address start = __ pc();
 756 
 757     Label L;
 758 
 759     __ push(rax);
 760     __ push(c_rarg3);
 761     __ push(c_rarg2);
 762     __ push(c_rarg1);
 763     __ push(c_rarg0);
 764 
 765     __ movl(rax, 0x7ff00000);
 766     __ movq(c_rarg2, inout);
 767     __ movl(c_rarg3, c_rarg2);
 768     __ mov(c_rarg1, c_rarg2);
 769     __ mov(c_rarg0, c_rarg2);
 770     __ negl(c_rarg3);
 771     __ shrptr(c_rarg1, 0x20);
 772     __ orl(c_rarg3, c_rarg2);
 773     __ andl(c_rarg1, 0x7fffffff);
 774     __ xorl(c_rarg2, c_rarg2);
 775     __ shrl(c_rarg3, 0x1f);
 776     __ orl(c_rarg1, c_rarg3);
 777     __ cmpl(rax, c_rarg1);
 778     __ jcc(Assembler::negative, L); // NaN -> 0
 779     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 780     __ mov64(c_rarg2, 0x8000000000000000);
 781     __ mov64(rax, 0x7fffffffffffffff);
 782     __ cmovq(Assembler::positive, c_rarg2, rax);
 783 
 784     __ bind(L);
 785     __ movq(inout, c_rarg2);
 786 
 787     __ pop(c_rarg0);
 788     __ pop(c_rarg1);
 789     __ pop(c_rarg2);
 790     __ pop(c_rarg3);
 791     __ pop(rax);
 792 
 793     __ ret(0);
 794 
 795     return start;
 796   }
 797 
 798   address generate_iota_indices(const char *stub_name) {
 799     __ align(CodeEntryAlignment);
 800     StubCodeMark mark(this, "StubRoutines", stub_name);
 801     address start = __ pc();
 802     __ emit_data64(0x0706050403020100, relocInfo::none);
 803     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 804     __ emit_data64(0x1716151413121110, relocInfo::none);
 805     __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
 806     __ emit_data64(0x2726252423222120, relocInfo::none);
 807     __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
 808     __ emit_data64(0x3736353433323130, relocInfo::none);
 809     __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
 810     return start;
 811   }
 812 
 813   address generate_vector_byte_shuffle_mask(const char *stub_name) {
 814     __ align(CodeEntryAlignment);
 815     StubCodeMark mark(this, "StubRoutines", stub_name);
 816     address start = __ pc();
 817     __ emit_data64(0x7070707070707070, relocInfo::none);
 818     __ emit_data64(0x7070707070707070, relocInfo::none);
 819     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 820     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 821     return start;
 822   }
 823 
 824   address generate_fp_mask(const char *stub_name, int64_t mask) {
 825     __ align(CodeEntryAlignment);
 826     StubCodeMark mark(this, "StubRoutines", stub_name);
 827     address start = __ pc();
 828 
 829     __ emit_data64( mask, relocInfo::none );
 830     __ emit_data64( mask, relocInfo::none );
 831 
 832     return start;
 833   }
 834 
 835   address generate_vector_mask(const char *stub_name, int64_t mask) {
 836     __ align(CodeEntryAlignment);
 837     StubCodeMark mark(this, "StubRoutines", stub_name);
 838     address start = __ pc();
 839 
 840     __ emit_data64(mask, relocInfo::none);
 841     __ emit_data64(mask, relocInfo::none);
 842     __ emit_data64(mask, relocInfo::none);
 843     __ emit_data64(mask, relocInfo::none);
 844     __ emit_data64(mask, relocInfo::none);
 845     __ emit_data64(mask, relocInfo::none);
 846     __ emit_data64(mask, relocInfo::none);
 847     __ emit_data64(mask, relocInfo::none);
 848 
 849     return start;
 850   }
 851 
 852   address generate_vector_byte_perm_mask(const char *stub_name) {
 853     __ align(CodeEntryAlignment);
 854     StubCodeMark mark(this, "StubRoutines", stub_name);
 855     address start = __ pc();
 856 
 857     __ emit_data64(0x0000000000000001, relocInfo::none);
 858     __ emit_data64(0x0000000000000003, relocInfo::none);
 859     __ emit_data64(0x0000000000000005, relocInfo::none);
 860     __ emit_data64(0x0000000000000007, relocInfo::none);
 861     __ emit_data64(0x0000000000000000, relocInfo::none);
 862     __ emit_data64(0x0000000000000002, relocInfo::none);
 863     __ emit_data64(0x0000000000000004, relocInfo::none);
 864     __ emit_data64(0x0000000000000006, relocInfo::none);
 865 
 866     return start;
 867   }
 868 
 869   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
 870     __ align(CodeEntryAlignment);
 871     StubCodeMark mark(this, "StubRoutines", stub_name);
 872     address start = __ pc();
 873 
 874     __ emit_data64(mask, relocInfo::none);
 875     __ emit_data64(mask, relocInfo::none);
 876     __ emit_data64(mask, relocInfo::none);
 877     __ emit_data64(mask, relocInfo::none);
 878     __ emit_data64(mask, relocInfo::none);
 879     __ emit_data64(mask, relocInfo::none);
 880     __ emit_data64(mask, relocInfo::none);
 881     __ emit_data64(mask, relocInfo::none);
 882 
 883     return start;
 884   }
 885 
 886   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 887                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 888                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 889                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 890                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 891     __ align(CodeEntryAlignment);
 892     StubCodeMark mark(this, "StubRoutines", stub_name);
 893     address start = __ pc();
 894 
 895     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 896     __ emit_data(val0, relocInfo::none, 0);
 897     __ emit_data(val1, relocInfo::none, 0);
 898     __ emit_data(val2, relocInfo::none, 0);
 899     __ emit_data(val3, relocInfo::none, 0);
 900     if (len >= Assembler::AVX_256bit) {
 901       __ emit_data(val4, relocInfo::none, 0);
 902       __ emit_data(val5, relocInfo::none, 0);
 903       __ emit_data(val6, relocInfo::none, 0);
 904       __ emit_data(val7, relocInfo::none, 0);
 905       if (len >= Assembler::AVX_512bit) {
 906         __ emit_data(val8, relocInfo::none, 0);
 907         __ emit_data(val9, relocInfo::none, 0);
 908         __ emit_data(val10, relocInfo::none, 0);
 909         __ emit_data(val11, relocInfo::none, 0);
 910         __ emit_data(val12, relocInfo::none, 0);
 911         __ emit_data(val13, relocInfo::none, 0);
 912         __ emit_data(val14, relocInfo::none, 0);
 913         __ emit_data(val15, relocInfo::none, 0);
 914       }
 915     }
 916 
 917     return start;
 918   }
 919 
 920   // Non-destructive plausibility checks for oops
 921   //
 922   // Arguments:
 923   //    all args on stack!
 924   //
 925   // Stack after saving c_rarg3:
 926   //    [tos + 0]: saved c_rarg3
 927   //    [tos + 1]: saved c_rarg2
 928   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 929   //    [tos + 3]: saved flags
 930   //    [tos + 4]: return address
 931   //  * [tos + 5]: error message (char*)
 932   //  * [tos + 6]: object to verify (oop)
 933   //  * [tos + 7]: saved rax - saved by caller and bashed
 934   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 935   //  * = popped on exit
 936   address generate_verify_oop() {
 937     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 938     address start = __ pc();
 939 
 940     Label exit, error;
 941 
 942     __ pushf();
 943     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 944 
 945     __ push(r12);
 946 
 947     // save c_rarg2 and c_rarg3
 948     __ push(c_rarg2);
 949     __ push(c_rarg3);
 950 
 951     enum {
 952            // After previous pushes.
 953            oop_to_verify = 6 * wordSize,
 954            saved_rax     = 7 * wordSize,
 955            saved_r10     = 8 * wordSize,
 956 
 957            // Before the call to MacroAssembler::debug(), see below.
 958            return_addr   = 16 * wordSize,
 959            error_msg     = 17 * wordSize
 960     };
 961 
 962     // get object
 963     __ movptr(rax, Address(rsp, oop_to_verify));
 964 
 965     // make sure object is 'reasonable'
 966     __ testptr(rax, rax);
 967     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
 968 
 969 #if INCLUDE_ZGC
 970     if (UseZGC) {
 971       // Check if metadata bits indicate a bad oop
 972       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
 973       __ jcc(Assembler::notZero, error);
 974     }
 975 #endif
 976 
 977     // Check if the oop is in the right area of memory
 978     __ movptr(c_rarg2, rax);
 979     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 980     __ andptr(c_rarg2, c_rarg3);
 981     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 982     __ cmpptr(c_rarg2, c_rarg3);
 983     __ jcc(Assembler::notZero, error);
 984 
 985     // make sure klass is 'reasonable', which is not zero.
 986     __ load_klass(rax, rax, rscratch1);  // get klass
 987     __ testptr(rax, rax);
 988     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
 989 
 990     // return if everything seems ok
 991     __ bind(exit);
 992     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
 993     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
 994     __ pop(c_rarg3);                             // restore c_rarg3
 995     __ pop(c_rarg2);                             // restore c_rarg2
 996     __ pop(r12);                                 // restore r12
 997     __ popf();                                   // restore flags
 998     __ ret(4 * wordSize);                        // pop caller saved stuff
 999 
1000     // handle errors
1001     __ bind(error);
1002     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1003     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1004     __ pop(c_rarg3);                             // get saved c_rarg3 back
1005     __ pop(c_rarg2);                             // get saved c_rarg2 back
1006     __ pop(r12);                                 // get saved r12 back
1007     __ popf();                                   // get saved flags off stack --
1008                                                  // will be ignored
1009 
1010     __ pusha();                                  // push registers
1011                                                  // (rip is already
1012                                                  // already pushed)
1013     // debug(char* msg, int64_t pc, int64_t regs[])
1014     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1015     // pushed all the registers, so now the stack looks like:
1016     //     [tos +  0] 16 saved registers
1017     //     [tos + 16] return address
1018     //   * [tos + 17] error message (char*)
1019     //   * [tos + 18] object to verify (oop)
1020     //   * [tos + 19] saved rax - saved by caller and bashed
1021     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1022     //   * = popped on exit
1023 
1024     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1025     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1026     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1027     __ mov(r12, rsp);                               // remember rsp
1028     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1029     __ andptr(rsp, -16);                            // align stack as required by ABI
1030     BLOCK_COMMENT("call MacroAssembler::debug");
1031     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1032     __ hlt();
1033     return start;
1034   }
1035 
1036   //
1037   // Verify that a register contains clean 32-bits positive value
1038   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1039   //
1040   //  Input:
1041   //    Rint  -  32-bits value
1042   //    Rtmp  -  scratch
1043   //
1044   void assert_clean_int(Register Rint, Register Rtmp) {
1045 #ifdef ASSERT
1046     Label L;
1047     assert_different_registers(Rtmp, Rint);
1048     __ movslq(Rtmp, Rint);
1049     __ cmpq(Rtmp, Rint);
1050     __ jcc(Assembler::equal, L);
1051     __ stop("high 32-bits of int value are not 0");
1052     __ bind(L);
1053 #endif
1054   }
1055 
1056   //  Generate overlap test for array copy stubs
1057   //
1058   //  Input:
1059   //     c_rarg0 - from
1060   //     c_rarg1 - to
1061   //     c_rarg2 - element count
1062   //
1063   //  Output:
1064   //     rax   - &from[element count - 1]
1065   //
1066   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1067     assert(no_overlap_target != NULL, "must be generated");
1068     array_overlap_test(no_overlap_target, NULL, sf);
1069   }
1070   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1071     array_overlap_test(NULL, &L_no_overlap, sf);
1072   }
1073   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1074     const Register from     = c_rarg0;
1075     const Register to       = c_rarg1;
1076     const Register count    = c_rarg2;
1077     const Register end_from = rax;
1078 
1079     __ cmpptr(to, from);
1080     __ lea(end_from, Address(from, count, sf, 0));
1081     if (NOLp == NULL) {
1082       ExternalAddress no_overlap(no_overlap_target);
1083       __ jump_cc(Assembler::belowEqual, no_overlap);
1084       __ cmpptr(to, end_from);
1085       __ jump_cc(Assembler::aboveEqual, no_overlap);
1086     } else {
1087       __ jcc(Assembler::belowEqual, (*NOLp));
1088       __ cmpptr(to, end_from);
1089       __ jcc(Assembler::aboveEqual, (*NOLp));
1090     }
1091   }
1092 
1093   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1094   //
1095   // Outputs:
1096   //    rdi - rcx
1097   //    rsi - rdx
1098   //    rdx - r8
1099   //    rcx - r9
1100   //
1101   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1102   // are non-volatile.  r9 and r10 should not be used by the caller.
1103   //
1104   DEBUG_ONLY(bool regs_in_thread;)
1105 
1106   void setup_arg_regs(int nargs = 3) {
1107     const Register saved_rdi = r9;
1108     const Register saved_rsi = r10;
1109     assert(nargs == 3 || nargs == 4, "else fix");
1110 #ifdef _WIN64
1111     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1112            "unexpected argument registers");
1113     if (nargs >= 4)
1114       __ mov(rax, r9);  // r9 is also saved_rdi
1115     __ movptr(saved_rdi, rdi);
1116     __ movptr(saved_rsi, rsi);
1117     __ mov(rdi, rcx); // c_rarg0
1118     __ mov(rsi, rdx); // c_rarg1
1119     __ mov(rdx, r8);  // c_rarg2
1120     if (nargs >= 4)
1121       __ mov(rcx, rax); // c_rarg3 (via rax)
1122 #else
1123     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1124            "unexpected argument registers");
1125 #endif
1126     DEBUG_ONLY(regs_in_thread = false;)
1127   }
1128 
1129   void restore_arg_regs() {
1130     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1131     const Register saved_rdi = r9;
1132     const Register saved_rsi = r10;
1133 #ifdef _WIN64
1134     __ movptr(rdi, saved_rdi);
1135     __ movptr(rsi, saved_rsi);
1136 #endif
1137   }
1138 
1139   // This is used in places where r10 is a scratch register, and can
1140   // be adapted if r9 is needed also.
1141   void setup_arg_regs_using_thread() {
1142     const Register saved_r15 = r9;
1143 #ifdef _WIN64
1144     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1145     __ get_thread(r15_thread);
1146     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1147            "unexpected argument registers");
1148     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1149     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1150 
1151     __ mov(rdi, rcx); // c_rarg0
1152     __ mov(rsi, rdx); // c_rarg1
1153     __ mov(rdx, r8);  // c_rarg2
1154 #else
1155     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1156            "unexpected argument registers");
1157 #endif
1158     DEBUG_ONLY(regs_in_thread = true;)
1159   }
1160 
1161   void restore_arg_regs_using_thread() {
1162     assert(regs_in_thread, "wrong call to restore_arg_regs");
1163     const Register saved_r15 = r9;
1164 #ifdef _WIN64
1165     __ get_thread(r15_thread);
1166     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1167     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1168     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1169 #endif
1170   }
1171 
1172   // Copy big chunks forward
1173   //
1174   // Inputs:
1175   //   end_from     - source arrays end address
1176   //   end_to       - destination array end address
1177   //   qword_count  - 64-bits element count, negative
1178   //   to           - scratch
1179   //   L_copy_bytes - entry label
1180   //   L_copy_8_bytes  - exit  label
1181   //
1182   void copy_bytes_forward(Register end_from, Register end_to,
1183                              Register qword_count, Register to,
1184                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1185     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1186     Label L_loop;
1187     __ align(OptoLoopAlignment);
1188     if (UseUnalignedLoadStores) {
1189       Label L_end;
1190       __ BIND(L_loop);
1191       if (UseAVX >= 2) {
1192         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1193         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1194         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1195         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1196       } else {
1197         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1198         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1199         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1200         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1201         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1202         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1203         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1204         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1205       }
1206 
1207       __ BIND(L_copy_bytes);
1208       __ addptr(qword_count, 8);
1209       __ jcc(Assembler::lessEqual, L_loop);
1210       __ subptr(qword_count, 4);  // sub(8) and add(4)
1211       __ jccb(Assembler::greater, L_end);
1212       // Copy trailing 32 bytes
1213       if (UseAVX >= 2) {
1214         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1215         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1216       } else {
1217         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1218         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1219         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1220         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1221       }
1222       __ addptr(qword_count, 4);
1223       __ BIND(L_end);
1224       if (UseAVX >= 2) {
1225         // clean upper bits of YMM registers
1226         __ vpxor(xmm0, xmm0);
1227         __ vpxor(xmm1, xmm1);
1228       }
1229     } else {
1230       // Copy 32-bytes per iteration
1231       __ BIND(L_loop);
1232       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1233       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1234       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1235       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1236       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1237       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1238       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1239       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1240 
1241       __ BIND(L_copy_bytes);
1242       __ addptr(qword_count, 4);
1243       __ jcc(Assembler::lessEqual, L_loop);
1244     }
1245     __ subptr(qword_count, 4);
1246     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1247   }
1248 
1249   // Copy big chunks backward
1250   //
1251   // Inputs:
1252   //   from         - source arrays address
1253   //   dest         - destination array address
1254   //   qword_count  - 64-bits element count
1255   //   to           - scratch
1256   //   L_copy_bytes - entry label
1257   //   L_copy_8_bytes  - exit  label
1258   //
1259   void copy_bytes_backward(Register from, Register dest,
1260                               Register qword_count, Register to,
1261                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1262     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1263     Label L_loop;
1264     __ align(OptoLoopAlignment);
1265     if (UseUnalignedLoadStores) {
1266       Label L_end;
1267       __ BIND(L_loop);
1268       if (UseAVX >= 2) {
1269         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1270         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1271         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1272         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1273       } else {
1274         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1275         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1276         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1277         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1278         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1279         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1280         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1281         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1282       }
1283 
1284       __ BIND(L_copy_bytes);
1285       __ subptr(qword_count, 8);
1286       __ jcc(Assembler::greaterEqual, L_loop);
1287 
1288       __ addptr(qword_count, 4);  // add(8) and sub(4)
1289       __ jccb(Assembler::less, L_end);
1290       // Copy trailing 32 bytes
1291       if (UseAVX >= 2) {
1292         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1293         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1294       } else {
1295         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1296         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1297         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1298         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1299       }
1300       __ subptr(qword_count, 4);
1301       __ BIND(L_end);
1302       if (UseAVX >= 2) {
1303         // clean upper bits of YMM registers
1304         __ vpxor(xmm0, xmm0);
1305         __ vpxor(xmm1, xmm1);
1306       }
1307     } else {
1308       // Copy 32-bytes per iteration
1309       __ BIND(L_loop);
1310       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1311       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1312       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1313       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1314       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1315       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1316       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1317       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1318 
1319       __ BIND(L_copy_bytes);
1320       __ subptr(qword_count, 4);
1321       __ jcc(Assembler::greaterEqual, L_loop);
1322     }
1323     __ addptr(qword_count, 4);
1324     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1325   }
1326 
1327 #ifndef PRODUCT
1328     int& get_profile_ctr(int shift) {
1329       if ( 0 == shift)
1330         return SharedRuntime::_jbyte_array_copy_ctr;
1331       else if(1 == shift)
1332         return SharedRuntime::_jshort_array_copy_ctr;
1333       else if(2 == shift)
1334         return SharedRuntime::_jint_array_copy_ctr;
1335       else
1336         return SharedRuntime::_jlong_array_copy_ctr;
1337     }
1338 #endif
1339 
1340   void setup_argument_regs(BasicType type) {
1341     if (type == T_BYTE || type == T_SHORT) {
1342       setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1343                         // r9 and r10 may be used to save non-volatile registers
1344     } else {
1345       setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1346                                      // r9 is used to save r15_thread
1347     }
1348   }
1349 
1350   void restore_argument_regs(BasicType type) {
1351     if (type == T_BYTE || type == T_SHORT) {
1352       restore_arg_regs();
1353     } else {
1354       restore_arg_regs_using_thread();
1355     }
1356   }
1357 
1358 #if COMPILER2_OR_JVMCI
1359   // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
1360   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
1361   //   for both special cases (various small block sizes) and aligned copy loop. This is the
1362   //   default configuration.
1363   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
1364   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
1365   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
1366   //   better performance for disjoint copies. For conjoint/backward copy vector based
1367   //   copy performs better.
1368   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
1369   //   64 byte vector registers (ZMMs).
1370 
1371   // Inputs:
1372   //   c_rarg0   - source array address
1373   //   c_rarg1   - destination array address
1374   //   c_rarg2   - element count, treated as ssize_t, can be zero
1375   //
1376   //
1377   // Side Effects:
1378   //   disjoint_copy_avx3_masked is set to the no-overlap entry point
1379   //   used by generate_conjoint_[byte/int/short/long]_copy().
1380   //
1381 
1382   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1383                                              bool aligned, bool is_oop, bool dest_uninitialized) {
1384     __ align(CodeEntryAlignment);
1385     StubCodeMark mark(this, "StubRoutines", name);
1386     address start = __ pc();
1387 
1388     bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
1389     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1390     Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1391     const Register from        = rdi;  // source array address
1392     const Register to          = rsi;  // destination array address
1393     const Register count       = rdx;  // elements count
1394     const Register temp1       = r8;
1395     const Register temp2       = r11;
1396     const Register temp3       = rax;
1397     const Register temp4       = rcx;
1398     // End pointers are inclusive, and if count is not zero they point
1399     // to the last unit copied:  end_to[0] := end_from[0]
1400 
1401     __ enter(); // required for proper stackwalking of RuntimeStub frame
1402     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1403 
1404     if (entry != NULL) {
1405       *entry = __ pc();
1406        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1407       BLOCK_COMMENT("Entry:");
1408     }
1409 
1410     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1411     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1412 
1413     setup_argument_regs(type);
1414 
1415     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1416     if (dest_uninitialized) {
1417       decorators |= IS_DEST_UNINITIALIZED;
1418     }
1419     if (aligned) {
1420       decorators |= ARRAYCOPY_ALIGNED;
1421     }
1422     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1423     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1424 
1425     {
1426       // Type(shift)           byte(0), short(1), int(2),   long(3)
1427       int loop_size[]        = { 192,     96,       48,      24};
1428       int threshold[]        = { 4096,    2048,     1024,    512};
1429 
1430       // UnsafeCopyMemory page error: continue after ucm
1431       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1432       // 'from', 'to' and 'count' are now valid
1433 
1434       // temp1 holds remaining count and temp4 holds running count used to compute
1435       // next address offset for start of to/from addresses (temp4 * scale).
1436       __ mov64(temp4, 0);
1437       __ movq(temp1, count);
1438 
1439       // Zero length check.
1440       __ BIND(L_tail);
1441       __ cmpq(temp1, 0);
1442       __ jcc(Assembler::lessEqual, L_exit);
1443 
1444       // Special cases using 32 byte [masked] vector copy operations.
1445       __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1446                                       temp4, temp3, use64byteVector, L_entry, L_exit);
1447 
1448       // PRE-MAIN-POST loop for aligned copy.
1449       __ BIND(L_entry);
1450 
1451       if (AVX3Threshold != 0) {
1452         __ cmpq(count, threshold[shift]);
1453         if (MaxVectorSize == 64) {
1454           // Copy using 64 byte vectors.
1455           __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1456         } else {
1457           assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
1458           // REP MOVS offer a faster copy path.
1459           __ jcc(Assembler::greaterEqual, L_repmovs);
1460         }
1461       }
1462 
1463       if (MaxVectorSize < 64  || AVX3Threshold != 0) {
1464         // Partial copy to make dst address 32 byte aligned.
1465         __ movq(temp2, to);
1466         __ andq(temp2, 31);
1467         __ jcc(Assembler::equal, L_main_pre_loop);
1468 
1469         __ negptr(temp2);
1470         __ addq(temp2, 32);
1471         if (shift) {
1472           __ shrq(temp2, shift);
1473         }
1474         __ movq(temp3, temp2);
1475         __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
1476         __ movq(temp4, temp2);
1477         __ movq(temp1, count);
1478         __ subq(temp1, temp2);
1479 
1480         __ cmpq(temp1, loop_size[shift]);
1481         __ jcc(Assembler::less, L_tail);
1482 
1483         __ BIND(L_main_pre_loop);
1484         __ subq(temp1, loop_size[shift]);
1485 
1486         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1487         __ align32();
1488         __ BIND(L_main_loop);
1489            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
1490            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
1491            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
1492            __ addptr(temp4, loop_size[shift]);
1493            __ subq(temp1, loop_size[shift]);
1494            __ jcc(Assembler::greater, L_main_loop);
1495 
1496         __ addq(temp1, loop_size[shift]);
1497 
1498         // Tail loop.
1499         __ jmp(L_tail);
1500 
1501         __ BIND(L_repmovs);
1502           __ movq(temp2, temp1);
1503           // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
1504           __ movq(temp3, to);
1505           __ movq(to,  from);
1506           __ movq(from, temp3);
1507           // Save to/from for restoration post rep_mov.
1508           __ movq(temp1, to);
1509           __ movq(temp3, from);
1510           if(shift < 3) {
1511             __ shrq(temp2, 3-shift);     // quad word count
1512           }
1513           __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
1514           __ rep_mov();
1515           __ shlq(temp2, 3);             // convert quad words into byte count.
1516           if(shift) {
1517             __ shrq(temp2, shift);       // type specific count.
1518           }
1519           // Restore original addresses in to/from.
1520           __ movq(to, temp3);
1521           __ movq(from, temp1);
1522           __ movq(temp4, temp2);
1523           __ movq(temp1, count);
1524           __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
1525           __ jmp(L_tail);
1526       }
1527 
1528       if (MaxVectorSize > 32) {
1529         __ BIND(L_pre_main_post_64);
1530         // Partial copy to make dst address 64 byte aligned.
1531         __ movq(temp2, to);
1532         __ andq(temp2, 63);
1533         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1534 
1535         __ negptr(temp2);
1536         __ addq(temp2, 64);
1537         if (shift) {
1538           __ shrq(temp2, shift);
1539         }
1540         __ movq(temp3, temp2);
1541         __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
1542         __ movq(temp4, temp2);
1543         __ movq(temp1, count);
1544         __ subq(temp1, temp2);
1545 
1546         __ cmpq(temp1, loop_size[shift]);
1547         __ jcc(Assembler::less, L_tail64);
1548 
1549         __ BIND(L_main_pre_loop_64bytes);
1550         __ subq(temp1, loop_size[shift]);
1551 
1552         // Main loop with aligned copy block size of 192 bytes at
1553         // 64 byte copy granularity.
1554         __ align32();
1555         __ BIND(L_main_loop_64bytes);
1556            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
1557            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
1558            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
1559            __ addptr(temp4, loop_size[shift]);
1560            __ subq(temp1, loop_size[shift]);
1561            __ jcc(Assembler::greater, L_main_loop_64bytes);
1562 
1563         __ addq(temp1, loop_size[shift]);
1564         // Zero length check.
1565         __ jcc(Assembler::lessEqual, L_exit);
1566 
1567         __ BIND(L_tail64);
1568 
1569         // Tail handling using 64 byte [masked] vector copy operations.
1570         use64byteVector = true;
1571         __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1572                                         temp4, temp3, use64byteVector, L_entry, L_exit);
1573       }
1574       __ BIND(L_exit);
1575     }
1576 
1577     address ucme_exit_pc = __ pc();
1578     // When called from generic_arraycopy r11 contains specific values
1579     // used during arraycopy epilogue, re-initializing r11.
1580     if (is_oop) {
1581       __ movq(r11, shift == 3 ? count : to);
1582     }
1583     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1584     restore_argument_regs(type);
1585     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1586     __ xorptr(rax, rax); // return 0
1587     __ vzeroupper();
1588     __ leave(); // required for proper stackwalking of RuntimeStub frame
1589     __ ret(0);
1590     return start;
1591   }
1592 
1593   // Inputs:
1594   //   c_rarg0   - source array address
1595   //   c_rarg1   - destination array address
1596   //   c_rarg2   - element count, treated as ssize_t, can be zero
1597   //
1598   //
1599   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1600                                              address nooverlap_target, bool aligned, bool is_oop,
1601                                              bool dest_uninitialized) {
1602     __ align(CodeEntryAlignment);
1603     StubCodeMark mark(this, "StubRoutines", name);
1604     address start = __ pc();
1605 
1606     bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
1607 
1608     Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1609     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1610     const Register from        = rdi;  // source array address
1611     const Register to          = rsi;  // destination array address
1612     const Register count       = rdx;  // elements count
1613     const Register temp1       = r8;
1614     const Register temp2       = rcx;
1615     const Register temp3       = r11;
1616     const Register temp4       = rax;
1617     // End pointers are inclusive, and if count is not zero they point
1618     // to the last unit copied:  end_to[0] := end_from[0]
1619 
1620     __ enter(); // required for proper stackwalking of RuntimeStub frame
1621     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1622 
1623     if (entry != NULL) {
1624       *entry = __ pc();
1625        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1626       BLOCK_COMMENT("Entry:");
1627     }
1628 
1629     array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1630 
1631     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1632     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1633 
1634     setup_argument_regs(type);
1635 
1636     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1637     if (dest_uninitialized) {
1638       decorators |= IS_DEST_UNINITIALIZED;
1639     }
1640     if (aligned) {
1641       decorators |= ARRAYCOPY_ALIGNED;
1642     }
1643     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1644     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1645     {
1646       // Type(shift)       byte(0), short(1), int(2),   long(3)
1647       int loop_size[]   = { 192,     96,       48,      24};
1648       int threshold[]   = { 4096,    2048,     1024,    512};
1649 
1650       // UnsafeCopyMemory page error: continue after ucm
1651       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1652       // 'from', 'to' and 'count' are now valid
1653 
1654       // temp1 holds remaining count.
1655       __ movq(temp1, count);
1656 
1657       // Zero length check.
1658       __ BIND(L_tail);
1659       __ cmpq(temp1, 0);
1660       __ jcc(Assembler::lessEqual, L_exit);
1661 
1662       __ mov64(temp2, 0);
1663       __ movq(temp3, temp1);
1664       // Special cases using 32 byte [masked] vector copy operations.
1665       __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1666                                                temp4, use64byteVector, L_entry, L_exit);
1667 
1668       // PRE-MAIN-POST loop for aligned copy.
1669       __ BIND(L_entry);
1670 
1671       if (MaxVectorSize > 32 && AVX3Threshold != 0) {
1672         __ cmpq(temp1, threshold[shift]);
1673         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1674       }
1675 
1676       if (MaxVectorSize < 64  || AVX3Threshold != 0) {
1677         // Partial copy to make dst address 32 byte aligned.
1678         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1679         __ andq(temp2, 31);
1680         __ jcc(Assembler::equal, L_main_pre_loop);
1681 
1682         if (shift) {
1683           __ shrq(temp2, shift);
1684         }
1685         __ subq(temp1, temp2);
1686         __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1687 
1688         __ cmpq(temp1, loop_size[shift]);
1689         __ jcc(Assembler::less, L_tail);
1690 
1691         __ BIND(L_main_pre_loop);
1692 
1693         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1694         __ align32();
1695         __ BIND(L_main_loop);
1696            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1697            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1698            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1699            __ subptr(temp1, loop_size[shift]);
1700            __ cmpq(temp1, loop_size[shift]);
1701            __ jcc(Assembler::greater, L_main_loop);
1702 
1703         // Tail loop.
1704         __ jmp(L_tail);
1705       }
1706 
1707       if (MaxVectorSize > 32) {
1708         __ BIND(L_pre_main_post_64);
1709         // Partial copy to make dst address 64 byte aligned.
1710         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1711         __ andq(temp2, 63);
1712         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1713 
1714         if (shift) {
1715           __ shrq(temp2, shift);
1716         }
1717         __ subq(temp1, temp2);
1718         __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1719 
1720         __ cmpq(temp1, loop_size[shift]);
1721         __ jcc(Assembler::less, L_tail64);
1722 
1723         __ BIND(L_main_pre_loop_64bytes);
1724 
1725         // Main loop with aligned copy block size of 192 bytes at
1726         // 64 byte copy granularity.
1727         __ align32();
1728         __ BIND(L_main_loop_64bytes);
1729            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1730            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1731            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1732            __ subq(temp1, loop_size[shift]);
1733            __ cmpq(temp1, loop_size[shift]);
1734            __ jcc(Assembler::greater, L_main_loop_64bytes);
1735 
1736         // Zero length check.
1737         __ cmpq(temp1, 0);
1738         __ jcc(Assembler::lessEqual, L_exit);
1739 
1740         __ BIND(L_tail64);
1741 
1742         // Tail handling using 64 byte [masked] vector copy operations.
1743         use64byteVector = true;
1744         __ mov64(temp2, 0);
1745         __ movq(temp3, temp1);
1746         __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1747                                                  temp4, use64byteVector, L_entry, L_exit);
1748       }
1749       __ BIND(L_exit);
1750     }
1751     address ucme_exit_pc = __ pc();
1752     // When called from generic_arraycopy r11 contains specific values
1753     // used during arraycopy epilogue, re-initializing r11.
1754     if(is_oop) {
1755       __ movq(r11, count);
1756     }
1757     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1758     restore_argument_regs(type);
1759     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1760     __ xorptr(rax, rax); // return 0
1761     __ vzeroupper();
1762     __ leave(); // required for proper stackwalking of RuntimeStub frame
1763     __ ret(0);
1764     return start;
1765   }
1766 #endif // COMPILER2_OR_JVMCI
1767 
1768 
1769   // Arguments:
1770   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1771   //             ignored
1772   //   name    - stub name string
1773   //
1774   // Inputs:
1775   //   c_rarg0   - source array address
1776   //   c_rarg1   - destination array address
1777   //   c_rarg2   - element count, treated as ssize_t, can be zero
1778   //
1779   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1780   // we let the hardware handle it.  The one to eight bytes within words,
1781   // dwords or qwords that span cache line boundaries will still be loaded
1782   // and stored atomically.
1783   //
1784   // Side Effects:
1785   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1786   //   used by generate_conjoint_byte_copy().
1787   //
1788   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1789 #if COMPILER2_OR_JVMCI
1790     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1791        return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1792                                                  aligned, false, false);
1793     }
1794 #endif
1795     __ align(CodeEntryAlignment);
1796     StubCodeMark mark(this, "StubRoutines", name);
1797     address start = __ pc();
1798 
1799     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1800     Label L_copy_byte, L_exit;
1801     const Register from        = rdi;  // source array address
1802     const Register to          = rsi;  // destination array address
1803     const Register count       = rdx;  // elements count
1804     const Register byte_count  = rcx;
1805     const Register qword_count = count;
1806     const Register end_from    = from; // source array end address
1807     const Register end_to      = to;   // destination array end address
1808     // End pointers are inclusive, and if count is not zero they point
1809     // to the last unit copied:  end_to[0] := end_from[0]
1810 
1811     __ enter(); // required for proper stackwalking of RuntimeStub frame
1812     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1813 
1814     if (entry != NULL) {
1815       *entry = __ pc();
1816        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1817       BLOCK_COMMENT("Entry:");
1818     }
1819 
1820     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1821                       // r9 and r10 may be used to save non-volatile registers
1822 
1823     {
1824       // UnsafeCopyMemory page error: continue after ucm
1825       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1826       // 'from', 'to' and 'count' are now valid
1827       __ movptr(byte_count, count);
1828       __ shrptr(count, 3); // count => qword_count
1829 
1830       // Copy from low to high addresses.  Use 'to' as scratch.
1831       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1832       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1833       __ negptr(qword_count); // make the count negative
1834       __ jmp(L_copy_bytes);
1835 
1836       // Copy trailing qwords
1837     __ BIND(L_copy_8_bytes);
1838       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1839       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1840       __ increment(qword_count);
1841       __ jcc(Assembler::notZero, L_copy_8_bytes);
1842 
1843       // Check for and copy trailing dword
1844     __ BIND(L_copy_4_bytes);
1845       __ testl(byte_count, 4);
1846       __ jccb(Assembler::zero, L_copy_2_bytes);
1847       __ movl(rax, Address(end_from, 8));
1848       __ movl(Address(end_to, 8), rax);
1849 
1850       __ addptr(end_from, 4);
1851       __ addptr(end_to, 4);
1852 
1853       // Check for and copy trailing word
1854     __ BIND(L_copy_2_bytes);
1855       __ testl(byte_count, 2);
1856       __ jccb(Assembler::zero, L_copy_byte);
1857       __ movw(rax, Address(end_from, 8));
1858       __ movw(Address(end_to, 8), rax);
1859 
1860       __ addptr(end_from, 2);
1861       __ addptr(end_to, 2);
1862 
1863       // Check for and copy trailing byte
1864     __ BIND(L_copy_byte);
1865       __ testl(byte_count, 1);
1866       __ jccb(Assembler::zero, L_exit);
1867       __ movb(rax, Address(end_from, 8));
1868       __ movb(Address(end_to, 8), rax);
1869     }
1870   __ BIND(L_exit);
1871     address ucme_exit_pc = __ pc();
1872     restore_arg_regs();
1873     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1874     __ xorptr(rax, rax); // return 0
1875     __ vzeroupper();
1876     __ leave(); // required for proper stackwalking of RuntimeStub frame
1877     __ ret(0);
1878 
1879     {
1880       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1881       // Copy in multi-bytes chunks
1882       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1883       __ jmp(L_copy_4_bytes);
1884     }
1885     return start;
1886   }
1887 
1888   // Arguments:
1889   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1890   //             ignored
1891   //   name    - stub name string
1892   //
1893   // Inputs:
1894   //   c_rarg0   - source array address
1895   //   c_rarg1   - destination array address
1896   //   c_rarg2   - element count, treated as ssize_t, can be zero
1897   //
1898   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1899   // we let the hardware handle it.  The one to eight bytes within words,
1900   // dwords or qwords that span cache line boundaries will still be loaded
1901   // and stored atomically.
1902   //
1903   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1904                                       address* entry, const char *name) {
1905 #if COMPILER2_OR_JVMCI
1906     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1907        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1908                                                  nooverlap_target, aligned, false, false);
1909     }
1910 #endif
1911     __ align(CodeEntryAlignment);
1912     StubCodeMark mark(this, "StubRoutines", name);
1913     address start = __ pc();
1914 
1915     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1916     const Register from        = rdi;  // source array address
1917     const Register to          = rsi;  // destination array address
1918     const Register count       = rdx;  // elements count
1919     const Register byte_count  = rcx;
1920     const Register qword_count = count;
1921 
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1924 
1925     if (entry != NULL) {
1926       *entry = __ pc();
1927       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1928       BLOCK_COMMENT("Entry:");
1929     }
1930 
1931     array_overlap_test(nooverlap_target, Address::times_1);
1932     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1933                       // r9 and r10 may be used to save non-volatile registers
1934 
1935     {
1936       // UnsafeCopyMemory page error: continue after ucm
1937       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1938       // 'from', 'to' and 'count' are now valid
1939       __ movptr(byte_count, count);
1940       __ shrptr(count, 3);   // count => qword_count
1941 
1942       // Copy from high to low addresses.
1943 
1944       // Check for and copy trailing byte
1945       __ testl(byte_count, 1);
1946       __ jcc(Assembler::zero, L_copy_2_bytes);
1947       __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1948       __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1949       __ decrement(byte_count); // Adjust for possible trailing word
1950 
1951       // Check for and copy trailing word
1952     __ BIND(L_copy_2_bytes);
1953       __ testl(byte_count, 2);
1954       __ jcc(Assembler::zero, L_copy_4_bytes);
1955       __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1956       __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1957 
1958       // Check for and copy trailing dword
1959     __ BIND(L_copy_4_bytes);
1960       __ testl(byte_count, 4);
1961       __ jcc(Assembler::zero, L_copy_bytes);
1962       __ movl(rax, Address(from, qword_count, Address::times_8));
1963       __ movl(Address(to, qword_count, Address::times_8), rax);
1964       __ jmp(L_copy_bytes);
1965 
1966       // Copy trailing qwords
1967     __ BIND(L_copy_8_bytes);
1968       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1969       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1970       __ decrement(qword_count);
1971       __ jcc(Assembler::notZero, L_copy_8_bytes);
1972     }
1973     restore_arg_regs();
1974     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1975     __ xorptr(rax, rax); // return 0
1976     __ vzeroupper();
1977     __ leave(); // required for proper stackwalking of RuntimeStub frame
1978     __ ret(0);
1979 
1980     {
1981       // UnsafeCopyMemory page error: continue after ucm
1982       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1983       // Copy in multi-bytes chunks
1984       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1985     }
1986     restore_arg_regs();
1987     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1988     __ xorptr(rax, rax); // return 0
1989     __ vzeroupper();
1990     __ leave(); // required for proper stackwalking of RuntimeStub frame
1991     __ ret(0);
1992 
1993     return start;
1994   }
1995 
1996   // Arguments:
1997   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1998   //             ignored
1999   //   name    - stub name string
2000   //
2001   // Inputs:
2002   //   c_rarg0   - source array address
2003   //   c_rarg1   - destination array address
2004   //   c_rarg2   - element count, treated as ssize_t, can be zero
2005   //
2006   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2007   // let the hardware handle it.  The two or four words within dwords
2008   // or qwords that span cache line boundaries will still be loaded
2009   // and stored atomically.
2010   //
2011   // Side Effects:
2012   //   disjoint_short_copy_entry is set to the no-overlap entry point
2013   //   used by generate_conjoint_short_copy().
2014   //
2015   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
2016 #if COMPILER2_OR_JVMCI
2017     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2018        return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
2019                                                  aligned, false, false);
2020     }
2021 #endif
2022 
2023     __ align(CodeEntryAlignment);
2024     StubCodeMark mark(this, "StubRoutines", name);
2025     address start = __ pc();
2026 
2027     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
2028     const Register from        = rdi;  // source array address
2029     const Register to          = rsi;  // destination array address
2030     const Register count       = rdx;  // elements count
2031     const Register word_count  = rcx;
2032     const Register qword_count = count;
2033     const Register end_from    = from; // source array end address
2034     const Register end_to      = to;   // destination array end address
2035     // End pointers are inclusive, and if count is not zero they point
2036     // to the last unit copied:  end_to[0] := end_from[0]
2037 
2038     __ enter(); // required for proper stackwalking of RuntimeStub frame
2039     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2040 
2041     if (entry != NULL) {
2042       *entry = __ pc();
2043       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2044       BLOCK_COMMENT("Entry:");
2045     }
2046 
2047     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2048                       // r9 and r10 may be used to save non-volatile registers
2049 
2050     {
2051       // UnsafeCopyMemory page error: continue after ucm
2052       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2053       // 'from', 'to' and 'count' are now valid
2054       __ movptr(word_count, count);
2055       __ shrptr(count, 2); // count => qword_count
2056 
2057       // Copy from low to high addresses.  Use 'to' as scratch.
2058       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2059       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2060       __ negptr(qword_count);
2061       __ jmp(L_copy_bytes);
2062 
2063       // Copy trailing qwords
2064     __ BIND(L_copy_8_bytes);
2065       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2066       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2067       __ increment(qword_count);
2068       __ jcc(Assembler::notZero, L_copy_8_bytes);
2069 
2070       // Original 'dest' is trashed, so we can't use it as a
2071       // base register for a possible trailing word copy
2072 
2073       // Check for and copy trailing dword
2074     __ BIND(L_copy_4_bytes);
2075       __ testl(word_count, 2);
2076       __ jccb(Assembler::zero, L_copy_2_bytes);
2077       __ movl(rax, Address(end_from, 8));
2078       __ movl(Address(end_to, 8), rax);
2079 
2080       __ addptr(end_from, 4);
2081       __ addptr(end_to, 4);
2082 
2083       // Check for and copy trailing word
2084     __ BIND(L_copy_2_bytes);
2085       __ testl(word_count, 1);
2086       __ jccb(Assembler::zero, L_exit);
2087       __ movw(rax, Address(end_from, 8));
2088       __ movw(Address(end_to, 8), rax);
2089     }
2090   __ BIND(L_exit);
2091     address ucme_exit_pc = __ pc();
2092     restore_arg_regs();
2093     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2094     __ xorptr(rax, rax); // return 0
2095     __ vzeroupper();
2096     __ leave(); // required for proper stackwalking of RuntimeStub frame
2097     __ ret(0);
2098 
2099     {
2100       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
2101       // Copy in multi-bytes chunks
2102       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2103       __ jmp(L_copy_4_bytes);
2104     }
2105 
2106     return start;
2107   }
2108 
2109   address generate_fill(BasicType t, bool aligned, const char *name) {
2110     __ align(CodeEntryAlignment);
2111     StubCodeMark mark(this, "StubRoutines", name);
2112     address start = __ pc();
2113 
2114     BLOCK_COMMENT("Entry:");
2115 
2116     const Register to       = c_rarg0;  // source array address
2117     const Register value    = c_rarg1;  // value
2118     const Register count    = c_rarg2;  // elements count
2119 
2120     __ enter(); // required for proper stackwalking of RuntimeStub frame
2121 
2122     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
2123 
2124     __ vzeroupper();
2125     __ leave(); // required for proper stackwalking of RuntimeStub frame
2126     __ ret(0);
2127     return start;
2128   }
2129 
2130   // Arguments:
2131   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2132   //             ignored
2133   //   name    - stub name string
2134   //
2135   // Inputs:
2136   //   c_rarg0   - source array address
2137   //   c_rarg1   - destination array address
2138   //   c_rarg2   - element count, treated as ssize_t, can be zero
2139   //
2140   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2141   // let the hardware handle it.  The two or four words within dwords
2142   // or qwords that span cache line boundaries will still be loaded
2143   // and stored atomically.
2144   //
2145   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2146                                        address *entry, const char *name) {
2147 #if COMPILER2_OR_JVMCI
2148     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2149        return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
2150                                                  nooverlap_target, aligned, false, false);
2151     }
2152 #endif
2153     __ align(CodeEntryAlignment);
2154     StubCodeMark mark(this, "StubRoutines", name);
2155     address start = __ pc();
2156 
2157     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2158     const Register from        = rdi;  // source array address
2159     const Register to          = rsi;  // destination array address
2160     const Register count       = rdx;  // elements count
2161     const Register word_count  = rcx;
2162     const Register qword_count = count;
2163 
2164     __ enter(); // required for proper stackwalking of RuntimeStub frame
2165     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2166 
2167     if (entry != NULL) {
2168       *entry = __ pc();
2169       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2170       BLOCK_COMMENT("Entry:");
2171     }
2172 
2173     array_overlap_test(nooverlap_target, Address::times_2);
2174     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2175                       // r9 and r10 may be used to save non-volatile registers
2176 
2177     {
2178       // UnsafeCopyMemory page error: continue after ucm
2179       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2180       // 'from', 'to' and 'count' are now valid
2181       __ movptr(word_count, count);
2182       __ shrptr(count, 2); // count => qword_count
2183 
2184       // Copy from high to low addresses.  Use 'to' as scratch.
2185 
2186       // Check for and copy trailing word
2187       __ testl(word_count, 1);
2188       __ jccb(Assembler::zero, L_copy_4_bytes);
2189       __ movw(rax, Address(from, word_count, Address::times_2, -2));
2190       __ movw(Address(to, word_count, Address::times_2, -2), rax);
2191 
2192      // Check for and copy trailing dword
2193     __ BIND(L_copy_4_bytes);
2194       __ testl(word_count, 2);
2195       __ jcc(Assembler::zero, L_copy_bytes);
2196       __ movl(rax, Address(from, qword_count, Address::times_8));
2197       __ movl(Address(to, qword_count, Address::times_8), rax);
2198       __ jmp(L_copy_bytes);
2199 
2200       // Copy trailing qwords
2201     __ BIND(L_copy_8_bytes);
2202       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2203       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2204       __ decrement(qword_count);
2205       __ jcc(Assembler::notZero, L_copy_8_bytes);
2206     }
2207     restore_arg_regs();
2208     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2209     __ xorptr(rax, rax); // return 0
2210     __ vzeroupper();
2211     __ leave(); // required for proper stackwalking of RuntimeStub frame
2212     __ ret(0);
2213 
2214     {
2215       // UnsafeCopyMemory page error: continue after ucm
2216       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2217       // Copy in multi-bytes chunks
2218       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2219     }
2220     restore_arg_regs();
2221     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2222     __ xorptr(rax, rax); // return 0
2223     __ vzeroupper();
2224     __ leave(); // required for proper stackwalking of RuntimeStub frame
2225     __ ret(0);
2226 
2227     return start;
2228   }
2229 
2230   // Arguments:
2231   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2232   //             ignored
2233   //   is_oop  - true => oop array, so generate store check code
2234   //   name    - stub name string
2235   //
2236   // Inputs:
2237   //   c_rarg0   - source array address
2238   //   c_rarg1   - destination array address
2239   //   c_rarg2   - element count, treated as ssize_t, can be zero
2240   //
2241   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2242   // the hardware handle it.  The two dwords within qwords that span
2243   // cache line boundaries will still be loaded and stored atomicly.
2244   //
2245   // Side Effects:
2246   //   disjoint_int_copy_entry is set to the no-overlap entry point
2247   //   used by generate_conjoint_int_oop_copy().
2248   //
2249   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2250                                          const char *name, bool dest_uninitialized = false) {
2251 #if COMPILER2_OR_JVMCI
2252     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2253        return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
2254                                                  aligned, is_oop, dest_uninitialized);
2255     }
2256 #endif
2257 
2258     __ align(CodeEntryAlignment);
2259     StubCodeMark mark(this, "StubRoutines", name);
2260     address start = __ pc();
2261 
2262     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2263     const Register from        = rdi;  // source array address
2264     const Register to          = rsi;  // destination array address
2265     const Register count       = rdx;  // elements count
2266     const Register dword_count = rcx;
2267     const Register qword_count = count;
2268     const Register end_from    = from; // source array end address
2269     const Register end_to      = to;   // destination array end address
2270     // End pointers are inclusive, and if count is not zero they point
2271     // to the last unit copied:  end_to[0] := end_from[0]
2272 
2273     __ enter(); // required for proper stackwalking of RuntimeStub frame
2274     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2275 
2276     if (entry != NULL) {
2277       *entry = __ pc();
2278       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2279       BLOCK_COMMENT("Entry:");
2280     }
2281 
2282     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2283                                    // r9 is used to save r15_thread
2284 
2285     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2286     if (dest_uninitialized) {
2287       decorators |= IS_DEST_UNINITIALIZED;
2288     }
2289     if (aligned) {
2290       decorators |= ARRAYCOPY_ALIGNED;
2291     }
2292 
2293     BasicType type = is_oop ? T_OBJECT : T_INT;
2294     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2295     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2296 
2297     {
2298       // UnsafeCopyMemory page error: continue after ucm
2299       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2300       // 'from', 'to' and 'count' are now valid
2301       __ movptr(dword_count, count);
2302       __ shrptr(count, 1); // count => qword_count
2303 
2304       // Copy from low to high addresses.  Use 'to' as scratch.
2305       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2306       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2307       __ negptr(qword_count);
2308       __ jmp(L_copy_bytes);
2309 
2310       // Copy trailing qwords
2311     __ BIND(L_copy_8_bytes);
2312       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2313       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2314       __ increment(qword_count);
2315       __ jcc(Assembler::notZero, L_copy_8_bytes);
2316 
2317       // Check for and copy trailing dword
2318     __ BIND(L_copy_4_bytes);
2319       __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2320       __ jccb(Assembler::zero, L_exit);
2321       __ movl(rax, Address(end_from, 8));
2322       __ movl(Address(end_to, 8), rax);
2323     }
2324   __ BIND(L_exit);
2325     address ucme_exit_pc = __ pc();
2326     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2327     restore_arg_regs_using_thread();
2328     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2329     __ vzeroupper();
2330     __ xorptr(rax, rax); // return 0
2331     __ leave(); // required for proper stackwalking of RuntimeStub frame
2332     __ ret(0);
2333 
2334     {
2335       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2336       // Copy in multi-bytes chunks
2337       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2338       __ jmp(L_copy_4_bytes);
2339     }
2340 
2341     return start;
2342   }
2343 
2344   // Arguments:
2345   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2346   //             ignored
2347   //   is_oop  - true => oop array, so generate store check code
2348   //   name    - stub name string
2349   //
2350   // Inputs:
2351   //   c_rarg0   - source array address
2352   //   c_rarg1   - destination array address
2353   //   c_rarg2   - element count, treated as ssize_t, can be zero
2354   //
2355   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2356   // the hardware handle it.  The two dwords within qwords that span
2357   // cache line boundaries will still be loaded and stored atomicly.
2358   //
2359   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2360                                          address *entry, const char *name,
2361                                          bool dest_uninitialized = false) {
2362 #if COMPILER2_OR_JVMCI
2363     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2364        return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
2365                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2366     }
2367 #endif
2368     __ align(CodeEntryAlignment);
2369     StubCodeMark mark(this, "StubRoutines", name);
2370     address start = __ pc();
2371 
2372     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2373     const Register from        = rdi;  // source array address
2374     const Register to          = rsi;  // destination array address
2375     const Register count       = rdx;  // elements count
2376     const Register dword_count = rcx;
2377     const Register qword_count = count;
2378 
2379     __ enter(); // required for proper stackwalking of RuntimeStub frame
2380     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2381 
2382     if (entry != NULL) {
2383       *entry = __ pc();
2384        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2385       BLOCK_COMMENT("Entry:");
2386     }
2387 
2388     array_overlap_test(nooverlap_target, Address::times_4);
2389     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2390                                    // r9 is used to save r15_thread
2391 
2392     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2393     if (dest_uninitialized) {
2394       decorators |= IS_DEST_UNINITIALIZED;
2395     }
2396     if (aligned) {
2397       decorators |= ARRAYCOPY_ALIGNED;
2398     }
2399 
2400     BasicType type = is_oop ? T_OBJECT : T_INT;
2401     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2402     // no registers are destroyed by this call
2403     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2404 
2405     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2406     {
2407       // UnsafeCopyMemory page error: continue after ucm
2408       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2409       // 'from', 'to' and 'count' are now valid
2410       __ movptr(dword_count, count);
2411       __ shrptr(count, 1); // count => qword_count
2412 
2413       // Copy from high to low addresses.  Use 'to' as scratch.
2414 
2415       // Check for and copy trailing dword
2416       __ testl(dword_count, 1);
2417       __ jcc(Assembler::zero, L_copy_bytes);
2418       __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2419       __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2420       __ jmp(L_copy_bytes);
2421 
2422       // Copy trailing qwords
2423     __ BIND(L_copy_8_bytes);
2424       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2425       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2426       __ decrement(qword_count);
2427       __ jcc(Assembler::notZero, L_copy_8_bytes);
2428     }
2429     if (is_oop) {
2430       __ jmp(L_exit);
2431     }
2432     restore_arg_regs_using_thread();
2433     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2434     __ xorptr(rax, rax); // return 0
2435     __ vzeroupper();
2436     __ leave(); // required for proper stackwalking of RuntimeStub frame
2437     __ ret(0);
2438 
2439     {
2440       // UnsafeCopyMemory page error: continue after ucm
2441       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2442       // Copy in multi-bytes chunks
2443       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2444     }
2445 
2446   __ BIND(L_exit);
2447     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2448     restore_arg_regs_using_thread();
2449     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2450     __ xorptr(rax, rax); // return 0
2451     __ vzeroupper();
2452     __ leave(); // required for proper stackwalking of RuntimeStub frame
2453     __ ret(0);
2454 
2455     return start;
2456   }
2457 
2458   // Arguments:
2459   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2460   //             ignored
2461   //   is_oop  - true => oop array, so generate store check code
2462   //   name    - stub name string
2463   //
2464   // Inputs:
2465   //   c_rarg0   - source array address
2466   //   c_rarg1   - destination array address
2467   //   c_rarg2   - element count, treated as ssize_t, can be zero
2468   //
2469  // Side Effects:
2470   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2471   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2472   //
2473   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2474                                           const char *name, bool dest_uninitialized = false) {
2475 #if COMPILER2_OR_JVMCI
2476     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2477        return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
2478                                                  aligned, is_oop, dest_uninitialized);
2479     }
2480 #endif
2481     __ align(CodeEntryAlignment);
2482     StubCodeMark mark(this, "StubRoutines", name);
2483     address start = __ pc();
2484 
2485     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2486     const Register from        = rdi;  // source array address
2487     const Register to          = rsi;  // destination array address
2488     const Register qword_count = rdx;  // elements count
2489     const Register end_from    = from; // source array end address
2490     const Register end_to      = rcx;  // destination array end address
2491     const Register saved_count = r11;
2492     // End pointers are inclusive, and if count is not zero they point
2493     // to the last unit copied:  end_to[0] := end_from[0]
2494 
2495     __ enter(); // required for proper stackwalking of RuntimeStub frame
2496     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2497     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2498 
2499     if (entry != NULL) {
2500       *entry = __ pc();
2501       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2502       BLOCK_COMMENT("Entry:");
2503     }
2504 
2505     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2506                                      // r9 is used to save r15_thread
2507     // 'from', 'to' and 'qword_count' are now valid
2508 
2509     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2510     if (dest_uninitialized) {
2511       decorators |= IS_DEST_UNINITIALIZED;
2512     }
2513     if (aligned) {
2514       decorators |= ARRAYCOPY_ALIGNED;
2515     }
2516 
2517     BasicType type = is_oop ? T_OBJECT : T_LONG;
2518     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2519     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2520     {
2521       // UnsafeCopyMemory page error: continue after ucm
2522       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2523 
2524       // Copy from low to high addresses.  Use 'to' as scratch.
2525       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2526       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2527       __ negptr(qword_count);
2528       __ jmp(L_copy_bytes);
2529 
2530       // Copy trailing qwords
2531     __ BIND(L_copy_8_bytes);
2532       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2533       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2534       __ increment(qword_count);
2535       __ jcc(Assembler::notZero, L_copy_8_bytes);
2536     }
2537     if (is_oop) {
2538       __ jmp(L_exit);
2539     } else {
2540       restore_arg_regs_using_thread();
2541       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2542       __ xorptr(rax, rax); // return 0
2543       __ vzeroupper();
2544       __ leave(); // required for proper stackwalking of RuntimeStub frame
2545       __ ret(0);
2546     }
2547 
2548     {
2549       // UnsafeCopyMemory page error: continue after ucm
2550       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2551       // Copy in multi-bytes chunks
2552       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2553     }
2554 
2555     __ BIND(L_exit);
2556     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2557     restore_arg_regs_using_thread();
2558     if (is_oop) {
2559       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2560     } else {
2561       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2562     }
2563     __ vzeroupper();
2564     __ xorptr(rax, rax); // return 0
2565     __ leave(); // required for proper stackwalking of RuntimeStub frame
2566     __ ret(0);
2567 
2568     return start;
2569   }
2570 
2571   // Arguments:
2572   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2573   //             ignored
2574   //   is_oop  - true => oop array, so generate store check code
2575   //   name    - stub name string
2576   //
2577   // Inputs:
2578   //   c_rarg0   - source array address
2579   //   c_rarg1   - destination array address
2580   //   c_rarg2   - element count, treated as ssize_t, can be zero
2581   //
2582   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2583                                           address nooverlap_target, address *entry,
2584                                           const char *name, bool dest_uninitialized = false) {
2585 #if COMPILER2_OR_JVMCI
2586     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2587        return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2588                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2589     }
2590 #endif
2591     __ align(CodeEntryAlignment);
2592     StubCodeMark mark(this, "StubRoutines", name);
2593     address start = __ pc();
2594 
2595     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2596     const Register from        = rdi;  // source array address
2597     const Register to          = rsi;  // destination array address
2598     const Register qword_count = rdx;  // elements count
2599     const Register saved_count = rcx;
2600 
2601     __ enter(); // required for proper stackwalking of RuntimeStub frame
2602     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2603 
2604     if (entry != NULL) {
2605       *entry = __ pc();
2606       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2607       BLOCK_COMMENT("Entry:");
2608     }
2609 
2610     array_overlap_test(nooverlap_target, Address::times_8);
2611     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2612                                    // r9 is used to save r15_thread
2613     // 'from', 'to' and 'qword_count' are now valid
2614 
2615     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2616     if (dest_uninitialized) {
2617       decorators |= IS_DEST_UNINITIALIZED;
2618     }
2619     if (aligned) {
2620       decorators |= ARRAYCOPY_ALIGNED;
2621     }
2622 
2623     BasicType type = is_oop ? T_OBJECT : T_LONG;
2624     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2625     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2626     {
2627       // UnsafeCopyMemory page error: continue after ucm
2628       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2629 
2630       __ jmp(L_copy_bytes);
2631 
2632       // Copy trailing qwords
2633     __ BIND(L_copy_8_bytes);
2634       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2635       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2636       __ decrement(qword_count);
2637       __ jcc(Assembler::notZero, L_copy_8_bytes);
2638     }
2639     if (is_oop) {
2640       __ jmp(L_exit);
2641     } else {
2642       restore_arg_regs_using_thread();
2643       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2644       __ xorptr(rax, rax); // return 0
2645       __ vzeroupper();
2646       __ leave(); // required for proper stackwalking of RuntimeStub frame
2647       __ ret(0);
2648     }
2649     {
2650       // UnsafeCopyMemory page error: continue after ucm
2651       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2652 
2653       // Copy in multi-bytes chunks
2654       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2655     }
2656     __ BIND(L_exit);
2657     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2658     restore_arg_regs_using_thread();
2659     if (is_oop) {
2660       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2661     } else {
2662       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2663     }
2664     __ vzeroupper();
2665     __ xorptr(rax, rax); // return 0
2666     __ leave(); // required for proper stackwalking of RuntimeStub frame
2667     __ ret(0);
2668 
2669     return start;
2670   }
2671 
2672 
2673   // Helper for generating a dynamic type check.
2674   // Smashes no registers.
2675   void generate_type_check(Register sub_klass,
2676                            Register super_check_offset,
2677                            Register super_klass,
2678                            Label& L_success) {
2679     assert_different_registers(sub_klass, super_check_offset, super_klass);
2680 
2681     BLOCK_COMMENT("type_check:");
2682 
2683     Label L_miss;
2684 
2685     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2686                                      super_check_offset);
2687     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2688 
2689     // Fall through on failure!
2690     __ BIND(L_miss);
2691   }
2692 
2693   //
2694   //  Generate checkcasting array copy stub
2695   //
2696   //  Input:
2697   //    c_rarg0   - source array address
2698   //    c_rarg1   - destination array address
2699   //    c_rarg2   - element count, treated as ssize_t, can be zero
2700   //    c_rarg3   - size_t ckoff (super_check_offset)
2701   // not Win64
2702   //    c_rarg4   - oop ckval (super_klass)
2703   // Win64
2704   //    rsp+40    - oop ckval (super_klass)
2705   //
2706   //  Output:
2707   //    rax ==  0  -  success
2708   //    rax == -1^K - failure, where K is partial transfer count
2709   //
2710   address generate_checkcast_copy(const char *name, address *entry,
2711                                   bool dest_uninitialized = false) {
2712 
2713     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2714 
2715     // Input registers (after setup_arg_regs)
2716     const Register from        = rdi;   // source array address
2717     const Register to          = rsi;   // destination array address
2718     const Register length      = rdx;   // elements count
2719     const Register ckoff       = rcx;   // super_check_offset
2720     const Register ckval       = r8;    // super_klass
2721 
2722     // Registers used as temps (r13, r14 are save-on-entry)
2723     const Register end_from    = from;  // source array end address
2724     const Register end_to      = r13;   // destination array end address
2725     const Register count       = rdx;   // -(count_remaining)
2726     const Register r14_length  = r14;   // saved copy of length
2727     // End pointers are inclusive, and if length is not zero they point
2728     // to the last unit copied:  end_to[0] := end_from[0]
2729 
2730     const Register rax_oop    = rax;    // actual oop copied
2731     const Register r11_klass  = r11;    // oop._klass
2732 
2733     //---------------------------------------------------------------
2734     // Assembler stub will be used for this call to arraycopy
2735     // if the two arrays are subtypes of Object[] but the
2736     // destination array type is not equal to or a supertype
2737     // of the source type.  Each element must be separately
2738     // checked.
2739 
2740     __ align(CodeEntryAlignment);
2741     StubCodeMark mark(this, "StubRoutines", name);
2742     address start = __ pc();
2743 
2744     __ enter(); // required for proper stackwalking of RuntimeStub frame
2745 
2746 #ifdef ASSERT
2747     // caller guarantees that the arrays really are different
2748     // otherwise, we would have to make conjoint checks
2749     { Label L;
2750       array_overlap_test(L, TIMES_OOP);
2751       __ stop("checkcast_copy within a single array");
2752       __ bind(L);
2753     }
2754 #endif //ASSERT
2755 
2756     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2757                        // ckoff => rcx, ckval => r8
2758                        // r9 and r10 may be used to save non-volatile registers
2759 #ifdef _WIN64
2760     // last argument (#4) is on stack on Win64
2761     __ movptr(ckval, Address(rsp, 6 * wordSize));
2762 #endif
2763 
2764     // Caller of this entry point must set up the argument registers.
2765     if (entry != NULL) {
2766       *entry = __ pc();
2767       BLOCK_COMMENT("Entry:");
2768     }
2769 
2770     // allocate spill slots for r13, r14
2771     enum {
2772       saved_r13_offset,
2773       saved_r14_offset,
2774       saved_r10_offset,
2775       saved_rbp_offset
2776     };
2777     __ subptr(rsp, saved_rbp_offset * wordSize);
2778     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2779     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2780     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2781 
2782 #ifdef ASSERT
2783       Label L2;
2784       __ get_thread(r14);
2785       __ cmpptr(r15_thread, r14);
2786       __ jcc(Assembler::equal, L2);
2787       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2788       __ bind(L2);
2789 #endif // ASSERT
2790 
2791     // check that int operands are properly extended to size_t
2792     assert_clean_int(length, rax);
2793     assert_clean_int(ckoff, rax);
2794 
2795 #ifdef ASSERT
2796     BLOCK_COMMENT("assert consistent ckoff/ckval");
2797     // The ckoff and ckval must be mutually consistent,
2798     // even though caller generates both.
2799     { Label L;
2800       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2801       __ cmpl(ckoff, Address(ckval, sco_offset));
2802       __ jcc(Assembler::equal, L);
2803       __ stop("super_check_offset inconsistent");
2804       __ bind(L);
2805     }
2806 #endif //ASSERT
2807 
2808     // Loop-invariant addresses.  They are exclusive end pointers.
2809     Address end_from_addr(from, length, TIMES_OOP, 0);
2810     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2811     // Loop-variant addresses.  They assume post-incremented count < 0.
2812     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2813     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2814 
2815     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2816     if (dest_uninitialized) {
2817       decorators |= IS_DEST_UNINITIALIZED;
2818     }
2819 
2820     BasicType type = T_OBJECT;
2821     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2822     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2823 
2824     // Copy from low to high addresses, indexed from the end of each array.
2825     __ lea(end_from, end_from_addr);
2826     __ lea(end_to,   end_to_addr);
2827     __ movptr(r14_length, length);        // save a copy of the length
2828     assert(length == count, "");          // else fix next line:
2829     __ negptr(count);                     // negate and test the length
2830     __ jcc(Assembler::notZero, L_load_element);
2831 
2832     // Empty array:  Nothing to do.
2833     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2834     __ jmp(L_done);
2835 
2836     // ======== begin loop ========
2837     // (Loop is rotated; its entry is L_load_element.)
2838     // Loop control:
2839     //   for (count = -count; count != 0; count++)
2840     // Base pointers src, dst are biased by 8*(count-1),to last element.
2841     __ align(OptoLoopAlignment);
2842 
2843     __ BIND(L_store_element);
2844     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW);  // store the oop
2845     __ increment(count);               // increment the count toward zero
2846     __ jcc(Assembler::zero, L_do_card_marks);
2847 
2848     // ======== loop entry is here ========
2849     __ BIND(L_load_element);
2850     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2851     __ testptr(rax_oop, rax_oop);
2852     __ jcc(Assembler::zero, L_store_element);
2853 
2854     __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2855     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2856     // ======== end loop ========
2857 
2858     // It was a real error; we must depend on the caller to finish the job.
2859     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2860     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2861     // and report their number to the caller.
2862     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2863     Label L_post_barrier;
2864     __ addptr(r14_length, count);     // K = (original - remaining) oops
2865     __ movptr(rax, r14_length);       // save the value
2866     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2867     __ jccb(Assembler::notZero, L_post_barrier);
2868     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2869 
2870     // Come here on success only.
2871     __ BIND(L_do_card_marks);
2872     __ xorptr(rax, rax);              // return 0 on success
2873 
2874     __ BIND(L_post_barrier);
2875     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2876 
2877     // Common exit point (success or failure).
2878     __ BIND(L_done);
2879     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2880     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2881     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2882     restore_arg_regs();
2883     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2884     __ leave(); // required for proper stackwalking of RuntimeStub frame
2885     __ ret(0);
2886 
2887     return start;
2888   }
2889 
2890   //
2891   //  Generate 'unsafe' array copy stub
2892   //  Though just as safe as the other stubs, it takes an unscaled
2893   //  size_t argument instead of an element count.
2894   //
2895   //  Input:
2896   //    c_rarg0   - source array address
2897   //    c_rarg1   - destination array address
2898   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2899   //
2900   // Examines the alignment of the operands and dispatches
2901   // to a long, int, short, or byte copy loop.
2902   //
2903   address generate_unsafe_copy(const char *name,
2904                                address byte_copy_entry, address short_copy_entry,
2905                                address int_copy_entry, address long_copy_entry) {
2906 
2907     Label L_long_aligned, L_int_aligned, L_short_aligned;
2908 
2909     // Input registers (before setup_arg_regs)
2910     const Register from        = c_rarg0;  // source array address
2911     const Register to          = c_rarg1;  // destination array address
2912     const Register size        = c_rarg2;  // byte count (size_t)
2913 
2914     // Register used as a temp
2915     const Register bits        = rax;      // test copy of low bits
2916 
2917     __ align(CodeEntryAlignment);
2918     StubCodeMark mark(this, "StubRoutines", name);
2919     address start = __ pc();
2920 
2921     __ enter(); // required for proper stackwalking of RuntimeStub frame
2922 
2923     // bump this on entry, not on exit:
2924     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2925 
2926     __ mov(bits, from);
2927     __ orptr(bits, to);
2928     __ orptr(bits, size);
2929 
2930     __ testb(bits, BytesPerLong-1);
2931     __ jccb(Assembler::zero, L_long_aligned);
2932 
2933     __ testb(bits, BytesPerInt-1);
2934     __ jccb(Assembler::zero, L_int_aligned);
2935 
2936     __ testb(bits, BytesPerShort-1);
2937     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2938 
2939     __ BIND(L_short_aligned);
2940     __ shrptr(size, LogBytesPerShort); // size => short_count
2941     __ jump(RuntimeAddress(short_copy_entry));
2942 
2943     __ BIND(L_int_aligned);
2944     __ shrptr(size, LogBytesPerInt); // size => int_count
2945     __ jump(RuntimeAddress(int_copy_entry));
2946 
2947     __ BIND(L_long_aligned);
2948     __ shrptr(size, LogBytesPerLong); // size => qword_count
2949     __ jump(RuntimeAddress(long_copy_entry));
2950 
2951     return start;
2952   }
2953 
2954   // Perform range checks on the proposed arraycopy.
2955   // Kills temp, but nothing else.
2956   // Also, clean the sign bits of src_pos and dst_pos.
2957   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2958                               Register src_pos, // source position (c_rarg1)
2959                               Register dst,     // destination array oo (c_rarg2)
2960                               Register dst_pos, // destination position (c_rarg3)
2961                               Register length,
2962                               Register temp,
2963                               Label& L_failed) {
2964     BLOCK_COMMENT("arraycopy_range_checks:");
2965 
2966     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2967     __ movl(temp, length);
2968     __ addl(temp, src_pos);             // src_pos + length
2969     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2970     __ jcc(Assembler::above, L_failed);
2971 
2972     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2973     __ movl(temp, length);
2974     __ addl(temp, dst_pos);             // dst_pos + length
2975     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2976     __ jcc(Assembler::above, L_failed);
2977 
2978     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2979     // Move with sign extension can be used since they are positive.
2980     __ movslq(src_pos, src_pos);
2981     __ movslq(dst_pos, dst_pos);
2982 
2983     BLOCK_COMMENT("arraycopy_range_checks done");
2984   }
2985 
2986   //
2987   //  Generate generic array copy stubs
2988   //
2989   //  Input:
2990   //    c_rarg0    -  src oop
2991   //    c_rarg1    -  src_pos (32-bits)
2992   //    c_rarg2    -  dst oop
2993   //    c_rarg3    -  dst_pos (32-bits)
2994   // not Win64
2995   //    c_rarg4    -  element count (32-bits)
2996   // Win64
2997   //    rsp+40     -  element count (32-bits)
2998   //
2999   //  Output:
3000   //    rax ==  0  -  success
3001   //    rax == -1^K - failure, where K is partial transfer count
3002   //
3003   address generate_generic_copy(const char *name,
3004                                 address byte_copy_entry, address short_copy_entry,
3005                                 address int_copy_entry, address oop_copy_entry,
3006                                 address long_copy_entry, address checkcast_copy_entry) {
3007 
3008     Label L_failed, L_failed_0, L_objArray;
3009     Label L_copy_shorts, L_copy_ints, L_copy_longs;
3010 
3011     // Input registers
3012     const Register src        = c_rarg0;  // source array oop
3013     const Register src_pos    = c_rarg1;  // source position
3014     const Register dst        = c_rarg2;  // destination array oop
3015     const Register dst_pos    = c_rarg3;  // destination position
3016 #ifndef _WIN64
3017     const Register length     = c_rarg4;
3018     const Register rklass_tmp = r9;  // load_klass
3019 #else
3020     const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3021     const Register rklass_tmp = rdi;  // load_klass
3022 #endif
3023 
3024     { int modulus = CodeEntryAlignment;
3025       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
3026       int advance = target - (__ offset() % modulus);
3027       if (advance < 0)  advance += modulus;
3028       if (advance > 0)  __ nop(advance);
3029     }
3030     StubCodeMark mark(this, "StubRoutines", name);
3031 
3032     // Short-hop target to L_failed.  Makes for denser prologue code.
3033     __ BIND(L_failed_0);
3034     __ jmp(L_failed);
3035     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
3036 
3037     __ align(CodeEntryAlignment);
3038     address start = __ pc();
3039 
3040     __ enter(); // required for proper stackwalking of RuntimeStub frame
3041 
3042 #ifdef _WIN64
3043     __ push(rklass_tmp); // rdi is callee-save on Windows
3044 #endif
3045 
3046     // bump this on entry, not on exit:
3047     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
3048 
3049     //-----------------------------------------------------------------------
3050     // Assembler stub will be used for this call to arraycopy
3051     // if the following conditions are met:
3052     //
3053     // (1) src and dst must not be null.
3054     // (2) src_pos must not be negative.
3055     // (3) dst_pos must not be negative.
3056     // (4) length  must not be negative.
3057     // (5) src klass and dst klass should be the same and not NULL.
3058     // (6) src and dst should be arrays.
3059     // (7) src_pos + length must not exceed length of src.
3060     // (8) dst_pos + length must not exceed length of dst.
3061     //
3062 
3063     //  if (src == NULL) return -1;
3064     __ testptr(src, src);         // src oop
3065     size_t j1off = __ offset();
3066     __ jccb(Assembler::zero, L_failed_0);
3067 
3068     //  if (src_pos < 0) return -1;
3069     __ testl(src_pos, src_pos); // src_pos (32-bits)
3070     __ jccb(Assembler::negative, L_failed_0);
3071 
3072     //  if (dst == NULL) return -1;
3073     __ testptr(dst, dst);         // dst oop
3074     __ jccb(Assembler::zero, L_failed_0);
3075 
3076     //  if (dst_pos < 0) return -1;
3077     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3078     size_t j4off = __ offset();
3079     __ jccb(Assembler::negative, L_failed_0);
3080 
3081     // The first four tests are very dense code,
3082     // but not quite dense enough to put four
3083     // jumps in a 16-byte instruction fetch buffer.
3084     // That's good, because some branch predicters
3085     // do not like jumps so close together.
3086     // Make sure of this.
3087     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3088 
3089     // registers used as temp
3090     const Register r11_length    = r11; // elements count to copy
3091     const Register r10_src_klass = r10; // array klass
3092 
3093     //  if (length < 0) return -1;
3094     __ movl(r11_length, length);        // length (elements count, 32-bits value)
3095     __ testl(r11_length, r11_length);
3096     __ jccb(Assembler::negative, L_failed_0);
3097 
3098     __ load_klass(r10_src_klass, src, rklass_tmp);
3099 #ifdef ASSERT
3100     //  assert(src->klass() != NULL);
3101     {
3102       BLOCK_COMMENT("assert klasses not null {");
3103       Label L1, L2;
3104       __ testptr(r10_src_klass, r10_src_klass);
3105       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
3106       __ bind(L1);
3107       __ stop("broken null klass");
3108       __ bind(L2);
3109       __ load_klass(rax, dst, rklass_tmp);
3110       __ cmpq(rax, 0);
3111       __ jcc(Assembler::equal, L1);     // this would be broken also
3112       BLOCK_COMMENT("} assert klasses not null done");
3113     }
3114 #endif
3115 
3116     // Load layout helper (32-bits)
3117     //
3118     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3119     // 32        30    24            16              8     2                 0
3120     //
3121     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3122     //
3123 
3124     const int lh_offset = in_bytes(Klass::layout_helper_offset());
3125 
3126     // Handle objArrays completely differently...
3127     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3128     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3129     __ jcc(Assembler::equal, L_objArray);
3130 
3131     //  if (src->klass() != dst->klass()) return -1;
3132     __ load_klass(rax, dst, rklass_tmp);
3133     __ cmpq(r10_src_klass, rax);
3134     __ jcc(Assembler::notEqual, L_failed);
3135 
3136     const Register rax_lh = rax;  // layout helper
3137     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3138 
3139     //  if (!src->is_Array()) return -1;
3140     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3141     __ jcc(Assembler::greaterEqual, L_failed);
3142 
3143     // At this point, it is known to be a typeArray (array_tag 0x3).
3144 #ifdef ASSERT
3145     {
3146       BLOCK_COMMENT("assert primitive array {");
3147       Label L;
3148       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3149       __ jcc(Assembler::greaterEqual, L);
3150       __ stop("must be a primitive array");
3151       __ bind(L);
3152       BLOCK_COMMENT("} assert primitive array done");
3153     }
3154 #endif
3155 
3156     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3157                            r10, L_failed);
3158 
3159     // TypeArrayKlass
3160     //
3161     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3162     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3163     //
3164 
3165     const Register r10_offset = r10;    // array offset
3166     const Register rax_elsize = rax_lh; // element size
3167 
3168     __ movl(r10_offset, rax_lh);
3169     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3170     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3171     __ addptr(src, r10_offset);           // src array offset
3172     __ addptr(dst, r10_offset);           // dst array offset
3173     BLOCK_COMMENT("choose copy loop based on element size");
3174     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3175 
3176 #ifdef _WIN64
3177     __ pop(rklass_tmp); // Restore callee-save rdi
3178 #endif
3179 
3180     // next registers should be set before the jump to corresponding stub
3181     const Register from     = c_rarg0;  // source array address
3182     const Register to       = c_rarg1;  // destination array address
3183     const Register count    = c_rarg2;  // elements count
3184 
3185     // 'from', 'to', 'count' registers should be set in such order
3186     // since they are the same as 'src', 'src_pos', 'dst'.
3187 
3188     __ cmpl(rax_elsize, 0);
3189     __ jccb(Assembler::notEqual, L_copy_shorts);
3190     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3191     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3192     __ movl2ptr(count, r11_length); // length
3193     __ jump(RuntimeAddress(byte_copy_entry));
3194 
3195   __ BIND(L_copy_shorts);
3196     __ cmpl(rax_elsize, LogBytesPerShort);
3197     __ jccb(Assembler::notEqual, L_copy_ints);
3198     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3199     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3200     __ movl2ptr(count, r11_length); // length
3201     __ jump(RuntimeAddress(short_copy_entry));
3202 
3203   __ BIND(L_copy_ints);
3204     __ cmpl(rax_elsize, LogBytesPerInt);
3205     __ jccb(Assembler::notEqual, L_copy_longs);
3206     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3207     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3208     __ movl2ptr(count, r11_length); // length
3209     __ jump(RuntimeAddress(int_copy_entry));
3210 
3211   __ BIND(L_copy_longs);
3212 #ifdef ASSERT
3213     {
3214       BLOCK_COMMENT("assert long copy {");
3215       Label L;
3216       __ cmpl(rax_elsize, LogBytesPerLong);
3217       __ jcc(Assembler::equal, L);
3218       __ stop("must be long copy, but elsize is wrong");
3219       __ bind(L);
3220       BLOCK_COMMENT("} assert long copy done");
3221     }
3222 #endif
3223     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3224     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3225     __ movl2ptr(count, r11_length); // length
3226     __ jump(RuntimeAddress(long_copy_entry));
3227 
3228     // ObjArrayKlass
3229   __ BIND(L_objArray);
3230     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3231 
3232     Label L_plain_copy, L_checkcast_copy;
3233     //  test array classes for subtyping
3234     __ load_klass(rax, dst, rklass_tmp);
3235     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3236     __ jcc(Assembler::notEqual, L_checkcast_copy);
3237 
3238     // Identically typed arrays can be copied without element-wise checks.
3239     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3240                            r10, L_failed);
3241 
3242     __ lea(from, Address(src, src_pos, TIMES_OOP,
3243                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3244     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3245                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3246     __ movl2ptr(count, r11_length); // length
3247   __ BIND(L_plain_copy);
3248 #ifdef _WIN64
3249     __ pop(rklass_tmp); // Restore callee-save rdi
3250 #endif
3251     __ jump(RuntimeAddress(oop_copy_entry));
3252 
3253   __ BIND(L_checkcast_copy);
3254     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3255     {
3256       // Before looking at dst.length, make sure dst is also an objArray.
3257       __ cmpl(Address(rax, lh_offset), objArray_lh);
3258       __ jcc(Assembler::notEqual, L_failed);
3259 
3260       // It is safe to examine both src.length and dst.length.
3261       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3262                              rax, L_failed);
3263 
3264       const Register r11_dst_klass = r11;
3265       __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3266 
3267       // Marshal the base address arguments now, freeing registers.
3268       __ lea(from, Address(src, src_pos, TIMES_OOP,
3269                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3270       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3271                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3272       __ movl(count, length);           // length (reloaded)
3273       Register sco_temp = c_rarg3;      // this register is free now
3274       assert_different_registers(from, to, count, sco_temp,
3275                                  r11_dst_klass, r10_src_klass);
3276       assert_clean_int(count, sco_temp);
3277 
3278       // Generate the type check.
3279       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3280       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3281       assert_clean_int(sco_temp, rax);
3282       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3283 
3284       // Fetch destination element klass from the ObjArrayKlass header.
3285       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3286       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3287       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3288       assert_clean_int(sco_temp, rax);
3289 
3290 #ifdef _WIN64
3291       __ pop(rklass_tmp); // Restore callee-save rdi
3292 #endif
3293 
3294       // the checkcast_copy loop needs two extra arguments:
3295       assert(c_rarg3 == sco_temp, "#3 already in place");
3296       // Set up arguments for checkcast_copy_entry.
3297       setup_arg_regs(4);
3298       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3299       __ jump(RuntimeAddress(checkcast_copy_entry));
3300     }
3301 
3302   __ BIND(L_failed);
3303 #ifdef _WIN64
3304     __ pop(rklass_tmp); // Restore callee-save rdi
3305 #endif
3306     __ xorptr(rax, rax);
3307     __ notptr(rax); // return -1
3308     __ leave();   // required for proper stackwalking of RuntimeStub frame
3309     __ ret(0);
3310 
3311     return start;
3312   }
3313 
3314   address generate_data_cache_writeback() {
3315     const Register src        = c_rarg0;  // source address
3316 
3317     __ align(CodeEntryAlignment);
3318 
3319     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3320 
3321     address start = __ pc();
3322     __ enter();
3323     __ cache_wb(Address(src, 0));
3324     __ leave();
3325     __ ret(0);
3326 
3327     return start;
3328   }
3329 
3330   address generate_data_cache_writeback_sync() {
3331     const Register is_pre    = c_rarg0;  // pre or post sync
3332 
3333     __ align(CodeEntryAlignment);
3334 
3335     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3336 
3337     // pre wbsync is a no-op
3338     // post wbsync translates to an sfence
3339 
3340     Label skip;
3341     address start = __ pc();
3342     __ enter();
3343     __ cmpl(is_pre, 0);
3344     __ jcc(Assembler::notEqual, skip);
3345     __ cache_wbsync(false);
3346     __ bind(skip);
3347     __ leave();
3348     __ ret(0);
3349 
3350     return start;
3351   }
3352 
3353   void generate_arraycopy_stubs() {
3354     address entry;
3355     address entry_jbyte_arraycopy;
3356     address entry_jshort_arraycopy;
3357     address entry_jint_arraycopy;
3358     address entry_oop_arraycopy;
3359     address entry_jlong_arraycopy;
3360     address entry_checkcast_arraycopy;
3361 
3362     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3363                                                                            "jbyte_disjoint_arraycopy");
3364     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3365                                                                            "jbyte_arraycopy");
3366 
3367     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3368                                                                             "jshort_disjoint_arraycopy");
3369     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3370                                                                             "jshort_arraycopy");
3371 
3372     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3373                                                                               "jint_disjoint_arraycopy");
3374     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3375                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3376 
3377     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3378                                                                                "jlong_disjoint_arraycopy");
3379     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3380                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3381 
3382 
3383     if (UseCompressedOops) {
3384       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3385                                                                               "oop_disjoint_arraycopy");
3386       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3387                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3388       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3389                                                                                      "oop_disjoint_arraycopy_uninit",
3390                                                                                      /*dest_uninitialized*/true);
3391       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3392                                                                                      NULL, "oop_arraycopy_uninit",
3393                                                                                      /*dest_uninitialized*/true);
3394     } else {
3395       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3396                                                                                "oop_disjoint_arraycopy");
3397       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3398                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3399       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3400                                                                                       "oop_disjoint_arraycopy_uninit",
3401                                                                                       /*dest_uninitialized*/true);
3402       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3403                                                                                       NULL, "oop_arraycopy_uninit",
3404                                                                                       /*dest_uninitialized*/true);
3405     }
3406 
3407     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3408     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3409                                                                         /*dest_uninitialized*/true);
3410 
3411     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3412                                                               entry_jbyte_arraycopy,
3413                                                               entry_jshort_arraycopy,
3414                                                               entry_jint_arraycopy,
3415                                                               entry_jlong_arraycopy);
3416     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3417                                                                entry_jbyte_arraycopy,
3418                                                                entry_jshort_arraycopy,
3419                                                                entry_jint_arraycopy,
3420                                                                entry_oop_arraycopy,
3421                                                                entry_jlong_arraycopy,
3422                                                                entry_checkcast_arraycopy);
3423 
3424     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3425     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3426     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3427     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3428     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3429     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3430 
3431     // We don't generate specialized code for HeapWord-aligned source
3432     // arrays, so just use the code we've already generated
3433     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3434     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3435 
3436     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3437     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3438 
3439     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3440     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3441 
3442     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3443     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3444 
3445     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3446     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3447 
3448     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3449     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3450   }
3451 
3452   // AES intrinsic stubs
3453   enum {AESBlockSize = 16};
3454 
3455   address generate_key_shuffle_mask() {
3456     __ align(16);
3457     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3458     address start = __ pc();
3459     __ emit_data64( 0x0405060700010203, relocInfo::none );
3460     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3461     return start;
3462   }
3463 
3464   address generate_counter_shuffle_mask() {
3465     __ align(16);
3466     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3467     address start = __ pc();
3468     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3469     __ emit_data64(0x0001020304050607, relocInfo::none);
3470     return start;
3471   }
3472 
3473   // Utility routine for loading a 128-bit key word in little endian format
3474   // can optionally specify that the shuffle mask is already in an xmmregister
3475   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3476     __ movdqu(xmmdst, Address(key, offset));
3477     if (xmm_shuf_mask != NULL) {
3478       __ pshufb(xmmdst, xmm_shuf_mask);
3479     } else {
3480       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3481     }
3482   }
3483 
3484   // Utility routine for increase 128bit counter (iv in CTR mode)
3485   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3486     __ pextrq(reg, xmmdst, 0x0);
3487     __ addq(reg, inc_delta);
3488     __ pinsrq(xmmdst, reg, 0x0);
3489     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3490     __ pextrq(reg, xmmdst, 0x01); // Carry
3491     __ addq(reg, 0x01);
3492     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3493     __ BIND(next_block);          // next instruction
3494   }
3495 
3496   // Arguments:
3497   //
3498   // Inputs:
3499   //   c_rarg0   - source byte array address
3500   //   c_rarg1   - destination byte array address
3501   //   c_rarg2   - K (key) in little endian int array
3502   //
3503   address generate_aescrypt_encryptBlock() {
3504     assert(UseAES, "need AES instructions and misaligned SSE support");
3505     __ align(CodeEntryAlignment);
3506     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3507     Label L_doLast;
3508     address start = __ pc();
3509 
3510     const Register from        = c_rarg0;  // source array address
3511     const Register to          = c_rarg1;  // destination array address
3512     const Register key         = c_rarg2;  // key array address
3513     const Register keylen      = rax;
3514 
3515     const XMMRegister xmm_result = xmm0;
3516     const XMMRegister xmm_key_shuf_mask = xmm1;
3517     // On win64 xmm6-xmm15 must be preserved so don't use them.
3518     const XMMRegister xmm_temp1  = xmm2;
3519     const XMMRegister xmm_temp2  = xmm3;
3520     const XMMRegister xmm_temp3  = xmm4;
3521     const XMMRegister xmm_temp4  = xmm5;
3522 
3523     __ enter(); // required for proper stackwalking of RuntimeStub frame
3524 
3525     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3526     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3527 
3528     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3529     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3530 
3531     // For encryption, the java expanded key ordering is just what we need
3532     // we don't know if the key is aligned, hence not using load-execute form
3533 
3534     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3535     __ pxor(xmm_result, xmm_temp1);
3536 
3537     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3538     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3539     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3540     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3541 
3542     __ aesenc(xmm_result, xmm_temp1);
3543     __ aesenc(xmm_result, xmm_temp2);
3544     __ aesenc(xmm_result, xmm_temp3);
3545     __ aesenc(xmm_result, xmm_temp4);
3546 
3547     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3548     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3549     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3550     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3551 
3552     __ aesenc(xmm_result, xmm_temp1);
3553     __ aesenc(xmm_result, xmm_temp2);
3554     __ aesenc(xmm_result, xmm_temp3);
3555     __ aesenc(xmm_result, xmm_temp4);
3556 
3557     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3558     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3559 
3560     __ cmpl(keylen, 44);
3561     __ jccb(Assembler::equal, L_doLast);
3562 
3563     __ aesenc(xmm_result, xmm_temp1);
3564     __ aesenc(xmm_result, xmm_temp2);
3565 
3566     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3567     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3568 
3569     __ cmpl(keylen, 52);
3570     __ jccb(Assembler::equal, L_doLast);
3571 
3572     __ aesenc(xmm_result, xmm_temp1);
3573     __ aesenc(xmm_result, xmm_temp2);
3574 
3575     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3576     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3577 
3578     __ BIND(L_doLast);
3579     __ aesenc(xmm_result, xmm_temp1);
3580     __ aesenclast(xmm_result, xmm_temp2);
3581     __ movdqu(Address(to, 0), xmm_result);        // store the result
3582     __ xorptr(rax, rax); // return 0
3583     __ leave(); // required for proper stackwalking of RuntimeStub frame
3584     __ ret(0);
3585 
3586     return start;
3587   }
3588 
3589 
3590   // Arguments:
3591   //
3592   // Inputs:
3593   //   c_rarg0   - source byte array address
3594   //   c_rarg1   - destination byte array address
3595   //   c_rarg2   - K (key) in little endian int array
3596   //
3597   address generate_aescrypt_decryptBlock() {
3598     assert(UseAES, "need AES instructions and misaligned SSE support");
3599     __ align(CodeEntryAlignment);
3600     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3601     Label L_doLast;
3602     address start = __ pc();
3603 
3604     const Register from        = c_rarg0;  // source array address
3605     const Register to          = c_rarg1;  // destination array address
3606     const Register key         = c_rarg2;  // key array address
3607     const Register keylen      = rax;
3608 
3609     const XMMRegister xmm_result = xmm0;
3610     const XMMRegister xmm_key_shuf_mask = xmm1;
3611     // On win64 xmm6-xmm15 must be preserved so don't use them.
3612     const XMMRegister xmm_temp1  = xmm2;
3613     const XMMRegister xmm_temp2  = xmm3;
3614     const XMMRegister xmm_temp3  = xmm4;
3615     const XMMRegister xmm_temp4  = xmm5;
3616 
3617     __ enter(); // required for proper stackwalking of RuntimeStub frame
3618 
3619     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3620     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3621 
3622     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3623     __ movdqu(xmm_result, Address(from, 0));
3624 
3625     // for decryption java expanded key ordering is rotated one position from what we want
3626     // so we start from 0x10 here and hit 0x00 last
3627     // we don't know if the key is aligned, hence not using load-execute form
3628     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3629     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3630     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3631     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3632 
3633     __ pxor  (xmm_result, xmm_temp1);
3634     __ aesdec(xmm_result, xmm_temp2);
3635     __ aesdec(xmm_result, xmm_temp3);
3636     __ aesdec(xmm_result, xmm_temp4);
3637 
3638     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3639     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3640     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3641     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3642 
3643     __ aesdec(xmm_result, xmm_temp1);
3644     __ aesdec(xmm_result, xmm_temp2);
3645     __ aesdec(xmm_result, xmm_temp3);
3646     __ aesdec(xmm_result, xmm_temp4);
3647 
3648     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3649     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3650     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3651 
3652     __ cmpl(keylen, 44);
3653     __ jccb(Assembler::equal, L_doLast);
3654 
3655     __ aesdec(xmm_result, xmm_temp1);
3656     __ aesdec(xmm_result, xmm_temp2);
3657 
3658     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3659     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3660 
3661     __ cmpl(keylen, 52);
3662     __ jccb(Assembler::equal, L_doLast);
3663 
3664     __ aesdec(xmm_result, xmm_temp1);
3665     __ aesdec(xmm_result, xmm_temp2);
3666 
3667     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3668     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3669 
3670     __ BIND(L_doLast);
3671     __ aesdec(xmm_result, xmm_temp1);
3672     __ aesdec(xmm_result, xmm_temp2);
3673 
3674     // for decryption the aesdeclast operation is always on key+0x00
3675     __ aesdeclast(xmm_result, xmm_temp3);
3676     __ movdqu(Address(to, 0), xmm_result);  // store the result
3677     __ xorptr(rax, rax); // return 0
3678     __ leave(); // required for proper stackwalking of RuntimeStub frame
3679     __ ret(0);
3680 
3681     return start;
3682   }
3683 
3684 
3685   // Arguments:
3686   //
3687   // Inputs:
3688   //   c_rarg0   - source byte array address
3689   //   c_rarg1   - destination byte array address
3690   //   c_rarg2   - K (key) in little endian int array
3691   //   c_rarg3   - r vector byte array address
3692   //   c_rarg4   - input length
3693   //
3694   // Output:
3695   //   rax       - input length
3696   //
3697   address generate_cipherBlockChaining_encryptAESCrypt() {
3698     assert(UseAES, "need AES instructions and misaligned SSE support");
3699     __ align(CodeEntryAlignment);
3700     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3701     address start = __ pc();
3702 
3703     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3704     const Register from        = c_rarg0;  // source array address
3705     const Register to          = c_rarg1;  // destination array address
3706     const Register key         = c_rarg2;  // key array address
3707     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3708                                            // and left with the results of the last encryption block
3709 #ifndef _WIN64
3710     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3711 #else
3712     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3713     const Register len_reg     = r11;      // pick the volatile windows register
3714 #endif
3715     const Register pos         = rax;
3716 
3717     // xmm register assignments for the loops below
3718     const XMMRegister xmm_result = xmm0;
3719     const XMMRegister xmm_temp   = xmm1;
3720     // keys 0-10 preloaded into xmm2-xmm12
3721     const int XMM_REG_NUM_KEY_FIRST = 2;
3722     const int XMM_REG_NUM_KEY_LAST  = 15;
3723     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3724     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3725     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3726     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3727     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3728 
3729     __ enter(); // required for proper stackwalking of RuntimeStub frame
3730 
3731 #ifdef _WIN64
3732     // on win64, fill len_reg from stack position
3733     __ movl(len_reg, len_mem);
3734 #else
3735     __ push(len_reg); // Save
3736 #endif
3737 
3738     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3739     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3740     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3741     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3742       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3743       offset += 0x10;
3744     }
3745     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3746 
3747     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3748     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3749     __ cmpl(rax, 44);
3750     __ jcc(Assembler::notEqual, L_key_192_256);
3751 
3752     // 128 bit code follows here
3753     __ movptr(pos, 0);
3754     __ align(OptoLoopAlignment);
3755 
3756     __ BIND(L_loopTop_128);
3757     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3758     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3759     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3760     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3761       __ aesenc(xmm_result, as_XMMRegister(rnum));
3762     }
3763     __ aesenclast(xmm_result, xmm_key10);
3764     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3765     // no need to store r to memory until we exit
3766     __ addptr(pos, AESBlockSize);
3767     __ subptr(len_reg, AESBlockSize);
3768     __ jcc(Assembler::notEqual, L_loopTop_128);
3769 
3770     __ BIND(L_exit);
3771     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3772 
3773 #ifdef _WIN64
3774     __ movl(rax, len_mem);
3775 #else
3776     __ pop(rax); // return length
3777 #endif
3778     __ leave(); // required for proper stackwalking of RuntimeStub frame
3779     __ ret(0);
3780 
3781     __ BIND(L_key_192_256);
3782     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3783     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3784     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3785     __ cmpl(rax, 52);
3786     __ jcc(Assembler::notEqual, L_key_256);
3787 
3788     // 192-bit code follows here (could be changed to use more xmm registers)
3789     __ movptr(pos, 0);
3790     __ align(OptoLoopAlignment);
3791 
3792     __ BIND(L_loopTop_192);
3793     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3794     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3795     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3796     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3797       __ aesenc(xmm_result, as_XMMRegister(rnum));
3798     }
3799     __ aesenclast(xmm_result, xmm_key12);
3800     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3801     // no need to store r to memory until we exit
3802     __ addptr(pos, AESBlockSize);
3803     __ subptr(len_reg, AESBlockSize);
3804     __ jcc(Assembler::notEqual, L_loopTop_192);
3805     __ jmp(L_exit);
3806 
3807     __ BIND(L_key_256);
3808     // 256-bit code follows here (could be changed to use more xmm registers)
3809     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3810     __ movptr(pos, 0);
3811     __ align(OptoLoopAlignment);
3812 
3813     __ BIND(L_loopTop_256);
3814     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3815     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3816     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3817     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3818       __ aesenc(xmm_result, as_XMMRegister(rnum));
3819     }
3820     load_key(xmm_temp, key, 0xe0);
3821     __ aesenclast(xmm_result, xmm_temp);
3822     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3823     // no need to store r to memory until we exit
3824     __ addptr(pos, AESBlockSize);
3825     __ subptr(len_reg, AESBlockSize);
3826     __ jcc(Assembler::notEqual, L_loopTop_256);
3827     __ jmp(L_exit);
3828 
3829     return start;
3830   }
3831 
3832   // Safefetch stubs.
3833   void generate_safefetch(const char* name, int size, address* entry,
3834                           address* fault_pc, address* continuation_pc) {
3835     // safefetch signatures:
3836     //   int      SafeFetch32(int*      adr, int      errValue);
3837     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3838     //
3839     // arguments:
3840     //   c_rarg0 = adr
3841     //   c_rarg1 = errValue
3842     //
3843     // result:
3844     //   PPC_RET  = *adr or errValue
3845 
3846     StubCodeMark mark(this, "StubRoutines", name);
3847 
3848     // Entry point, pc or function descriptor.
3849     *entry = __ pc();
3850 
3851     // Load *adr into c_rarg1, may fault.
3852     *fault_pc = __ pc();
3853     switch (size) {
3854       case 4:
3855         // int32_t
3856         __ movl(c_rarg1, Address(c_rarg0, 0));
3857         break;
3858       case 8:
3859         // int64_t
3860         __ movq(c_rarg1, Address(c_rarg0, 0));
3861         break;
3862       default:
3863         ShouldNotReachHere();
3864     }
3865 
3866     // return errValue or *adr
3867     *continuation_pc = __ pc();
3868     __ movq(rax, c_rarg1);
3869     __ ret(0);
3870   }
3871 
3872   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3873   // to hide instruction latency
3874   //
3875   // Arguments:
3876   //
3877   // Inputs:
3878   //   c_rarg0   - source byte array address
3879   //   c_rarg1   - destination byte array address
3880   //   c_rarg2   - K (key) in little endian int array
3881   //   c_rarg3   - r vector byte array address
3882   //   c_rarg4   - input length
3883   //
3884   // Output:
3885   //   rax       - input length
3886   //
3887   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3888     assert(UseAES, "need AES instructions and misaligned SSE support");
3889     __ align(CodeEntryAlignment);
3890     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3891     address start = __ pc();
3892 
3893     const Register from        = c_rarg0;  // source array address
3894     const Register to          = c_rarg1;  // destination array address
3895     const Register key         = c_rarg2;  // key array address
3896     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3897                                            // and left with the results of the last encryption block
3898 #ifndef _WIN64
3899     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3900 #else
3901     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3902     const Register len_reg     = r11;      // pick the volatile windows register
3903 #endif
3904     const Register pos         = rax;
3905 
3906     const int PARALLEL_FACTOR = 4;
3907     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3908 
3909     Label L_exit;
3910     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3911     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3912     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3913     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3914     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3915 
3916     // keys 0-10 preloaded into xmm5-xmm15
3917     const int XMM_REG_NUM_KEY_FIRST = 5;
3918     const int XMM_REG_NUM_KEY_LAST  = 15;
3919     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3920     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3921 
3922     __ enter(); // required for proper stackwalking of RuntimeStub frame
3923 
3924 #ifdef _WIN64
3925     // on win64, fill len_reg from stack position
3926     __ movl(len_reg, len_mem);
3927 #else
3928     __ push(len_reg); // Save
3929 #endif
3930     __ push(rbx);
3931     // the java expanded key ordering is rotated one position from what we want
3932     // so we start from 0x10 here and hit 0x00 last
3933     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3934     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3935     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3936     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3937       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3938       offset += 0x10;
3939     }
3940     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3941 
3942     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3943 
3944     // registers holding the four results in the parallelized loop
3945     const XMMRegister xmm_result0 = xmm0;
3946     const XMMRegister xmm_result1 = xmm2;
3947     const XMMRegister xmm_result2 = xmm3;
3948     const XMMRegister xmm_result3 = xmm4;
3949 
3950     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3951 
3952     __ xorptr(pos, pos);
3953 
3954     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3955     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3956     __ cmpl(rbx, 52);
3957     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3958     __ cmpl(rbx, 60);
3959     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3960 
3961 #define DoFour(opc, src_reg)           \
3962   __ opc(xmm_result0, src_reg);         \
3963   __ opc(xmm_result1, src_reg);         \
3964   __ opc(xmm_result2, src_reg);         \
3965   __ opc(xmm_result3, src_reg);         \
3966 
3967     for (int k = 0; k < 3; ++k) {
3968       __ BIND(L_multiBlock_loopTopHead[k]);
3969       if (k != 0) {
3970         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3971         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3972       }
3973       if (k == 1) {
3974         __ subptr(rsp, 6 * wordSize);
3975         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3976         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3977         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3978         load_key(xmm1, key, 0xc0);  // 0xc0;
3979         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3980       } else if (k == 2) {
3981         __ subptr(rsp, 10 * wordSize);
3982         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3983         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3984         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3985         load_key(xmm1, key, 0xe0);  // 0xe0;
3986         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3987         load_key(xmm15, key, 0xb0); // 0xb0;
3988         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3989         load_key(xmm1, key, 0xc0);  // 0xc0;
3990         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3991       }
3992       __ align(OptoLoopAlignment);
3993       __ BIND(L_multiBlock_loopTop[k]);
3994       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3995       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3996 
3997       if  (k != 0) {
3998         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3999         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
4000       }
4001 
4002       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
4003       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4004       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4005       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4006 
4007       DoFour(pxor, xmm_key_first);
4008       if (k == 0) {
4009         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
4010           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4011         }
4012         DoFour(aesdeclast, xmm_key_last);
4013       } else if (k == 1) {
4014         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
4015           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4016         }
4017         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4018         DoFour(aesdec, xmm1);  // key : 0xc0
4019         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4020         DoFour(aesdeclast, xmm_key_last);
4021       } else if (k == 2) {
4022         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
4023           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4024         }
4025         DoFour(aesdec, xmm1);  // key : 0xc0
4026         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
4027         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
4028         DoFour(aesdec, xmm15);  // key : 0xd0
4029         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4030         DoFour(aesdec, xmm1);  // key : 0xe0
4031         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4032         DoFour(aesdeclast, xmm_key_last);
4033       }
4034 
4035       // for each result, xor with the r vector of previous cipher block
4036       __ pxor(xmm_result0, xmm_prev_block_cipher);
4037       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4038       __ pxor(xmm_result1, xmm_prev_block_cipher);
4039       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4040       __ pxor(xmm_result2, xmm_prev_block_cipher);
4041       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4042       __ pxor(xmm_result3, xmm_prev_block_cipher);
4043       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
4044       if (k != 0) {
4045         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
4046       }
4047 
4048       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
4049       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4050       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4051       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4052 
4053       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
4054       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
4055       __ jmp(L_multiBlock_loopTop[k]);
4056 
4057       // registers used in the non-parallelized loops
4058       // xmm register assignments for the loops below
4059       const XMMRegister xmm_result = xmm0;
4060       const XMMRegister xmm_prev_block_cipher_save = xmm2;
4061       const XMMRegister xmm_key11 = xmm3;
4062       const XMMRegister xmm_key12 = xmm4;
4063       const XMMRegister key_tmp = xmm4;
4064 
4065       __ BIND(L_singleBlock_loopTopHead[k]);
4066       if (k == 1) {
4067         __ addptr(rsp, 6 * wordSize);
4068       } else if (k == 2) {
4069         __ addptr(rsp, 10 * wordSize);
4070       }
4071       __ cmpptr(len_reg, 0); // any blocks left??
4072       __ jcc(Assembler::equal, L_exit);
4073       __ BIND(L_singleBlock_loopTopHead2[k]);
4074       if (k == 1) {
4075         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
4076         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
4077       }
4078       if (k == 2) {
4079         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
4080       }
4081       __ align(OptoLoopAlignment);
4082       __ BIND(L_singleBlock_loopTop[k]);
4083       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
4084       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
4085       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
4086       for (int rnum = 1; rnum <= 9 ; rnum++) {
4087           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4088       }
4089       if (k == 1) {
4090         __ aesdec(xmm_result, xmm_key11);
4091         __ aesdec(xmm_result, xmm_key12);
4092       }
4093       if (k == 2) {
4094         __ aesdec(xmm_result, xmm_key11);
4095         load_key(key_tmp, key, 0xc0);
4096         __ aesdec(xmm_result, key_tmp);
4097         load_key(key_tmp, key, 0xd0);
4098         __ aesdec(xmm_result, key_tmp);
4099         load_key(key_tmp, key, 0xe0);
4100         __ aesdec(xmm_result, key_tmp);
4101       }
4102 
4103       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
4104       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
4105       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
4106       // no need to store r to memory until we exit
4107       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
4108       __ addptr(pos, AESBlockSize);
4109       __ subptr(len_reg, AESBlockSize);
4110       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
4111       if (k != 2) {
4112         __ jmp(L_exit);
4113       }
4114     } //for 128/192/256
4115 
4116     __ BIND(L_exit);
4117     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
4118     __ pop(rbx);
4119 #ifdef _WIN64
4120     __ movl(rax, len_mem);
4121 #else
4122     __ pop(rax); // return length
4123 #endif
4124     __ leave(); // required for proper stackwalking of RuntimeStub frame
4125     __ ret(0);
4126     return start;
4127 }
4128 
4129   address generate_electronicCodeBook_encryptAESCrypt() {
4130     __ align(CodeEntryAlignment);
4131     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
4132     address start = __ pc();
4133     const Register from = c_rarg0;  // source array address
4134     const Register to = c_rarg1;  // destination array address
4135     const Register key = c_rarg2;  // key array address
4136     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4137     __ enter(); // required for proper stackwalking of RuntimeStub frame
4138     __ aesecb_encrypt(from, to, key, len);
4139     __ leave(); // required for proper stackwalking of RuntimeStub frame
4140     __ ret(0);
4141     return start;
4142  }
4143 
4144   address generate_electronicCodeBook_decryptAESCrypt() {
4145     __ align(CodeEntryAlignment);
4146     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
4147     address start = __ pc();
4148     const Register from = c_rarg0;  // source array address
4149     const Register to = c_rarg1;  // destination array address
4150     const Register key = c_rarg2;  // key array address
4151     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4152     __ enter(); // required for proper stackwalking of RuntimeStub frame
4153     __ aesecb_decrypt(from, to, key, len);
4154     __ leave(); // required for proper stackwalking of RuntimeStub frame
4155     __ ret(0);
4156     return start;
4157   }
4158 
4159   // ofs and limit are use for multi-block byte array.
4160   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
4161   address generate_md5_implCompress(bool multi_block, const char *name) {
4162     __ align(CodeEntryAlignment);
4163     StubCodeMark mark(this, "StubRoutines", name);
4164     address start = __ pc();
4165 
4166     const Register buf_param = r15;
4167     const Address state_param(rsp, 0 * wordSize);
4168     const Address ofs_param  (rsp, 1 * wordSize    );
4169     const Address limit_param(rsp, 1 * wordSize + 4);
4170 
4171     __ enter();
4172     __ push(rbx);
4173     __ push(rdi);
4174     __ push(rsi);
4175     __ push(r15);
4176     __ subptr(rsp, 2 * wordSize);
4177 
4178     __ movptr(buf_param, c_rarg0);
4179     __ movptr(state_param, c_rarg1);
4180     if (multi_block) {
4181       __ movl(ofs_param, c_rarg2);
4182       __ movl(limit_param, c_rarg3);
4183     }
4184     __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
4185 
4186     __ addptr(rsp, 2 * wordSize);
4187     __ pop(r15);
4188     __ pop(rsi);
4189     __ pop(rdi);
4190     __ pop(rbx);
4191     __ leave();
4192     __ ret(0);
4193     return start;
4194   }
4195 
4196   address generate_upper_word_mask() {
4197     __ align64();
4198     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
4199     address start = __ pc();
4200     __ emit_data64(0x0000000000000000, relocInfo::none);
4201     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
4202     return start;
4203   }
4204 
4205   address generate_shuffle_byte_flip_mask() {
4206     __ align64();
4207     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
4208     address start = __ pc();
4209     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4210     __ emit_data64(0x0001020304050607, relocInfo::none);
4211     return start;
4212   }
4213 
4214   // ofs and limit are use for multi-block byte array.
4215   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4216   address generate_sha1_implCompress(bool multi_block, const char *name) {
4217     __ align(CodeEntryAlignment);
4218     StubCodeMark mark(this, "StubRoutines", name);
4219     address start = __ pc();
4220 
4221     Register buf = c_rarg0;
4222     Register state = c_rarg1;
4223     Register ofs = c_rarg2;
4224     Register limit = c_rarg3;
4225 
4226     const XMMRegister abcd = xmm0;
4227     const XMMRegister e0 = xmm1;
4228     const XMMRegister e1 = xmm2;
4229     const XMMRegister msg0 = xmm3;
4230 
4231     const XMMRegister msg1 = xmm4;
4232     const XMMRegister msg2 = xmm5;
4233     const XMMRegister msg3 = xmm6;
4234     const XMMRegister shuf_mask = xmm7;
4235 
4236     __ enter();
4237 
4238     __ subptr(rsp, 4 * wordSize);
4239 
4240     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4241       buf, state, ofs, limit, rsp, multi_block);
4242 
4243     __ addptr(rsp, 4 * wordSize);
4244 
4245     __ leave();
4246     __ ret(0);
4247     return start;
4248   }
4249 
4250   address generate_pshuffle_byte_flip_mask() {
4251     __ align64();
4252     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4253     address start = __ pc();
4254     __ emit_data64(0x0405060700010203, relocInfo::none);
4255     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4256 
4257     if (VM_Version::supports_avx2()) {
4258       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4259       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4260       // _SHUF_00BA
4261       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4262       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4263       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4264       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4265       // _SHUF_DC00
4266       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4267       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4268       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4269       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4270     }
4271 
4272     return start;
4273   }
4274 
4275   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4276   address generate_pshuffle_byte_flip_mask_sha512() {
4277     __ align32();
4278     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4279     address start = __ pc();
4280     if (VM_Version::supports_avx2()) {
4281       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4282       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4283       __ emit_data64(0x1011121314151617, relocInfo::none);
4284       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4285       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4286       __ emit_data64(0x0000000000000000, relocInfo::none);
4287       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4288       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4289     }
4290 
4291     return start;
4292   }
4293 
4294 // ofs and limit are use for multi-block byte array.
4295 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4296   address generate_sha256_implCompress(bool multi_block, const char *name) {
4297     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4298     __ align(CodeEntryAlignment);
4299     StubCodeMark mark(this, "StubRoutines", name);
4300     address start = __ pc();
4301 
4302     Register buf = c_rarg0;
4303     Register state = c_rarg1;
4304     Register ofs = c_rarg2;
4305     Register limit = c_rarg3;
4306 
4307     const XMMRegister msg = xmm0;
4308     const XMMRegister state0 = xmm1;
4309     const XMMRegister state1 = xmm2;
4310     const XMMRegister msgtmp0 = xmm3;
4311 
4312     const XMMRegister msgtmp1 = xmm4;
4313     const XMMRegister msgtmp2 = xmm5;
4314     const XMMRegister msgtmp3 = xmm6;
4315     const XMMRegister msgtmp4 = xmm7;
4316 
4317     const XMMRegister shuf_mask = xmm8;
4318 
4319     __ enter();
4320 
4321     __ subptr(rsp, 4 * wordSize);
4322 
4323     if (VM_Version::supports_sha()) {
4324       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4325         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4326     } else if (VM_Version::supports_avx2()) {
4327       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4328         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4329     }
4330     __ addptr(rsp, 4 * wordSize);
4331     __ vzeroupper();
4332     __ leave();
4333     __ ret(0);
4334     return start;
4335   }
4336 
4337   address generate_sha512_implCompress(bool multi_block, const char *name) {
4338     assert(VM_Version::supports_avx2(), "");
4339     assert(VM_Version::supports_bmi2(), "");
4340     __ align(CodeEntryAlignment);
4341     StubCodeMark mark(this, "StubRoutines", name);
4342     address start = __ pc();
4343 
4344     Register buf = c_rarg0;
4345     Register state = c_rarg1;
4346     Register ofs = c_rarg2;
4347     Register limit = c_rarg3;
4348 
4349     const XMMRegister msg = xmm0;
4350     const XMMRegister state0 = xmm1;
4351     const XMMRegister state1 = xmm2;
4352     const XMMRegister msgtmp0 = xmm3;
4353     const XMMRegister msgtmp1 = xmm4;
4354     const XMMRegister msgtmp2 = xmm5;
4355     const XMMRegister msgtmp3 = xmm6;
4356     const XMMRegister msgtmp4 = xmm7;
4357 
4358     const XMMRegister shuf_mask = xmm8;
4359 
4360     __ enter();
4361 
4362     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4363     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4364 
4365     __ vzeroupper();
4366     __ leave();
4367     __ ret(0);
4368     return start;
4369   }
4370 
4371   address ghash_polynomial512_addr() {
4372     __ align(CodeEntryAlignment);
4373     StubCodeMark mark(this, "StubRoutines", "_ghash_poly512_addr");
4374     address start = __ pc();
4375     __ emit_data64(0x00000001C2000000, relocInfo::none); // POLY for reduction
4376     __ emit_data64(0xC200000000000000, relocInfo::none);
4377     __ emit_data64(0x00000001C2000000, relocInfo::none);
4378     __ emit_data64(0xC200000000000000, relocInfo::none);
4379     __ emit_data64(0x00000001C2000000, relocInfo::none);
4380     __ emit_data64(0xC200000000000000, relocInfo::none);
4381     __ emit_data64(0x00000001C2000000, relocInfo::none);
4382     __ emit_data64(0xC200000000000000, relocInfo::none);
4383     __ emit_data64(0x0000000000000001, relocInfo::none); // POLY
4384     __ emit_data64(0xC200000000000000, relocInfo::none);
4385     __ emit_data64(0x0000000000000001, relocInfo::none); // TWOONE
4386     __ emit_data64(0x0000000100000000, relocInfo::none);
4387     return start;
4388 }
4389 
4390   // Vector AES Galois Counter Mode implementation. Parameters:
4391   // Windows regs            |  Linux regs
4392   // in = c_rarg0 (rcx)      |  c_rarg0 (rsi)
4393   // len = c_rarg1 (rdx)     |  c_rarg1 (rdi)
4394   // ct = c_rarg2 (r8)       |  c_rarg2 (rdx)
4395   // out = c_rarg3 (r9)      |  c_rarg3 (rcx)
4396   // key = r10               |  c_rarg4 (r8)
4397   // state = r13             |  c_rarg5 (r9)
4398   // subkeyHtbl = r14        |  r11
4399   // counter = rsi           |  r12
4400   // return - number of processed bytes
4401   address generate_galoisCounterMode_AESCrypt() {
4402     __ align(CodeEntryAlignment);
4403     StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
4404     address start = __ pc();
4405     const Register in = c_rarg0;
4406     const Register len = c_rarg1;
4407     const Register ct = c_rarg2;
4408     const Register out = c_rarg3;
4409     // and updated with the incremented counter in the end
4410 #ifndef _WIN64
4411     const Register key = c_rarg4;
4412     const Register state = c_rarg5;
4413     const Address subkeyH_mem(rbp, 2 * wordSize);
4414     const Register subkeyHtbl = r11;
4415     const Address avx512_subkeyH_mem(rbp, 3 * wordSize);
4416     const Register avx512_subkeyHtbl = r13;
4417     const Address counter_mem(rbp, 4 * wordSize);
4418     const Register counter = r12;
4419 #else
4420     const Address key_mem(rbp, 6 * wordSize);
4421     const Register key = r10;
4422     const Address state_mem(rbp, 7 * wordSize);
4423     const Register state = r13;
4424     const Address subkeyH_mem(rbp, 8 * wordSize);
4425     const Register subkeyHtbl = r14;
4426     const Address avx512_subkeyH_mem(rbp, 9 * wordSize);
4427     const Register avx512_subkeyHtbl = r12;
4428     const Address counter_mem(rbp, 10 * wordSize);
4429     const Register counter = rsi;
4430 #endif
4431     __ enter();
4432    // Save state before entering routine
4433     __ push(r12);
4434     __ push(r13);
4435     __ push(r14);
4436     __ push(r15);
4437     __ push(rbx);
4438 #ifdef _WIN64
4439     // on win64, fill len_reg from stack position
4440     __ push(rsi);
4441     __ movptr(key, key_mem);
4442     __ movptr(state, state_mem);
4443 #endif
4444     __ movptr(subkeyHtbl, subkeyH_mem);
4445     __ movptr(avx512_subkeyHtbl, avx512_subkeyH_mem);
4446     __ movptr(counter, counter_mem);
4447 
4448     __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
4449 
4450     // Restore state before leaving routine
4451 #ifdef _WIN64
4452     __ pop(rsi);
4453 #endif
4454     __ pop(rbx);
4455     __ pop(r15);
4456     __ pop(r14);
4457     __ pop(r13);
4458     __ pop(r12);
4459 
4460     __ leave(); // required for proper stackwalking of RuntimeStub frame
4461     __ ret(0);
4462      return start;
4463   }
4464 
4465   // This mask is used for incrementing counter value(linc0, linc4, etc.)
4466   address counter_mask_addr() {
4467     __ align64();
4468     StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
4469     address start = __ pc();
4470     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
4471     __ emit_data64(0x0001020304050607, relocInfo::none);
4472     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4473     __ emit_data64(0x0001020304050607, relocInfo::none);
4474     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4475     __ emit_data64(0x0001020304050607, relocInfo::none);
4476     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4477     __ emit_data64(0x0001020304050607, relocInfo::none);
4478     __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4479     __ emit_data64(0x0000000000000000, relocInfo::none);
4480     __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4481     __ emit_data64(0x0000000000000000, relocInfo::none);
4482     __ emit_data64(0x0000000000000002, relocInfo::none);
4483     __ emit_data64(0x0000000000000000, relocInfo::none);
4484     __ emit_data64(0x0000000000000003, relocInfo::none);
4485     __ emit_data64(0x0000000000000000, relocInfo::none);
4486     __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4487     __ emit_data64(0x0000000000000000, relocInfo::none);
4488     __ emit_data64(0x0000000000000004, relocInfo::none);
4489     __ emit_data64(0x0000000000000000, relocInfo::none);
4490     __ emit_data64(0x0000000000000004, relocInfo::none);
4491     __ emit_data64(0x0000000000000000, relocInfo::none);
4492     __ emit_data64(0x0000000000000004, relocInfo::none);
4493     __ emit_data64(0x0000000000000000, relocInfo::none);
4494     __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4495     __ emit_data64(0x0000000000000000, relocInfo::none);
4496     __ emit_data64(0x0000000000000008, relocInfo::none);
4497     __ emit_data64(0x0000000000000000, relocInfo::none);
4498     __ emit_data64(0x0000000000000008, relocInfo::none);
4499     __ emit_data64(0x0000000000000000, relocInfo::none);
4500     __ emit_data64(0x0000000000000008, relocInfo::none);
4501     __ emit_data64(0x0000000000000000, relocInfo::none);
4502     __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4503     __ emit_data64(0x0000000000000000, relocInfo::none);
4504     __ emit_data64(0x0000000000000020, relocInfo::none);
4505     __ emit_data64(0x0000000000000000, relocInfo::none);
4506     __ emit_data64(0x0000000000000020, relocInfo::none);
4507     __ emit_data64(0x0000000000000000, relocInfo::none);
4508     __ emit_data64(0x0000000000000020, relocInfo::none);
4509     __ emit_data64(0x0000000000000000, relocInfo::none);
4510     __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4511     __ emit_data64(0x0000000000000000, relocInfo::none);
4512     __ emit_data64(0x0000000000000010, relocInfo::none);
4513     __ emit_data64(0x0000000000000000, relocInfo::none);
4514     __ emit_data64(0x0000000000000010, relocInfo::none);
4515     __ emit_data64(0x0000000000000000, relocInfo::none);
4516     __ emit_data64(0x0000000000000010, relocInfo::none);
4517     __ emit_data64(0x0000000000000000, relocInfo::none);
4518     return start;
4519   }
4520 
4521  // Vector AES Counter implementation
4522   address generate_counterMode_VectorAESCrypt()  {
4523     __ align(CodeEntryAlignment);
4524     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4525     address start = __ pc();
4526     const Register from = c_rarg0; // source array address
4527     const Register to = c_rarg1; // destination array address
4528     const Register key = c_rarg2; // key array address r8
4529     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4530     // and updated with the incremented counter in the end
4531 #ifndef _WIN64
4532     const Register len_reg = c_rarg4;
4533     const Register saved_encCounter_start = c_rarg5;
4534     const Register used_addr = r10;
4535     const Address  used_mem(rbp, 2 * wordSize);
4536     const Register used = r11;
4537 #else
4538     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4539     const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4540     const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4541     const Register len_reg = r10; // pick the first volatile windows register
4542     const Register saved_encCounter_start = r11;
4543     const Register used_addr = r13;
4544     const Register used = r14;
4545 #endif
4546     __ enter();
4547    // Save state before entering routine
4548     __ push(r12);
4549     __ push(r13);
4550     __ push(r14);
4551     __ push(r15);
4552 #ifdef _WIN64
4553     // on win64, fill len_reg from stack position
4554     __ movl(len_reg, len_mem);
4555     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4556     __ movptr(used_addr, used_mem);
4557     __ movl(used, Address(used_addr, 0));
4558 #else
4559     __ push(len_reg); // Save
4560     __ movptr(used_addr, used_mem);
4561     __ movl(used, Address(used_addr, 0));
4562 #endif
4563     __ push(rbx);
4564     __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
4565     // Restore state before leaving routine
4566     __ pop(rbx);
4567 #ifdef _WIN64
4568     __ movl(rax, len_mem); // return length
4569 #else
4570     __ pop(rax); // return length
4571 #endif
4572     __ pop(r15);
4573     __ pop(r14);
4574     __ pop(r13);
4575     __ pop(r12);
4576 
4577     __ leave(); // required for proper stackwalking of RuntimeStub frame
4578     __ ret(0);
4579     return start;
4580   }
4581 
4582   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4583   // to hide instruction latency
4584   //
4585   // Arguments:
4586   //
4587   // Inputs:
4588   //   c_rarg0   - source byte array address
4589   //   c_rarg1   - destination byte array address
4590   //   c_rarg2   - K (key) in little endian int array
4591   //   c_rarg3   - counter vector byte array address
4592   //   Linux
4593   //     c_rarg4   -          input length
4594   //     c_rarg5   -          saved encryptedCounter start
4595   //     rbp + 6 * wordSize - saved used length
4596   //   Windows
4597   //     rbp + 6 * wordSize - input length
4598   //     rbp + 7 * wordSize - saved encryptedCounter start
4599   //     rbp + 8 * wordSize - saved used length
4600   //
4601   // Output:
4602   //   rax       - input length
4603   //
4604   address generate_counterMode_AESCrypt_Parallel() {
4605     assert(UseAES, "need AES instructions and misaligned SSE support");
4606     __ align(CodeEntryAlignment);
4607     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4608     address start = __ pc();
4609     const Register from = c_rarg0; // source array address
4610     const Register to = c_rarg1; // destination array address
4611     const Register key = c_rarg2; // key array address
4612     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4613                                       // and updated with the incremented counter in the end
4614 #ifndef _WIN64
4615     const Register len_reg = c_rarg4;
4616     const Register saved_encCounter_start = c_rarg5;
4617     const Register used_addr = r10;
4618     const Address  used_mem(rbp, 2 * wordSize);
4619     const Register used = r11;
4620 #else
4621     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4622     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4623     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4624     const Register len_reg = r10; // pick the first volatile windows register
4625     const Register saved_encCounter_start = r11;
4626     const Register used_addr = r13;
4627     const Register used = r14;
4628 #endif
4629     const Register pos = rax;
4630 
4631     const int PARALLEL_FACTOR = 6;
4632     const XMMRegister xmm_counter_shuf_mask = xmm0;
4633     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4634     const XMMRegister xmm_curr_counter = xmm2;
4635 
4636     const XMMRegister xmm_key_tmp0 = xmm3;
4637     const XMMRegister xmm_key_tmp1 = xmm4;
4638 
4639     // registers holding the four results in the parallelized loop
4640     const XMMRegister xmm_result0 = xmm5;
4641     const XMMRegister xmm_result1 = xmm6;
4642     const XMMRegister xmm_result2 = xmm7;
4643     const XMMRegister xmm_result3 = xmm8;
4644     const XMMRegister xmm_result4 = xmm9;
4645     const XMMRegister xmm_result5 = xmm10;
4646 
4647     const XMMRegister xmm_from0 = xmm11;
4648     const XMMRegister xmm_from1 = xmm12;
4649     const XMMRegister xmm_from2 = xmm13;
4650     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4651     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4652     const XMMRegister xmm_from5 = xmm4;
4653 
4654     //for key_128, key_192, key_256
4655     const int rounds[3] = {10, 12, 14};
4656     Label L_exit_preLoop, L_preLoop_start;
4657     Label L_multiBlock_loopTop[3];
4658     Label L_singleBlockLoopTop[3];
4659     Label L__incCounter[3][6]; //for 6 blocks
4660     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4661     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4662     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4663 
4664     Label L_exit;
4665 
4666     __ enter(); // required for proper stackwalking of RuntimeStub frame
4667 
4668 #ifdef _WIN64
4669     // allocate spill slots for r13, r14
4670     enum {
4671         saved_r13_offset,
4672         saved_r14_offset
4673     };
4674     __ subptr(rsp, 2 * wordSize);
4675     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4676     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4677 
4678     // on win64, fill len_reg from stack position
4679     __ movl(len_reg, len_mem);
4680     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4681     __ movptr(used_addr, used_mem);
4682     __ movl(used, Address(used_addr, 0));
4683 #else
4684     __ push(len_reg); // Save
4685     __ movptr(used_addr, used_mem);
4686     __ movl(used, Address(used_addr, 0));
4687 #endif
4688 
4689     __ push(rbx); // Save RBX
4690     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4691     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4692     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4693     __ movptr(pos, 0);
4694 
4695     // Use the partially used encrpyted counter from last invocation
4696     __ BIND(L_preLoop_start);
4697     __ cmpptr(used, 16);
4698     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4699       __ cmpptr(len_reg, 0);
4700       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4701       __ movb(rbx, Address(saved_encCounter_start, used));
4702       __ xorb(rbx, Address(from, pos));
4703       __ movb(Address(to, pos), rbx);
4704       __ addptr(pos, 1);
4705       __ addptr(used, 1);
4706       __ subptr(len_reg, 1);
4707 
4708     __ jmp(L_preLoop_start);
4709 
4710     __ BIND(L_exit_preLoop);
4711     __ movl(Address(used_addr, 0), used);
4712 
4713     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4714     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4715     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4716     __ cmpl(rbx, 52);
4717     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4718     __ cmpl(rbx, 60);
4719     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4720 
4721 #define CTR_DoSix(opc, src_reg)                \
4722     __ opc(xmm_result0, src_reg);              \
4723     __ opc(xmm_result1, src_reg);              \
4724     __ opc(xmm_result2, src_reg);              \
4725     __ opc(xmm_result3, src_reg);              \
4726     __ opc(xmm_result4, src_reg);              \
4727     __ opc(xmm_result5, src_reg);
4728 
4729     // k == 0 :  generate code for key_128
4730     // k == 1 :  generate code for key_192
4731     // k == 2 :  generate code for key_256
4732     for (int k = 0; k < 3; ++k) {
4733       //multi blocks starts here
4734       __ align(OptoLoopAlignment);
4735       __ BIND(L_multiBlock_loopTop[k]);
4736       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4737       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4738       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4739 
4740       //load, then increase counters
4741       CTR_DoSix(movdqa, xmm_curr_counter);
4742       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4743       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4744       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4745       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4746       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4747       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4748       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4749       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4750 
4751       //load two ROUND_KEYs at a time
4752       for (int i = 1; i < rounds[k]; ) {
4753         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4754         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4755         CTR_DoSix(aesenc, xmm_key_tmp1);
4756         i++;
4757         if (i != rounds[k]) {
4758           CTR_DoSix(aesenc, xmm_key_tmp0);
4759         } else {
4760           CTR_DoSix(aesenclast, xmm_key_tmp0);
4761         }
4762         i++;
4763       }
4764 
4765       // get next PARALLEL_FACTOR blocks into xmm_result registers
4766       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4767       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4768       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4769       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4770       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4771       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4772 
4773       __ pxor(xmm_result0, xmm_from0);
4774       __ pxor(xmm_result1, xmm_from1);
4775       __ pxor(xmm_result2, xmm_from2);
4776       __ pxor(xmm_result3, xmm_from3);
4777       __ pxor(xmm_result4, xmm_from4);
4778       __ pxor(xmm_result5, xmm_from5);
4779 
4780       // store 6 results into the next 64 bytes of output
4781       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4782       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4783       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4784       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4785       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4786       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4787 
4788       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4789       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4790       __ jmp(L_multiBlock_loopTop[k]);
4791 
4792       // singleBlock starts here
4793       __ align(OptoLoopAlignment);
4794       __ BIND(L_singleBlockLoopTop[k]);
4795       __ cmpptr(len_reg, 0);
4796       __ jcc(Assembler::lessEqual, L_exit);
4797       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4798       __ movdqa(xmm_result0, xmm_curr_counter);
4799       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4800       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4801       __ pxor(xmm_result0, xmm_key_tmp0);
4802       for (int i = 1; i < rounds[k]; i++) {
4803         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4804         __ aesenc(xmm_result0, xmm_key_tmp0);
4805       }
4806       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4807       __ aesenclast(xmm_result0, xmm_key_tmp0);
4808       __ cmpptr(len_reg, AESBlockSize);
4809       __ jcc(Assembler::less, L_processTail_insr[k]);
4810         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4811         __ pxor(xmm_result0, xmm_from0);
4812         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4813         __ addptr(pos, AESBlockSize);
4814         __ subptr(len_reg, AESBlockSize);
4815         __ jmp(L_singleBlockLoopTop[k]);
4816       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4817         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4818         __ testptr(len_reg, 8);
4819         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4820           __ subptr(pos,8);
4821           __ pinsrq(xmm_from0, Address(from, pos), 0);
4822         __ BIND(L_processTail_4_insr[k]);
4823         __ testptr(len_reg, 4);
4824         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4825           __ subptr(pos,4);
4826           __ pslldq(xmm_from0, 4);
4827           __ pinsrd(xmm_from0, Address(from, pos), 0);
4828         __ BIND(L_processTail_2_insr[k]);
4829         __ testptr(len_reg, 2);
4830         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4831           __ subptr(pos, 2);
4832           __ pslldq(xmm_from0, 2);
4833           __ pinsrw(xmm_from0, Address(from, pos), 0);
4834         __ BIND(L_processTail_1_insr[k]);
4835         __ testptr(len_reg, 1);
4836         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4837           __ subptr(pos, 1);
4838           __ pslldq(xmm_from0, 1);
4839           __ pinsrb(xmm_from0, Address(from, pos), 0);
4840         __ BIND(L_processTail_exit_insr[k]);
4841 
4842         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4843         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4844 
4845         __ testptr(len_reg, 8);
4846         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4847           __ pextrq(Address(to, pos), xmm_result0, 0);
4848           __ psrldq(xmm_result0, 8);
4849           __ addptr(pos, 8);
4850         __ BIND(L_processTail_4_extr[k]);
4851         __ testptr(len_reg, 4);
4852         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4853           __ pextrd(Address(to, pos), xmm_result0, 0);
4854           __ psrldq(xmm_result0, 4);
4855           __ addptr(pos, 4);
4856         __ BIND(L_processTail_2_extr[k]);
4857         __ testptr(len_reg, 2);
4858         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4859           __ pextrw(Address(to, pos), xmm_result0, 0);
4860           __ psrldq(xmm_result0, 2);
4861           __ addptr(pos, 2);
4862         __ BIND(L_processTail_1_extr[k]);
4863         __ testptr(len_reg, 1);
4864         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4865           __ pextrb(Address(to, pos), xmm_result0, 0);
4866 
4867         __ BIND(L_processTail_exit_extr[k]);
4868         __ movl(Address(used_addr, 0), len_reg);
4869         __ jmp(L_exit);
4870 
4871     }
4872 
4873     __ BIND(L_exit);
4874     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4875     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4876     __ pop(rbx); // pop the saved RBX.
4877 #ifdef _WIN64
4878     __ movl(rax, len_mem);
4879     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4880     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4881     __ addptr(rsp, 2 * wordSize);
4882 #else
4883     __ pop(rax); // return 'len'
4884 #endif
4885     __ leave(); // required for proper stackwalking of RuntimeStub frame
4886     __ ret(0);
4887     return start;
4888   }
4889 
4890 void roundDec(XMMRegister xmm_reg) {
4891   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4892   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4893   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4894   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4895   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4896   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4897   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4898   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4899 }
4900 
4901 void roundDeclast(XMMRegister xmm_reg) {
4902   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4903   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4904   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4905   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4906   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4907   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4908   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4909   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4910 }
4911 
4912   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4913     __ movdqu(xmmdst, Address(key, offset));
4914     if (xmm_shuf_mask != NULL) {
4915       __ pshufb(xmmdst, xmm_shuf_mask);
4916     } else {
4917       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4918     }
4919     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4920 
4921   }
4922 
4923 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4924     assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
4925     __ align(CodeEntryAlignment);
4926     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4927     address start = __ pc();
4928 
4929     const Register from = c_rarg0;  // source array address
4930     const Register to = c_rarg1;  // destination array address
4931     const Register key = c_rarg2;  // key array address
4932     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4933     // and left with the results of the last encryption block
4934 #ifndef _WIN64
4935     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4936 #else
4937     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4938     const Register len_reg = r11;      // pick the volatile windows register
4939 #endif
4940 
4941     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4942           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4943 
4944     __ enter();
4945 
4946 #ifdef _WIN64
4947   // on win64, fill len_reg from stack position
4948     __ movl(len_reg, len_mem);
4949 #else
4950     __ push(len_reg); // Save
4951 #endif
4952     __ push(rbx);
4953     __ vzeroupper();
4954 
4955     // Temporary variable declaration for swapping key bytes
4956     const XMMRegister xmm_key_shuf_mask = xmm1;
4957     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4958 
4959     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4960     const Register rounds = rbx;
4961     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4962 
4963     const XMMRegister IV = xmm0;
4964     // Load IV and broadcast value to 512-bits
4965     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4966 
4967     // Temporary variables for storing round keys
4968     const XMMRegister RK0 = xmm30;
4969     const XMMRegister RK1 = xmm9;
4970     const XMMRegister RK2 = xmm18;
4971     const XMMRegister RK3 = xmm19;
4972     const XMMRegister RK4 = xmm20;
4973     const XMMRegister RK5 = xmm21;
4974     const XMMRegister RK6 = xmm22;
4975     const XMMRegister RK7 = xmm23;
4976     const XMMRegister RK8 = xmm24;
4977     const XMMRegister RK9 = xmm25;
4978     const XMMRegister RK10 = xmm26;
4979 
4980      // Load and shuffle key
4981     // the java expanded key ordering is rotated one position from what we want
4982     // so we start from 1*16 here and hit 0*16 last
4983     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4984     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4985     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4986     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4987     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4988     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4989     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4990     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4991     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4992     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4993     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4994 
4995     // Variables for storing source cipher text
4996     const XMMRegister S0 = xmm10;
4997     const XMMRegister S1 = xmm11;
4998     const XMMRegister S2 = xmm12;
4999     const XMMRegister S3 = xmm13;
5000     const XMMRegister S4 = xmm14;
5001     const XMMRegister S5 = xmm15;
5002     const XMMRegister S6 = xmm16;
5003     const XMMRegister S7 = xmm17;
5004 
5005     // Variables for storing decrypted text
5006     const XMMRegister B0 = xmm1;
5007     const XMMRegister B1 = xmm2;
5008     const XMMRegister B2 = xmm3;
5009     const XMMRegister B3 = xmm4;
5010     const XMMRegister B4 = xmm5;
5011     const XMMRegister B5 = xmm6;
5012     const XMMRegister B6 = xmm7;
5013     const XMMRegister B7 = xmm8;
5014 
5015     __ cmpl(rounds, 44);
5016     __ jcc(Assembler::greater, KEY_192);
5017     __ jmp(Loop);
5018 
5019     __ BIND(KEY_192);
5020     const XMMRegister RK11 = xmm27;
5021     const XMMRegister RK12 = xmm28;
5022     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
5023     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
5024 
5025     __ cmpl(rounds, 52);
5026     __ jcc(Assembler::greater, KEY_256);
5027     __ jmp(Loop);
5028 
5029     __ BIND(KEY_256);
5030     const XMMRegister RK13 = xmm29;
5031     const XMMRegister RK14 = xmm31;
5032     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
5033     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
5034 
5035     __ BIND(Loop);
5036     __ cmpl(len_reg, 512);
5037     __ jcc(Assembler::below, Lcbc_dec_rem);
5038     __ BIND(Loop1);
5039     __ subl(len_reg, 512);
5040     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
5041     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
5042     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
5043     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
5044     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
5045     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
5046     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
5047     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
5048     __ leaq(from, Address(from, 8 * 64));
5049 
5050     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5051     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
5052     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
5053     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
5054     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
5055     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
5056     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
5057     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
5058 
5059     __ evalignq(IV, S0, IV, 0x06);
5060     __ evalignq(S0, S1, S0, 0x06);
5061     __ evalignq(S1, S2, S1, 0x06);
5062     __ evalignq(S2, S3, S2, 0x06);
5063     __ evalignq(S3, S4, S3, 0x06);
5064     __ evalignq(S4, S5, S4, 0x06);
5065     __ evalignq(S5, S6, S5, 0x06);
5066     __ evalignq(S6, S7, S6, 0x06);
5067 
5068     roundDec(RK2);
5069     roundDec(RK3);
5070     roundDec(RK4);
5071     roundDec(RK5);
5072     roundDec(RK6);
5073     roundDec(RK7);
5074     roundDec(RK8);
5075     roundDec(RK9);
5076     roundDec(RK10);
5077 
5078     __ cmpl(rounds, 44);
5079     __ jcc(Assembler::belowEqual, L_128);
5080     roundDec(RK11);
5081     roundDec(RK12);
5082 
5083     __ cmpl(rounds, 52);
5084     __ jcc(Assembler::belowEqual, L_192);
5085     roundDec(RK13);
5086     roundDec(RK14);
5087 
5088     __ BIND(L_256);
5089     roundDeclast(RK0);
5090     __ jmp(Loop2);
5091 
5092     __ BIND(L_128);
5093     roundDeclast(RK0);
5094     __ jmp(Loop2);
5095 
5096     __ BIND(L_192);
5097     roundDeclast(RK0);
5098 
5099     __ BIND(Loop2);
5100     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5101     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
5102     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
5103     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
5104     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
5105     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
5106     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
5107     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
5108     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
5109 
5110     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
5111     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
5112     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
5113     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
5114     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
5115     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
5116     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
5117     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
5118     __ leaq(to, Address(to, 8 * 64));
5119     __ jmp(Loop);
5120 
5121     __ BIND(Lcbc_dec_rem);
5122     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
5123 
5124     __ BIND(Lcbc_dec_rem_loop);
5125     __ subl(len_reg, 16);
5126     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
5127 
5128     __ movdqu(S0, Address(from, 0));
5129     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5130     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
5131     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
5132     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
5133     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
5134     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
5135     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
5136     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
5137     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
5138     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
5139     __ cmpl(rounds, 44);
5140     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5141 
5142     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
5143     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
5144     __ cmpl(rounds, 52);
5145     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5146 
5147     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
5148     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
5149 
5150     __ BIND(Lcbc_dec_rem_last);
5151     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
5152 
5153     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5154     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
5155     __ movdqu(Address(to, 0), B0);
5156     __ leaq(from, Address(from, 16));
5157     __ leaq(to, Address(to, 16));
5158     __ jmp(Lcbc_dec_rem_loop);
5159 
5160     __ BIND(Lcbc_dec_ret);
5161     __ movdqu(Address(rvec, 0), IV);
5162 
5163     // Zero out the round keys
5164     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
5165     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
5166     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
5167     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
5168     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
5169     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
5170     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
5171     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
5172     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
5173     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
5174     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
5175     __ cmpl(rounds, 44);
5176     __ jcc(Assembler::belowEqual, Lcbc_exit);
5177     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
5178     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
5179     __ cmpl(rounds, 52);
5180     __ jcc(Assembler::belowEqual, Lcbc_exit);
5181     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
5182     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
5183 
5184     __ BIND(Lcbc_exit);
5185     __ pop(rbx);
5186 #ifdef _WIN64
5187     __ movl(rax, len_mem);
5188 #else
5189     __ pop(rax); // return length
5190 #endif
5191     __ leave(); // required for proper stackwalking of RuntimeStub frame
5192     __ ret(0);
5193     return start;
5194 }
5195 
5196 // Polynomial x^128+x^127+x^126+x^121+1
5197 address ghash_polynomial_addr() {
5198     __ align(CodeEntryAlignment);
5199     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
5200     address start = __ pc();
5201     __ emit_data64(0x0000000000000001, relocInfo::none);
5202     __ emit_data64(0xc200000000000000, relocInfo::none);
5203     return start;
5204 }
5205 
5206 address ghash_shufflemask_addr() {
5207     __ align(CodeEntryAlignment);
5208     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
5209     address start = __ pc();
5210     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5211     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5212     return start;
5213 }
5214 
5215 // Ghash single and multi block operations using AVX instructions
5216 address generate_avx_ghash_processBlocks() {
5217     __ align(CodeEntryAlignment);
5218 
5219     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5220     address start = __ pc();
5221 
5222     // arguments
5223     const Register state = c_rarg0;
5224     const Register htbl = c_rarg1;
5225     const Register data = c_rarg2;
5226     const Register blocks = c_rarg3;
5227     __ enter();
5228    // Save state before entering routine
5229     __ avx_ghash(state, htbl, data, blocks);
5230     __ leave(); // required for proper stackwalking of RuntimeStub frame
5231     __ ret(0);
5232     return start;
5233 }
5234 
5235   // byte swap x86 long
5236   address generate_ghash_long_swap_mask() {
5237     __ align(CodeEntryAlignment);
5238     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
5239     address start = __ pc();
5240     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
5241     __ emit_data64(0x0706050403020100, relocInfo::none );
5242   return start;
5243   }
5244 
5245   // byte swap x86 byte array
5246   address generate_ghash_byte_swap_mask() {
5247     __ align(CodeEntryAlignment);
5248     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
5249     address start = __ pc();
5250     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
5251     __ emit_data64(0x0001020304050607, relocInfo::none );
5252   return start;
5253   }
5254 
5255   /* Single and multi-block ghash operations */
5256   address generate_ghash_processBlocks() {
5257     __ align(CodeEntryAlignment);
5258     Label L_ghash_loop, L_exit;
5259     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5260     address start = __ pc();
5261 
5262     const Register state        = c_rarg0;
5263     const Register subkeyH      = c_rarg1;
5264     const Register data         = c_rarg2;
5265     const Register blocks       = c_rarg3;
5266 
5267     const XMMRegister xmm_temp0 = xmm0;
5268     const XMMRegister xmm_temp1 = xmm1;
5269     const XMMRegister xmm_temp2 = xmm2;
5270     const XMMRegister xmm_temp3 = xmm3;
5271     const XMMRegister xmm_temp4 = xmm4;
5272     const XMMRegister xmm_temp5 = xmm5;
5273     const XMMRegister xmm_temp6 = xmm6;
5274     const XMMRegister xmm_temp7 = xmm7;
5275     const XMMRegister xmm_temp8 = xmm8;
5276     const XMMRegister xmm_temp9 = xmm9;
5277     const XMMRegister xmm_temp10 = xmm10;
5278 
5279     __ enter();
5280 
5281     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
5282 
5283     __ movdqu(xmm_temp0, Address(state, 0));
5284     __ pshufb(xmm_temp0, xmm_temp10);
5285 
5286 
5287     __ BIND(L_ghash_loop);
5288     __ movdqu(xmm_temp2, Address(data, 0));
5289     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
5290 
5291     __ movdqu(xmm_temp1, Address(subkeyH, 0));
5292     __ pshufb(xmm_temp1, xmm_temp10);
5293 
5294     __ pxor(xmm_temp0, xmm_temp2);
5295 
5296     //
5297     // Multiply with the hash key
5298     //
5299     __ movdqu(xmm_temp3, xmm_temp0);
5300     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
5301     __ movdqu(xmm_temp4, xmm_temp0);
5302     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
5303 
5304     __ movdqu(xmm_temp5, xmm_temp0);
5305     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
5306     __ movdqu(xmm_temp6, xmm_temp0);
5307     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
5308 
5309     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
5310 
5311     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
5312     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
5313     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
5314     __ pxor(xmm_temp3, xmm_temp5);
5315     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
5316                                         // of the carry-less multiplication of
5317                                         // xmm0 by xmm1.
5318 
5319     // We shift the result of the multiplication by one bit position
5320     // to the left to cope for the fact that the bits are reversed.
5321     __ movdqu(xmm_temp7, xmm_temp3);
5322     __ movdqu(xmm_temp8, xmm_temp6);
5323     __ pslld(xmm_temp3, 1);
5324     __ pslld(xmm_temp6, 1);
5325     __ psrld(xmm_temp7, 31);
5326     __ psrld(xmm_temp8, 31);
5327     __ movdqu(xmm_temp9, xmm_temp7);
5328     __ pslldq(xmm_temp8, 4);
5329     __ pslldq(xmm_temp7, 4);
5330     __ psrldq(xmm_temp9, 12);
5331     __ por(xmm_temp3, xmm_temp7);
5332     __ por(xmm_temp6, xmm_temp8);
5333     __ por(xmm_temp6, xmm_temp9);
5334 
5335     //
5336     // First phase of the reduction
5337     //
5338     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
5339     // independently.
5340     __ movdqu(xmm_temp7, xmm_temp3);
5341     __ movdqu(xmm_temp8, xmm_temp3);
5342     __ movdqu(xmm_temp9, xmm_temp3);
5343     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
5344     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
5345     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
5346     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
5347     __ pxor(xmm_temp7, xmm_temp9);
5348     __ movdqu(xmm_temp8, xmm_temp7);
5349     __ pslldq(xmm_temp7, 12);
5350     __ psrldq(xmm_temp8, 4);
5351     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
5352 
5353     //
5354     // Second phase of the reduction
5355     //
5356     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
5357     // shift operations.
5358     __ movdqu(xmm_temp2, xmm_temp3);
5359     __ movdqu(xmm_temp4, xmm_temp3);
5360     __ movdqu(xmm_temp5, xmm_temp3);
5361     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
5362     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
5363     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
5364     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
5365     __ pxor(xmm_temp2, xmm_temp5);
5366     __ pxor(xmm_temp2, xmm_temp8);
5367     __ pxor(xmm_temp3, xmm_temp2);
5368     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
5369 
5370     __ decrement(blocks);
5371     __ jcc(Assembler::zero, L_exit);
5372     __ movdqu(xmm_temp0, xmm_temp6);
5373     __ addptr(data, 16);
5374     __ jmp(L_ghash_loop);
5375 
5376     __ BIND(L_exit);
5377     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
5378     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
5379     __ leave();
5380     __ ret(0);
5381     return start;
5382   }
5383 
5384   address base64_shuffle_addr()
5385   {
5386     __ align64();
5387     StubCodeMark mark(this, "StubRoutines", "shuffle_base64");
5388     address start = __ pc();
5389     assert(((unsigned long long)start & 0x3f) == 0,
5390            "Alignment problem (0x%08llx)", (unsigned long long)start);
5391     __ emit_data64(0x0405030401020001, relocInfo::none);
5392     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5393     __ emit_data64(0x10110f100d0e0c0d, relocInfo::none);
5394     __ emit_data64(0x1617151613141213, relocInfo::none);
5395     __ emit_data64(0x1c1d1b1c191a1819, relocInfo::none);
5396     __ emit_data64(0x222321221f201e1f, relocInfo::none);
5397     __ emit_data64(0x2829272825262425, relocInfo::none);
5398     __ emit_data64(0x2e2f2d2e2b2c2a2b, relocInfo::none);
5399     return start;
5400   }
5401 
5402   address base64_avx2_shuffle_addr()
5403   {
5404     __ align32();
5405     StubCodeMark mark(this, "StubRoutines", "avx2_shuffle_base64");
5406     address start = __ pc();
5407     __ emit_data64(0x0809070805060405, relocInfo::none);
5408     __ emit_data64(0x0e0f0d0e0b0c0a0b, relocInfo::none);
5409     __ emit_data64(0x0405030401020001, relocInfo::none);
5410     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5411     return start;
5412   }
5413 
5414   address base64_avx2_input_mask_addr()
5415   {
5416     __ align32();
5417     StubCodeMark mark(this, "StubRoutines", "avx2_input_mask_base64");
5418     address start = __ pc();
5419     __ emit_data64(0x8000000000000000, relocInfo::none);
5420     __ emit_data64(0x8000000080000000, relocInfo::none);
5421     __ emit_data64(0x8000000080000000, relocInfo::none);
5422     __ emit_data64(0x8000000080000000, relocInfo::none);
5423     return start;
5424   }
5425 
5426   address base64_avx2_lut_addr()
5427   {
5428     __ align32();
5429     StubCodeMark mark(this, "StubRoutines", "avx2_lut_base64");
5430     address start = __ pc();
5431     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5432     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5433     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5434     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5435 
5436     // URL LUT
5437     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5438     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5439     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5440     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5441     return start;
5442   }
5443 
5444   address base64_encoding_table_addr()
5445   {
5446     __ align64();
5447     StubCodeMark mark(this, "StubRoutines", "encoding_table_base64");
5448     address start = __ pc();
5449     assert(((unsigned long long)start & 0x3f) == 0, "Alignment problem (0x%08llx)", (unsigned long long)start);
5450     __ emit_data64(0x4847464544434241, relocInfo::none);
5451     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5452     __ emit_data64(0x5857565554535251, relocInfo::none);
5453     __ emit_data64(0x6665646362615a59, relocInfo::none);
5454     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5455     __ emit_data64(0x767574737271706f, relocInfo::none);
5456     __ emit_data64(0x333231307a797877, relocInfo::none);
5457     __ emit_data64(0x2f2b393837363534, relocInfo::none);
5458 
5459     // URL table
5460     __ emit_data64(0x4847464544434241, relocInfo::none);
5461     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5462     __ emit_data64(0x5857565554535251, relocInfo::none);
5463     __ emit_data64(0x6665646362615a59, relocInfo::none);
5464     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5465     __ emit_data64(0x767574737271706f, relocInfo::none);
5466     __ emit_data64(0x333231307a797877, relocInfo::none);
5467     __ emit_data64(0x5f2d393837363534, relocInfo::none);
5468     return start;
5469   }
5470 
5471   // Code for generating Base64 encoding.
5472   // Intrinsic function prototype in Base64.java:
5473   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp,
5474   // boolean isURL) {
5475   address generate_base64_encodeBlock()
5476   {
5477     __ align(CodeEntryAlignment);
5478     StubCodeMark mark(this, "StubRoutines", "implEncode");
5479     address start = __ pc();
5480     __ enter();
5481 
5482     // Save callee-saved registers before using them
5483     __ push(r12);
5484     __ push(r13);
5485     __ push(r14);
5486     __ push(r15);
5487 
5488     // arguments
5489     const Register source = c_rarg0;       // Source Array
5490     const Register start_offset = c_rarg1; // start offset
5491     const Register end_offset = c_rarg2;   // end offset
5492     const Register dest = c_rarg3;   // destination array
5493 
5494 #ifndef _WIN64
5495     const Register dp = c_rarg4;    // Position for writing to dest array
5496     const Register isURL = c_rarg5; // Base64 or URL character set
5497 #else
5498     const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5499     const Address isURL_mem(rbp, 7 * wordSize);
5500     const Register isURL = r10; // pick the volatile windows register
5501     const Register dp = r12;
5502     __ movl(dp, dp_mem);
5503     __ movl(isURL, isURL_mem);
5504 #endif
5505 
5506     const Register length = r14;
5507     const Register encode_table = r13;
5508     Label L_process3, L_exit, L_processdata, L_vbmiLoop, L_not512, L_32byteLoop;
5509 
5510     // calculate length from offsets
5511     __ movl(length, end_offset);
5512     __ subl(length, start_offset);
5513     __ cmpl(length, 0);
5514     __ jcc(Assembler::lessEqual, L_exit);
5515 
5516     // Code for 512-bit VBMI encoding.  Encodes 48 input bytes into 64
5517     // output bytes. We read 64 input bytes and ignore the last 16, so be
5518     // sure not to read past the end of the input buffer.
5519     if (VM_Version::supports_avx512_vbmi()) {
5520       __ cmpl(length, 64); // Do not overrun input buffer.
5521       __ jcc(Assembler::below, L_not512);
5522 
5523       __ shll(isURL, 6); // index into decode table based on isURL
5524       __ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5525       __ addptr(encode_table, isURL);
5526       __ shrl(isURL, 6); // restore isURL
5527 
5528       __ mov64(rax, 0x3036242a1016040aull); // Shifts
5529       __ evmovdquq(xmm3, ExternalAddress(StubRoutines::x86::base64_shuffle_addr()), Assembler::AVX_512bit, r15);
5530       __ evmovdquq(xmm2, Address(encode_table, 0), Assembler::AVX_512bit);
5531       __ evpbroadcastq(xmm1, rax, Assembler::AVX_512bit);
5532 
5533       __ align32();
5534       __ BIND(L_vbmiLoop);
5535 
5536       __ vpermb(xmm0, xmm3, Address(source, start_offset), Assembler::AVX_512bit);
5537       __ subl(length, 48);
5538 
5539       // Put the input bytes into the proper lanes for writing, then
5540       // encode them.
5541       __ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
5542       __ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
5543 
5544       // Write to destination
5545       __ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
5546 
5547       __ addptr(dest, 64);
5548       __ addptr(source, 48);
5549       __ cmpl(length, 64);
5550       __ jcc(Assembler::aboveEqual, L_vbmiLoop);
5551 
5552       __ vzeroupper();
5553     }
5554 
5555     __ BIND(L_not512);
5556     if (VM_Version::supports_avx2()
5557         && VM_Version::supports_avx512vlbw()) {
5558       /*
5559       ** This AVX2 encoder is based off the paper at:
5560       **      https://dl.acm.org/doi/10.1145/3132709
5561       **
5562       ** We use AVX2 SIMD instructions to encode 24 bytes into 32
5563       ** output bytes.
5564       **
5565       */
5566       // Lengths under 32 bytes are done with scalar routine
5567       __ cmpl(length, 31);
5568       __ jcc(Assembler::belowEqual, L_process3);
5569 
5570       // Set up supporting constant table data
5571       __ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
5572       // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
5573       __ movl(rax, 0x0fc0fc00);
5574       __ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
5575       __ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
5576 
5577       // Multiplication constant for "shifting" right by 6 and 10
5578       // bits
5579       __ movl(rax, 0x04000040);
5580 
5581       __ subl(length, 24);
5582       __ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
5583 
5584       // For the first load, we mask off reading of the first 4
5585       // bytes into the register. This is so we can get 4 3-byte
5586       // chunks into each lane of the register, avoiding having to
5587       // handle end conditions.  We then shuffle these bytes into a
5588       // specific order so that manipulation is easier.
5589       //
5590       // The initial read loads the XMM register like this:
5591       //
5592       // Lower 128-bit lane:
5593       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5594       // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1
5595       // | C2 | D0 | D1 | D2 |
5596       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5597       //
5598       // Upper 128-bit lane:
5599       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5600       // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2
5601       // | XX | XX | XX | XX |
5602       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5603       //
5604       // Where A0 is the first input byte, B0 is the fourth, etc.
5605       // The alphabetical significance denotes the 3 bytes to be
5606       // consumed and encoded into 4 bytes.
5607       //
5608       // We then shuffle the register so each 32-bit word contains
5609       // the sequence:
5610       //    A1 A0 A2 A1, B1, B0, B2, B1, etc.
5611       // Each of these byte sequences are then manipulated into 4
5612       // 6-bit values ready for encoding.
5613       //
5614       // If we focus on one set of 3-byte chunks, changing the
5615       // nomenclature such that A0 => a, A1 => b, and A2 => c, we
5616       // shuffle such that each 24-bit chunk contains:
5617       //
5618       // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6
5619       // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0
5620       // Explain this step.
5621       // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4
5622       // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2
5623       //
5624       // W first and off all but bits 4-9 and 16-21 (c5..c0 and
5625       // a5..a0) and shift them using a vector multiplication
5626       // operation (vpmulhuw) which effectively shifts c right by 6
5627       // bits and a right by 10 bits.  We similarly mask bits 10-15
5628       // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4
5629       // bits respecively.  This is done using vpmullw.  We end up
5630       // with 4 6-bit values, thus splitting the 3 input bytes,
5631       // ready for encoding:
5632       //    0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0
5633       //
5634       // For translation, we recognize that there are 5 distinct
5635       // ranges of legal Base64 characters as below:
5636       //
5637       //   +-------------+-------------+------------+
5638       //   | 6-bit value | ASCII range |   offset   |
5639       //   +-------------+-------------+------------+
5640       //   |    0..25    |    A..Z     |     65     |
5641       //   |   26..51    |    a..z     |     71     |
5642       //   |   52..61    |    0..9     |     -4     |
5643       //   |     62      |   + or -    | -19 or -17 |
5644       //   |     63      |   / or _    | -16 or 32  |
5645       //   +-------------+-------------+------------+
5646       //
5647       // We note that vpshufb does a parallel lookup in a
5648       // destination register using the lower 4 bits of bytes from a
5649       // source register.  If we use a saturated subtraction and
5650       // subtract 51 from each 6-bit value, bytes from [0,51]
5651       // saturate to 0, and [52,63] map to a range of [1,12].  We
5652       // distinguish the [0,25] and [26,51] ranges by assigning a
5653       // value of 13 for all 6-bit values less than 26.  We end up
5654       // with:
5655       //
5656       //   +-------------+-------------+------------+
5657       //   | 6-bit value |   Reduced   |   offset   |
5658       //   +-------------+-------------+------------+
5659       //   |    0..25    |     13      |     65     |
5660       //   |   26..51    |      0      |     71     |
5661       //   |   52..61    |    0..9     |     -4     |
5662       //   |     62      |     11      | -19 or -17 |
5663       //   |     63      |     12      | -16 or 32  |
5664       //   +-------------+-------------+------------+
5665       //
5666       // We then use a final vpshufb to add the appropriate offset,
5667       // translating the bytes.
5668       //
5669       // Load input bytes - only 28 bytes.  Mask the first load to
5670       // not load into the full register.
5671       __ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
5672 
5673       // Move 3-byte chunks of input (12 bytes) into 16 bytes,
5674       // ordering by:
5675       //   1, 0, 2, 1; 4, 3, 5, 4; etc.  This groups 6-bit chunks
5676       //   for easy masking
5677       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5678 
5679       __ addl(start_offset, 24);
5680 
5681       // Load masking register for first and third (and multiples)
5682       // 6-bit values.
5683       __ movl(rax, 0x003f03f0);
5684       __ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
5685       // Multiplication constant for "shifting" left by 4 and 8 bits
5686       __ movl(rax, 0x01000010);
5687       __ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
5688 
5689       // Isolate 6-bit chunks of interest
5690       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5691 
5692       // Load constants for encoding
5693       __ movl(rax, 0x19191919);
5694       __ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
5695       __ movl(rax, 0x33333333);
5696       __ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
5697 
5698       // Shift output bytes 0 and 2 into proper lanes
5699       __ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
5700 
5701       // Mask and shift output bytes 1 and 3 into proper lanes and
5702       // combine
5703       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5704       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5705       __ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
5706 
5707       // Find out which are 0..25.  This indicates which input
5708       // values fall in the range of 'A'-'Z', which require an
5709       // additional offset (see comments above)
5710       __ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
5711       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5712       __ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
5713 
5714       // Load the proper lookup table
5715       __ lea(r11, ExternalAddress(StubRoutines::x86::base64_avx2_lut_addr()));
5716       __ movl(r15, isURL);
5717       __ shll(r15, 5);
5718       __ vmovdqu(xmm2, Address(r11, r15));
5719 
5720       // Shuffle the offsets based on the range calculation done
5721       // above. This allows us to add the correct offset to the
5722       // 6-bit value corresponding to the range documented above.
5723       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5724       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5725 
5726       // Store the encoded bytes
5727       __ vmovdqu(Address(dest, dp), xmm0);
5728       __ addl(dp, 32);
5729 
5730       __ cmpl(length, 31);
5731       __ jcc(Assembler::belowEqual, L_process3);
5732 
5733       __ align32();
5734       __ BIND(L_32byteLoop);
5735 
5736       // Get next 32 bytes
5737       __ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
5738 
5739       __ subl(length, 24);
5740       __ addl(start_offset, 24);
5741 
5742       // This logic is identical to the above, with only constant
5743       // register loads removed.  Shuffle the input, mask off 6-bit
5744       // chunks, shift them into place, then add the offset to
5745       // encode.
5746       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5747 
5748       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5749       __ vpmulhuw(xmm10, xmm0, xmm7, Assembler::AVX_256bit);
5750       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5751       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5752       __ vpor(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
5753       __ vpcmpgtb(xmm10, xmm0, xmm3, Assembler::AVX_256bit);
5754       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5755       __ vpsubb(xmm1, xmm1, xmm10, Assembler::AVX_256bit);
5756       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5757       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5758 
5759       // Store the encoded bytes
5760       __ vmovdqu(Address(dest, dp), xmm0);
5761       __ addl(dp, 32);
5762 
5763       __ cmpl(length, 31);
5764       __ jcc(Assembler::above, L_32byteLoop);
5765 
5766       __ BIND(L_process3);
5767       __ vzeroupper();
5768     } else {
5769       __ BIND(L_process3);
5770     }
5771 
5772     __ cmpl(length, 3);
5773     __ jcc(Assembler::below, L_exit);
5774 
5775     // Load the encoding table based on isURL
5776     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5777     __ movl(r15, isURL);
5778     __ shll(r15, 6);
5779     __ addptr(r11, r15);
5780 
5781     __ BIND(L_processdata);
5782 
5783     // Load 3 bytes
5784     __ load_unsigned_byte(r15, Address(source, start_offset));
5785     __ load_unsigned_byte(r10, Address(source, start_offset, Address::times_1, 1));
5786     __ load_unsigned_byte(r13, Address(source, start_offset, Address::times_1, 2));
5787 
5788     // Build a 32-bit word with bytes 1, 2, 0, 1
5789     __ movl(rax, r10);
5790     __ shll(r10, 24);
5791     __ orl(rax, r10);
5792 
5793     __ subl(length, 3);
5794 
5795     __ shll(r15, 8);
5796     __ shll(r13, 16);
5797     __ orl(rax, r15);
5798 
5799     __ addl(start_offset, 3);
5800 
5801     __ orl(rax, r13);
5802     // At this point, rax contains | byte1 | byte2 | byte0 | byte1
5803     // r13 has byte2 << 16 - need low-order 6 bits to translate.
5804     // This translated byte is the fourth output byte.
5805     __ shrl(r13, 16);
5806     __ andl(r13, 0x3f);
5807 
5808     // The high-order 6 bits of r15 (byte0) is translated.
5809     // The translated byte is the first output byte.
5810     __ shrl(r15, 10);
5811 
5812     __ load_unsigned_byte(r13, Address(r11, r13));
5813     __ load_unsigned_byte(r15, Address(r11, r15));
5814 
5815     __ movb(Address(dest, dp, Address::times_1, 3), r13);
5816 
5817     // Extract high-order 4 bits of byte1 and low-order 2 bits of byte0.
5818     // This translated byte is the second output byte.
5819     __ shrl(rax, 4);
5820     __ movl(r10, rax);
5821     __ andl(rax, 0x3f);
5822 
5823     __ movb(Address(dest, dp, Address::times_1, 0), r15);
5824 
5825     __ load_unsigned_byte(rax, Address(r11, rax));
5826 
5827     // Extract low-order 2 bits of byte1 and high-order 4 bits of byte2.
5828     // This translated byte is the third output byte.
5829     __ shrl(r10, 18);
5830     __ andl(r10, 0x3f);
5831 
5832     __ load_unsigned_byte(r10, Address(r11, r10));
5833 
5834     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5835     __ movb(Address(dest, dp, Address::times_1, 2), r10);
5836 
5837     __ addl(dp, 4);
5838     __ cmpl(length, 3);
5839     __ jcc(Assembler::aboveEqual, L_processdata);
5840 
5841     __ BIND(L_exit);
5842     __ pop(r15);
5843     __ pop(r14);
5844     __ pop(r13);
5845     __ pop(r12);
5846     __ leave();
5847     __ ret(0);
5848     return start;
5849   }
5850 
5851   // base64 AVX512vbmi tables
5852   address base64_vbmi_lookup_lo_addr() {
5853     __ align64();
5854     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64");
5855     address start = __ pc();
5856     assert(((unsigned long long)start & 0x3f) == 0,
5857            "Alignment problem (0x%08llx)", (unsigned long long)start);
5858     __ emit_data64(0x8080808080808080, relocInfo::none);
5859     __ emit_data64(0x8080808080808080, relocInfo::none);
5860     __ emit_data64(0x8080808080808080, relocInfo::none);
5861     __ emit_data64(0x8080808080808080, relocInfo::none);
5862     __ emit_data64(0x8080808080808080, relocInfo::none);
5863     __ emit_data64(0x3f8080803e808080, relocInfo::none);
5864     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5865     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5866     return start;
5867   }
5868 
5869   address base64_vbmi_lookup_hi_addr() {
5870     __ align64();
5871     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64");
5872     address start = __ pc();
5873     assert(((unsigned long long)start & 0x3f) == 0,
5874            "Alignment problem (0x%08llx)", (unsigned long long)start);
5875     __ emit_data64(0x0605040302010080, relocInfo::none);
5876     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5877     __ emit_data64(0x161514131211100f, relocInfo::none);
5878     __ emit_data64(0x8080808080191817, relocInfo::none);
5879     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5880     __ emit_data64(0x2827262524232221, relocInfo::none);
5881     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5882     __ emit_data64(0x8080808080333231, relocInfo::none);
5883     return start;
5884   }
5885   address base64_vbmi_lookup_lo_url_addr() {
5886     __ align64();
5887     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64url");
5888     address start = __ pc();
5889     assert(((unsigned long long)start & 0x3f) == 0,
5890            "Alignment problem (0x%08llx)", (unsigned long long)start);
5891     __ emit_data64(0x8080808080808080, relocInfo::none);
5892     __ emit_data64(0x8080808080808080, relocInfo::none);
5893     __ emit_data64(0x8080808080808080, relocInfo::none);
5894     __ emit_data64(0x8080808080808080, relocInfo::none);
5895     __ emit_data64(0x8080808080808080, relocInfo::none);
5896     __ emit_data64(0x80803e8080808080, relocInfo::none);
5897     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5898     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5899     return start;
5900   }
5901 
5902   address base64_vbmi_lookup_hi_url_addr() {
5903     __ align64();
5904     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64url");
5905     address start = __ pc();
5906     assert(((unsigned long long)start & 0x3f) == 0,
5907            "Alignment problem (0x%08llx)", (unsigned long long)start);
5908     __ emit_data64(0x0605040302010080, relocInfo::none);
5909     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5910     __ emit_data64(0x161514131211100f, relocInfo::none);
5911     __ emit_data64(0x3f80808080191817, relocInfo::none);
5912     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5913     __ emit_data64(0x2827262524232221, relocInfo::none);
5914     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5915     __ emit_data64(0x8080808080333231, relocInfo::none);
5916     return start;
5917   }
5918 
5919   address base64_vbmi_pack_vec_addr() {
5920     __ align64();
5921     StubCodeMark mark(this, "StubRoutines", "pack_vec_base64");
5922     address start = __ pc();
5923     assert(((unsigned long long)start & 0x3f) == 0,
5924            "Alignment problem (0x%08llx)", (unsigned long long)start);
5925     __ emit_data64(0x090a040506000102, relocInfo::none);
5926     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5927     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5928     __ emit_data64(0x292a242526202122, relocInfo::none);
5929     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5930     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5931     __ emit_data64(0x0000000000000000, relocInfo::none);
5932     __ emit_data64(0x0000000000000000, relocInfo::none);
5933     return start;
5934   }
5935 
5936   address base64_vbmi_join_0_1_addr() {
5937     __ align64();
5938     StubCodeMark mark(this, "StubRoutines", "join_0_1_base64");
5939     address start = __ pc();
5940     assert(((unsigned long long)start & 0x3f) == 0,
5941            "Alignment problem (0x%08llx)", (unsigned long long)start);
5942     __ emit_data64(0x090a040506000102, relocInfo::none);
5943     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5944     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5945     __ emit_data64(0x292a242526202122, relocInfo::none);
5946     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5947     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5948     __ emit_data64(0x494a444546404142, relocInfo::none);
5949     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5950     return start;
5951   }
5952 
5953   address base64_vbmi_join_1_2_addr() {
5954     __ align64();
5955     StubCodeMark mark(this, "StubRoutines", "join_1_2_base64");
5956     address start = __ pc();
5957     assert(((unsigned long long)start & 0x3f) == 0,
5958            "Alignment problem (0x%08llx)", (unsigned long long)start);
5959     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5960     __ emit_data64(0x292a242526202122, relocInfo::none);
5961     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5962     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5963     __ emit_data64(0x494a444546404142, relocInfo::none);
5964     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5965     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
5966     __ emit_data64(0x696a646566606162, relocInfo::none);
5967     return start;
5968   }
5969 
5970   address base64_vbmi_join_2_3_addr() {
5971     __ align64();
5972     StubCodeMark mark(this, "StubRoutines", "join_2_3_base64");
5973     address start = __ pc();
5974     assert(((unsigned long long)start & 0x3f) == 0,
5975            "Alignment problem (0x%08llx)", (unsigned long long)start);
5976     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5977     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5978     __ emit_data64(0x494a444546404142, relocInfo::none);
5979     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5980     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
5981     __ emit_data64(0x696a646566606162, relocInfo::none);
5982     __ emit_data64(0x767071726c6d6e68, relocInfo::none);
5983     __ emit_data64(0x7c7d7e78797a7475, relocInfo::none);
5984     return start;
5985   }
5986 
5987   address base64_decoding_table_addr() {
5988     StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
5989     address start = __ pc();
5990     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5991     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5992     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5993     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5994     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5995     __ emit_data64(0x3fffffff3effffff, relocInfo::none);
5996     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5997     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
5998     __ emit_data64(0x06050403020100ff, relocInfo::none);
5999     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6000     __ emit_data64(0x161514131211100f, relocInfo::none);
6001     __ emit_data64(0xffffffffff191817, relocInfo::none);
6002     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6003     __ emit_data64(0x2827262524232221, relocInfo::none);
6004     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6005     __ emit_data64(0xffffffffff333231, relocInfo::none);
6006     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6007     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6008     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6009     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6010     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6011     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6012     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6013     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6014     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6015     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6016     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6017     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6018     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6019     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6020     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6021     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6022 
6023     // URL table
6024     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6025     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6026     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6027     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6028     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6029     __ emit_data64(0xffff3effffffffff, relocInfo::none);
6030     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6031     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6032     __ emit_data64(0x06050403020100ff, relocInfo::none);
6033     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6034     __ emit_data64(0x161514131211100f, relocInfo::none);
6035     __ emit_data64(0x3fffffffff191817, relocInfo::none);
6036     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6037     __ emit_data64(0x2827262524232221, relocInfo::none);
6038     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6039     __ emit_data64(0xffffffffff333231, relocInfo::none);
6040     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6041     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6042     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6043     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6044     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6045     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6046     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6047     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6048     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6049     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6050     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6051     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6052     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6053     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6054     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6055     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6056     return start;
6057   }
6058 
6059 
6060 // Code for generating Base64 decoding.
6061 //
6062 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
6063 //
6064 // Intrinsic function prototype in Base64.java:
6065 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME) {
6066   address generate_base64_decodeBlock() {
6067     __ align(CodeEntryAlignment);
6068     StubCodeMark mark(this, "StubRoutines", "implDecode");
6069     address start = __ pc();
6070     __ enter();
6071 
6072     // Save callee-saved registers before using them
6073     __ push(r12);
6074     __ push(r13);
6075     __ push(r14);
6076     __ push(r15);
6077     __ push(rbx);
6078 
6079     // arguments
6080     const Register source = c_rarg0; // Source Array
6081     const Register start_offset = c_rarg1; // start offset
6082     const Register end_offset = c_rarg2; // end offset
6083     const Register dest = c_rarg3; // destination array
6084     const Register isMIME = rbx;
6085 
6086 #ifndef _WIN64
6087     const Register dp = c_rarg4;  // Position for writing to dest array
6088     const Register isURL = c_rarg5;// Base64 or URL character set
6089     __ movl(isMIME, Address(rbp, 2 * wordSize));
6090 #else
6091     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
6092     const Address isURL_mem(rbp, 7 * wordSize);
6093     const Register isURL = r10;      // pick the volatile windows register
6094     const Register dp = r12;
6095     __ movl(dp, dp_mem);
6096     __ movl(isURL, isURL_mem);
6097     __ movl(isMIME, Address(rbp, 8 * wordSize));
6098 #endif
6099 
6100     const XMMRegister lookup_lo = xmm5;
6101     const XMMRegister lookup_hi = xmm6;
6102     const XMMRegister errorvec = xmm7;
6103     const XMMRegister pack16_op = xmm9;
6104     const XMMRegister pack32_op = xmm8;
6105     const XMMRegister input0 = xmm3;
6106     const XMMRegister input1 = xmm20;
6107     const XMMRegister input2 = xmm21;
6108     const XMMRegister input3 = xmm19;
6109     const XMMRegister join01 = xmm12;
6110     const XMMRegister join12 = xmm11;
6111     const XMMRegister join23 = xmm10;
6112     const XMMRegister translated0 = xmm2;
6113     const XMMRegister translated1 = xmm1;
6114     const XMMRegister translated2 = xmm0;
6115     const XMMRegister translated3 = xmm4;
6116 
6117     const XMMRegister merged0 = xmm2;
6118     const XMMRegister merged1 = xmm1;
6119     const XMMRegister merged2 = xmm0;
6120     const XMMRegister merged3 = xmm4;
6121     const XMMRegister merge_ab_bc0 = xmm2;
6122     const XMMRegister merge_ab_bc1 = xmm1;
6123     const XMMRegister merge_ab_bc2 = xmm0;
6124     const XMMRegister merge_ab_bc3 = xmm4;
6125 
6126     const XMMRegister pack24bits = xmm4;
6127 
6128     const Register length = r14;
6129     const Register output_size = r13;
6130     const Register output_mask = r15;
6131     const KRegister input_mask = k1;
6132 
6133     const XMMRegister input_initial_valid_b64 = xmm0;
6134     const XMMRegister tmp = xmm10;
6135     const XMMRegister mask = xmm0;
6136     const XMMRegister invalid_b64 = xmm1;
6137 
6138     Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
6139     Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
6140     Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
6141 
6142     // calculate length from offsets
6143     __ movl(length, end_offset);
6144     __ subl(length, start_offset);
6145     __ push(dest);          // Save for return value calc
6146 
6147     // If AVX512 VBMI not supported, just compile non-AVX code
6148     if(VM_Version::supports_avx512_vbmi() &&
6149        VM_Version::supports_avx512bw()) {
6150       __ cmpl(length, 128);     // 128-bytes is break-even for AVX-512
6151       __ jcc(Assembler::lessEqual, L_bruteForce);
6152 
6153       __ cmpl(isMIME, 0);
6154       __ jcc(Assembler::notEqual, L_bruteForce);
6155 
6156       // Load lookup tables based on isURL
6157       __ cmpl(isURL, 0);
6158       __ jcc(Assembler::notZero, L_loadURL);
6159 
6160       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_addr()), Assembler::AVX_512bit, r13);
6161       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_addr()), Assembler::AVX_512bit, r13);
6162 
6163       __ BIND(L_continue);
6164 
6165       __ movl(r15, 0x01400140);
6166       __ evpbroadcastd(pack16_op, r15, Assembler::AVX_512bit);
6167 
6168       __ movl(r15, 0x00011000);
6169       __ evpbroadcastd(pack32_op, r15, Assembler::AVX_512bit);
6170 
6171       __ cmpl(length, 0xff);
6172       __ jcc(Assembler::lessEqual, L_process64);
6173 
6174       // load masks required for decoding data
6175       __ BIND(L_processdata);
6176       __ evmovdquq(join01, ExternalAddress(StubRoutines::x86::base64_vbmi_join_0_1_addr()), Assembler::AVX_512bit,r13);
6177       __ evmovdquq(join12, ExternalAddress(StubRoutines::x86::base64_vbmi_join_1_2_addr()), Assembler::AVX_512bit, r13);
6178       __ evmovdquq(join23, ExternalAddress(StubRoutines::x86::base64_vbmi_join_2_3_addr()), Assembler::AVX_512bit, r13);
6179 
6180       __ align32();
6181       __ BIND(L_process256);
6182       // Grab input data
6183       __ evmovdquq(input0, Address(source, start_offset, Address::times_1, 0x00), Assembler::AVX_512bit);
6184       __ evmovdquq(input1, Address(source, start_offset, Address::times_1, 0x40), Assembler::AVX_512bit);
6185       __ evmovdquq(input2, Address(source, start_offset, Address::times_1, 0x80), Assembler::AVX_512bit);
6186       __ evmovdquq(input3, Address(source, start_offset, Address::times_1, 0xc0), Assembler::AVX_512bit);
6187 
6188       // Copy the low part of the lookup table into the destination of the permutation
6189       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6190       __ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
6191       __ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
6192       __ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
6193 
6194       // Translate the base64 input into "decoded" bytes
6195       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6196       __ evpermt2b(translated1, input1, lookup_hi, Assembler::AVX_512bit);
6197       __ evpermt2b(translated2, input2, lookup_hi, Assembler::AVX_512bit);
6198       __ evpermt2b(translated3, input3, lookup_hi, Assembler::AVX_512bit);
6199 
6200       // OR all of the translations together to check for errors (high-order bit of byte set)
6201       __ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
6202 
6203       __ vpternlogd(input3, 0xfe, translated0, translated1, Assembler::AVX_512bit);
6204       __ vpternlogd(input0, 0xfe, translated2, translated3, Assembler::AVX_512bit);
6205       __ vpor(errorvec, input3, input0, Assembler::AVX_512bit);
6206 
6207       // Check if there was an error - if so, try 64-byte chunks
6208       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6209       __ kortestql(k3, k3);
6210       __ jcc(Assembler::notZero, L_process64);
6211 
6212       // The merging and shuffling happens here
6213       // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa]
6214       // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd]
6215       // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
6216       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6217       __ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
6218       __ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
6219       __ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
6220 
6221       // Now do the same with packed 16-bit values.
6222       // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb]
6223       // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12
6224       // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
6225       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6226       __ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
6227       __ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
6228       __ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
6229 
6230       // The join vectors specify which byte from which vector goes into the outputs
6231       // One of every 4 bytes in the extended vector is zero, so we pack them into their
6232       // final positions in the register for storing (256 bytes in, 192 bytes out)
6233       __ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
6234       __ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
6235       __ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
6236 
6237       // Store result
6238       __ evmovdquq(Address(dest, dp, Address::times_1, 0x00), merged0, Assembler::AVX_512bit);
6239       __ evmovdquq(Address(dest, dp, Address::times_1, 0x40), merged1, Assembler::AVX_512bit);
6240       __ evmovdquq(Address(dest, dp, Address::times_1, 0x80), merged2, Assembler::AVX_512bit);
6241 
6242       __ addptr(source, 0x100);
6243       __ addptr(dest, 0xc0);
6244       __ subl(length, 0x100);
6245       __ cmpl(length, 64 * 4);
6246       __ jcc(Assembler::greaterEqual, L_process256);
6247 
6248       // At this point, we've decoded 64 * 4 * n bytes.
6249       // The remaining length will be <= 64 * 4 - 1.
6250       // UNLESS there was an error decoding the first 256-byte chunk.  In this
6251       // case, the length will be arbitrarily long.
6252       //
6253       // Note that this will be the path for MIME-encoded strings.
6254 
6255       __ BIND(L_process64);
6256 
6257       __ evmovdquq(pack24bits, ExternalAddress(StubRoutines::x86::base64_vbmi_pack_vec_addr()), Assembler::AVX_512bit, r13);
6258 
6259       __ cmpl(length, 63);
6260       __ jcc(Assembler::lessEqual, L_finalBit);
6261 
6262       __ align32();
6263       __ BIND(L_process64Loop);
6264 
6265       // Handle first 64-byte block
6266 
6267       __ evmovdquq(input0, Address(source, start_offset), Assembler::AVX_512bit);
6268       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6269       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6270 
6271       __ vpor(errorvec, translated0, input0, Assembler::AVX_512bit);
6272 
6273       // Check for error and bomb out before updating dest
6274       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6275       __ kortestql(k3, k3);
6276       __ jcc(Assembler::notZero, L_exit);
6277 
6278       // Pack output register, selecting correct byte ordering
6279       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6280       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6281       __ vpermb(merged0, pack24bits, merged0, Assembler::AVX_512bit);
6282 
6283       __ evmovdquq(Address(dest, dp), merged0, Assembler::AVX_512bit);
6284 
6285       __ subl(length, 64);
6286       __ addptr(source, 64);
6287       __ addptr(dest, 48);
6288 
6289       __ cmpl(length, 64);
6290       __ jcc(Assembler::greaterEqual, L_process64Loop);
6291 
6292       __ cmpl(length, 0);
6293       __ jcc(Assembler::lessEqual, L_exit);
6294 
6295       __ BIND(L_finalBit);
6296       // Now have 1 to 63 bytes left to decode
6297 
6298       // I was going to let Java take care of the final fragment
6299       // however it will repeatedly call this routine for every 4 bytes
6300       // of input data, so handle the rest here.
6301       __ movq(rax, -1);
6302       __ bzhiq(rax, rax, length);    // Input mask in rax
6303 
6304       __ movl(output_size, length);
6305       __ shrl(output_size, 2);   // Find (len / 4) * 3 (output length)
6306       __ lea(output_size, Address(output_size, output_size, Address::times_2, 0));
6307       // output_size in r13
6308 
6309       // Strip pad characters, if any, and adjust length and mask
6310       __ cmpb(Address(source, length, Address::times_1, -1), '=');
6311       __ jcc(Assembler::equal, L_padding);
6312 
6313       __ BIND(L_donePadding);
6314 
6315       // Output size is (64 - output_size), output mask is (all 1s >> output_size).
6316       __ kmovql(input_mask, rax);
6317       __ movq(output_mask, -1);
6318       __ bzhiq(output_mask, output_mask, output_size);
6319 
6320       // Load initial input with all valid base64 characters.  Will be used
6321       // in merging source bytes to avoid masking when determining if an error occurred.
6322       __ movl(rax, 0x61616161);
6323       __ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
6324 
6325       // A register containing all invalid base64 decoded values
6326       __ movl(rax, 0x80808080);
6327       __ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
6328 
6329       // input_mask is in k1
6330       // output_size is in r13
6331       // output_mask is in r15
6332       // zmm0 - free
6333       // zmm1 - 0x00011000
6334       // zmm2 - 0x01400140
6335       // zmm3 - errorvec
6336       // zmm4 - pack vector
6337       // zmm5 - lookup_lo
6338       // zmm6 - lookup_hi
6339       // zmm7 - errorvec
6340       // zmm8 - 0x61616161
6341       // zmm9 - 0x80808080
6342 
6343       // Load only the bytes from source, merging into our "fully-valid" register
6344       __ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
6345 
6346       // Decode all bytes within our merged input
6347       __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
6348       __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
6349       __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
6350 
6351       // Check for error.  Compare (decoded | initial) to all invalid.
6352       // If any bytes have their high-order bit set, then we have an error.
6353       __ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
6354       __ kortestql(k2, k2);
6355 
6356       // If we have an error, use the brute force loop to decode what we can (4-byte chunks).
6357       __ jcc(Assembler::notZero, L_bruteForce);
6358 
6359       // Shuffle output bytes
6360       __ vpmaddubsw(tmp, tmp, pack16_op, Assembler::AVX_512bit);
6361       __ vpmaddwd(tmp, tmp, pack32_op, Assembler::AVX_512bit);
6362 
6363       __ vpermb(tmp, pack24bits, tmp, Assembler::AVX_512bit);
6364       __ kmovql(k1, output_mask);
6365       __ evmovdqub(Address(dest, dp), k1, tmp, true, Assembler::AVX_512bit);
6366 
6367       __ addptr(dest, output_size);
6368 
6369       __ BIND(L_exit);
6370       __ vzeroupper();
6371       __ pop(rax);             // Get original dest value
6372       __ subptr(dest, rax);      // Number of bytes converted
6373       __ movptr(rax, dest);
6374       __ pop(rbx);
6375       __ pop(r15);
6376       __ pop(r14);
6377       __ pop(r13);
6378       __ pop(r12);
6379       __ leave();
6380       __ ret(0);
6381 
6382       __ BIND(L_loadURL);
6383       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_url_addr()), Assembler::AVX_512bit, r13);
6384       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_url_addr()), Assembler::AVX_512bit, r13);
6385       __ jmp(L_continue);
6386 
6387       __ BIND(L_padding);
6388       __ decrementq(output_size, 1);
6389       __ shrq(rax, 1);
6390 
6391       __ cmpb(Address(source, length, Address::times_1, -2), '=');
6392       __ jcc(Assembler::notEqual, L_donePadding);
6393 
6394       __ decrementq(output_size, 1);
6395       __ shrq(rax, 1);
6396       __ jmp(L_donePadding);
6397 
6398       __ align32();
6399       __ BIND(L_bruteForce);
6400     }   // End of if(avx512_vbmi)
6401 
6402     // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
6403 
6404     // Register state (Linux):
6405     // r12-15 - saved on stack
6406     // rdi - src
6407     // rsi - sp
6408     // rdx - sl
6409     // rcx - dst
6410     // r8 - dp
6411     // r9 - isURL
6412 
6413     // Register state (Windows):
6414     // r12-15 - saved on stack
6415     // rcx - src
6416     // rdx - sp
6417     // r8 - sl
6418     // r9 - dst
6419     // r12 - dp
6420     // r10 - isURL
6421 
6422     // Registers (common):
6423     // length (r14) - bytes in src
6424 
6425     const Register decode_table = r11;
6426     const Register out_byte_count = rbx;
6427     const Register byte1 = r13;
6428     const Register byte2 = r15;
6429     const Register byte3 = WINDOWS_ONLY(r8) NOT_WINDOWS(rdx);
6430     const Register byte4 = WINDOWS_ONLY(r10) NOT_WINDOWS(r9);
6431 
6432     __ shrl(length, 2);    // Multiple of 4 bytes only - length is # 4-byte chunks
6433     __ cmpl(length, 0);
6434     __ jcc(Assembler::lessEqual, L_exit_no_vzero);
6435 
6436     __ shll(isURL, 8);    // index into decode table based on isURL
6437     __ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
6438     __ addptr(decode_table, isURL);
6439 
6440     __ jmp(L_bottomLoop);
6441 
6442     __ align32();
6443     __ BIND(L_forceLoop);
6444     __ shll(byte1, 18);
6445     __ shll(byte2, 12);
6446     __ shll(byte3, 6);
6447     __ orl(byte1, byte2);
6448     __ orl(byte1, byte3);
6449     __ orl(byte1, byte4);
6450 
6451     __ addptr(source, 4);
6452 
6453     __ movb(Address(dest, dp, Address::times_1, 2), byte1);
6454     __ shrl(byte1, 8);
6455     __ movb(Address(dest, dp, Address::times_1, 1), byte1);
6456     __ shrl(byte1, 8);
6457     __ movb(Address(dest, dp, Address::times_1, 0), byte1);
6458 
6459     __ addptr(dest, 3);
6460     __ decrementl(length, 1);
6461     __ jcc(Assembler::zero, L_exit_no_vzero);
6462 
6463     __ BIND(L_bottomLoop);
6464     __ load_unsigned_byte(byte1, Address(source, start_offset, Address::times_1, 0x00));
6465     __ load_unsigned_byte(byte2, Address(source, start_offset, Address::times_1, 0x01));
6466     __ load_signed_byte(byte1, Address(decode_table, byte1));
6467     __ load_signed_byte(byte2, Address(decode_table, byte2));
6468     __ load_unsigned_byte(byte3, Address(source, start_offset, Address::times_1, 0x02));
6469     __ load_unsigned_byte(byte4, Address(source, start_offset, Address::times_1, 0x03));
6470     __ load_signed_byte(byte3, Address(decode_table, byte3));
6471     __ load_signed_byte(byte4, Address(decode_table, byte4));
6472 
6473     __ mov(rax, byte1);
6474     __ orl(rax, byte2);
6475     __ orl(rax, byte3);
6476     __ orl(rax, byte4);
6477     __ jcc(Assembler::positive, L_forceLoop);
6478 
6479     __ BIND(L_exit_no_vzero);
6480     __ pop(rax);             // Get original dest value
6481     __ subptr(dest, rax);      // Number of bytes converted
6482     __ movptr(rax, dest);
6483     __ pop(rbx);
6484     __ pop(r15);
6485     __ pop(r14);
6486     __ pop(r13);
6487     __ pop(r12);
6488     __ leave();
6489     __ ret(0);
6490 
6491     return start;
6492   }
6493 
6494 
6495   /**
6496    *  Arguments:
6497    *
6498    * Inputs:
6499    *   c_rarg0   - int crc
6500    *   c_rarg1   - byte* buf
6501    *   c_rarg2   - int length
6502    *
6503    * Ouput:
6504    *       rax   - int crc result
6505    */
6506   address generate_updateBytesCRC32() {
6507     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
6508 
6509     __ align(CodeEntryAlignment);
6510     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6511 
6512     address start = __ pc();
6513     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6514     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6515     // rscratch1: r10
6516     const Register crc   = c_rarg0;  // crc
6517     const Register buf   = c_rarg1;  // source java byte array address
6518     const Register len   = c_rarg2;  // length
6519     const Register table = c_rarg3;  // crc_table address (reuse register)
6520     const Register tmp1   = r11;
6521     const Register tmp2   = r10;
6522     assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
6523 
6524     BLOCK_COMMENT("Entry:");
6525     __ enter(); // required for proper stackwalking of RuntimeStub frame
6526 
6527     if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6528         VM_Version::supports_avx512bw() &&
6529         VM_Version::supports_avx512vl()) {
6530       __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6531     } else {
6532       __ kernel_crc32(crc, buf, len, table, tmp1);
6533     }
6534 
6535     __ movl(rax, crc);
6536     __ vzeroupper();
6537     __ leave(); // required for proper stackwalking of RuntimeStub frame
6538     __ ret(0);
6539 
6540     return start;
6541   }
6542 
6543   /**
6544   *  Arguments:
6545   *
6546   * Inputs:
6547   *   c_rarg0   - int crc
6548   *   c_rarg1   - byte* buf
6549   *   c_rarg2   - long length
6550   *   c_rarg3   - table_start - optional (present only when doing a library_call,
6551   *              not used by x86 algorithm)
6552   *
6553   * Ouput:
6554   *       rax   - int crc result
6555   */
6556   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
6557       assert(UseCRC32CIntrinsics, "need SSE4_2");
6558       __ align(CodeEntryAlignment);
6559       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
6560       address start = __ pc();
6561       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
6562       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
6563       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
6564       const Register crc = c_rarg0;  // crc
6565       const Register buf = c_rarg1;  // source java byte array address
6566       const Register len = c_rarg2;  // length
6567       const Register a = rax;
6568       const Register j = r9;
6569       const Register k = r10;
6570       const Register l = r11;
6571 #ifdef _WIN64
6572       const Register y = rdi;
6573       const Register z = rsi;
6574 #else
6575       const Register y = rcx;
6576       const Register z = r8;
6577 #endif
6578       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
6579 
6580       BLOCK_COMMENT("Entry:");
6581       __ enter(); // required for proper stackwalking of RuntimeStub frame
6582 #ifdef _WIN64
6583       __ push(y);
6584       __ push(z);
6585 #endif
6586       __ crc32c_ipl_alg2_alt2(crc, buf, len,
6587                               a, j, k,
6588                               l, y, z,
6589                               c_farg0, c_farg1, c_farg2,
6590                               is_pclmulqdq_supported);
6591       __ movl(rax, crc);
6592 #ifdef _WIN64
6593       __ pop(z);
6594       __ pop(y);
6595 #endif
6596       __ vzeroupper();
6597       __ leave(); // required for proper stackwalking of RuntimeStub frame
6598       __ ret(0);
6599 
6600       return start;
6601   }
6602 
6603 
6604   /***
6605    *  Arguments:
6606    *
6607    *  Inputs:
6608    *   c_rarg0   - int   adler
6609    *   c_rarg1   - byte* buff
6610    *   c_rarg2   - int   len
6611    *
6612    * Output:
6613    *   rax   - int adler result
6614    */
6615 
6616   address generate_updateBytesAdler32() {
6617       assert(UseAdler32Intrinsics, "need AVX2");
6618 
6619       __ align(CodeEntryAlignment);
6620       StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
6621 
6622       address start = __ pc();
6623 
6624       const Register data = r9;
6625       const Register size = r10;
6626 
6627       const XMMRegister yshuf0 = xmm6;
6628       const XMMRegister yshuf1 = xmm7;
6629       assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
6630 
6631       BLOCK_COMMENT("Entry:");
6632       __ enter(); // required for proper stackwalking of RuntimeStub frame
6633 
6634       __ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
6635       __ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
6636       __ movptr(data, c_rarg1); //data
6637       __ movl(size, c_rarg2); //length
6638       __ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
6639       __ leave();
6640       __ ret(0);
6641       return start;
6642   }
6643 
6644   /**
6645    *  Arguments:
6646    *
6647    *  Input:
6648    *    c_rarg0   - x address
6649    *    c_rarg1   - x length
6650    *    c_rarg2   - y address
6651    *    c_rarg3   - y length
6652    * not Win64
6653    *    c_rarg4   - z address
6654    *    c_rarg5   - z length
6655    * Win64
6656    *    rsp+40    - z address
6657    *    rsp+48    - z length
6658    */
6659   address generate_multiplyToLen() {
6660     __ align(CodeEntryAlignment);
6661     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
6662 
6663     address start = __ pc();
6664     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6665     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6666     const Register x     = rdi;
6667     const Register xlen  = rax;
6668     const Register y     = rsi;
6669     const Register ylen  = rcx;
6670     const Register z     = r8;
6671     const Register zlen  = r11;
6672 
6673     // Next registers will be saved on stack in multiply_to_len().
6674     const Register tmp1  = r12;
6675     const Register tmp2  = r13;
6676     const Register tmp3  = r14;
6677     const Register tmp4  = r15;
6678     const Register tmp5  = rbx;
6679 
6680     BLOCK_COMMENT("Entry:");
6681     __ enter(); // required for proper stackwalking of RuntimeStub frame
6682 
6683 #ifndef _WIN64
6684     __ movptr(zlen, r9); // Save r9 in r11 - zlen
6685 #endif
6686     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
6687                        // ylen => rcx, z => r8, zlen => r11
6688                        // r9 and r10 may be used to save non-volatile registers
6689 #ifdef _WIN64
6690     // last 2 arguments (#4, #5) are on stack on Win64
6691     __ movptr(z, Address(rsp, 6 * wordSize));
6692     __ movptr(zlen, Address(rsp, 7 * wordSize));
6693 #endif
6694 
6695     __ movptr(xlen, rsi);
6696     __ movptr(y,    rdx);
6697     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
6698 
6699     restore_arg_regs();
6700 
6701     __ leave(); // required for proper stackwalking of RuntimeStub frame
6702     __ ret(0);
6703 
6704     return start;
6705   }
6706 
6707   /**
6708   *  Arguments:
6709   *
6710   *  Input:
6711   *    c_rarg0   - obja     address
6712   *    c_rarg1   - objb     address
6713   *    c_rarg3   - length   length
6714   *    c_rarg4   - scale    log2_array_indxscale
6715   *
6716   *  Output:
6717   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
6718   */
6719   address generate_vectorizedMismatch() {
6720     __ align(CodeEntryAlignment);
6721     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
6722     address start = __ pc();
6723 
6724     BLOCK_COMMENT("Entry:");
6725     __ enter();
6726 
6727 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6728     const Register scale = c_rarg0;  //rcx, will exchange with r9
6729     const Register objb = c_rarg1;   //rdx
6730     const Register length = c_rarg2; //r8
6731     const Register obja = c_rarg3;   //r9
6732     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
6733 
6734     const Register tmp1 = r10;
6735     const Register tmp2 = r11;
6736 #endif
6737 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6738     const Register obja = c_rarg0;   //U:rdi
6739     const Register objb = c_rarg1;   //U:rsi
6740     const Register length = c_rarg2; //U:rdx
6741     const Register scale = c_rarg3;  //U:rcx
6742     const Register tmp1 = r8;
6743     const Register tmp2 = r9;
6744 #endif
6745     const Register result = rax; //return value
6746     const XMMRegister vec0 = xmm0;
6747     const XMMRegister vec1 = xmm1;
6748     const XMMRegister vec2 = xmm2;
6749 
6750     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
6751 
6752     __ vzeroupper();
6753     __ leave();
6754     __ ret(0);
6755 
6756     return start;
6757   }
6758 
6759 /**
6760    *  Arguments:
6761    *
6762   //  Input:
6763   //    c_rarg0   - x address
6764   //    c_rarg1   - x length
6765   //    c_rarg2   - z address
6766   //    c_rarg3   - z lenth
6767    *
6768    */
6769   address generate_squareToLen() {
6770 
6771     __ align(CodeEntryAlignment);
6772     StubCodeMark mark(this, "StubRoutines", "squareToLen");
6773 
6774     address start = __ pc();
6775     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6776     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
6777     const Register x      = rdi;
6778     const Register len    = rsi;
6779     const Register z      = r8;
6780     const Register zlen   = rcx;
6781 
6782    const Register tmp1      = r12;
6783    const Register tmp2      = r13;
6784    const Register tmp3      = r14;
6785    const Register tmp4      = r15;
6786    const Register tmp5      = rbx;
6787 
6788     BLOCK_COMMENT("Entry:");
6789     __ enter(); // required for proper stackwalking of RuntimeStub frame
6790 
6791     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
6792                        // zlen => rcx
6793                        // r9 and r10 may be used to save non-volatile registers
6794     __ movptr(r8, rdx);
6795     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6796 
6797     restore_arg_regs();
6798 
6799     __ leave(); // required for proper stackwalking of RuntimeStub frame
6800     __ ret(0);
6801 
6802     return start;
6803   }
6804 
6805   address generate_method_entry_barrier() {
6806     __ align(CodeEntryAlignment);
6807     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
6808 
6809     Label deoptimize_label;
6810 
6811     address start = __ pc();
6812 
6813     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
6814 
6815     BLOCK_COMMENT("Entry:");
6816     __ enter(); // save rbp
6817 
6818     // save c_rarg0, because we want to use that value.
6819     // We could do without it but then we depend on the number of slots used by pusha
6820     __ push(c_rarg0);
6821 
6822     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
6823 
6824     __ pusha();
6825 
6826     // The method may have floats as arguments, and we must spill them before calling
6827     // the VM runtime.
6828     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
6829     const int xmm_size = wordSize * 2;
6830     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
6831     __ subptr(rsp, xmm_spill_size);
6832     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
6833     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
6834     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
6835     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
6836     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
6837     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
6838     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
6839     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
6840 
6841     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
6842 
6843     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
6844     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
6845     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
6846     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
6847     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
6848     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
6849     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
6850     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
6851     __ addptr(rsp, xmm_spill_size);
6852 
6853     __ cmpl(rax, 1); // 1 means deoptimize
6854     __ jcc(Assembler::equal, deoptimize_label);
6855 
6856     __ popa();
6857     __ pop(c_rarg0);
6858 
6859     __ leave();
6860 
6861     __ addptr(rsp, 1 * wordSize); // cookie
6862     __ ret(0);
6863 
6864 
6865     __ BIND(deoptimize_label);
6866 
6867     __ popa();
6868     __ pop(c_rarg0);
6869 
6870     __ leave();
6871 
6872     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
6873     // here while still having a correct stack is valuable
6874     __ testptr(rsp, Address(rsp, 0));
6875 
6876     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
6877     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
6878 
6879     return start;
6880   }
6881 
6882    /**
6883    *  Arguments:
6884    *
6885    *  Input:
6886    *    c_rarg0   - out address
6887    *    c_rarg1   - in address
6888    *    c_rarg2   - offset
6889    *    c_rarg3   - len
6890    * not Win64
6891    *    c_rarg4   - k
6892    * Win64
6893    *    rsp+40    - k
6894    */
6895   address generate_mulAdd() {
6896     __ align(CodeEntryAlignment);
6897     StubCodeMark mark(this, "StubRoutines", "mulAdd");
6898 
6899     address start = __ pc();
6900     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6901     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6902     const Register out     = rdi;
6903     const Register in      = rsi;
6904     const Register offset  = r11;
6905     const Register len     = rcx;
6906     const Register k       = r8;
6907 
6908     // Next registers will be saved on stack in mul_add().
6909     const Register tmp1  = r12;
6910     const Register tmp2  = r13;
6911     const Register tmp3  = r14;
6912     const Register tmp4  = r15;
6913     const Register tmp5  = rbx;
6914 
6915     BLOCK_COMMENT("Entry:");
6916     __ enter(); // required for proper stackwalking of RuntimeStub frame
6917 
6918     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
6919                        // len => rcx, k => r8
6920                        // r9 and r10 may be used to save non-volatile registers
6921 #ifdef _WIN64
6922     // last argument is on stack on Win64
6923     __ movl(k, Address(rsp, 6 * wordSize));
6924 #endif
6925     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
6926     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6927 
6928     restore_arg_regs();
6929 
6930     __ leave(); // required for proper stackwalking of RuntimeStub frame
6931     __ ret(0);
6932 
6933     return start;
6934   }
6935 
6936   address generate_bigIntegerRightShift() {
6937     __ align(CodeEntryAlignment);
6938     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
6939 
6940     address start = __ pc();
6941     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
6942     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
6943     const Register newArr = rdi;
6944     const Register oldArr = rsi;
6945     const Register newIdx = rdx;
6946     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
6947     const Register totalNumIter = r8;
6948 
6949     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
6950     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
6951     const Register tmp1 = r11;                    // Caller save.
6952     const Register tmp2 = rax;                    // Caller save.
6953     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
6954     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
6955     const Register tmp5 = r14;                    // Callee save.
6956     const Register tmp6 = r15;
6957 
6958     const XMMRegister x0 = xmm0;
6959     const XMMRegister x1 = xmm1;
6960     const XMMRegister x2 = xmm2;
6961 
6962     BLOCK_COMMENT("Entry:");
6963     __ enter(); // required for proper stackwalking of RuntimeStub frame
6964 
6965 #ifdef _WINDOWS
6966     setup_arg_regs(4);
6967     // For windows, since last argument is on stack, we need to move it to the appropriate register.
6968     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
6969     // Save callee save registers.
6970     __ push(tmp3);
6971     __ push(tmp4);
6972 #endif
6973     __ push(tmp5);
6974 
6975     // Rename temps used throughout the code.
6976     const Register idx = tmp1;
6977     const Register nIdx = tmp2;
6978 
6979     __ xorl(idx, idx);
6980 
6981     // Start right shift from end of the array.
6982     // For example, if #iteration = 4 and newIdx = 1
6983     // then dest[4] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
6984     // if #iteration = 4 and newIdx = 0
6985     // then dest[3] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
6986     __ movl(idx, totalNumIter);
6987     __ movl(nIdx, idx);
6988     __ addl(nIdx, newIdx);
6989 
6990     // If vectorization is enabled, check if the number of iterations is at least 64
6991     // If not, then go to ShifTwo processing 2 iterations
6992     if (VM_Version::supports_avx512_vbmi2()) {
6993       __ cmpptr(totalNumIter, (AVX3Threshold/64));
6994       __ jcc(Assembler::less, ShiftTwo);
6995 
6996       if (AVX3Threshold < 16 * 64) {
6997         __ cmpl(totalNumIter, 16);
6998         __ jcc(Assembler::less, ShiftTwo);
6999       }
7000       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7001       __ subl(idx, 16);
7002       __ subl(nIdx, 16);
7003       __ BIND(Shift512Loop);
7004       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
7005       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7006       __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
7007       __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
7008       __ subl(nIdx, 16);
7009       __ subl(idx, 16);
7010       __ jcc(Assembler::greaterEqual, Shift512Loop);
7011       __ addl(idx, 16);
7012       __ addl(nIdx, 16);
7013     }
7014     __ BIND(ShiftTwo);
7015     __ cmpl(idx, 2);
7016     __ jcc(Assembler::less, ShiftOne);
7017     __ subl(idx, 2);
7018     __ subl(nIdx, 2);
7019     __ BIND(ShiftTwoLoop);
7020     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
7021     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7022     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7023     __ shrdl(tmp5, tmp4);
7024     __ shrdl(tmp4, tmp3);
7025     __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
7026     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7027     __ subl(nIdx, 2);
7028     __ subl(idx, 2);
7029     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7030     __ addl(idx, 2);
7031     __ addl(nIdx, 2);
7032 
7033     // Do the last iteration
7034     __ BIND(ShiftOne);
7035     __ cmpl(idx, 1);
7036     __ jcc(Assembler::less, Exit);
7037     __ subl(idx, 1);
7038     __ subl(nIdx, 1);
7039     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7040     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7041     __ shrdl(tmp4, tmp3);
7042     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7043     __ BIND(Exit);
7044     // Restore callee save registers.
7045     __ pop(tmp5);
7046 #ifdef _WINDOWS
7047     __ pop(tmp4);
7048     __ pop(tmp3);
7049     restore_arg_regs();
7050 #endif
7051     __ leave(); // required for proper stackwalking of RuntimeStub frame
7052     __ ret(0);
7053     return start;
7054   }
7055 
7056    /**
7057    *  Arguments:
7058    *
7059    *  Input:
7060    *    c_rarg0   - newArr address
7061    *    c_rarg1   - oldArr address
7062    *    c_rarg2   - newIdx
7063    *    c_rarg3   - shiftCount
7064    * not Win64
7065    *    c_rarg4   - numIter
7066    * Win64
7067    *    rsp40    - numIter
7068    */
7069   address generate_bigIntegerLeftShift() {
7070     __ align(CodeEntryAlignment);
7071     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
7072     address start = __ pc();
7073     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7074     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7075     const Register newArr = rdi;
7076     const Register oldArr = rsi;
7077     const Register newIdx = rdx;
7078     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7079     const Register totalNumIter = r8;
7080     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7081     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7082     const Register tmp1 = r11;                    // Caller save.
7083     const Register tmp2 = rax;                    // Caller save.
7084     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7085     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7086     const Register tmp5 = r14;                    // Callee save.
7087 
7088     const XMMRegister x0 = xmm0;
7089     const XMMRegister x1 = xmm1;
7090     const XMMRegister x2 = xmm2;
7091     BLOCK_COMMENT("Entry:");
7092     __ enter(); // required for proper stackwalking of RuntimeStub frame
7093 
7094 #ifdef _WINDOWS
7095     setup_arg_regs(4);
7096     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7097     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7098     // Save callee save registers.
7099     __ push(tmp3);
7100     __ push(tmp4);
7101 #endif
7102     __ push(tmp5);
7103 
7104     // Rename temps used throughout the code
7105     const Register idx = tmp1;
7106     const Register numIterTmp = tmp2;
7107 
7108     // Start idx from zero.
7109     __ xorl(idx, idx);
7110     // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
7111     __ lea(newArr, Address(newArr, newIdx, Address::times_4));
7112     __ movl(numIterTmp, totalNumIter);
7113 
7114     // If vectorization is enabled, check if the number of iterations is at least 64
7115     // If not, then go to ShiftTwo shifting two numbers at a time
7116     if (VM_Version::supports_avx512_vbmi2()) {
7117       __ cmpl(totalNumIter, (AVX3Threshold/64));
7118       __ jcc(Assembler::less, ShiftTwo);
7119 
7120       if (AVX3Threshold < 16 * 64) {
7121         __ cmpl(totalNumIter, 16);
7122         __ jcc(Assembler::less, ShiftTwo);
7123       }
7124       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7125       __ subl(numIterTmp, 16);
7126       __ BIND(Shift512Loop);
7127       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7128       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
7129       __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
7130       __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
7131       __ addl(idx, 16);
7132       __ subl(numIterTmp, 16);
7133       __ jcc(Assembler::greaterEqual, Shift512Loop);
7134       __ addl(numIterTmp, 16);
7135     }
7136     __ BIND(ShiftTwo);
7137     __ cmpl(totalNumIter, 1);
7138     __ jcc(Assembler::less, Exit);
7139     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7140     __ subl(numIterTmp, 2);
7141     __ jcc(Assembler::less, ShiftOne);
7142 
7143     __ BIND(ShiftTwoLoop);
7144     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7145     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
7146     __ shldl(tmp3, tmp4);
7147     __ shldl(tmp4, tmp5);
7148     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7149     __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
7150     __ movl(tmp3, tmp5);
7151     __ addl(idx, 2);
7152     __ subl(numIterTmp, 2);
7153     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7154 
7155     // Do the last iteration
7156     __ BIND(ShiftOne);
7157     __ addl(numIterTmp, 2);
7158     __ cmpl(numIterTmp, 1);
7159     __ jcc(Assembler::less, Exit);
7160     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7161     __ shldl(tmp3, tmp4);
7162     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7163 
7164     __ BIND(Exit);
7165     // Restore callee save registers.
7166     __ pop(tmp5);
7167 #ifdef _WINDOWS
7168     __ pop(tmp4);
7169     __ pop(tmp3);
7170     restore_arg_regs();
7171 #endif
7172     __ leave(); // required for proper stackwalking of RuntimeStub frame
7173     __ ret(0);
7174     return start;
7175   }
7176 
7177   address generate_libmExp() {
7178     StubCodeMark mark(this, "StubRoutines", "libmExp");
7179 
7180     address start = __ pc();
7181 
7182     const XMMRegister x0  = xmm0;
7183     const XMMRegister x1  = xmm1;
7184     const XMMRegister x2  = xmm2;
7185     const XMMRegister x3  = xmm3;
7186 
7187     const XMMRegister x4  = xmm4;
7188     const XMMRegister x5  = xmm5;
7189     const XMMRegister x6  = xmm6;
7190     const XMMRegister x7  = xmm7;
7191 
7192     const Register tmp   = r11;
7193 
7194     BLOCK_COMMENT("Entry:");
7195     __ enter(); // required for proper stackwalking of RuntimeStub frame
7196 
7197     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7198 
7199     __ leave(); // required for proper stackwalking of RuntimeStub frame
7200     __ ret(0);
7201 
7202     return start;
7203 
7204   }
7205 
7206   address generate_libmLog() {
7207     StubCodeMark mark(this, "StubRoutines", "libmLog");
7208 
7209     address start = __ pc();
7210 
7211     const XMMRegister x0 = xmm0;
7212     const XMMRegister x1 = xmm1;
7213     const XMMRegister x2 = xmm2;
7214     const XMMRegister x3 = xmm3;
7215 
7216     const XMMRegister x4 = xmm4;
7217     const XMMRegister x5 = xmm5;
7218     const XMMRegister x6 = xmm6;
7219     const XMMRegister x7 = xmm7;
7220 
7221     const Register tmp1 = r11;
7222     const Register tmp2 = r8;
7223 
7224     BLOCK_COMMENT("Entry:");
7225     __ enter(); // required for proper stackwalking of RuntimeStub frame
7226 
7227     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
7228 
7229     __ leave(); // required for proper stackwalking of RuntimeStub frame
7230     __ ret(0);
7231 
7232     return start;
7233 
7234   }
7235 
7236   address generate_libmLog10() {
7237     StubCodeMark mark(this, "StubRoutines", "libmLog10");
7238 
7239     address start = __ pc();
7240 
7241     const XMMRegister x0 = xmm0;
7242     const XMMRegister x1 = xmm1;
7243     const XMMRegister x2 = xmm2;
7244     const XMMRegister x3 = xmm3;
7245 
7246     const XMMRegister x4 = xmm4;
7247     const XMMRegister x5 = xmm5;
7248     const XMMRegister x6 = xmm6;
7249     const XMMRegister x7 = xmm7;
7250 
7251     const Register tmp = r11;
7252 
7253     BLOCK_COMMENT("Entry:");
7254     __ enter(); // required for proper stackwalking of RuntimeStub frame
7255 
7256     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7257 
7258     __ leave(); // required for proper stackwalking of RuntimeStub frame
7259     __ ret(0);
7260 
7261     return start;
7262 
7263   }
7264 
7265   address generate_libmPow() {
7266     StubCodeMark mark(this, "StubRoutines", "libmPow");
7267 
7268     address start = __ pc();
7269 
7270     const XMMRegister x0 = xmm0;
7271     const XMMRegister x1 = xmm1;
7272     const XMMRegister x2 = xmm2;
7273     const XMMRegister x3 = xmm3;
7274 
7275     const XMMRegister x4 = xmm4;
7276     const XMMRegister x5 = xmm5;
7277     const XMMRegister x6 = xmm6;
7278     const XMMRegister x7 = xmm7;
7279 
7280     const Register tmp1 = r8;
7281     const Register tmp2 = r9;
7282     const Register tmp3 = r10;
7283     const Register tmp4 = r11;
7284 
7285     BLOCK_COMMENT("Entry:");
7286     __ enter(); // required for proper stackwalking of RuntimeStub frame
7287 
7288     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7289 
7290     __ leave(); // required for proper stackwalking of RuntimeStub frame
7291     __ ret(0);
7292 
7293     return start;
7294 
7295   }
7296 
7297   address generate_libmSin() {
7298     StubCodeMark mark(this, "StubRoutines", "libmSin");
7299 
7300     address start = __ pc();
7301 
7302     const XMMRegister x0 = xmm0;
7303     const XMMRegister x1 = xmm1;
7304     const XMMRegister x2 = xmm2;
7305     const XMMRegister x3 = xmm3;
7306 
7307     const XMMRegister x4 = xmm4;
7308     const XMMRegister x5 = xmm5;
7309     const XMMRegister x6 = xmm6;
7310     const XMMRegister x7 = xmm7;
7311 
7312     const Register tmp1 = r8;
7313     const Register tmp2 = r9;
7314     const Register tmp3 = r10;
7315     const Register tmp4 = r11;
7316 
7317     BLOCK_COMMENT("Entry:");
7318     __ enter(); // required for proper stackwalking of RuntimeStub frame
7319 
7320 #ifdef _WIN64
7321     __ push(rsi);
7322     __ push(rdi);
7323 #endif
7324     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7325 
7326 #ifdef _WIN64
7327     __ pop(rdi);
7328     __ pop(rsi);
7329 #endif
7330 
7331     __ leave(); // required for proper stackwalking of RuntimeStub frame
7332     __ ret(0);
7333 
7334     return start;
7335 
7336   }
7337 
7338   address generate_libmCos() {
7339     StubCodeMark mark(this, "StubRoutines", "libmCos");
7340 
7341     address start = __ pc();
7342 
7343     const XMMRegister x0 = xmm0;
7344     const XMMRegister x1 = xmm1;
7345     const XMMRegister x2 = xmm2;
7346     const XMMRegister x3 = xmm3;
7347 
7348     const XMMRegister x4 = xmm4;
7349     const XMMRegister x5 = xmm5;
7350     const XMMRegister x6 = xmm6;
7351     const XMMRegister x7 = xmm7;
7352 
7353     const Register tmp1 = r8;
7354     const Register tmp2 = r9;
7355     const Register tmp3 = r10;
7356     const Register tmp4 = r11;
7357 
7358     BLOCK_COMMENT("Entry:");
7359     __ enter(); // required for proper stackwalking of RuntimeStub frame
7360 
7361 #ifdef _WIN64
7362     __ push(rsi);
7363     __ push(rdi);
7364 #endif
7365     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7366 
7367 #ifdef _WIN64
7368     __ pop(rdi);
7369     __ pop(rsi);
7370 #endif
7371 
7372     __ leave(); // required for proper stackwalking of RuntimeStub frame
7373     __ ret(0);
7374 
7375     return start;
7376 
7377   }
7378 
7379   address generate_libmTan() {
7380     StubCodeMark mark(this, "StubRoutines", "libmTan");
7381 
7382     address start = __ pc();
7383 
7384     const XMMRegister x0 = xmm0;
7385     const XMMRegister x1 = xmm1;
7386     const XMMRegister x2 = xmm2;
7387     const XMMRegister x3 = xmm3;
7388 
7389     const XMMRegister x4 = xmm4;
7390     const XMMRegister x5 = xmm5;
7391     const XMMRegister x6 = xmm6;
7392     const XMMRegister x7 = xmm7;
7393 
7394     const Register tmp1 = r8;
7395     const Register tmp2 = r9;
7396     const Register tmp3 = r10;
7397     const Register tmp4 = r11;
7398 
7399     BLOCK_COMMENT("Entry:");
7400     __ enter(); // required for proper stackwalking of RuntimeStub frame
7401 
7402 #ifdef _WIN64
7403     __ push(rsi);
7404     __ push(rdi);
7405 #endif
7406     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7407 
7408 #ifdef _WIN64
7409     __ pop(rdi);
7410     __ pop(rsi);
7411 #endif
7412 
7413     __ leave(); // required for proper stackwalking of RuntimeStub frame
7414     __ ret(0);
7415 
7416     return start;
7417 
7418   }
7419 
7420 #undef __
7421 #define __ masm->
7422 
7423   // Continuation point for throwing of implicit exceptions that are
7424   // not handled in the current activation. Fabricates an exception
7425   // oop and initiates normal exception dispatching in this
7426   // frame. Since we need to preserve callee-saved values (currently
7427   // only for C2, but done for C1 as well) we need a callee-saved oop
7428   // map and therefore have to make these stubs into RuntimeStubs
7429   // rather than BufferBlobs.  If the compiler needs all registers to
7430   // be preserved between the fault point and the exception handler
7431   // then it must assume responsibility for that in
7432   // AbstractCompiler::continuation_for_implicit_null_exception or
7433   // continuation_for_implicit_division_by_zero_exception. All other
7434   // implicit exceptions (e.g., NullPointerException or
7435   // AbstractMethodError on entry) are either at call sites or
7436   // otherwise assume that stack unwinding will be initiated, so
7437   // caller saved registers were assumed volatile in the compiler.
7438   address generate_throw_exception(const char* name,
7439                                    address runtime_entry,
7440                                    Register arg1 = noreg,
7441                                    Register arg2 = noreg) {
7442     // Information about frame layout at time of blocking runtime call.
7443     // Note that we only have to preserve callee-saved registers since
7444     // the compilers are responsible for supplying a continuation point
7445     // if they expect all registers to be preserved.
7446     enum layout {
7447       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
7448       rbp_off2,
7449       return_off,
7450       return_off2,
7451       framesize // inclusive of return address
7452     };
7453 
7454     int insts_size = 512;
7455     int locs_size  = 64;
7456 
7457     CodeBuffer code(name, insts_size, locs_size);
7458     OopMapSet* oop_maps  = new OopMapSet();
7459     MacroAssembler* masm = new MacroAssembler(&code);
7460 
7461     address start = __ pc();
7462 
7463     // This is an inlined and slightly modified version of call_VM
7464     // which has the ability to fetch the return PC out of
7465     // thread-local storage and also sets up last_Java_sp slightly
7466     // differently than the real call_VM
7467 
7468     __ enter(); // required for proper stackwalking of RuntimeStub frame
7469 
7470     assert(is_even(framesize/2), "sp not 16-byte aligned");
7471 
7472     // return address and rbp are already in place
7473     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
7474 
7475     int frame_complete = __ pc() - start;
7476 
7477     // Set up last_Java_sp and last_Java_fp
7478     address the_pc = __ pc();
7479     __ set_last_Java_frame(rsp, rbp, the_pc);
7480     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
7481 
7482     // Call runtime
7483     if (arg1 != noreg) {
7484       assert(arg2 != c_rarg1, "clobbered");
7485       __ movptr(c_rarg1, arg1);
7486     }
7487     if (arg2 != noreg) {
7488       __ movptr(c_rarg2, arg2);
7489     }
7490     __ movptr(c_rarg0, r15_thread);
7491     BLOCK_COMMENT("call runtime_entry");
7492     __ call(RuntimeAddress(runtime_entry));
7493 
7494     // Generate oop map
7495     OopMap* map = new OopMap(framesize, 0);
7496 
7497     oop_maps->add_gc_map(the_pc - start, map);
7498 
7499     __ reset_last_Java_frame(true);
7500 
7501     __ leave(); // required for proper stackwalking of RuntimeStub frame
7502 
7503     // check for pending exceptions
7504 #ifdef ASSERT
7505     Label L;
7506     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
7507             (int32_t) NULL_WORD);
7508     __ jcc(Assembler::notEqual, L);
7509     __ should_not_reach_here();
7510     __ bind(L);
7511 #endif // ASSERT
7512     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7513 
7514 
7515     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7516     RuntimeStub* stub =
7517       RuntimeStub::new_runtime_stub(name,
7518                                     &code,
7519                                     frame_complete,
7520                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7521                                     oop_maps, false);
7522     return stub->entry_point();
7523   }
7524 
7525   void create_control_words() {
7526     // Round to nearest, 64-bit mode, exceptions masked
7527     StubRoutines::x86::_mxcsr_std = 0x1F80;
7528   }
7529 
7530   // Initialization
7531   void generate_initial() {
7532     // Generates all stubs and initializes the entry points
7533 
7534     // This platform-specific settings are needed by generate_call_stub()
7535     create_control_words();
7536 
7537     // entry points that exist in all platforms Note: This is code
7538     // that could be shared among different platforms - however the
7539     // benefit seems to be smaller than the disadvantage of having a
7540     // much more complicated generator structure. See also comment in
7541     // stubRoutines.hpp.
7542 
7543     StubRoutines::_forward_exception_entry = generate_forward_exception();
7544 
7545     StubRoutines::_call_stub_entry =
7546       generate_call_stub(StubRoutines::_call_stub_return_address);
7547 
7548     // is referenced by megamorphic call
7549     StubRoutines::_catch_exception_entry = generate_catch_exception();
7550 
7551     // atomic calls
7552     StubRoutines::_fence_entry                = generate_orderaccess_fence();
7553 
7554     // platform dependent
7555     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
7556 
7557     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
7558 
7559     StubRoutines::x86::_f2i_fixup             = generate_f2i_fixup();
7560     StubRoutines::x86::_f2l_fixup             = generate_f2l_fixup();
7561     StubRoutines::x86::_d2i_fixup             = generate_d2i_fixup();
7562     StubRoutines::x86::_d2l_fixup             = generate_d2l_fixup();
7563 
7564     StubRoutines::x86::_float_sign_mask       = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
7565     StubRoutines::x86::_float_sign_flip       = generate_fp_mask("float_sign_flip",  0x8000000080000000);
7566     StubRoutines::x86::_double_sign_mask      = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7567     StubRoutines::x86::_double_sign_flip      = generate_fp_mask("double_sign_flip", 0x8000000000000000);
7568 
7569     // Build this early so it's available for the interpreter.
7570     StubRoutines::_throw_StackOverflowError_entry =
7571       generate_throw_exception("StackOverflowError throw_exception",
7572                                CAST_FROM_FN_PTR(address,
7573                                                 SharedRuntime::
7574                                                 throw_StackOverflowError));
7575     StubRoutines::_throw_delayed_StackOverflowError_entry =
7576       generate_throw_exception("delayed StackOverflowError throw_exception",
7577                                CAST_FROM_FN_PTR(address,
7578                                                 SharedRuntime::
7579                                                 throw_delayed_StackOverflowError));
7580     if (UseCRC32Intrinsics) {
7581       // set table address before stub generation which use it
7582       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
7583       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7584     }
7585 
7586     if (UseCRC32CIntrinsics) {
7587       bool supports_clmul = VM_Version::supports_clmul();
7588       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
7589       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
7590       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
7591     }
7592 
7593     if (UseAdler32Intrinsics) {
7594        StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7595     }
7596 
7597     if (UseLibmIntrinsic && InlineIntrinsics) {
7598       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
7599           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
7600           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7601         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
7602         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
7603         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
7604         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
7605         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
7606         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
7607         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
7608         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
7609         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
7610         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
7611         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
7612         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
7613         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
7614         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
7615       }
7616       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
7617         StubRoutines::_dexp = generate_libmExp();
7618       }
7619       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7620         StubRoutines::_dlog = generate_libmLog();
7621       }
7622       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
7623         StubRoutines::_dlog10 = generate_libmLog10();
7624       }
7625       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
7626         StubRoutines::_dpow = generate_libmPow();
7627       }
7628       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7629         StubRoutines::_dsin = generate_libmSin();
7630       }
7631       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7632         StubRoutines::_dcos = generate_libmCos();
7633       }
7634       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7635         StubRoutines::_dtan = generate_libmTan();
7636       }
7637     }
7638 
7639     // Safefetch stubs.
7640     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7641                                                        &StubRoutines::_safefetch32_fault_pc,
7642                                                        &StubRoutines::_safefetch32_continuation_pc);
7643     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7644                                                        &StubRoutines::_safefetchN_fault_pc,
7645                                                        &StubRoutines::_safefetchN_continuation_pc);
7646   }
7647 
7648   void generate_all() {
7649     // Generates all stubs and initializes the entry points
7650 
7651     // These entry points require SharedInfo::stack0 to be set up in
7652     // non-core builds and need to be relocatable, so they each
7653     // fabricate a RuntimeStub internally.
7654     StubRoutines::_throw_AbstractMethodError_entry =
7655       generate_throw_exception("AbstractMethodError throw_exception",
7656                                CAST_FROM_FN_PTR(address,
7657                                                 SharedRuntime::
7658                                                 throw_AbstractMethodError));
7659 
7660     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7661       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7662                                CAST_FROM_FN_PTR(address,
7663                                                 SharedRuntime::
7664                                                 throw_IncompatibleClassChangeError));
7665 
7666     StubRoutines::_throw_NullPointerException_at_call_entry =
7667       generate_throw_exception("NullPointerException at call throw_exception",
7668                                CAST_FROM_FN_PTR(address,
7669                                                 SharedRuntime::
7670                                                 throw_NullPointerException_at_call));
7671 
7672     // entry points that are platform specific
7673     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
7674     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
7675     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7676     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
7677     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
7678     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
7679     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
7680     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
7681     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
7682     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
7683                                                                         0xFFFFFFFF, 0, 0, 0);
7684     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
7685                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
7686     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
7687     StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
7688     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
7689     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
7690     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
7691     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
7692 
7693     // support for verify_oop (must happen after universe_init)
7694     if (VerifyOops) {
7695       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7696     }
7697 
7698     // data cache line writeback
7699     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7700     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7701 
7702     // arraycopy stubs used by compilers
7703     generate_arraycopy_stubs();
7704 
7705     // don't bother generating these AES intrinsic stubs unless global flag is set
7706     if (UseAESIntrinsics) {
7707       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
7708       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7709       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7710       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7711       if (VM_Version::supports_avx512_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
7712         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
7713         StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
7714         StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
7715         StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7716         StubRoutines::x86::_ghash_poly512_addr = ghash_polynomial512_addr();
7717         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7718         StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7719       } else {
7720         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
7721       }
7722     }
7723 
7724     if (UseAESCTRIntrinsics) {
7725       if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
7726         if (StubRoutines::x86::_counter_mask_addr == NULL) {
7727           StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7728         }
7729         StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
7730       } else {
7731         StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
7732         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
7733       }
7734     }
7735 
7736     if (UseMD5Intrinsics) {
7737       StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
7738       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
7739     }
7740     if (UseSHA1Intrinsics) {
7741       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
7742       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
7743       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
7744       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
7745     }
7746     if (UseSHA256Intrinsics) {
7747       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
7748       char* dst = (char*)StubRoutines::x86::_k256_W;
7749       char* src = (char*)StubRoutines::x86::_k256;
7750       for (int ii = 0; ii < 16; ++ii) {
7751         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
7752         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
7753       }
7754       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
7755       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
7756       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
7757       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
7758     }
7759     if (UseSHA512Intrinsics) {
7760       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
7761       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
7762       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
7763       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
7764     }
7765 
7766     // Generate GHASH intrinsics code
7767     if (UseGHASHIntrinsics) {
7768       if (StubRoutines::x86::_ghash_long_swap_mask_addr == NULL) {
7769         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7770       }
7771     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
7772       if (VM_Version::supports_avx()) {
7773         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
7774         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
7775         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
7776       } else {
7777         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7778       }
7779     }
7780 
7781 
7782     if (UseBASE64Intrinsics) {
7783       if(VM_Version::supports_avx2() &&
7784          VM_Version::supports_avx512bw() &&
7785          VM_Version::supports_avx512vl()) {
7786         StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
7787         StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
7788         StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
7789       }
7790       StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
7791       if (VM_Version::supports_avx512_vbmi()) {
7792         StubRoutines::x86::_shuffle_base64 = base64_shuffle_addr();
7793         StubRoutines::x86::_lookup_lo_base64 = base64_vbmi_lookup_lo_addr();
7794         StubRoutines::x86::_lookup_hi_base64 = base64_vbmi_lookup_hi_addr();
7795         StubRoutines::x86::_lookup_lo_base64url = base64_vbmi_lookup_lo_url_addr();
7796         StubRoutines::x86::_lookup_hi_base64url = base64_vbmi_lookup_hi_url_addr();
7797         StubRoutines::x86::_pack_vec_base64 = base64_vbmi_pack_vec_addr();
7798         StubRoutines::x86::_join_0_1_base64 = base64_vbmi_join_0_1_addr();
7799         StubRoutines::x86::_join_1_2_base64 = base64_vbmi_join_1_2_addr();
7800         StubRoutines::x86::_join_2_3_base64 = base64_vbmi_join_2_3_addr();
7801       }
7802       StubRoutines::x86::_decoding_table_base64 = base64_decoding_table_addr();
7803       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7804       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7805     }
7806 
7807     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7808     if (bs_nm != NULL) {
7809       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
7810     }
7811 #ifdef COMPILER2
7812     if (UseMultiplyToLenIntrinsic) {
7813       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7814     }
7815     if (UseSquareToLenIntrinsic) {
7816       StubRoutines::_squareToLen = generate_squareToLen();
7817     }
7818     if (UseMulAddIntrinsic) {
7819       StubRoutines::_mulAdd = generate_mulAdd();
7820     }
7821     if (VM_Version::supports_avx512_vbmi2()) {
7822       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7823       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7824     }
7825     if (UseMontgomeryMultiplyIntrinsic) {
7826       StubRoutines::_montgomeryMultiply
7827         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
7828     }
7829     if (UseMontgomerySquareIntrinsic) {
7830       StubRoutines::_montgomerySquare
7831         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
7832     }
7833 
7834     // Get svml stub routine addresses
7835     void *libsvml = NULL;
7836     char ebuf[1024];
7837     char dll_name[JVM_MAXPATHLEN];
7838     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "svml")) {
7839       libsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
7840     }
7841     if (libsvml != NULL) {
7842       // SVML method naming convention
7843       //   All the methods are named as __svml_op<T><N>_ha_<VV>
7844       //   Where:
7845       //      ha stands for high accuracy
7846       //      <T> is optional to indicate float/double
7847       //              Set to f for vector float operation
7848       //              Omitted for vector double operation
7849       //      <N> is the number of elements in the vector
7850       //              1, 2, 4, 8, 16
7851       //              e.g. 128 bit float vector has 4 float elements
7852       //      <VV> indicates the avx/sse level:
7853       //              z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2
7854       //      e.g. __svml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns
7855       //           __svml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
7856 
7857       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "svml" JNI_LIB_SUFFIX, p2i(libsvml));
7858       if (UseAVX > 2) {
7859         for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7860           int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7861           if ((!VM_Version::supports_avx512dq()) &&
7862               (vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) {
7863             continue;
7864           }
7865           snprintf(ebuf, sizeof(ebuf), "__svml_%sf16_ha_z0", VectorSupport::svmlname[op]);
7866           StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libsvml, ebuf);
7867 
7868           snprintf(ebuf, sizeof(ebuf), "__svml_%s8_ha_z0", VectorSupport::svmlname[op]);
7869           StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libsvml, ebuf);
7870         }
7871       }
7872       const char* avx_sse_str = (UseAVX >= 2) ? "l9" : ((UseAVX == 1) ? "e9" : "ex");
7873       for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7874         int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7875         if (vop == VectorSupport::VECTOR_OP_POW) {
7876           continue;
7877         }
7878         snprintf(ebuf, sizeof(ebuf), "__svml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7879         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsvml, ebuf);
7880 
7881         snprintf(ebuf, sizeof(ebuf), "__svml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7882         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsvml, ebuf);
7883 
7884         snprintf(ebuf, sizeof(ebuf), "__svml_%sf8_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7885         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
7886 
7887         snprintf(ebuf, sizeof(ebuf), "__svml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7888         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsvml, ebuf);
7889 
7890         snprintf(ebuf, sizeof(ebuf), "__svml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7891         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsvml, ebuf);
7892 
7893         snprintf(ebuf, sizeof(ebuf), "__svml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7894         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
7895       }
7896     }
7897 #endif // COMPILER2
7898 
7899     if (UseVectorizedMismatchIntrinsic) {
7900       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
7901     }
7902   }
7903 
7904  public:
7905   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7906     if (all) {
7907       generate_all();
7908     } else {
7909       generate_initial();
7910     }
7911   }
7912 }; // end class declaration
7913 
7914 #define UCM_TABLE_MAX_ENTRIES 16
7915 void StubGenerator_generate(CodeBuffer* code, bool all) {
7916   if (UnsafeCopyMemory::_table == NULL) {
7917     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7918   }
7919   StubGenerator g(code, all);
7920 }