1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/barrierSetNMethod.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/universe.hpp"
  36 #include "nativeInst_x86.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"
  41 #include "prims/methodHandles.hpp"
  42 #include "runtime/arguments.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubCodeGenerator.hpp"
  47 #include "runtime/stubRoutines.hpp"
  48 #include "runtime/thread.inline.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_JVMCI
  53 #include "jvmci/jvmci_globals.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #define __ _masm->
  64 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  65 #define a__ ((Assembler*)_masm)->
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     // This can destroy rscratch1 if counter is far from the code cache
  86     __ incrementl(ExternalAddress((address)&counter));
  87   }
  88 #define inc_counter_np(counter) \
  89   BLOCK_COMMENT("inc_counter " #counter); \
  90   inc_counter_np_(counter);
  91 #endif
  92 
  93   // Call stubs are used to call Java from C
  94   //
  95   // Linux Arguments:
  96   //    c_rarg0:   call wrapper address                   address
  97   //    c_rarg1:   result                                 address
  98   //    c_rarg2:   result type                            BasicType
  99   //    c_rarg3:   method                                 Method*
 100   //    c_rarg4:   (interpreter) entry point              address
 101   //    c_rarg5:   parameters                             intptr_t*
 102   //    16(rbp): parameter size (in words)              int
 103   //    24(rbp): thread                                 Thread*
 104   //
 105   //     [ return_from_Java     ] <--- rsp
 106   //     [ argument word n      ]
 107   //      ...
 108   // -12 [ argument word 1      ]
 109   // -11 [ saved r15            ] <--- rsp_after_call
 110   // -10 [ saved r14            ]
 111   //  -9 [ saved r13            ]
 112   //  -8 [ saved r12            ]
 113   //  -7 [ saved rbx            ]
 114   //  -6 [ call wrapper         ]
 115   //  -5 [ result               ]
 116   //  -4 [ result type          ]
 117   //  -3 [ method               ]
 118   //  -2 [ entry point          ]
 119   //  -1 [ parameters           ]
 120   //   0 [ saved rbp            ] <--- rbp
 121   //   1 [ return address       ]
 122   //   2 [ parameter size       ]
 123   //   3 [ thread               ]
 124   //
 125   // Windows Arguments:
 126   //    c_rarg0:   call wrapper address                   address
 127   //    c_rarg1:   result                                 address
 128   //    c_rarg2:   result type                            BasicType
 129   //    c_rarg3:   method                                 Method*
 130   //    48(rbp): (interpreter) entry point              address
 131   //    56(rbp): parameters                             intptr_t*
 132   //    64(rbp): parameter size (in words)              int
 133   //    72(rbp): thread                                 Thread*
 134   //
 135   //     [ return_from_Java     ] <--- rsp
 136   //     [ argument word n      ]
 137   //      ...
 138   // -60 [ argument word 1      ]
 139   // -59 [ saved xmm31          ] <--- rsp after_call
 140   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 141   // -27 [ saved xmm15          ]
 142   //     [ saved xmm7-xmm14     ]
 143   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 144   //  -7 [ saved r15            ]
 145   //  -6 [ saved r14            ]
 146   //  -5 [ saved r13            ]
 147   //  -4 [ saved r12            ]
 148   //  -3 [ saved rdi            ]
 149   //  -2 [ saved rsi            ]
 150   //  -1 [ saved rbx            ]
 151   //   0 [ saved rbp            ] <--- rbp
 152   //   1 [ return address       ]
 153   //   2 [ call wrapper         ]
 154   //   3 [ result               ]
 155   //   4 [ result type          ]
 156   //   5 [ method               ]
 157   //   6 [ entry point          ]
 158   //   7 [ parameters           ]
 159   //   8 [ parameter size       ]
 160   //   9 [ thread               ]
 161   //
 162   //    Windows reserves the callers stack space for arguments 1-4.
 163   //    We spill c_rarg0-c_rarg3 to this space.
 164 
 165   // Call stub stack layout word offsets from rbp
 166   enum call_stub_layout {
 167 #ifdef _WIN64
 168     xmm_save_first     = 6,  // save from xmm6
 169     xmm_save_last      = 31, // to xmm31
 170     xmm_save_base      = -9,
 171     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 172     r15_off            = -7,
 173     r14_off            = -6,
 174     r13_off            = -5,
 175     r12_off            = -4,
 176     rdi_off            = -3,
 177     rsi_off            = -2,
 178     rbx_off            = -1,
 179     rbp_off            =  0,
 180     retaddr_off        =  1,
 181     call_wrapper_off   =  2,
 182     result_off         =  3,
 183     result_type_off    =  4,
 184     method_off         =  5,
 185     entry_point_off    =  6,
 186     parameters_off     =  7,
 187     parameter_size_off =  8,
 188     thread_off         =  9
 189 #else
 190     rsp_after_call_off = -12,
 191     mxcsr_off          = rsp_after_call_off,
 192     r15_off            = -11,
 193     r14_off            = -10,
 194     r13_off            = -9,
 195     r12_off            = -8,
 196     rbx_off            = -7,
 197     call_wrapper_off   = -6,
 198     result_off         = -5,
 199     result_type_off    = -4,
 200     method_off         = -3,
 201     entry_point_off    = -2,
 202     parameters_off     = -1,
 203     rbp_off            =  0,
 204     retaddr_off        =  1,
 205     parameter_size_off =  2,
 206     thread_off         =  3
 207 #endif
 208   };
 209 
 210 #ifdef _WIN64
 211   Address xmm_save(int reg) {
 212     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 213     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 214   }
 215 #endif
 216 
 217   address generate_call_stub(address& return_address) {
 218     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 219            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 220            "adjust this code");
 221     StubCodeMark mark(this, "StubRoutines", "call_stub");
 222     address start = __ pc();
 223 
 224     // same as in generate_catch_exception()!
 225     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 226 
 227     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 228     const Address result        (rbp, result_off         * wordSize);
 229     const Address result_type   (rbp, result_type_off    * wordSize);
 230     const Address method        (rbp, method_off         * wordSize);
 231     const Address entry_point   (rbp, entry_point_off    * wordSize);
 232     const Address parameters    (rbp, parameters_off     * wordSize);
 233     const Address parameter_size(rbp, parameter_size_off * wordSize);
 234 
 235     // same as in generate_catch_exception()!
 236     const Address thread        (rbp, thread_off         * wordSize);
 237 
 238     const Address r15_save(rbp, r15_off * wordSize);
 239     const Address r14_save(rbp, r14_off * wordSize);
 240     const Address r13_save(rbp, r13_off * wordSize);
 241     const Address r12_save(rbp, r12_off * wordSize);
 242     const Address rbx_save(rbp, rbx_off * wordSize);
 243 
 244     // stub code
 245     __ enter();
 246     __ subptr(rsp, -rsp_after_call_off * wordSize);
 247 
 248     // save register parameters
 249 #ifndef _WIN64
 250     __ movptr(parameters,   c_rarg5); // parameters
 251     __ movptr(entry_point,  c_rarg4); // entry_point
 252 #endif
 253 
 254     __ movptr(method,       c_rarg3); // method
 255     __ movl(result_type,  c_rarg2);   // result type
 256     __ movptr(result,       c_rarg1); // result
 257     __ movptr(call_wrapper, c_rarg0); // call wrapper
 258 
 259     // save regs belonging to calling function
 260     __ movptr(rbx_save, rbx);
 261     __ movptr(r12_save, r12);
 262     __ movptr(r13_save, r13);
 263     __ movptr(r14_save, r14);
 264     __ movptr(r15_save, r15);
 265 
 266 #ifdef _WIN64
 267     int last_reg = 15;
 268     if (UseAVX > 2) {
 269       last_reg = 31;
 270     }
 271     if (VM_Version::supports_evex()) {
 272       for (int i = xmm_save_first; i <= last_reg; i++) {
 273         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 274       }
 275     } else {
 276       for (int i = xmm_save_first; i <= last_reg; i++) {
 277         __ movdqu(xmm_save(i), as_XMMRegister(i));
 278       }
 279     }
 280 
 281     const Address rdi_save(rbp, rdi_off * wordSize);
 282     const Address rsi_save(rbp, rsi_off * wordSize);
 283 
 284     __ movptr(rsi_save, rsi);
 285     __ movptr(rdi_save, rdi);
 286 #else
 287     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 288     {
 289       Label skip_ldmx;
 290       __ stmxcsr(mxcsr_save);
 291       __ movl(rax, mxcsr_save);
 292       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 293       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 294       __ cmp32(rax, mxcsr_std);
 295       __ jcc(Assembler::equal, skip_ldmx);
 296       __ ldmxcsr(mxcsr_std);
 297       __ bind(skip_ldmx);
 298     }
 299 #endif
 300 
 301     // Load up thread register
 302     __ movptr(r15_thread, thread);
 303     __ reinit_heapbase();
 304 
 305 #ifdef ASSERT
 306     // make sure we have no pending exceptions
 307     {
 308       Label L;
 309       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 310       __ jcc(Assembler::equal, L);
 311       __ stop("StubRoutines::call_stub: entered with pending exception");
 312       __ bind(L);
 313     }
 314 #endif
 315 
 316     // pass parameters if any
 317     BLOCK_COMMENT("pass parameters if any");
 318     Label parameters_done;
 319     __ movl(c_rarg3, parameter_size);
 320     __ testl(c_rarg3, c_rarg3);
 321     __ jcc(Assembler::zero, parameters_done);
 322 
 323     Label loop;
 324     __ movptr(c_rarg2, parameters);       // parameter pointer
 325     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 326     __ BIND(loop);
 327     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 328     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 329     __ decrementl(c_rarg1);             // decrement counter
 330     __ push(rax);                       // pass parameter
 331     __ jcc(Assembler::notZero, loop);
 332 
 333     // call Java function
 334     __ BIND(parameters_done);
 335     __ movptr(rbx, method);             // get Method*
 336     __ movptr(c_rarg1, entry_point);    // get entry_point
 337     __ mov(r13, rsp);                   // set sender sp
 338     BLOCK_COMMENT("call Java function");
 339     __ call(c_rarg1);
 340 
 341     BLOCK_COMMENT("call_stub_return_address:");
 342     return_address = __ pc();
 343 
 344     // store result depending on type (everything that is not
 345     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 346     __ movptr(c_rarg0, result);
 347     Label is_long, is_float, is_double, exit;
 348     __ movl(c_rarg1, result_type);
 349     __ cmpl(c_rarg1, T_OBJECT);
 350     __ jcc(Assembler::equal, is_long);
 351     __ cmpl(c_rarg1, T_LONG);
 352     __ jcc(Assembler::equal, is_long);
 353     __ cmpl(c_rarg1, T_FLOAT);
 354     __ jcc(Assembler::equal, is_float);
 355     __ cmpl(c_rarg1, T_DOUBLE);
 356     __ jcc(Assembler::equal, is_double);
 357 
 358     // handle T_INT case
 359     __ movl(Address(c_rarg0, 0), rax);
 360 
 361     __ BIND(exit);
 362 
 363     // pop parameters
 364     __ lea(rsp, rsp_after_call);
 365 
 366 #ifdef ASSERT
 367     // verify that threads correspond
 368     {
 369      Label L1, L2, L3;
 370       __ cmpptr(r15_thread, thread);
 371       __ jcc(Assembler::equal, L1);
 372       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 373       __ bind(L1);
 374       __ get_thread(rbx);
 375       __ cmpptr(r15_thread, thread);
 376       __ jcc(Assembler::equal, L2);
 377       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 378       __ bind(L2);
 379       __ cmpptr(r15_thread, rbx);
 380       __ jcc(Assembler::equal, L3);
 381       __ stop("StubRoutines::call_stub: threads must correspond");
 382       __ bind(L3);
 383     }
 384 #endif
 385 
 386     // restore regs belonging to calling function
 387 #ifdef _WIN64
 388     // emit the restores for xmm regs
 389     if (VM_Version::supports_evex()) {
 390       for (int i = xmm_save_first; i <= last_reg; i++) {
 391         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 392       }
 393     } else {
 394       for (int i = xmm_save_first; i <= last_reg; i++) {
 395         __ movdqu(as_XMMRegister(i), xmm_save(i));
 396       }
 397     }
 398 #endif
 399     __ movptr(r15, r15_save);
 400     __ movptr(r14, r14_save);
 401     __ movptr(r13, r13_save);
 402     __ movptr(r12, r12_save);
 403     __ movptr(rbx, rbx_save);
 404 
 405 #ifdef _WIN64
 406     __ movptr(rdi, rdi_save);
 407     __ movptr(rsi, rsi_save);
 408 #else
 409     __ ldmxcsr(mxcsr_save);
 410 #endif
 411 
 412     // restore rsp
 413     __ addptr(rsp, -rsp_after_call_off * wordSize);
 414 
 415     // return
 416     __ vzeroupper();
 417     __ pop(rbp);
 418     __ ret(0);
 419 
 420     // handle return types different from T_INT
 421     __ BIND(is_long);
 422     __ movq(Address(c_rarg0, 0), rax);
 423     __ jmp(exit);
 424 
 425     __ BIND(is_float);
 426     __ movflt(Address(c_rarg0, 0), xmm0);
 427     __ jmp(exit);
 428 
 429     __ BIND(is_double);
 430     __ movdbl(Address(c_rarg0, 0), xmm0);
 431     __ jmp(exit);
 432 
 433     return start;
 434   }
 435 
 436   // Return point for a Java call if there's an exception thrown in
 437   // Java code.  The exception is caught and transformed into a
 438   // pending exception stored in JavaThread that can be tested from
 439   // within the VM.
 440   //
 441   // Note: Usually the parameters are removed by the callee. In case
 442   // of an exception crossing an activation frame boundary, that is
 443   // not the case if the callee is compiled code => need to setup the
 444   // rsp.
 445   //
 446   // rax: exception oop
 447 
 448   address generate_catch_exception() {
 449     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 450     address start = __ pc();
 451 
 452     // same as in generate_call_stub():
 453     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 454     const Address thread        (rbp, thread_off         * wordSize);
 455 
 456 #ifdef ASSERT
 457     // verify that threads correspond
 458     {
 459       Label L1, L2, L3;
 460       __ cmpptr(r15_thread, thread);
 461       __ jcc(Assembler::equal, L1);
 462       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 463       __ bind(L1);
 464       __ get_thread(rbx);
 465       __ cmpptr(r15_thread, thread);
 466       __ jcc(Assembler::equal, L2);
 467       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 468       __ bind(L2);
 469       __ cmpptr(r15_thread, rbx);
 470       __ jcc(Assembler::equal, L3);
 471       __ stop("StubRoutines::catch_exception: threads must correspond");
 472       __ bind(L3);
 473     }
 474 #endif
 475 
 476     // set pending exception
 477     __ verify_oop(rax);
 478 
 479     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 480     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 481     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 482     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 483 
 484     // complete return to VM
 485     assert(StubRoutines::_call_stub_return_address != NULL,
 486            "_call_stub_return_address must have been generated before");
 487     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 488 
 489     return start;
 490   }
 491 
 492   // Continuation point for runtime calls returning with a pending
 493   // exception.  The pending exception check happened in the runtime
 494   // or native call stub.  The pending exception in Thread is
 495   // converted into a Java-level exception.
 496   //
 497   // Contract with Java-level exception handlers:
 498   // rax: exception
 499   // rdx: throwing pc
 500   //
 501   // NOTE: At entry of this stub, exception-pc must be on stack !!
 502 
 503   address generate_forward_exception() {
 504     StubCodeMark mark(this, "StubRoutines", "forward exception");
 505     address start = __ pc();
 506 
 507     // Upon entry, the sp points to the return address returning into
 508     // Java (interpreted or compiled) code; i.e., the return address
 509     // becomes the throwing pc.
 510     //
 511     // Arguments pushed before the runtime call are still on the stack
 512     // but the exception handler will reset the stack pointer ->
 513     // ignore them.  A potential result in registers can be ignored as
 514     // well.
 515 
 516 #ifdef ASSERT
 517     // make sure this code is only executed if there is a pending exception
 518     {
 519       Label L;
 520       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
 521       __ jcc(Assembler::notEqual, L);
 522       __ stop("StubRoutines::forward exception: no pending exception (1)");
 523       __ bind(L);
 524     }
 525 #endif
 526 
 527     // compute exception handler into rbx
 528     __ movptr(c_rarg0, Address(rsp, 0));
 529     BLOCK_COMMENT("call exception_handler_for_return_address");
 530     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 531                          SharedRuntime::exception_handler_for_return_address),
 532                     r15_thread, c_rarg0);
 533     __ mov(rbx, rax);
 534 
 535     // setup rax & rdx, remove return address & clear pending exception
 536     __ pop(rdx);
 537     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 538     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 539 
 540 #ifdef ASSERT
 541     // make sure exception is set
 542     {
 543       Label L;
 544       __ testptr(rax, rax);
 545       __ jcc(Assembler::notEqual, L);
 546       __ stop("StubRoutines::forward exception: no pending exception (2)");
 547       __ bind(L);
 548     }
 549 #endif
 550 
 551     // continue at exception handler (return address removed)
 552     // rax: exception
 553     // rbx: exception handler
 554     // rdx: throwing pc
 555     __ verify_oop(rax);
 556     __ jmp(rbx);
 557 
 558     return start;
 559   }
 560 
 561   // Support for intptr_t OrderAccess::fence()
 562   //
 563   // Arguments :
 564   //
 565   // Result:
 566   address generate_orderaccess_fence() {
 567     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 568     address start = __ pc();
 569     __ membar(Assembler::StoreLoad);
 570     __ ret(0);
 571 
 572     return start;
 573   }
 574 
 575 
 576   // Support for intptr_t get_previous_sp()
 577   //
 578   // This routine is used to find the previous stack pointer for the
 579   // caller.
 580   address generate_get_previous_sp() {
 581     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 582     address start = __ pc();
 583 
 584     __ movptr(rax, rsp);
 585     __ addptr(rax, 8); // return address is at the top of the stack.
 586     __ ret(0);
 587 
 588     return start;
 589   }
 590 
 591   //----------------------------------------------------------------------------------------------------
 592   // Support for void verify_mxcsr()
 593   //
 594   // This routine is used with -Xcheck:jni to verify that native
 595   // JNI code does not return to Java code without restoring the
 596   // MXCSR register to our expected state.
 597 
 598   address generate_verify_mxcsr() {
 599     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 600     address start = __ pc();
 601 
 602     const Address mxcsr_save(rsp, 0);
 603 
 604     if (CheckJNICalls) {
 605       Label ok_ret;
 606       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 607       __ push(rax);
 608       __ subptr(rsp, wordSize);      // allocate a temp location
 609       __ stmxcsr(mxcsr_save);
 610       __ movl(rax, mxcsr_save);
 611       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 612       __ cmp32(rax, mxcsr_std);
 613       __ jcc(Assembler::equal, ok_ret);
 614 
 615       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 616 
 617       __ ldmxcsr(mxcsr_std);
 618 
 619       __ bind(ok_ret);
 620       __ addptr(rsp, wordSize);
 621       __ pop(rax);
 622     }
 623 
 624     __ ret(0);
 625 
 626     return start;
 627   }
 628 
 629   address generate_f2i_fixup() {
 630     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 631     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 632 
 633     address start = __ pc();
 634 
 635     Label L;
 636 
 637     __ push(rax);
 638     __ push(c_rarg3);
 639     __ push(c_rarg2);
 640     __ push(c_rarg1);
 641 
 642     __ movl(rax, 0x7f800000);
 643     __ xorl(c_rarg3, c_rarg3);
 644     __ movl(c_rarg2, inout);
 645     __ movl(c_rarg1, c_rarg2);
 646     __ andl(c_rarg1, 0x7fffffff);
 647     __ cmpl(rax, c_rarg1); // NaN? -> 0
 648     __ jcc(Assembler::negative, L);
 649     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 650     __ movl(c_rarg3, 0x80000000);
 651     __ movl(rax, 0x7fffffff);
 652     __ cmovl(Assembler::positive, c_rarg3, rax);
 653 
 654     __ bind(L);
 655     __ movptr(inout, c_rarg3);
 656 
 657     __ pop(c_rarg1);
 658     __ pop(c_rarg2);
 659     __ pop(c_rarg3);
 660     __ pop(rax);
 661 
 662     __ ret(0);
 663 
 664     return start;
 665   }
 666 
 667   address generate_f2l_fixup() {
 668     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 669     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 670     address start = __ pc();
 671 
 672     Label L;
 673 
 674     __ push(rax);
 675     __ push(c_rarg3);
 676     __ push(c_rarg2);
 677     __ push(c_rarg1);
 678 
 679     __ movl(rax, 0x7f800000);
 680     __ xorl(c_rarg3, c_rarg3);
 681     __ movl(c_rarg2, inout);
 682     __ movl(c_rarg1, c_rarg2);
 683     __ andl(c_rarg1, 0x7fffffff);
 684     __ cmpl(rax, c_rarg1); // NaN? -> 0
 685     __ jcc(Assembler::negative, L);
 686     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 687     __ mov64(c_rarg3, 0x8000000000000000);
 688     __ mov64(rax, 0x7fffffffffffffff);
 689     __ cmov(Assembler::positive, c_rarg3, rax);
 690 
 691     __ bind(L);
 692     __ movptr(inout, c_rarg3);
 693 
 694     __ pop(c_rarg1);
 695     __ pop(c_rarg2);
 696     __ pop(c_rarg3);
 697     __ pop(rax);
 698 
 699     __ ret(0);
 700 
 701     return start;
 702   }
 703 
 704   address generate_d2i_fixup() {
 705     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 706     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 707 
 708     address start = __ pc();
 709 
 710     Label L;
 711 
 712     __ push(rax);
 713     __ push(c_rarg3);
 714     __ push(c_rarg2);
 715     __ push(c_rarg1);
 716     __ push(c_rarg0);
 717 
 718     __ movl(rax, 0x7ff00000);
 719     __ movq(c_rarg2, inout);
 720     __ movl(c_rarg3, c_rarg2);
 721     __ mov(c_rarg1, c_rarg2);
 722     __ mov(c_rarg0, c_rarg2);
 723     __ negl(c_rarg3);
 724     __ shrptr(c_rarg1, 0x20);
 725     __ orl(c_rarg3, c_rarg2);
 726     __ andl(c_rarg1, 0x7fffffff);
 727     __ xorl(c_rarg2, c_rarg2);
 728     __ shrl(c_rarg3, 0x1f);
 729     __ orl(c_rarg1, c_rarg3);
 730     __ cmpl(rax, c_rarg1);
 731     __ jcc(Assembler::negative, L); // NaN -> 0
 732     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 733     __ movl(c_rarg2, 0x80000000);
 734     __ movl(rax, 0x7fffffff);
 735     __ cmov(Assembler::positive, c_rarg2, rax);
 736 
 737     __ bind(L);
 738     __ movptr(inout, c_rarg2);
 739 
 740     __ pop(c_rarg0);
 741     __ pop(c_rarg1);
 742     __ pop(c_rarg2);
 743     __ pop(c_rarg3);
 744     __ pop(rax);
 745 
 746     __ ret(0);
 747 
 748     return start;
 749   }
 750 
 751   address generate_d2l_fixup() {
 752     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 753     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 754 
 755     address start = __ pc();
 756 
 757     Label L;
 758 
 759     __ push(rax);
 760     __ push(c_rarg3);
 761     __ push(c_rarg2);
 762     __ push(c_rarg1);
 763     __ push(c_rarg0);
 764 
 765     __ movl(rax, 0x7ff00000);
 766     __ movq(c_rarg2, inout);
 767     __ movl(c_rarg3, c_rarg2);
 768     __ mov(c_rarg1, c_rarg2);
 769     __ mov(c_rarg0, c_rarg2);
 770     __ negl(c_rarg3);
 771     __ shrptr(c_rarg1, 0x20);
 772     __ orl(c_rarg3, c_rarg2);
 773     __ andl(c_rarg1, 0x7fffffff);
 774     __ xorl(c_rarg2, c_rarg2);
 775     __ shrl(c_rarg3, 0x1f);
 776     __ orl(c_rarg1, c_rarg3);
 777     __ cmpl(rax, c_rarg1);
 778     __ jcc(Assembler::negative, L); // NaN -> 0
 779     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 780     __ mov64(c_rarg2, 0x8000000000000000);
 781     __ mov64(rax, 0x7fffffffffffffff);
 782     __ cmovq(Assembler::positive, c_rarg2, rax);
 783 
 784     __ bind(L);
 785     __ movq(inout, c_rarg2);
 786 
 787     __ pop(c_rarg0);
 788     __ pop(c_rarg1);
 789     __ pop(c_rarg2);
 790     __ pop(c_rarg3);
 791     __ pop(rax);
 792 
 793     __ ret(0);
 794 
 795     return start;
 796   }
 797 
 798   address generate_iota_indices(const char *stub_name) {
 799     __ align(CodeEntryAlignment);
 800     StubCodeMark mark(this, "StubRoutines", stub_name);
 801     address start = __ pc();
 802     __ emit_data64(0x0706050403020100, relocInfo::none);
 803     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 804     __ emit_data64(0x1716151413121110, relocInfo::none);
 805     __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
 806     __ emit_data64(0x2726252423222120, relocInfo::none);
 807     __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
 808     __ emit_data64(0x3736353433323130, relocInfo::none);
 809     __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
 810     return start;
 811   }
 812 
 813   address generate_vector_byte_shuffle_mask(const char *stub_name) {
 814     __ align(CodeEntryAlignment);
 815     StubCodeMark mark(this, "StubRoutines", stub_name);
 816     address start = __ pc();
 817     __ emit_data64(0x7070707070707070, relocInfo::none);
 818     __ emit_data64(0x7070707070707070, relocInfo::none);
 819     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 820     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 821     return start;
 822   }
 823 
 824   address generate_fp_mask(const char *stub_name, int64_t mask) {
 825     __ align(CodeEntryAlignment);
 826     StubCodeMark mark(this, "StubRoutines", stub_name);
 827     address start = __ pc();
 828 
 829     __ emit_data64( mask, relocInfo::none );
 830     __ emit_data64( mask, relocInfo::none );
 831 
 832     return start;
 833   }
 834 
 835   address generate_vector_mask(const char *stub_name, int64_t mask) {
 836     __ align(CodeEntryAlignment);
 837     StubCodeMark mark(this, "StubRoutines", stub_name);
 838     address start = __ pc();
 839 
 840     __ emit_data64(mask, relocInfo::none);
 841     __ emit_data64(mask, relocInfo::none);
 842     __ emit_data64(mask, relocInfo::none);
 843     __ emit_data64(mask, relocInfo::none);
 844     __ emit_data64(mask, relocInfo::none);
 845     __ emit_data64(mask, relocInfo::none);
 846     __ emit_data64(mask, relocInfo::none);
 847     __ emit_data64(mask, relocInfo::none);
 848 
 849     return start;
 850   }
 851 
 852   address generate_vector_byte_perm_mask(const char *stub_name) {
 853     __ align(CodeEntryAlignment);
 854     StubCodeMark mark(this, "StubRoutines", stub_name);
 855     address start = __ pc();
 856 
 857     __ emit_data64(0x0000000000000001, relocInfo::none);
 858     __ emit_data64(0x0000000000000003, relocInfo::none);
 859     __ emit_data64(0x0000000000000005, relocInfo::none);
 860     __ emit_data64(0x0000000000000007, relocInfo::none);
 861     __ emit_data64(0x0000000000000000, relocInfo::none);
 862     __ emit_data64(0x0000000000000002, relocInfo::none);
 863     __ emit_data64(0x0000000000000004, relocInfo::none);
 864     __ emit_data64(0x0000000000000006, relocInfo::none);
 865 
 866     return start;
 867   }
 868 
 869   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
 870     __ align(CodeEntryAlignment);
 871     StubCodeMark mark(this, "StubRoutines", stub_name);
 872     address start = __ pc();
 873 
 874     __ emit_data64(mask, relocInfo::none);
 875     __ emit_data64(mask, relocInfo::none);
 876     __ emit_data64(mask, relocInfo::none);
 877     __ emit_data64(mask, relocInfo::none);
 878     __ emit_data64(mask, relocInfo::none);
 879     __ emit_data64(mask, relocInfo::none);
 880     __ emit_data64(mask, relocInfo::none);
 881     __ emit_data64(mask, relocInfo::none);
 882 
 883     return start;
 884   }
 885 
 886   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 887                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 888                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 889                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 890                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 891     __ align(CodeEntryAlignment);
 892     StubCodeMark mark(this, "StubRoutines", stub_name);
 893     address start = __ pc();
 894 
 895     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 896     __ emit_data(val0, relocInfo::none, 0);
 897     __ emit_data(val1, relocInfo::none, 0);
 898     __ emit_data(val2, relocInfo::none, 0);
 899     __ emit_data(val3, relocInfo::none, 0);
 900     if (len >= Assembler::AVX_256bit) {
 901       __ emit_data(val4, relocInfo::none, 0);
 902       __ emit_data(val5, relocInfo::none, 0);
 903       __ emit_data(val6, relocInfo::none, 0);
 904       __ emit_data(val7, relocInfo::none, 0);
 905       if (len >= Assembler::AVX_512bit) {
 906         __ emit_data(val8, relocInfo::none, 0);
 907         __ emit_data(val9, relocInfo::none, 0);
 908         __ emit_data(val10, relocInfo::none, 0);
 909         __ emit_data(val11, relocInfo::none, 0);
 910         __ emit_data(val12, relocInfo::none, 0);
 911         __ emit_data(val13, relocInfo::none, 0);
 912         __ emit_data(val14, relocInfo::none, 0);
 913         __ emit_data(val15, relocInfo::none, 0);
 914       }
 915     }
 916 
 917     return start;
 918   }
 919 
 920   // Non-destructive plausibility checks for oops
 921   //
 922   // Arguments:
 923   //    all args on stack!
 924   //
 925   // Stack after saving c_rarg3:
 926   //    [tos + 0]: saved c_rarg3
 927   //    [tos + 1]: saved c_rarg2
 928   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 929   //    [tos + 3]: saved flags
 930   //    [tos + 4]: return address
 931   //  * [tos + 5]: error message (char*)
 932   //  * [tos + 6]: object to verify (oop)
 933   //  * [tos + 7]: saved rax - saved by caller and bashed
 934   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 935   //  * = popped on exit
 936   address generate_verify_oop() {
 937     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 938     address start = __ pc();
 939 
 940     Label exit, error;
 941 
 942     __ pushf();
 943     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 944 
 945     __ push(r12);
 946 
 947     // save c_rarg2 and c_rarg3
 948     __ push(c_rarg2);
 949     __ push(c_rarg3);
 950 
 951     enum {
 952            // After previous pushes.
 953            oop_to_verify = 6 * wordSize,
 954            saved_rax     = 7 * wordSize,
 955            saved_r10     = 8 * wordSize,
 956 
 957            // Before the call to MacroAssembler::debug(), see below.
 958            return_addr   = 16 * wordSize,
 959            error_msg     = 17 * wordSize
 960     };
 961 
 962     // get object
 963     __ movptr(rax, Address(rsp, oop_to_verify));
 964 
 965     // make sure object is 'reasonable'
 966     __ testptr(rax, rax);
 967     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
 968 
 969 #if INCLUDE_ZGC
 970     if (UseZGC) {
 971       // Check if metadata bits indicate a bad oop
 972       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
 973       __ jcc(Assembler::notZero, error);
 974     }
 975 #endif
 976 
 977     // Check if the oop is in the right area of memory
 978     __ movptr(c_rarg2, rax);
 979     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 980     __ andptr(c_rarg2, c_rarg3);
 981     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 982     __ cmpptr(c_rarg2, c_rarg3);
 983     __ jcc(Assembler::notZero, error);
 984 
 985     // make sure klass is 'reasonable', which is not zero.
 986     __ load_klass(rax, rax, rscratch1);  // get klass
 987     __ testptr(rax, rax);
 988     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
 989 
 990     // return if everything seems ok
 991     __ bind(exit);
 992     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
 993     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
 994     __ pop(c_rarg3);                             // restore c_rarg3
 995     __ pop(c_rarg2);                             // restore c_rarg2
 996     __ pop(r12);                                 // restore r12
 997     __ popf();                                   // restore flags
 998     __ ret(4 * wordSize);                        // pop caller saved stuff
 999 
1000     // handle errors
1001     __ bind(error);
1002     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1003     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1004     __ pop(c_rarg3);                             // get saved c_rarg3 back
1005     __ pop(c_rarg2);                             // get saved c_rarg2 back
1006     __ pop(r12);                                 // get saved r12 back
1007     __ popf();                                   // get saved flags off stack --
1008                                                  // will be ignored
1009 
1010     __ pusha();                                  // push registers
1011                                                  // (rip is already
1012                                                  // already pushed)
1013     // debug(char* msg, int64_t pc, int64_t regs[])
1014     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1015     // pushed all the registers, so now the stack looks like:
1016     //     [tos +  0] 16 saved registers
1017     //     [tos + 16] return address
1018     //   * [tos + 17] error message (char*)
1019     //   * [tos + 18] object to verify (oop)
1020     //   * [tos + 19] saved rax - saved by caller and bashed
1021     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1022     //   * = popped on exit
1023 
1024     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1025     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1026     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1027     __ mov(r12, rsp);                               // remember rsp
1028     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1029     __ andptr(rsp, -16);                            // align stack as required by ABI
1030     BLOCK_COMMENT("call MacroAssembler::debug");
1031     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1032     __ hlt();
1033     return start;
1034   }
1035 
1036   //
1037   // Verify that a register contains clean 32-bits positive value
1038   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1039   //
1040   //  Input:
1041   //    Rint  -  32-bits value
1042   //    Rtmp  -  scratch
1043   //
1044   void assert_clean_int(Register Rint, Register Rtmp) {
1045 #ifdef ASSERT
1046     Label L;
1047     assert_different_registers(Rtmp, Rint);
1048     __ movslq(Rtmp, Rint);
1049     __ cmpq(Rtmp, Rint);
1050     __ jcc(Assembler::equal, L);
1051     __ stop("high 32-bits of int value are not 0");
1052     __ bind(L);
1053 #endif
1054   }
1055 
1056   //  Generate overlap test for array copy stubs
1057   //
1058   //  Input:
1059   //     c_rarg0 - from
1060   //     c_rarg1 - to
1061   //     c_rarg2 - element count
1062   //
1063   //  Output:
1064   //     rax   - &from[element count - 1]
1065   //
1066   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1067     assert(no_overlap_target != NULL, "must be generated");
1068     array_overlap_test(no_overlap_target, NULL, sf);
1069   }
1070   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1071     array_overlap_test(NULL, &L_no_overlap, sf);
1072   }
1073   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1074     const Register from     = c_rarg0;
1075     const Register to       = c_rarg1;
1076     const Register count    = c_rarg2;
1077     const Register end_from = rax;
1078 
1079     __ cmpptr(to, from);
1080     __ lea(end_from, Address(from, count, sf, 0));
1081     if (NOLp == NULL) {
1082       ExternalAddress no_overlap(no_overlap_target);
1083       __ jump_cc(Assembler::belowEqual, no_overlap);
1084       __ cmpptr(to, end_from);
1085       __ jump_cc(Assembler::aboveEqual, no_overlap);
1086     } else {
1087       __ jcc(Assembler::belowEqual, (*NOLp));
1088       __ cmpptr(to, end_from);
1089       __ jcc(Assembler::aboveEqual, (*NOLp));
1090     }
1091   }
1092 
1093   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1094   //
1095   // Outputs:
1096   //    rdi - rcx
1097   //    rsi - rdx
1098   //    rdx - r8
1099   //    rcx - r9
1100   //
1101   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1102   // are non-volatile.  r9 and r10 should not be used by the caller.
1103   //
1104   DEBUG_ONLY(bool regs_in_thread;)
1105 
1106   void setup_arg_regs(int nargs = 3) {
1107     const Register saved_rdi = r9;
1108     const Register saved_rsi = r10;
1109     assert(nargs == 3 || nargs == 4, "else fix");
1110 #ifdef _WIN64
1111     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1112            "unexpected argument registers");
1113     if (nargs >= 4)
1114       __ mov(rax, r9);  // r9 is also saved_rdi
1115     __ movptr(saved_rdi, rdi);
1116     __ movptr(saved_rsi, rsi);
1117     __ mov(rdi, rcx); // c_rarg0
1118     __ mov(rsi, rdx); // c_rarg1
1119     __ mov(rdx, r8);  // c_rarg2
1120     if (nargs >= 4)
1121       __ mov(rcx, rax); // c_rarg3 (via rax)
1122 #else
1123     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1124            "unexpected argument registers");
1125 #endif
1126     DEBUG_ONLY(regs_in_thread = false;)
1127   }
1128 
1129   void restore_arg_regs() {
1130     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1131     const Register saved_rdi = r9;
1132     const Register saved_rsi = r10;
1133 #ifdef _WIN64
1134     __ movptr(rdi, saved_rdi);
1135     __ movptr(rsi, saved_rsi);
1136 #endif
1137   }
1138 
1139   // This is used in places where r10 is a scratch register, and can
1140   // be adapted if r9 is needed also.
1141   void setup_arg_regs_using_thread() {
1142     const Register saved_r15 = r9;
1143 #ifdef _WIN64
1144     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1145     __ get_thread(r15_thread);
1146     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1147            "unexpected argument registers");
1148     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1149     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1150 
1151     __ mov(rdi, rcx); // c_rarg0
1152     __ mov(rsi, rdx); // c_rarg1
1153     __ mov(rdx, r8);  // c_rarg2
1154 #else
1155     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1156            "unexpected argument registers");
1157 #endif
1158     DEBUG_ONLY(regs_in_thread = true;)
1159   }
1160 
1161   void restore_arg_regs_using_thread() {
1162     assert(regs_in_thread, "wrong call to restore_arg_regs");
1163     const Register saved_r15 = r9;
1164 #ifdef _WIN64
1165     __ get_thread(r15_thread);
1166     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1167     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1168     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1169 #endif
1170   }
1171 
1172   // Copy big chunks forward
1173   //
1174   // Inputs:
1175   //   end_from     - source arrays end address
1176   //   end_to       - destination array end address
1177   //   qword_count  - 64-bits element count, negative
1178   //   to           - scratch
1179   //   L_copy_bytes - entry label
1180   //   L_copy_8_bytes  - exit  label
1181   //
1182   void copy_bytes_forward(Register end_from, Register end_to,
1183                              Register qword_count, Register to,
1184                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1185     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1186     Label L_loop;
1187     __ align(OptoLoopAlignment);
1188     if (UseUnalignedLoadStores) {
1189       Label L_end;
1190       __ BIND(L_loop);
1191       if (UseAVX >= 2) {
1192         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1193         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1194         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1195         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1196       } else {
1197         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1198         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1199         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1200         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1201         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1202         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1203         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1204         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1205       }
1206 
1207       __ BIND(L_copy_bytes);
1208       __ addptr(qword_count, 8);
1209       __ jcc(Assembler::lessEqual, L_loop);
1210       __ subptr(qword_count, 4);  // sub(8) and add(4)
1211       __ jccb(Assembler::greater, L_end);
1212       // Copy trailing 32 bytes
1213       if (UseAVX >= 2) {
1214         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1215         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1216       } else {
1217         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1218         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1219         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1220         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1221       }
1222       __ addptr(qword_count, 4);
1223       __ BIND(L_end);
1224       if (UseAVX >= 2) {
1225         // clean upper bits of YMM registers
1226         __ vpxor(xmm0, xmm0);
1227         __ vpxor(xmm1, xmm1);
1228       }
1229     } else {
1230       // Copy 32-bytes per iteration
1231       __ BIND(L_loop);
1232       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1233       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1234       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1235       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1236       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1237       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1238       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1239       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1240 
1241       __ BIND(L_copy_bytes);
1242       __ addptr(qword_count, 4);
1243       __ jcc(Assembler::lessEqual, L_loop);
1244     }
1245     __ subptr(qword_count, 4);
1246     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1247   }
1248 
1249   // Copy big chunks backward
1250   //
1251   // Inputs:
1252   //   from         - source arrays address
1253   //   dest         - destination array address
1254   //   qword_count  - 64-bits element count
1255   //   to           - scratch
1256   //   L_copy_bytes - entry label
1257   //   L_copy_8_bytes  - exit  label
1258   //
1259   void copy_bytes_backward(Register from, Register dest,
1260                               Register qword_count, Register to,
1261                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1262     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1263     Label L_loop;
1264     __ align(OptoLoopAlignment);
1265     if (UseUnalignedLoadStores) {
1266       Label L_end;
1267       __ BIND(L_loop);
1268       if (UseAVX >= 2) {
1269         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1270         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1271         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1272         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1273       } else {
1274         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1275         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1276         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1277         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1278         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1279         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1280         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1281         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1282       }
1283 
1284       __ BIND(L_copy_bytes);
1285       __ subptr(qword_count, 8);
1286       __ jcc(Assembler::greaterEqual, L_loop);
1287 
1288       __ addptr(qword_count, 4);  // add(8) and sub(4)
1289       __ jccb(Assembler::less, L_end);
1290       // Copy trailing 32 bytes
1291       if (UseAVX >= 2) {
1292         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1293         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1294       } else {
1295         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1296         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1297         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1298         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1299       }
1300       __ subptr(qword_count, 4);
1301       __ BIND(L_end);
1302       if (UseAVX >= 2) {
1303         // clean upper bits of YMM registers
1304         __ vpxor(xmm0, xmm0);
1305         __ vpxor(xmm1, xmm1);
1306       }
1307     } else {
1308       // Copy 32-bytes per iteration
1309       __ BIND(L_loop);
1310       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1311       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1312       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1313       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1314       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1315       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1316       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1317       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1318 
1319       __ BIND(L_copy_bytes);
1320       __ subptr(qword_count, 4);
1321       __ jcc(Assembler::greaterEqual, L_loop);
1322     }
1323     __ addptr(qword_count, 4);
1324     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1325   }
1326 
1327 #ifndef PRODUCT
1328     int& get_profile_ctr(int shift) {
1329       if ( 0 == shift)
1330         return SharedRuntime::_jbyte_array_copy_ctr;
1331       else if(1 == shift)
1332         return SharedRuntime::_jshort_array_copy_ctr;
1333       else if(2 == shift)
1334         return SharedRuntime::_jint_array_copy_ctr;
1335       else
1336         return SharedRuntime::_jlong_array_copy_ctr;
1337     }
1338 #endif
1339 
1340   void setup_argument_regs(BasicType type) {
1341     if (type == T_BYTE || type == T_SHORT) {
1342       setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1343                         // r9 and r10 may be used to save non-volatile registers
1344     } else {
1345       setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1346                                      // r9 is used to save r15_thread
1347     }
1348   }
1349 
1350   void restore_argument_regs(BasicType type) {
1351     if (type == T_BYTE || type == T_SHORT) {
1352       restore_arg_regs();
1353     } else {
1354       restore_arg_regs_using_thread();
1355     }
1356   }
1357 
1358 #if COMPILER2_OR_JVMCI
1359   // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
1360   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
1361   //   for both special cases (various small block sizes) and aligned copy loop. This is the
1362   //   default configuration.
1363   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
1364   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
1365   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
1366   //   better performance for disjoint copies. For conjoint/backward copy vector based
1367   //   copy performs better.
1368   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
1369   //   64 byte vector registers (ZMMs).
1370 
1371   // Inputs:
1372   //   c_rarg0   - source array address
1373   //   c_rarg1   - destination array address
1374   //   c_rarg2   - element count, treated as ssize_t, can be zero
1375   //
1376   //
1377   // Side Effects:
1378   //   disjoint_copy_avx3_masked is set to the no-overlap entry point
1379   //   used by generate_conjoint_[byte/int/short/long]_copy().
1380   //
1381 
1382   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1383                                              bool aligned, bool is_oop, bool dest_uninitialized) {
1384     __ align(CodeEntryAlignment);
1385     StubCodeMark mark(this, "StubRoutines", name);
1386     address start = __ pc();
1387 
1388     bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
1389     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1390     Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1391     const Register from        = rdi;  // source array address
1392     const Register to          = rsi;  // destination array address
1393     const Register count       = rdx;  // elements count
1394     const Register temp1       = r8;
1395     const Register temp2       = r11;
1396     const Register temp3       = rax;
1397     const Register temp4       = rcx;
1398     // End pointers are inclusive, and if count is not zero they point
1399     // to the last unit copied:  end_to[0] := end_from[0]
1400 
1401     __ enter(); // required for proper stackwalking of RuntimeStub frame
1402     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1403 
1404     if (entry != NULL) {
1405       *entry = __ pc();
1406        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1407       BLOCK_COMMENT("Entry:");
1408     }
1409 
1410     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1411     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1412 
1413     setup_argument_regs(type);
1414 
1415     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1416     if (dest_uninitialized) {
1417       decorators |= IS_DEST_UNINITIALIZED;
1418     }
1419     if (aligned) {
1420       decorators |= ARRAYCOPY_ALIGNED;
1421     }
1422     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1423     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1424 
1425     {
1426       // Type(shift)           byte(0), short(1), int(2),   long(3)
1427       int loop_size[]        = { 192,     96,       48,      24};
1428       int threshold[]        = { 4096,    2048,     1024,    512};
1429 
1430       // UnsafeCopyMemory page error: continue after ucm
1431       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1432       // 'from', 'to' and 'count' are now valid
1433 
1434       // temp1 holds remaining count and temp4 holds running count used to compute
1435       // next address offset for start of to/from addresses (temp4 * scale).
1436       __ mov64(temp4, 0);
1437       __ movq(temp1, count);
1438 
1439       // Zero length check.
1440       __ BIND(L_tail);
1441       __ cmpq(temp1, 0);
1442       __ jcc(Assembler::lessEqual, L_exit);
1443 
1444       // Special cases using 32 byte [masked] vector copy operations.
1445       __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1446                                       temp4, temp3, use64byteVector, L_entry, L_exit);
1447 
1448       // PRE-MAIN-POST loop for aligned copy.
1449       __ BIND(L_entry);
1450 
1451       if (AVX3Threshold != 0) {
1452         __ cmpq(count, threshold[shift]);
1453         if (MaxVectorSize == 64) {
1454           // Copy using 64 byte vectors.
1455           __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1456         } else {
1457           assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
1458           // REP MOVS offer a faster copy path.
1459           __ jcc(Assembler::greaterEqual, L_repmovs);
1460         }
1461       }
1462 
1463       if (MaxVectorSize < 64  || AVX3Threshold != 0) {
1464         // Partial copy to make dst address 32 byte aligned.
1465         __ movq(temp2, to);
1466         __ andq(temp2, 31);
1467         __ jcc(Assembler::equal, L_main_pre_loop);
1468 
1469         __ negptr(temp2);
1470         __ addq(temp2, 32);
1471         if (shift) {
1472           __ shrq(temp2, shift);
1473         }
1474         __ movq(temp3, temp2);
1475         __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
1476         __ movq(temp4, temp2);
1477         __ movq(temp1, count);
1478         __ subq(temp1, temp2);
1479 
1480         __ cmpq(temp1, loop_size[shift]);
1481         __ jcc(Assembler::less, L_tail);
1482 
1483         __ BIND(L_main_pre_loop);
1484         __ subq(temp1, loop_size[shift]);
1485 
1486         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1487         __ align32();
1488         __ BIND(L_main_loop);
1489            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
1490            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
1491            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
1492            __ addptr(temp4, loop_size[shift]);
1493            __ subq(temp1, loop_size[shift]);
1494            __ jcc(Assembler::greater, L_main_loop);
1495 
1496         __ addq(temp1, loop_size[shift]);
1497 
1498         // Tail loop.
1499         __ jmp(L_tail);
1500 
1501         __ BIND(L_repmovs);
1502           __ movq(temp2, temp1);
1503           // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
1504           __ movq(temp3, to);
1505           __ movq(to,  from);
1506           __ movq(from, temp3);
1507           // Save to/from for restoration post rep_mov.
1508           __ movq(temp1, to);
1509           __ movq(temp3, from);
1510           if(shift < 3) {
1511             __ shrq(temp2, 3-shift);     // quad word count
1512           }
1513           __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
1514           __ rep_mov();
1515           __ shlq(temp2, 3);             // convert quad words into byte count.
1516           if(shift) {
1517             __ shrq(temp2, shift);       // type specific count.
1518           }
1519           // Restore original addresses in to/from.
1520           __ movq(to, temp3);
1521           __ movq(from, temp1);
1522           __ movq(temp4, temp2);
1523           __ movq(temp1, count);
1524           __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
1525           __ jmp(L_tail);
1526       }
1527 
1528       if (MaxVectorSize > 32) {
1529         __ BIND(L_pre_main_post_64);
1530         // Partial copy to make dst address 64 byte aligned.
1531         __ movq(temp2, to);
1532         __ andq(temp2, 63);
1533         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1534 
1535         __ negptr(temp2);
1536         __ addq(temp2, 64);
1537         if (shift) {
1538           __ shrq(temp2, shift);
1539         }
1540         __ movq(temp3, temp2);
1541         __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
1542         __ movq(temp4, temp2);
1543         __ movq(temp1, count);
1544         __ subq(temp1, temp2);
1545 
1546         __ cmpq(temp1, loop_size[shift]);
1547         __ jcc(Assembler::less, L_tail64);
1548 
1549         __ BIND(L_main_pre_loop_64bytes);
1550         __ subq(temp1, loop_size[shift]);
1551 
1552         // Main loop with aligned copy block size of 192 bytes at
1553         // 64 byte copy granularity.
1554         __ align32();
1555         __ BIND(L_main_loop_64bytes);
1556            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
1557            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
1558            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
1559            __ addptr(temp4, loop_size[shift]);
1560            __ subq(temp1, loop_size[shift]);
1561            __ jcc(Assembler::greater, L_main_loop_64bytes);
1562 
1563         __ addq(temp1, loop_size[shift]);
1564         // Zero length check.
1565         __ jcc(Assembler::lessEqual, L_exit);
1566 
1567         __ BIND(L_tail64);
1568 
1569         // Tail handling using 64 byte [masked] vector copy operations.
1570         use64byteVector = true;
1571         __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1572                                         temp4, temp3, use64byteVector, L_entry, L_exit);
1573       }
1574       __ BIND(L_exit);
1575     }
1576 
1577     address ucme_exit_pc = __ pc();
1578     // When called from generic_arraycopy r11 contains specific values
1579     // used during arraycopy epilogue, re-initializing r11.
1580     if (is_oop) {
1581       __ movq(r11, shift == 3 ? count : to);
1582     }
1583     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1584     restore_argument_regs(type);
1585     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1586     __ xorptr(rax, rax); // return 0
1587     __ vzeroupper();
1588     __ leave(); // required for proper stackwalking of RuntimeStub frame
1589     __ ret(0);
1590     return start;
1591   }
1592 
1593   // Inputs:
1594   //   c_rarg0   - source array address
1595   //   c_rarg1   - destination array address
1596   //   c_rarg2   - element count, treated as ssize_t, can be zero
1597   //
1598   //
1599   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1600                                              address nooverlap_target, bool aligned, bool is_oop,
1601                                              bool dest_uninitialized) {
1602     __ align(CodeEntryAlignment);
1603     StubCodeMark mark(this, "StubRoutines", name);
1604     address start = __ pc();
1605 
1606     bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
1607 
1608     Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1609     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1610     const Register from        = rdi;  // source array address
1611     const Register to          = rsi;  // destination array address
1612     const Register count       = rdx;  // elements count
1613     const Register temp1       = r8;
1614     const Register temp2       = rcx;
1615     const Register temp3       = r11;
1616     const Register temp4       = rax;
1617     // End pointers are inclusive, and if count is not zero they point
1618     // to the last unit copied:  end_to[0] := end_from[0]
1619 
1620     __ enter(); // required for proper stackwalking of RuntimeStub frame
1621     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1622 
1623     if (entry != NULL) {
1624       *entry = __ pc();
1625        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1626       BLOCK_COMMENT("Entry:");
1627     }
1628 
1629     array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1630 
1631     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1632     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1633 
1634     setup_argument_regs(type);
1635 
1636     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1637     if (dest_uninitialized) {
1638       decorators |= IS_DEST_UNINITIALIZED;
1639     }
1640     if (aligned) {
1641       decorators |= ARRAYCOPY_ALIGNED;
1642     }
1643     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1644     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1645     {
1646       // Type(shift)       byte(0), short(1), int(2),   long(3)
1647       int loop_size[]   = { 192,     96,       48,      24};
1648       int threshold[]   = { 4096,    2048,     1024,    512};
1649 
1650       // UnsafeCopyMemory page error: continue after ucm
1651       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1652       // 'from', 'to' and 'count' are now valid
1653 
1654       // temp1 holds remaining count.
1655       __ movq(temp1, count);
1656 
1657       // Zero length check.
1658       __ BIND(L_tail);
1659       __ cmpq(temp1, 0);
1660       __ jcc(Assembler::lessEqual, L_exit);
1661 
1662       __ mov64(temp2, 0);
1663       __ movq(temp3, temp1);
1664       // Special cases using 32 byte [masked] vector copy operations.
1665       __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1666                                                temp4, use64byteVector, L_entry, L_exit);
1667 
1668       // PRE-MAIN-POST loop for aligned copy.
1669       __ BIND(L_entry);
1670 
1671       if (MaxVectorSize > 32 && AVX3Threshold != 0) {
1672         __ cmpq(temp1, threshold[shift]);
1673         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1674       }
1675 
1676       if (MaxVectorSize < 64  || AVX3Threshold != 0) {
1677         // Partial copy to make dst address 32 byte aligned.
1678         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1679         __ andq(temp2, 31);
1680         __ jcc(Assembler::equal, L_main_pre_loop);
1681 
1682         if (shift) {
1683           __ shrq(temp2, shift);
1684         }
1685         __ subq(temp1, temp2);
1686         __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1687 
1688         __ cmpq(temp1, loop_size[shift]);
1689         __ jcc(Assembler::less, L_tail);
1690 
1691         __ BIND(L_main_pre_loop);
1692 
1693         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1694         __ align32();
1695         __ BIND(L_main_loop);
1696            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1697            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1698            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1699            __ subptr(temp1, loop_size[shift]);
1700            __ cmpq(temp1, loop_size[shift]);
1701            __ jcc(Assembler::greater, L_main_loop);
1702 
1703         // Tail loop.
1704         __ jmp(L_tail);
1705       }
1706 
1707       if (MaxVectorSize > 32) {
1708         __ BIND(L_pre_main_post_64);
1709         // Partial copy to make dst address 64 byte aligned.
1710         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1711         __ andq(temp2, 63);
1712         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1713 
1714         if (shift) {
1715           __ shrq(temp2, shift);
1716         }
1717         __ subq(temp1, temp2);
1718         __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1719 
1720         __ cmpq(temp1, loop_size[shift]);
1721         __ jcc(Assembler::less, L_tail64);
1722 
1723         __ BIND(L_main_pre_loop_64bytes);
1724 
1725         // Main loop with aligned copy block size of 192 bytes at
1726         // 64 byte copy granularity.
1727         __ align32();
1728         __ BIND(L_main_loop_64bytes);
1729            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1730            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1731            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1732            __ subq(temp1, loop_size[shift]);
1733            __ cmpq(temp1, loop_size[shift]);
1734            __ jcc(Assembler::greater, L_main_loop_64bytes);
1735 
1736         // Zero length check.
1737         __ cmpq(temp1, 0);
1738         __ jcc(Assembler::lessEqual, L_exit);
1739 
1740         __ BIND(L_tail64);
1741 
1742         // Tail handling using 64 byte [masked] vector copy operations.
1743         use64byteVector = true;
1744         __ mov64(temp2, 0);
1745         __ movq(temp3, temp1);
1746         __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1747                                                  temp4, use64byteVector, L_entry, L_exit);
1748       }
1749       __ BIND(L_exit);
1750     }
1751     address ucme_exit_pc = __ pc();
1752     // When called from generic_arraycopy r11 contains specific values
1753     // used during arraycopy epilogue, re-initializing r11.
1754     if(is_oop) {
1755       __ movq(r11, count);
1756     }
1757     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1758     restore_argument_regs(type);
1759     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1760     __ xorptr(rax, rax); // return 0
1761     __ vzeroupper();
1762     __ leave(); // required for proper stackwalking of RuntimeStub frame
1763     __ ret(0);
1764     return start;
1765   }
1766 #endif // COMPILER2_OR_JVMCI
1767 
1768 
1769   // Arguments:
1770   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1771   //             ignored
1772   //   name    - stub name string
1773   //
1774   // Inputs:
1775   //   c_rarg0   - source array address
1776   //   c_rarg1   - destination array address
1777   //   c_rarg2   - element count, treated as ssize_t, can be zero
1778   //
1779   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1780   // we let the hardware handle it.  The one to eight bytes within words,
1781   // dwords or qwords that span cache line boundaries will still be loaded
1782   // and stored atomically.
1783   //
1784   // Side Effects:
1785   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1786   //   used by generate_conjoint_byte_copy().
1787   //
1788   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1789 #if COMPILER2_OR_JVMCI
1790     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1791        return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1792                                                  aligned, false, false);
1793     }
1794 #endif
1795     __ align(CodeEntryAlignment);
1796     StubCodeMark mark(this, "StubRoutines", name);
1797     address start = __ pc();
1798 
1799     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1800     Label L_copy_byte, L_exit;
1801     const Register from        = rdi;  // source array address
1802     const Register to          = rsi;  // destination array address
1803     const Register count       = rdx;  // elements count
1804     const Register byte_count  = rcx;
1805     const Register qword_count = count;
1806     const Register end_from    = from; // source array end address
1807     const Register end_to      = to;   // destination array end address
1808     // End pointers are inclusive, and if count is not zero they point
1809     // to the last unit copied:  end_to[0] := end_from[0]
1810 
1811     __ enter(); // required for proper stackwalking of RuntimeStub frame
1812     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1813 
1814     if (entry != NULL) {
1815       *entry = __ pc();
1816        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1817       BLOCK_COMMENT("Entry:");
1818     }
1819 
1820     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1821                       // r9 and r10 may be used to save non-volatile registers
1822 
1823     {
1824       // UnsafeCopyMemory page error: continue after ucm
1825       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1826       // 'from', 'to' and 'count' are now valid
1827       __ movptr(byte_count, count);
1828       __ shrptr(count, 3); // count => qword_count
1829 
1830       // Copy from low to high addresses.  Use 'to' as scratch.
1831       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1832       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1833       __ negptr(qword_count); // make the count negative
1834       __ jmp(L_copy_bytes);
1835 
1836       // Copy trailing qwords
1837     __ BIND(L_copy_8_bytes);
1838       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1839       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1840       __ increment(qword_count);
1841       __ jcc(Assembler::notZero, L_copy_8_bytes);
1842 
1843       // Check for and copy trailing dword
1844     __ BIND(L_copy_4_bytes);
1845       __ testl(byte_count, 4);
1846       __ jccb(Assembler::zero, L_copy_2_bytes);
1847       __ movl(rax, Address(end_from, 8));
1848       __ movl(Address(end_to, 8), rax);
1849 
1850       __ addptr(end_from, 4);
1851       __ addptr(end_to, 4);
1852 
1853       // Check for and copy trailing word
1854     __ BIND(L_copy_2_bytes);
1855       __ testl(byte_count, 2);
1856       __ jccb(Assembler::zero, L_copy_byte);
1857       __ movw(rax, Address(end_from, 8));
1858       __ movw(Address(end_to, 8), rax);
1859 
1860       __ addptr(end_from, 2);
1861       __ addptr(end_to, 2);
1862 
1863       // Check for and copy trailing byte
1864     __ BIND(L_copy_byte);
1865       __ testl(byte_count, 1);
1866       __ jccb(Assembler::zero, L_exit);
1867       __ movb(rax, Address(end_from, 8));
1868       __ movb(Address(end_to, 8), rax);
1869     }
1870   __ BIND(L_exit);
1871     address ucme_exit_pc = __ pc();
1872     restore_arg_regs();
1873     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1874     __ xorptr(rax, rax); // return 0
1875     __ vzeroupper();
1876     __ leave(); // required for proper stackwalking of RuntimeStub frame
1877     __ ret(0);
1878 
1879     {
1880       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1881       // Copy in multi-bytes chunks
1882       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1883       __ jmp(L_copy_4_bytes);
1884     }
1885     return start;
1886   }
1887 
1888   // Arguments:
1889   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1890   //             ignored
1891   //   name    - stub name string
1892   //
1893   // Inputs:
1894   //   c_rarg0   - source array address
1895   //   c_rarg1   - destination array address
1896   //   c_rarg2   - element count, treated as ssize_t, can be zero
1897   //
1898   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1899   // we let the hardware handle it.  The one to eight bytes within words,
1900   // dwords or qwords that span cache line boundaries will still be loaded
1901   // and stored atomically.
1902   //
1903   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1904                                       address* entry, const char *name) {
1905 #if COMPILER2_OR_JVMCI
1906     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1907        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1908                                                  nooverlap_target, aligned, false, false);
1909     }
1910 #endif
1911     __ align(CodeEntryAlignment);
1912     StubCodeMark mark(this, "StubRoutines", name);
1913     address start = __ pc();
1914 
1915     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1916     const Register from        = rdi;  // source array address
1917     const Register to          = rsi;  // destination array address
1918     const Register count       = rdx;  // elements count
1919     const Register byte_count  = rcx;
1920     const Register qword_count = count;
1921 
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1924 
1925     if (entry != NULL) {
1926       *entry = __ pc();
1927       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1928       BLOCK_COMMENT("Entry:");
1929     }
1930 
1931     array_overlap_test(nooverlap_target, Address::times_1);
1932     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1933                       // r9 and r10 may be used to save non-volatile registers
1934 
1935     {
1936       // UnsafeCopyMemory page error: continue after ucm
1937       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1938       // 'from', 'to' and 'count' are now valid
1939       __ movptr(byte_count, count);
1940       __ shrptr(count, 3);   // count => qword_count
1941 
1942       // Copy from high to low addresses.
1943 
1944       // Check for and copy trailing byte
1945       __ testl(byte_count, 1);
1946       __ jcc(Assembler::zero, L_copy_2_bytes);
1947       __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1948       __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1949       __ decrement(byte_count); // Adjust for possible trailing word
1950 
1951       // Check for and copy trailing word
1952     __ BIND(L_copy_2_bytes);
1953       __ testl(byte_count, 2);
1954       __ jcc(Assembler::zero, L_copy_4_bytes);
1955       __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1956       __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1957 
1958       // Check for and copy trailing dword
1959     __ BIND(L_copy_4_bytes);
1960       __ testl(byte_count, 4);
1961       __ jcc(Assembler::zero, L_copy_bytes);
1962       __ movl(rax, Address(from, qword_count, Address::times_8));
1963       __ movl(Address(to, qword_count, Address::times_8), rax);
1964       __ jmp(L_copy_bytes);
1965 
1966       // Copy trailing qwords
1967     __ BIND(L_copy_8_bytes);
1968       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1969       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1970       __ decrement(qword_count);
1971       __ jcc(Assembler::notZero, L_copy_8_bytes);
1972     }
1973     restore_arg_regs();
1974     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1975     __ xorptr(rax, rax); // return 0
1976     __ vzeroupper();
1977     __ leave(); // required for proper stackwalking of RuntimeStub frame
1978     __ ret(0);
1979 
1980     {
1981       // UnsafeCopyMemory page error: continue after ucm
1982       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1983       // Copy in multi-bytes chunks
1984       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1985     }
1986     restore_arg_regs();
1987     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1988     __ xorptr(rax, rax); // return 0
1989     __ vzeroupper();
1990     __ leave(); // required for proper stackwalking of RuntimeStub frame
1991     __ ret(0);
1992 
1993     return start;
1994   }
1995 
1996   // Arguments:
1997   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1998   //             ignored
1999   //   name    - stub name string
2000   //
2001   // Inputs:
2002   //   c_rarg0   - source array address
2003   //   c_rarg1   - destination array address
2004   //   c_rarg2   - element count, treated as ssize_t, can be zero
2005   //
2006   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2007   // let the hardware handle it.  The two or four words within dwords
2008   // or qwords that span cache line boundaries will still be loaded
2009   // and stored atomically.
2010   //
2011   // Side Effects:
2012   //   disjoint_short_copy_entry is set to the no-overlap entry point
2013   //   used by generate_conjoint_short_copy().
2014   //
2015   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
2016 #if COMPILER2_OR_JVMCI
2017     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2018        return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
2019                                                  aligned, false, false);
2020     }
2021 #endif
2022 
2023     __ align(CodeEntryAlignment);
2024     StubCodeMark mark(this, "StubRoutines", name);
2025     address start = __ pc();
2026 
2027     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
2028     const Register from        = rdi;  // source array address
2029     const Register to          = rsi;  // destination array address
2030     const Register count       = rdx;  // elements count
2031     const Register word_count  = rcx;
2032     const Register qword_count = count;
2033     const Register end_from    = from; // source array end address
2034     const Register end_to      = to;   // destination array end address
2035     // End pointers are inclusive, and if count is not zero they point
2036     // to the last unit copied:  end_to[0] := end_from[0]
2037 
2038     __ enter(); // required for proper stackwalking of RuntimeStub frame
2039     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2040 
2041     if (entry != NULL) {
2042       *entry = __ pc();
2043       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2044       BLOCK_COMMENT("Entry:");
2045     }
2046 
2047     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2048                       // r9 and r10 may be used to save non-volatile registers
2049 
2050     {
2051       // UnsafeCopyMemory page error: continue after ucm
2052       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2053       // 'from', 'to' and 'count' are now valid
2054       __ movptr(word_count, count);
2055       __ shrptr(count, 2); // count => qword_count
2056 
2057       // Copy from low to high addresses.  Use 'to' as scratch.
2058       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2059       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2060       __ negptr(qword_count);
2061       __ jmp(L_copy_bytes);
2062 
2063       // Copy trailing qwords
2064     __ BIND(L_copy_8_bytes);
2065       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2066       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2067       __ increment(qword_count);
2068       __ jcc(Assembler::notZero, L_copy_8_bytes);
2069 
2070       // Original 'dest' is trashed, so we can't use it as a
2071       // base register for a possible trailing word copy
2072 
2073       // Check for and copy trailing dword
2074     __ BIND(L_copy_4_bytes);
2075       __ testl(word_count, 2);
2076       __ jccb(Assembler::zero, L_copy_2_bytes);
2077       __ movl(rax, Address(end_from, 8));
2078       __ movl(Address(end_to, 8), rax);
2079 
2080       __ addptr(end_from, 4);
2081       __ addptr(end_to, 4);
2082 
2083       // Check for and copy trailing word
2084     __ BIND(L_copy_2_bytes);
2085       __ testl(word_count, 1);
2086       __ jccb(Assembler::zero, L_exit);
2087       __ movw(rax, Address(end_from, 8));
2088       __ movw(Address(end_to, 8), rax);
2089     }
2090   __ BIND(L_exit);
2091     address ucme_exit_pc = __ pc();
2092     restore_arg_regs();
2093     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2094     __ xorptr(rax, rax); // return 0
2095     __ vzeroupper();
2096     __ leave(); // required for proper stackwalking of RuntimeStub frame
2097     __ ret(0);
2098 
2099     {
2100       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
2101       // Copy in multi-bytes chunks
2102       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2103       __ jmp(L_copy_4_bytes);
2104     }
2105 
2106     return start;
2107   }
2108 
2109   address generate_fill(BasicType t, bool aligned, const char *name) {
2110     __ align(CodeEntryAlignment);
2111     StubCodeMark mark(this, "StubRoutines", name);
2112     address start = __ pc();
2113 
2114     BLOCK_COMMENT("Entry:");
2115 
2116     const Register to       = c_rarg0;  // source array address
2117     const Register value    = c_rarg1;  // value
2118     const Register count    = c_rarg2;  // elements count
2119 
2120     __ enter(); // required for proper stackwalking of RuntimeStub frame
2121 
2122     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
2123 
2124     __ vzeroupper();
2125     __ leave(); // required for proper stackwalking of RuntimeStub frame
2126     __ ret(0);
2127     return start;
2128   }
2129 
2130   // Arguments:
2131   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2132   //             ignored
2133   //   name    - stub name string
2134   //
2135   // Inputs:
2136   //   c_rarg0   - source array address
2137   //   c_rarg1   - destination array address
2138   //   c_rarg2   - element count, treated as ssize_t, can be zero
2139   //
2140   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2141   // let the hardware handle it.  The two or four words within dwords
2142   // or qwords that span cache line boundaries will still be loaded
2143   // and stored atomically.
2144   //
2145   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2146                                        address *entry, const char *name) {
2147 #if COMPILER2_OR_JVMCI
2148     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2149        return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
2150                                                  nooverlap_target, aligned, false, false);
2151     }
2152 #endif
2153     __ align(CodeEntryAlignment);
2154     StubCodeMark mark(this, "StubRoutines", name);
2155     address start = __ pc();
2156 
2157     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2158     const Register from        = rdi;  // source array address
2159     const Register to          = rsi;  // destination array address
2160     const Register count       = rdx;  // elements count
2161     const Register word_count  = rcx;
2162     const Register qword_count = count;
2163 
2164     __ enter(); // required for proper stackwalking of RuntimeStub frame
2165     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2166 
2167     if (entry != NULL) {
2168       *entry = __ pc();
2169       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2170       BLOCK_COMMENT("Entry:");
2171     }
2172 
2173     array_overlap_test(nooverlap_target, Address::times_2);
2174     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2175                       // r9 and r10 may be used to save non-volatile registers
2176 
2177     {
2178       // UnsafeCopyMemory page error: continue after ucm
2179       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2180       // 'from', 'to' and 'count' are now valid
2181       __ movptr(word_count, count);
2182       __ shrptr(count, 2); // count => qword_count
2183 
2184       // Copy from high to low addresses.  Use 'to' as scratch.
2185 
2186       // Check for and copy trailing word
2187       __ testl(word_count, 1);
2188       __ jccb(Assembler::zero, L_copy_4_bytes);
2189       __ movw(rax, Address(from, word_count, Address::times_2, -2));
2190       __ movw(Address(to, word_count, Address::times_2, -2), rax);
2191 
2192      // Check for and copy trailing dword
2193     __ BIND(L_copy_4_bytes);
2194       __ testl(word_count, 2);
2195       __ jcc(Assembler::zero, L_copy_bytes);
2196       __ movl(rax, Address(from, qword_count, Address::times_8));
2197       __ movl(Address(to, qword_count, Address::times_8), rax);
2198       __ jmp(L_copy_bytes);
2199 
2200       // Copy trailing qwords
2201     __ BIND(L_copy_8_bytes);
2202       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2203       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2204       __ decrement(qword_count);
2205       __ jcc(Assembler::notZero, L_copy_8_bytes);
2206     }
2207     restore_arg_regs();
2208     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2209     __ xorptr(rax, rax); // return 0
2210     __ vzeroupper();
2211     __ leave(); // required for proper stackwalking of RuntimeStub frame
2212     __ ret(0);
2213 
2214     {
2215       // UnsafeCopyMemory page error: continue after ucm
2216       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2217       // Copy in multi-bytes chunks
2218       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2219     }
2220     restore_arg_regs();
2221     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2222     __ xorptr(rax, rax); // return 0
2223     __ vzeroupper();
2224     __ leave(); // required for proper stackwalking of RuntimeStub frame
2225     __ ret(0);
2226 
2227     return start;
2228   }
2229 
2230   // Arguments:
2231   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2232   //             ignored
2233   //   is_oop  - true => oop array, so generate store check code
2234   //   name    - stub name string
2235   //
2236   // Inputs:
2237   //   c_rarg0   - source array address
2238   //   c_rarg1   - destination array address
2239   //   c_rarg2   - element count, treated as ssize_t, can be zero
2240   //
2241   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2242   // the hardware handle it.  The two dwords within qwords that span
2243   // cache line boundaries will still be loaded and stored atomicly.
2244   //
2245   // Side Effects:
2246   //   disjoint_int_copy_entry is set to the no-overlap entry point
2247   //   used by generate_conjoint_int_oop_copy().
2248   //
2249   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2250                                          const char *name, bool dest_uninitialized = false) {
2251 #if COMPILER2_OR_JVMCI
2252     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2253        return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
2254                                                  aligned, is_oop, dest_uninitialized);
2255     }
2256 #endif
2257 
2258     __ align(CodeEntryAlignment);
2259     StubCodeMark mark(this, "StubRoutines", name);
2260     address start = __ pc();
2261 
2262     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2263     const Register from        = rdi;  // source array address
2264     const Register to          = rsi;  // destination array address
2265     const Register count       = rdx;  // elements count
2266     const Register dword_count = rcx;
2267     const Register qword_count = count;
2268     const Register end_from    = from; // source array end address
2269     const Register end_to      = to;   // destination array end address
2270     // End pointers are inclusive, and if count is not zero they point
2271     // to the last unit copied:  end_to[0] := end_from[0]
2272 
2273     __ enter(); // required for proper stackwalking of RuntimeStub frame
2274     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2275 
2276     if (entry != NULL) {
2277       *entry = __ pc();
2278       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2279       BLOCK_COMMENT("Entry:");
2280     }
2281 
2282     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2283                                    // r9 is used to save r15_thread
2284 
2285     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2286     if (dest_uninitialized) {
2287       decorators |= IS_DEST_UNINITIALIZED;
2288     }
2289     if (aligned) {
2290       decorators |= ARRAYCOPY_ALIGNED;
2291     }
2292 
2293     BasicType type = is_oop ? T_OBJECT : T_INT;
2294     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2295     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2296 
2297     {
2298       // UnsafeCopyMemory page error: continue after ucm
2299       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2300       // 'from', 'to' and 'count' are now valid
2301       __ movptr(dword_count, count);
2302       __ shrptr(count, 1); // count => qword_count
2303 
2304       // Copy from low to high addresses.  Use 'to' as scratch.
2305       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2306       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2307       __ negptr(qword_count);
2308       __ jmp(L_copy_bytes);
2309 
2310       // Copy trailing qwords
2311     __ BIND(L_copy_8_bytes);
2312       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2313       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2314       __ increment(qword_count);
2315       __ jcc(Assembler::notZero, L_copy_8_bytes);
2316 
2317       // Check for and copy trailing dword
2318     __ BIND(L_copy_4_bytes);
2319       __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2320       __ jccb(Assembler::zero, L_exit);
2321       __ movl(rax, Address(end_from, 8));
2322       __ movl(Address(end_to, 8), rax);
2323     }
2324   __ BIND(L_exit);
2325     address ucme_exit_pc = __ pc();
2326     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2327     restore_arg_regs_using_thread();
2328     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2329     __ vzeroupper();
2330     __ xorptr(rax, rax); // return 0
2331     __ leave(); // required for proper stackwalking of RuntimeStub frame
2332     __ ret(0);
2333 
2334     {
2335       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2336       // Copy in multi-bytes chunks
2337       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2338       __ jmp(L_copy_4_bytes);
2339     }
2340 
2341     return start;
2342   }
2343 
2344   // Arguments:
2345   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2346   //             ignored
2347   //   is_oop  - true => oop array, so generate store check code
2348   //   name    - stub name string
2349   //
2350   // Inputs:
2351   //   c_rarg0   - source array address
2352   //   c_rarg1   - destination array address
2353   //   c_rarg2   - element count, treated as ssize_t, can be zero
2354   //
2355   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2356   // the hardware handle it.  The two dwords within qwords that span
2357   // cache line boundaries will still be loaded and stored atomicly.
2358   //
2359   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2360                                          address *entry, const char *name,
2361                                          bool dest_uninitialized = false) {
2362 #if COMPILER2_OR_JVMCI
2363     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2364        return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
2365                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2366     }
2367 #endif
2368     __ align(CodeEntryAlignment);
2369     StubCodeMark mark(this, "StubRoutines", name);
2370     address start = __ pc();
2371 
2372     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2373     const Register from        = rdi;  // source array address
2374     const Register to          = rsi;  // destination array address
2375     const Register count       = rdx;  // elements count
2376     const Register dword_count = rcx;
2377     const Register qword_count = count;
2378 
2379     __ enter(); // required for proper stackwalking of RuntimeStub frame
2380     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2381 
2382     if (entry != NULL) {
2383       *entry = __ pc();
2384        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2385       BLOCK_COMMENT("Entry:");
2386     }
2387 
2388     array_overlap_test(nooverlap_target, Address::times_4);
2389     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2390                                    // r9 is used to save r15_thread
2391 
2392     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2393     if (dest_uninitialized) {
2394       decorators |= IS_DEST_UNINITIALIZED;
2395     }
2396     if (aligned) {
2397       decorators |= ARRAYCOPY_ALIGNED;
2398     }
2399 
2400     BasicType type = is_oop ? T_OBJECT : T_INT;
2401     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2402     // no registers are destroyed by this call
2403     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2404 
2405     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2406     {
2407       // UnsafeCopyMemory page error: continue after ucm
2408       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2409       // 'from', 'to' and 'count' are now valid
2410       __ movptr(dword_count, count);
2411       __ shrptr(count, 1); // count => qword_count
2412 
2413       // Copy from high to low addresses.  Use 'to' as scratch.
2414 
2415       // Check for and copy trailing dword
2416       __ testl(dword_count, 1);
2417       __ jcc(Assembler::zero, L_copy_bytes);
2418       __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2419       __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2420       __ jmp(L_copy_bytes);
2421 
2422       // Copy trailing qwords
2423     __ BIND(L_copy_8_bytes);
2424       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2425       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2426       __ decrement(qword_count);
2427       __ jcc(Assembler::notZero, L_copy_8_bytes);
2428     }
2429     if (is_oop) {
2430       __ jmp(L_exit);
2431     }
2432     restore_arg_regs_using_thread();
2433     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2434     __ xorptr(rax, rax); // return 0
2435     __ vzeroupper();
2436     __ leave(); // required for proper stackwalking of RuntimeStub frame
2437     __ ret(0);
2438 
2439     {
2440       // UnsafeCopyMemory page error: continue after ucm
2441       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2442       // Copy in multi-bytes chunks
2443       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2444     }
2445 
2446   __ BIND(L_exit);
2447     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2448     restore_arg_regs_using_thread();
2449     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2450     __ xorptr(rax, rax); // return 0
2451     __ vzeroupper();
2452     __ leave(); // required for proper stackwalking of RuntimeStub frame
2453     __ ret(0);
2454 
2455     return start;
2456   }
2457 
2458   // Arguments:
2459   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2460   //             ignored
2461   //   is_oop  - true => oop array, so generate store check code
2462   //   name    - stub name string
2463   //
2464   // Inputs:
2465   //   c_rarg0   - source array address
2466   //   c_rarg1   - destination array address
2467   //   c_rarg2   - element count, treated as ssize_t, can be zero
2468   //
2469  // Side Effects:
2470   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2471   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2472   //
2473   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2474                                           const char *name, bool dest_uninitialized = false) {
2475 #if COMPILER2_OR_JVMCI
2476     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2477        return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
2478                                                  aligned, is_oop, dest_uninitialized);
2479     }
2480 #endif
2481     __ align(CodeEntryAlignment);
2482     StubCodeMark mark(this, "StubRoutines", name);
2483     address start = __ pc();
2484 
2485     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2486     const Register from        = rdi;  // source array address
2487     const Register to          = rsi;  // destination array address
2488     const Register qword_count = rdx;  // elements count
2489     const Register end_from    = from; // source array end address
2490     const Register end_to      = rcx;  // destination array end address
2491     const Register saved_count = r11;
2492     // End pointers are inclusive, and if count is not zero they point
2493     // to the last unit copied:  end_to[0] := end_from[0]
2494 
2495     __ enter(); // required for proper stackwalking of RuntimeStub frame
2496     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2497     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2498 
2499     if (entry != NULL) {
2500       *entry = __ pc();
2501       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2502       BLOCK_COMMENT("Entry:");
2503     }
2504 
2505     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2506                                      // r9 is used to save r15_thread
2507     // 'from', 'to' and 'qword_count' are now valid
2508 
2509     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2510     if (dest_uninitialized) {
2511       decorators |= IS_DEST_UNINITIALIZED;
2512     }
2513     if (aligned) {
2514       decorators |= ARRAYCOPY_ALIGNED;
2515     }
2516 
2517     BasicType type = is_oop ? T_OBJECT : T_LONG;
2518     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2519     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2520     {
2521       // UnsafeCopyMemory page error: continue after ucm
2522       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2523 
2524       // Copy from low to high addresses.  Use 'to' as scratch.
2525       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2526       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2527       __ negptr(qword_count);
2528       __ jmp(L_copy_bytes);
2529 
2530       // Copy trailing qwords
2531     __ BIND(L_copy_8_bytes);
2532       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2533       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2534       __ increment(qword_count);
2535       __ jcc(Assembler::notZero, L_copy_8_bytes);
2536     }
2537     if (is_oop) {
2538       __ jmp(L_exit);
2539     } else {
2540       restore_arg_regs_using_thread();
2541       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2542       __ xorptr(rax, rax); // return 0
2543       __ vzeroupper();
2544       __ leave(); // required for proper stackwalking of RuntimeStub frame
2545       __ ret(0);
2546     }
2547 
2548     {
2549       // UnsafeCopyMemory page error: continue after ucm
2550       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2551       // Copy in multi-bytes chunks
2552       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2553     }
2554 
2555     __ BIND(L_exit);
2556     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2557     restore_arg_regs_using_thread();
2558     if (is_oop) {
2559       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2560     } else {
2561       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2562     }
2563     __ vzeroupper();
2564     __ xorptr(rax, rax); // return 0
2565     __ leave(); // required for proper stackwalking of RuntimeStub frame
2566     __ ret(0);
2567 
2568     return start;
2569   }
2570 
2571   // Arguments:
2572   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2573   //             ignored
2574   //   is_oop  - true => oop array, so generate store check code
2575   //   name    - stub name string
2576   //
2577   // Inputs:
2578   //   c_rarg0   - source array address
2579   //   c_rarg1   - destination array address
2580   //   c_rarg2   - element count, treated as ssize_t, can be zero
2581   //
2582   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2583                                           address nooverlap_target, address *entry,
2584                                           const char *name, bool dest_uninitialized = false) {
2585 #if COMPILER2_OR_JVMCI
2586     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2587        return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2588                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2589     }
2590 #endif
2591     __ align(CodeEntryAlignment);
2592     StubCodeMark mark(this, "StubRoutines", name);
2593     address start = __ pc();
2594 
2595     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2596     const Register from        = rdi;  // source array address
2597     const Register to          = rsi;  // destination array address
2598     const Register qword_count = rdx;  // elements count
2599     const Register saved_count = rcx;
2600 
2601     __ enter(); // required for proper stackwalking of RuntimeStub frame
2602     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2603 
2604     if (entry != NULL) {
2605       *entry = __ pc();
2606       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2607       BLOCK_COMMENT("Entry:");
2608     }
2609 
2610     array_overlap_test(nooverlap_target, Address::times_8);
2611     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2612                                    // r9 is used to save r15_thread
2613     // 'from', 'to' and 'qword_count' are now valid
2614 
2615     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2616     if (dest_uninitialized) {
2617       decorators |= IS_DEST_UNINITIALIZED;
2618     }
2619     if (aligned) {
2620       decorators |= ARRAYCOPY_ALIGNED;
2621     }
2622 
2623     BasicType type = is_oop ? T_OBJECT : T_LONG;
2624     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2625     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2626     {
2627       // UnsafeCopyMemory page error: continue after ucm
2628       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2629 
2630       __ jmp(L_copy_bytes);
2631 
2632       // Copy trailing qwords
2633     __ BIND(L_copy_8_bytes);
2634       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2635       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2636       __ decrement(qword_count);
2637       __ jcc(Assembler::notZero, L_copy_8_bytes);
2638     }
2639     if (is_oop) {
2640       __ jmp(L_exit);
2641     } else {
2642       restore_arg_regs_using_thread();
2643       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2644       __ xorptr(rax, rax); // return 0
2645       __ vzeroupper();
2646       __ leave(); // required for proper stackwalking of RuntimeStub frame
2647       __ ret(0);
2648     }
2649     {
2650       // UnsafeCopyMemory page error: continue after ucm
2651       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2652 
2653       // Copy in multi-bytes chunks
2654       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2655     }
2656     __ BIND(L_exit);
2657     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2658     restore_arg_regs_using_thread();
2659     if (is_oop) {
2660       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2661     } else {
2662       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2663     }
2664     __ vzeroupper();
2665     __ xorptr(rax, rax); // return 0
2666     __ leave(); // required for proper stackwalking of RuntimeStub frame
2667     __ ret(0);
2668 
2669     return start;
2670   }
2671 
2672 
2673   // Helper for generating a dynamic type check.
2674   // Smashes no registers.
2675   void generate_type_check(Register sub_klass,
2676                            Register super_check_offset,
2677                            Register super_klass,
2678                            Label& L_success) {
2679     assert_different_registers(sub_klass, super_check_offset, super_klass);
2680 
2681     BLOCK_COMMENT("type_check:");
2682 
2683     Label L_miss;
2684 
2685     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2686                                      super_check_offset);
2687     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2688 
2689     // Fall through on failure!
2690     __ BIND(L_miss);
2691   }
2692 
2693   //
2694   //  Generate checkcasting array copy stub
2695   //
2696   //  Input:
2697   //    c_rarg0   - source array address
2698   //    c_rarg1   - destination array address
2699   //    c_rarg2   - element count, treated as ssize_t, can be zero
2700   //    c_rarg3   - size_t ckoff (super_check_offset)
2701   // not Win64
2702   //    c_rarg4   - oop ckval (super_klass)
2703   // Win64
2704   //    rsp+40    - oop ckval (super_klass)
2705   //
2706   //  Output:
2707   //    rax ==  0  -  success
2708   //    rax == -1^K - failure, where K is partial transfer count
2709   //
2710   address generate_checkcast_copy(const char *name, address *entry,
2711                                   bool dest_uninitialized = false) {
2712 
2713     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2714 
2715     // Input registers (after setup_arg_regs)
2716     const Register from        = rdi;   // source array address
2717     const Register to          = rsi;   // destination array address
2718     const Register length      = rdx;   // elements count
2719     const Register ckoff       = rcx;   // super_check_offset
2720     const Register ckval       = r8;    // super_klass
2721 
2722     // Registers used as temps (r13, r14 are save-on-entry)
2723     const Register end_from    = from;  // source array end address
2724     const Register end_to      = r13;   // destination array end address
2725     const Register count       = rdx;   // -(count_remaining)
2726     const Register r14_length  = r14;   // saved copy of length
2727     // End pointers are inclusive, and if length is not zero they point
2728     // to the last unit copied:  end_to[0] := end_from[0]
2729 
2730     const Register rax_oop    = rax;    // actual oop copied
2731     const Register r11_klass  = r11;    // oop._klass
2732 
2733     //---------------------------------------------------------------
2734     // Assembler stub will be used for this call to arraycopy
2735     // if the two arrays are subtypes of Object[] but the
2736     // destination array type is not equal to or a supertype
2737     // of the source type.  Each element must be separately
2738     // checked.
2739 
2740     __ align(CodeEntryAlignment);
2741     StubCodeMark mark(this, "StubRoutines", name);
2742     address start = __ pc();
2743 
2744     __ enter(); // required for proper stackwalking of RuntimeStub frame
2745 
2746 #ifdef ASSERT
2747     // caller guarantees that the arrays really are different
2748     // otherwise, we would have to make conjoint checks
2749     { Label L;
2750       array_overlap_test(L, TIMES_OOP);
2751       __ stop("checkcast_copy within a single array");
2752       __ bind(L);
2753     }
2754 #endif //ASSERT
2755 
2756     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2757                        // ckoff => rcx, ckval => r8
2758                        // r9 and r10 may be used to save non-volatile registers
2759 #ifdef _WIN64
2760     // last argument (#4) is on stack on Win64
2761     __ movptr(ckval, Address(rsp, 6 * wordSize));
2762 #endif
2763 
2764     // Caller of this entry point must set up the argument registers.
2765     if (entry != NULL) {
2766       *entry = __ pc();
2767       BLOCK_COMMENT("Entry:");
2768     }
2769 
2770     // allocate spill slots for r13, r14
2771     enum {
2772       saved_r13_offset,
2773       saved_r14_offset,
2774       saved_r10_offset,
2775       saved_rbp_offset
2776     };
2777     __ subptr(rsp, saved_rbp_offset * wordSize);
2778     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2779     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2780     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2781 
2782 #ifdef ASSERT
2783       Label L2;
2784       __ get_thread(r14);
2785       __ cmpptr(r15_thread, r14);
2786       __ jcc(Assembler::equal, L2);
2787       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2788       __ bind(L2);
2789 #endif // ASSERT
2790 
2791     // check that int operands are properly extended to size_t
2792     assert_clean_int(length, rax);
2793     assert_clean_int(ckoff, rax);
2794 
2795 #ifdef ASSERT
2796     BLOCK_COMMENT("assert consistent ckoff/ckval");
2797     // The ckoff and ckval must be mutually consistent,
2798     // even though caller generates both.
2799     { Label L;
2800       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2801       __ cmpl(ckoff, Address(ckval, sco_offset));
2802       __ jcc(Assembler::equal, L);
2803       __ stop("super_check_offset inconsistent");
2804       __ bind(L);
2805     }
2806 #endif //ASSERT
2807 
2808     // Loop-invariant addresses.  They are exclusive end pointers.
2809     Address end_from_addr(from, length, TIMES_OOP, 0);
2810     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2811     // Loop-variant addresses.  They assume post-incremented count < 0.
2812     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2813     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2814 
2815     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2816     if (dest_uninitialized) {
2817       decorators |= IS_DEST_UNINITIALIZED;
2818     }
2819 
2820     BasicType type = T_OBJECT;
2821     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2822     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2823 
2824     // Copy from low to high addresses, indexed from the end of each array.
2825     __ lea(end_from, end_from_addr);
2826     __ lea(end_to,   end_to_addr);
2827     __ movptr(r14_length, length);        // save a copy of the length
2828     assert(length == count, "");          // else fix next line:
2829     __ negptr(count);                     // negate and test the length
2830     __ jcc(Assembler::notZero, L_load_element);
2831 
2832     // Empty array:  Nothing to do.
2833     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2834     __ jmp(L_done);
2835 
2836     // ======== begin loop ========
2837     // (Loop is rotated; its entry is L_load_element.)
2838     // Loop control:
2839     //   for (count = -count; count != 0; count++)
2840     // Base pointers src, dst are biased by 8*(count-1),to last element.
2841     __ align(OptoLoopAlignment);
2842 
2843     __ BIND(L_store_element);
2844     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW);  // store the oop
2845     __ increment(count);               // increment the count toward zero
2846     __ jcc(Assembler::zero, L_do_card_marks);
2847 
2848     // ======== loop entry is here ========
2849     __ BIND(L_load_element);
2850     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2851     __ testptr(rax_oop, rax_oop);
2852     __ jcc(Assembler::zero, L_store_element);
2853 
2854     __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2855     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2856     // ======== end loop ========
2857 
2858     // It was a real error; we must depend on the caller to finish the job.
2859     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2860     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2861     // and report their number to the caller.
2862     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2863     Label L_post_barrier;
2864     __ addptr(r14_length, count);     // K = (original - remaining) oops
2865     __ movptr(rax, r14_length);       // save the value
2866     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2867     __ jccb(Assembler::notZero, L_post_barrier);
2868     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2869 
2870     // Come here on success only.
2871     __ BIND(L_do_card_marks);
2872     __ xorptr(rax, rax);              // return 0 on success
2873 
2874     __ BIND(L_post_barrier);
2875     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2876 
2877     // Common exit point (success or failure).
2878     __ BIND(L_done);
2879     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2880     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2881     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2882     restore_arg_regs();
2883     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2884     __ leave(); // required for proper stackwalking of RuntimeStub frame
2885     __ ret(0);
2886 
2887     return start;
2888   }
2889 
2890   //
2891   //  Generate 'unsafe' array copy stub
2892   //  Though just as safe as the other stubs, it takes an unscaled
2893   //  size_t argument instead of an element count.
2894   //
2895   //  Input:
2896   //    c_rarg0   - source array address
2897   //    c_rarg1   - destination array address
2898   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2899   //
2900   // Examines the alignment of the operands and dispatches
2901   // to a long, int, short, or byte copy loop.
2902   //
2903   address generate_unsafe_copy(const char *name,
2904                                address byte_copy_entry, address short_copy_entry,
2905                                address int_copy_entry, address long_copy_entry) {
2906 
2907     Label L_long_aligned, L_int_aligned, L_short_aligned;
2908 
2909     // Input registers (before setup_arg_regs)
2910     const Register from        = c_rarg0;  // source array address
2911     const Register to          = c_rarg1;  // destination array address
2912     const Register size        = c_rarg2;  // byte count (size_t)
2913 
2914     // Register used as a temp
2915     const Register bits        = rax;      // test copy of low bits
2916 
2917     __ align(CodeEntryAlignment);
2918     StubCodeMark mark(this, "StubRoutines", name);
2919     address start = __ pc();
2920 
2921     __ enter(); // required for proper stackwalking of RuntimeStub frame
2922 
2923     // bump this on entry, not on exit:
2924     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2925 
2926     __ mov(bits, from);
2927     __ orptr(bits, to);
2928     __ orptr(bits, size);
2929 
2930     __ testb(bits, BytesPerLong-1);
2931     __ jccb(Assembler::zero, L_long_aligned);
2932 
2933     __ testb(bits, BytesPerInt-1);
2934     __ jccb(Assembler::zero, L_int_aligned);
2935 
2936     __ testb(bits, BytesPerShort-1);
2937     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2938 
2939     __ BIND(L_short_aligned);
2940     __ shrptr(size, LogBytesPerShort); // size => short_count
2941     __ jump(RuntimeAddress(short_copy_entry));
2942 
2943     __ BIND(L_int_aligned);
2944     __ shrptr(size, LogBytesPerInt); // size => int_count
2945     __ jump(RuntimeAddress(int_copy_entry));
2946 
2947     __ BIND(L_long_aligned);
2948     __ shrptr(size, LogBytesPerLong); // size => qword_count
2949     __ jump(RuntimeAddress(long_copy_entry));
2950 
2951     return start;
2952   }
2953 
2954   // Perform range checks on the proposed arraycopy.
2955   // Kills temp, but nothing else.
2956   // Also, clean the sign bits of src_pos and dst_pos.
2957   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2958                               Register src_pos, // source position (c_rarg1)
2959                               Register dst,     // destination array oo (c_rarg2)
2960                               Register dst_pos, // destination position (c_rarg3)
2961                               Register length,
2962                               Register temp,
2963                               Label& L_failed) {
2964     BLOCK_COMMENT("arraycopy_range_checks:");
2965 
2966     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2967     __ movl(temp, length);
2968     __ addl(temp, src_pos);             // src_pos + length
2969     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2970     __ jcc(Assembler::above, L_failed);
2971 
2972     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2973     __ movl(temp, length);
2974     __ addl(temp, dst_pos);             // dst_pos + length
2975     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2976     __ jcc(Assembler::above, L_failed);
2977 
2978     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2979     // Move with sign extension can be used since they are positive.
2980     __ movslq(src_pos, src_pos);
2981     __ movslq(dst_pos, dst_pos);
2982 
2983     BLOCK_COMMENT("arraycopy_range_checks done");
2984   }
2985 
2986   //
2987   //  Generate generic array copy stubs
2988   //
2989   //  Input:
2990   //    c_rarg0    -  src oop
2991   //    c_rarg1    -  src_pos (32-bits)
2992   //    c_rarg2    -  dst oop
2993   //    c_rarg3    -  dst_pos (32-bits)
2994   // not Win64
2995   //    c_rarg4    -  element count (32-bits)
2996   // Win64
2997   //    rsp+40     -  element count (32-bits)
2998   //
2999   //  Output:
3000   //    rax ==  0  -  success
3001   //    rax == -1^K - failure, where K is partial transfer count
3002   //
3003   address generate_generic_copy(const char *name,
3004                                 address byte_copy_entry, address short_copy_entry,
3005                                 address int_copy_entry, address oop_copy_entry,
3006                                 address long_copy_entry, address checkcast_copy_entry) {
3007 
3008     Label L_failed, L_failed_0, L_objArray;
3009     Label L_copy_shorts, L_copy_ints, L_copy_longs;
3010 
3011     // Input registers
3012     const Register src        = c_rarg0;  // source array oop
3013     const Register src_pos    = c_rarg1;  // source position
3014     const Register dst        = c_rarg2;  // destination array oop
3015     const Register dst_pos    = c_rarg3;  // destination position
3016 #ifndef _WIN64
3017     const Register length     = c_rarg4;
3018     const Register rklass_tmp = r9;  // load_klass
3019 #else
3020     const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3021     const Register rklass_tmp = rdi;  // load_klass
3022 #endif
3023 
3024     { int modulus = CodeEntryAlignment;
3025       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
3026       int advance = target - (__ offset() % modulus);
3027       if (advance < 0)  advance += modulus;
3028       if (advance > 0)  __ nop(advance);
3029     }
3030     StubCodeMark mark(this, "StubRoutines", name);
3031 
3032     // Short-hop target to L_failed.  Makes for denser prologue code.
3033     __ BIND(L_failed_0);
3034     __ jmp(L_failed);
3035     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
3036 
3037     __ align(CodeEntryAlignment);
3038     address start = __ pc();
3039 
3040     __ enter(); // required for proper stackwalking of RuntimeStub frame
3041 
3042 #ifdef _WIN64
3043     __ push(rklass_tmp); // rdi is callee-save on Windows
3044 #endif
3045 
3046     // bump this on entry, not on exit:
3047     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
3048 
3049     //-----------------------------------------------------------------------
3050     // Assembler stub will be used for this call to arraycopy
3051     // if the following conditions are met:
3052     //
3053     // (1) src and dst must not be null.
3054     // (2) src_pos must not be negative.
3055     // (3) dst_pos must not be negative.
3056     // (4) length  must not be negative.
3057     // (5) src klass and dst klass should be the same and not NULL.
3058     // (6) src and dst should be arrays.
3059     // (7) src_pos + length must not exceed length of src.
3060     // (8) dst_pos + length must not exceed length of dst.
3061     //
3062 
3063     //  if (src == NULL) return -1;
3064     __ testptr(src, src);         // src oop
3065     size_t j1off = __ offset();
3066     __ jccb(Assembler::zero, L_failed_0);
3067 
3068     //  if (src_pos < 0) return -1;
3069     __ testl(src_pos, src_pos); // src_pos (32-bits)
3070     __ jccb(Assembler::negative, L_failed_0);
3071 
3072     //  if (dst == NULL) return -1;
3073     __ testptr(dst, dst);         // dst oop
3074     __ jccb(Assembler::zero, L_failed_0);
3075 
3076     //  if (dst_pos < 0) return -1;
3077     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3078     size_t j4off = __ offset();
3079     __ jccb(Assembler::negative, L_failed_0);
3080 
3081     // The first four tests are very dense code,
3082     // but not quite dense enough to put four
3083     // jumps in a 16-byte instruction fetch buffer.
3084     // That's good, because some branch predicters
3085     // do not like jumps so close together.
3086     // Make sure of this.
3087     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3088 
3089     // registers used as temp
3090     const Register r11_length    = r11; // elements count to copy
3091     const Register r10_src_klass = r10; // array klass
3092 
3093     //  if (length < 0) return -1;
3094     __ movl(r11_length, length);        // length (elements count, 32-bits value)
3095     __ testl(r11_length, r11_length);
3096     __ jccb(Assembler::negative, L_failed_0);
3097 
3098     __ load_klass(r10_src_klass, src, rklass_tmp);
3099 #ifdef ASSERT
3100     //  assert(src->klass() != NULL);
3101     {
3102       BLOCK_COMMENT("assert klasses not null {");
3103       Label L1, L2;
3104       __ testptr(r10_src_klass, r10_src_klass);
3105       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
3106       __ bind(L1);
3107       __ stop("broken null klass");
3108       __ bind(L2);
3109       __ load_klass(rax, dst, rklass_tmp);
3110       __ cmpq(rax, 0);
3111       __ jcc(Assembler::equal, L1);     // this would be broken also
3112       BLOCK_COMMENT("} assert klasses not null done");
3113     }
3114 #endif
3115 
3116     // Load layout helper (32-bits)
3117     //
3118     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3119     // 32        30    24            16              8     2                 0
3120     //
3121     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3122     //
3123 
3124     const int lh_offset = in_bytes(Klass::layout_helper_offset());
3125 
3126     // Handle objArrays completely differently...
3127     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3128     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3129     __ jcc(Assembler::equal, L_objArray);
3130 
3131     //  if (src->klass() != dst->klass()) return -1;
3132     __ load_klass(rax, dst, rklass_tmp);
3133     __ cmpq(r10_src_klass, rax);
3134     __ jcc(Assembler::notEqual, L_failed);
3135 
3136     const Register rax_lh = rax;  // layout helper
3137     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3138 
3139     //  if (!src->is_Array()) return -1;
3140     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3141     __ jcc(Assembler::greaterEqual, L_failed);
3142 
3143     // At this point, it is known to be a typeArray (array_tag 0x3).
3144 #ifdef ASSERT
3145     {
3146       BLOCK_COMMENT("assert primitive array {");
3147       Label L;
3148       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3149       __ jcc(Assembler::greaterEqual, L);
3150       __ stop("must be a primitive array");
3151       __ bind(L);
3152       BLOCK_COMMENT("} assert primitive array done");
3153     }
3154 #endif
3155 
3156     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3157                            r10, L_failed);
3158 
3159     // TypeArrayKlass
3160     //
3161     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3162     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3163     //
3164 
3165     const Register r10_offset = r10;    // array offset
3166     const Register rax_elsize = rax_lh; // element size
3167 
3168     __ movl(r10_offset, rax_lh);
3169     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3170     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3171     __ addptr(src, r10_offset);           // src array offset
3172     __ addptr(dst, r10_offset);           // dst array offset
3173     BLOCK_COMMENT("choose copy loop based on element size");
3174     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3175 
3176 #ifdef _WIN64
3177     __ pop(rklass_tmp); // Restore callee-save rdi
3178 #endif
3179 
3180     // next registers should be set before the jump to corresponding stub
3181     const Register from     = c_rarg0;  // source array address
3182     const Register to       = c_rarg1;  // destination array address
3183     const Register count    = c_rarg2;  // elements count
3184 
3185     // 'from', 'to', 'count' registers should be set in such order
3186     // since they are the same as 'src', 'src_pos', 'dst'.
3187 
3188     __ cmpl(rax_elsize, 0);
3189     __ jccb(Assembler::notEqual, L_copy_shorts);
3190     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3191     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3192     __ movl2ptr(count, r11_length); // length
3193     __ jump(RuntimeAddress(byte_copy_entry));
3194 
3195   __ BIND(L_copy_shorts);
3196     __ cmpl(rax_elsize, LogBytesPerShort);
3197     __ jccb(Assembler::notEqual, L_copy_ints);
3198     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3199     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3200     __ movl2ptr(count, r11_length); // length
3201     __ jump(RuntimeAddress(short_copy_entry));
3202 
3203   __ BIND(L_copy_ints);
3204     __ cmpl(rax_elsize, LogBytesPerInt);
3205     __ jccb(Assembler::notEqual, L_copy_longs);
3206     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3207     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3208     __ movl2ptr(count, r11_length); // length
3209     __ jump(RuntimeAddress(int_copy_entry));
3210 
3211   __ BIND(L_copy_longs);
3212 #ifdef ASSERT
3213     {
3214       BLOCK_COMMENT("assert long copy {");
3215       Label L;
3216       __ cmpl(rax_elsize, LogBytesPerLong);
3217       __ jcc(Assembler::equal, L);
3218       __ stop("must be long copy, but elsize is wrong");
3219       __ bind(L);
3220       BLOCK_COMMENT("} assert long copy done");
3221     }
3222 #endif
3223     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3224     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3225     __ movl2ptr(count, r11_length); // length
3226     __ jump(RuntimeAddress(long_copy_entry));
3227 
3228     // ObjArrayKlass
3229   __ BIND(L_objArray);
3230     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3231 
3232     Label L_plain_copy, L_checkcast_copy;
3233     //  test array classes for subtyping
3234     __ load_klass(rax, dst, rklass_tmp);
3235     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3236     __ jcc(Assembler::notEqual, L_checkcast_copy);
3237 
3238     // Identically typed arrays can be copied without element-wise checks.
3239     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3240                            r10, L_failed);
3241 
3242     __ lea(from, Address(src, src_pos, TIMES_OOP,
3243                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3244     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3245                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3246     __ movl2ptr(count, r11_length); // length
3247   __ BIND(L_plain_copy);
3248 #ifdef _WIN64
3249     __ pop(rklass_tmp); // Restore callee-save rdi
3250 #endif
3251     __ jump(RuntimeAddress(oop_copy_entry));
3252 
3253   __ BIND(L_checkcast_copy);
3254     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3255     {
3256       // Before looking at dst.length, make sure dst is also an objArray.
3257       __ cmpl(Address(rax, lh_offset), objArray_lh);
3258       __ jcc(Assembler::notEqual, L_failed);
3259 
3260       // It is safe to examine both src.length and dst.length.
3261       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3262                              rax, L_failed);
3263 
3264       const Register r11_dst_klass = r11;
3265       __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3266 
3267       // Marshal the base address arguments now, freeing registers.
3268       __ lea(from, Address(src, src_pos, TIMES_OOP,
3269                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3270       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3271                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3272       __ movl(count, length);           // length (reloaded)
3273       Register sco_temp = c_rarg3;      // this register is free now
3274       assert_different_registers(from, to, count, sco_temp,
3275                                  r11_dst_klass, r10_src_klass);
3276       assert_clean_int(count, sco_temp);
3277 
3278       // Generate the type check.
3279       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3280       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3281       assert_clean_int(sco_temp, rax);
3282       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3283 
3284       // Fetch destination element klass from the ObjArrayKlass header.
3285       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3286       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3287       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3288       assert_clean_int(sco_temp, rax);
3289 
3290 #ifdef _WIN64
3291       __ pop(rklass_tmp); // Restore callee-save rdi
3292 #endif
3293 
3294       // the checkcast_copy loop needs two extra arguments:
3295       assert(c_rarg3 == sco_temp, "#3 already in place");
3296       // Set up arguments for checkcast_copy_entry.
3297       setup_arg_regs(4);
3298       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3299       __ jump(RuntimeAddress(checkcast_copy_entry));
3300     }
3301 
3302   __ BIND(L_failed);
3303 #ifdef _WIN64
3304     __ pop(rklass_tmp); // Restore callee-save rdi
3305 #endif
3306     __ xorptr(rax, rax);
3307     __ notptr(rax); // return -1
3308     __ leave();   // required for proper stackwalking of RuntimeStub frame
3309     __ ret(0);
3310 
3311     return start;
3312   }
3313 
3314   address generate_data_cache_writeback() {
3315     const Register src        = c_rarg0;  // source address
3316 
3317     __ align(CodeEntryAlignment);
3318 
3319     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3320 
3321     address start = __ pc();
3322     __ enter();
3323     __ cache_wb(Address(src, 0));
3324     __ leave();
3325     __ ret(0);
3326 
3327     return start;
3328   }
3329 
3330   address generate_data_cache_writeback_sync() {
3331     const Register is_pre    = c_rarg0;  // pre or post sync
3332 
3333     __ align(CodeEntryAlignment);
3334 
3335     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3336 
3337     // pre wbsync is a no-op
3338     // post wbsync translates to an sfence
3339 
3340     Label skip;
3341     address start = __ pc();
3342     __ enter();
3343     __ cmpl(is_pre, 0);
3344     __ jcc(Assembler::notEqual, skip);
3345     __ cache_wbsync(false);
3346     __ bind(skip);
3347     __ leave();
3348     __ ret(0);
3349 
3350     return start;
3351   }
3352 
3353   void generate_arraycopy_stubs() {
3354     address entry;
3355     address entry_jbyte_arraycopy;
3356     address entry_jshort_arraycopy;
3357     address entry_jint_arraycopy;
3358     address entry_oop_arraycopy;
3359     address entry_jlong_arraycopy;
3360     address entry_checkcast_arraycopy;
3361 
3362     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3363                                                                            "jbyte_disjoint_arraycopy");
3364     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3365                                                                            "jbyte_arraycopy");
3366 
3367     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3368                                                                             "jshort_disjoint_arraycopy");
3369     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3370                                                                             "jshort_arraycopy");
3371 
3372     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3373                                                                               "jint_disjoint_arraycopy");
3374     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3375                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3376 
3377     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3378                                                                                "jlong_disjoint_arraycopy");
3379     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3380                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3381 
3382 
3383     if (UseCompressedOops) {
3384       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3385                                                                               "oop_disjoint_arraycopy");
3386       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3387                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3388       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3389                                                                                      "oop_disjoint_arraycopy_uninit",
3390                                                                                      /*dest_uninitialized*/true);
3391       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3392                                                                                      NULL, "oop_arraycopy_uninit",
3393                                                                                      /*dest_uninitialized*/true);
3394     } else {
3395       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3396                                                                                "oop_disjoint_arraycopy");
3397       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3398                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3399       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3400                                                                                       "oop_disjoint_arraycopy_uninit",
3401                                                                                       /*dest_uninitialized*/true);
3402       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3403                                                                                       NULL, "oop_arraycopy_uninit",
3404                                                                                       /*dest_uninitialized*/true);
3405     }
3406 
3407     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3408     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3409                                                                         /*dest_uninitialized*/true);
3410 
3411     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3412                                                               entry_jbyte_arraycopy,
3413                                                               entry_jshort_arraycopy,
3414                                                               entry_jint_arraycopy,
3415                                                               entry_jlong_arraycopy);
3416     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3417                                                                entry_jbyte_arraycopy,
3418                                                                entry_jshort_arraycopy,
3419                                                                entry_jint_arraycopy,
3420                                                                entry_oop_arraycopy,
3421                                                                entry_jlong_arraycopy,
3422                                                                entry_checkcast_arraycopy);
3423 
3424     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3425     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3426     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3427     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3428     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3429     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3430 
3431     // We don't generate specialized code for HeapWord-aligned source
3432     // arrays, so just use the code we've already generated
3433     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3434     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3435 
3436     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3437     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3438 
3439     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3440     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3441 
3442     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3443     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3444 
3445     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3446     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3447 
3448     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3449     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3450   }
3451 
3452   // AES intrinsic stubs
3453   enum {AESBlockSize = 16};
3454 
3455   address generate_key_shuffle_mask() {
3456     __ align(16);
3457     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3458     address start = __ pc();
3459     __ emit_data64( 0x0405060700010203, relocInfo::none );
3460     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3461     return start;
3462   }
3463 
3464   address generate_counter_shuffle_mask() {
3465     __ align(16);
3466     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3467     address start = __ pc();
3468     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3469     __ emit_data64(0x0001020304050607, relocInfo::none);
3470     return start;
3471   }
3472 
3473   // Utility routine for loading a 128-bit key word in little endian format
3474   // can optionally specify that the shuffle mask is already in an xmmregister
3475   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3476     __ movdqu(xmmdst, Address(key, offset));
3477     if (xmm_shuf_mask != NULL) {
3478       __ pshufb(xmmdst, xmm_shuf_mask);
3479     } else {
3480       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3481     }
3482   }
3483 
3484   // Utility routine for increase 128bit counter (iv in CTR mode)
3485   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3486     __ pextrq(reg, xmmdst, 0x0);
3487     __ addq(reg, inc_delta);
3488     __ pinsrq(xmmdst, reg, 0x0);
3489     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3490     __ pextrq(reg, xmmdst, 0x01); // Carry
3491     __ addq(reg, 0x01);
3492     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3493     __ BIND(next_block);          // next instruction
3494   }
3495 
3496   // Arguments:
3497   //
3498   // Inputs:
3499   //   c_rarg0   - source byte array address
3500   //   c_rarg1   - destination byte array address
3501   //   c_rarg2   - K (key) in little endian int array
3502   //
3503   address generate_aescrypt_encryptBlock() {
3504     assert(UseAES, "need AES instructions and misaligned SSE support");
3505     __ align(CodeEntryAlignment);
3506     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3507     Label L_doLast;
3508     address start = __ pc();
3509 
3510     const Register from        = c_rarg0;  // source array address
3511     const Register to          = c_rarg1;  // destination array address
3512     const Register key         = c_rarg2;  // key array address
3513     const Register keylen      = rax;
3514 
3515     const XMMRegister xmm_result = xmm0;
3516     const XMMRegister xmm_key_shuf_mask = xmm1;
3517     // On win64 xmm6-xmm15 must be preserved so don't use them.
3518     const XMMRegister xmm_temp1  = xmm2;
3519     const XMMRegister xmm_temp2  = xmm3;
3520     const XMMRegister xmm_temp3  = xmm4;
3521     const XMMRegister xmm_temp4  = xmm5;
3522 
3523     __ enter(); // required for proper stackwalking of RuntimeStub frame
3524 
3525     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3526     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3527 
3528     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3529     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3530 
3531     // For encryption, the java expanded key ordering is just what we need
3532     // we don't know if the key is aligned, hence not using load-execute form
3533 
3534     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3535     __ pxor(xmm_result, xmm_temp1);
3536 
3537     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3538     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3539     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3540     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3541 
3542     __ aesenc(xmm_result, xmm_temp1);
3543     __ aesenc(xmm_result, xmm_temp2);
3544     __ aesenc(xmm_result, xmm_temp3);
3545     __ aesenc(xmm_result, xmm_temp4);
3546 
3547     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3548     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3549     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3550     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3551 
3552     __ aesenc(xmm_result, xmm_temp1);
3553     __ aesenc(xmm_result, xmm_temp2);
3554     __ aesenc(xmm_result, xmm_temp3);
3555     __ aesenc(xmm_result, xmm_temp4);
3556 
3557     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3558     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3559 
3560     __ cmpl(keylen, 44);
3561     __ jccb(Assembler::equal, L_doLast);
3562 
3563     __ aesenc(xmm_result, xmm_temp1);
3564     __ aesenc(xmm_result, xmm_temp2);
3565 
3566     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3567     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3568 
3569     __ cmpl(keylen, 52);
3570     __ jccb(Assembler::equal, L_doLast);
3571 
3572     __ aesenc(xmm_result, xmm_temp1);
3573     __ aesenc(xmm_result, xmm_temp2);
3574 
3575     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3576     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3577 
3578     __ BIND(L_doLast);
3579     __ aesenc(xmm_result, xmm_temp1);
3580     __ aesenclast(xmm_result, xmm_temp2);
3581     __ movdqu(Address(to, 0), xmm_result);        // store the result
3582     __ xorptr(rax, rax); // return 0
3583     __ leave(); // required for proper stackwalking of RuntimeStub frame
3584     __ ret(0);
3585 
3586     return start;
3587   }
3588 
3589 
3590   // Arguments:
3591   //
3592   // Inputs:
3593   //   c_rarg0   - source byte array address
3594   //   c_rarg1   - destination byte array address
3595   //   c_rarg2   - K (key) in little endian int array
3596   //
3597   address generate_aescrypt_decryptBlock() {
3598     assert(UseAES, "need AES instructions and misaligned SSE support");
3599     __ align(CodeEntryAlignment);
3600     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3601     Label L_doLast;
3602     address start = __ pc();
3603 
3604     const Register from        = c_rarg0;  // source array address
3605     const Register to          = c_rarg1;  // destination array address
3606     const Register key         = c_rarg2;  // key array address
3607     const Register keylen      = rax;
3608 
3609     const XMMRegister xmm_result = xmm0;
3610     const XMMRegister xmm_key_shuf_mask = xmm1;
3611     // On win64 xmm6-xmm15 must be preserved so don't use them.
3612     const XMMRegister xmm_temp1  = xmm2;
3613     const XMMRegister xmm_temp2  = xmm3;
3614     const XMMRegister xmm_temp3  = xmm4;
3615     const XMMRegister xmm_temp4  = xmm5;
3616 
3617     __ enter(); // required for proper stackwalking of RuntimeStub frame
3618 
3619     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3620     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3621 
3622     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3623     __ movdqu(xmm_result, Address(from, 0));
3624 
3625     // for decryption java expanded key ordering is rotated one position from what we want
3626     // so we start from 0x10 here and hit 0x00 last
3627     // we don't know if the key is aligned, hence not using load-execute form
3628     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3629     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3630     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3631     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3632 
3633     __ pxor  (xmm_result, xmm_temp1);
3634     __ aesdec(xmm_result, xmm_temp2);
3635     __ aesdec(xmm_result, xmm_temp3);
3636     __ aesdec(xmm_result, xmm_temp4);
3637 
3638     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3639     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3640     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3641     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3642 
3643     __ aesdec(xmm_result, xmm_temp1);
3644     __ aesdec(xmm_result, xmm_temp2);
3645     __ aesdec(xmm_result, xmm_temp3);
3646     __ aesdec(xmm_result, xmm_temp4);
3647 
3648     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3649     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3650     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3651 
3652     __ cmpl(keylen, 44);
3653     __ jccb(Assembler::equal, L_doLast);
3654 
3655     __ aesdec(xmm_result, xmm_temp1);
3656     __ aesdec(xmm_result, xmm_temp2);
3657 
3658     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3659     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3660 
3661     __ cmpl(keylen, 52);
3662     __ jccb(Assembler::equal, L_doLast);
3663 
3664     __ aesdec(xmm_result, xmm_temp1);
3665     __ aesdec(xmm_result, xmm_temp2);
3666 
3667     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3668     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3669 
3670     __ BIND(L_doLast);
3671     __ aesdec(xmm_result, xmm_temp1);
3672     __ aesdec(xmm_result, xmm_temp2);
3673 
3674     // for decryption the aesdeclast operation is always on key+0x00
3675     __ aesdeclast(xmm_result, xmm_temp3);
3676     __ movdqu(Address(to, 0), xmm_result);  // store the result
3677     __ xorptr(rax, rax); // return 0
3678     __ leave(); // required for proper stackwalking of RuntimeStub frame
3679     __ ret(0);
3680 
3681     return start;
3682   }
3683 
3684 
3685   // Arguments:
3686   //
3687   // Inputs:
3688   //   c_rarg0   - source byte array address
3689   //   c_rarg1   - destination byte array address
3690   //   c_rarg2   - K (key) in little endian int array
3691   //   c_rarg3   - r vector byte array address
3692   //   c_rarg4   - input length
3693   //
3694   // Output:
3695   //   rax       - input length
3696   //
3697   address generate_cipherBlockChaining_encryptAESCrypt() {
3698     assert(UseAES, "need AES instructions and misaligned SSE support");
3699     __ align(CodeEntryAlignment);
3700     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3701     address start = __ pc();
3702 
3703     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3704     const Register from        = c_rarg0;  // source array address
3705     const Register to          = c_rarg1;  // destination array address
3706     const Register key         = c_rarg2;  // key array address
3707     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3708                                            // and left with the results of the last encryption block
3709 #ifndef _WIN64
3710     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3711 #else
3712     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3713     const Register len_reg     = r11;      // pick the volatile windows register
3714 #endif
3715     const Register pos         = rax;
3716 
3717     // xmm register assignments for the loops below
3718     const XMMRegister xmm_result = xmm0;
3719     const XMMRegister xmm_temp   = xmm1;
3720     // keys 0-10 preloaded into xmm2-xmm12
3721     const int XMM_REG_NUM_KEY_FIRST = 2;
3722     const int XMM_REG_NUM_KEY_LAST  = 15;
3723     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3724     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3725     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3726     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3727     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3728 
3729     __ enter(); // required for proper stackwalking of RuntimeStub frame
3730 
3731 #ifdef _WIN64
3732     // on win64, fill len_reg from stack position
3733     __ movl(len_reg, len_mem);
3734 #else
3735     __ push(len_reg); // Save
3736 #endif
3737 
3738     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3739     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3740     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3741     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3742       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3743       offset += 0x10;
3744     }
3745     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3746 
3747     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3748     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3749     __ cmpl(rax, 44);
3750     __ jcc(Assembler::notEqual, L_key_192_256);
3751 
3752     // 128 bit code follows here
3753     __ movptr(pos, 0);
3754     __ align(OptoLoopAlignment);
3755 
3756     __ BIND(L_loopTop_128);
3757     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3758     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3759     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3760     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3761       __ aesenc(xmm_result, as_XMMRegister(rnum));
3762     }
3763     __ aesenclast(xmm_result, xmm_key10);
3764     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3765     // no need to store r to memory until we exit
3766     __ addptr(pos, AESBlockSize);
3767     __ subptr(len_reg, AESBlockSize);
3768     __ jcc(Assembler::notEqual, L_loopTop_128);
3769 
3770     __ BIND(L_exit);
3771     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3772 
3773 #ifdef _WIN64
3774     __ movl(rax, len_mem);
3775 #else
3776     __ pop(rax); // return length
3777 #endif
3778     __ leave(); // required for proper stackwalking of RuntimeStub frame
3779     __ ret(0);
3780 
3781     __ BIND(L_key_192_256);
3782     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3783     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3784     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3785     __ cmpl(rax, 52);
3786     __ jcc(Assembler::notEqual, L_key_256);
3787 
3788     // 192-bit code follows here (could be changed to use more xmm registers)
3789     __ movptr(pos, 0);
3790     __ align(OptoLoopAlignment);
3791 
3792     __ BIND(L_loopTop_192);
3793     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3794     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3795     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3796     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3797       __ aesenc(xmm_result, as_XMMRegister(rnum));
3798     }
3799     __ aesenclast(xmm_result, xmm_key12);
3800     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3801     // no need to store r to memory until we exit
3802     __ addptr(pos, AESBlockSize);
3803     __ subptr(len_reg, AESBlockSize);
3804     __ jcc(Assembler::notEqual, L_loopTop_192);
3805     __ jmp(L_exit);
3806 
3807     __ BIND(L_key_256);
3808     // 256-bit code follows here (could be changed to use more xmm registers)
3809     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3810     __ movptr(pos, 0);
3811     __ align(OptoLoopAlignment);
3812 
3813     __ BIND(L_loopTop_256);
3814     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3815     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3816     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3817     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3818       __ aesenc(xmm_result, as_XMMRegister(rnum));
3819     }
3820     load_key(xmm_temp, key, 0xe0);
3821     __ aesenclast(xmm_result, xmm_temp);
3822     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3823     // no need to store r to memory until we exit
3824     __ addptr(pos, AESBlockSize);
3825     __ subptr(len_reg, AESBlockSize);
3826     __ jcc(Assembler::notEqual, L_loopTop_256);
3827     __ jmp(L_exit);
3828 
3829     return start;
3830   }
3831 
3832   // Safefetch stubs.
3833   void generate_safefetch(const char* name, int size, address* entry,
3834                           address* fault_pc, address* continuation_pc) {
3835     // safefetch signatures:
3836     //   int      SafeFetch32(int*      adr, int      errValue);
3837     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3838     //
3839     // arguments:
3840     //   c_rarg0 = adr
3841     //   c_rarg1 = errValue
3842     //
3843     // result:
3844     //   PPC_RET  = *adr or errValue
3845 
3846     StubCodeMark mark(this, "StubRoutines", name);
3847 
3848     // Entry point, pc or function descriptor.
3849     *entry = __ pc();
3850 
3851     // Load *adr into c_rarg1, may fault.
3852     *fault_pc = __ pc();
3853     switch (size) {
3854       case 4:
3855         // int32_t
3856         __ movl(c_rarg1, Address(c_rarg0, 0));
3857         break;
3858       case 8:
3859         // int64_t
3860         __ movq(c_rarg1, Address(c_rarg0, 0));
3861         break;
3862       default:
3863         ShouldNotReachHere();
3864     }
3865 
3866     // return errValue or *adr
3867     *continuation_pc = __ pc();
3868     __ movq(rax, c_rarg1);
3869     __ ret(0);
3870   }
3871 
3872   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3873   // to hide instruction latency
3874   //
3875   // Arguments:
3876   //
3877   // Inputs:
3878   //   c_rarg0   - source byte array address
3879   //   c_rarg1   - destination byte array address
3880   //   c_rarg2   - K (key) in little endian int array
3881   //   c_rarg3   - r vector byte array address
3882   //   c_rarg4   - input length
3883   //
3884   // Output:
3885   //   rax       - input length
3886   //
3887   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3888     assert(UseAES, "need AES instructions and misaligned SSE support");
3889     __ align(CodeEntryAlignment);
3890     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3891     address start = __ pc();
3892 
3893     const Register from        = c_rarg0;  // source array address
3894     const Register to          = c_rarg1;  // destination array address
3895     const Register key         = c_rarg2;  // key array address
3896     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3897                                            // and left with the results of the last encryption block
3898 #ifndef _WIN64
3899     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3900 #else
3901     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3902     const Register len_reg     = r11;      // pick the volatile windows register
3903 #endif
3904     const Register pos         = rax;
3905 
3906     const int PARALLEL_FACTOR = 4;
3907     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3908 
3909     Label L_exit;
3910     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3911     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3912     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3913     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3914     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3915 
3916     // keys 0-10 preloaded into xmm5-xmm15
3917     const int XMM_REG_NUM_KEY_FIRST = 5;
3918     const int XMM_REG_NUM_KEY_LAST  = 15;
3919     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3920     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3921 
3922     __ enter(); // required for proper stackwalking of RuntimeStub frame
3923 
3924 #ifdef _WIN64
3925     // on win64, fill len_reg from stack position
3926     __ movl(len_reg, len_mem);
3927 #else
3928     __ push(len_reg); // Save
3929 #endif
3930     __ push(rbx);
3931     // the java expanded key ordering is rotated one position from what we want
3932     // so we start from 0x10 here and hit 0x00 last
3933     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3934     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3935     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3936     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3937       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3938       offset += 0x10;
3939     }
3940     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3941 
3942     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3943 
3944     // registers holding the four results in the parallelized loop
3945     const XMMRegister xmm_result0 = xmm0;
3946     const XMMRegister xmm_result1 = xmm2;
3947     const XMMRegister xmm_result2 = xmm3;
3948     const XMMRegister xmm_result3 = xmm4;
3949 
3950     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3951 
3952     __ xorptr(pos, pos);
3953 
3954     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3955     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3956     __ cmpl(rbx, 52);
3957     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3958     __ cmpl(rbx, 60);
3959     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3960 
3961 #define DoFour(opc, src_reg)           \
3962   __ opc(xmm_result0, src_reg);         \
3963   __ opc(xmm_result1, src_reg);         \
3964   __ opc(xmm_result2, src_reg);         \
3965   __ opc(xmm_result3, src_reg);         \
3966 
3967     for (int k = 0; k < 3; ++k) {
3968       __ BIND(L_multiBlock_loopTopHead[k]);
3969       if (k != 0) {
3970         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3971         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3972       }
3973       if (k == 1) {
3974         __ subptr(rsp, 6 * wordSize);
3975         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3976         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3977         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3978         load_key(xmm1, key, 0xc0);  // 0xc0;
3979         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3980       } else if (k == 2) {
3981         __ subptr(rsp, 10 * wordSize);
3982         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3983         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3984         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3985         load_key(xmm1, key, 0xe0);  // 0xe0;
3986         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3987         load_key(xmm15, key, 0xb0); // 0xb0;
3988         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3989         load_key(xmm1, key, 0xc0);  // 0xc0;
3990         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3991       }
3992       __ align(OptoLoopAlignment);
3993       __ BIND(L_multiBlock_loopTop[k]);
3994       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3995       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3996 
3997       if  (k != 0) {
3998         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3999         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
4000       }
4001 
4002       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
4003       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4004       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4005       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4006 
4007       DoFour(pxor, xmm_key_first);
4008       if (k == 0) {
4009         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
4010           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4011         }
4012         DoFour(aesdeclast, xmm_key_last);
4013       } else if (k == 1) {
4014         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
4015           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4016         }
4017         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4018         DoFour(aesdec, xmm1);  // key : 0xc0
4019         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4020         DoFour(aesdeclast, xmm_key_last);
4021       } else if (k == 2) {
4022         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
4023           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4024         }
4025         DoFour(aesdec, xmm1);  // key : 0xc0
4026         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
4027         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
4028         DoFour(aesdec, xmm15);  // key : 0xd0
4029         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4030         DoFour(aesdec, xmm1);  // key : 0xe0
4031         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4032         DoFour(aesdeclast, xmm_key_last);
4033       }
4034 
4035       // for each result, xor with the r vector of previous cipher block
4036       __ pxor(xmm_result0, xmm_prev_block_cipher);
4037       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4038       __ pxor(xmm_result1, xmm_prev_block_cipher);
4039       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4040       __ pxor(xmm_result2, xmm_prev_block_cipher);
4041       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4042       __ pxor(xmm_result3, xmm_prev_block_cipher);
4043       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
4044       if (k != 0) {
4045         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
4046       }
4047 
4048       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
4049       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4050       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4051       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4052 
4053       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
4054       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
4055       __ jmp(L_multiBlock_loopTop[k]);
4056 
4057       // registers used in the non-parallelized loops
4058       // xmm register assignments for the loops below
4059       const XMMRegister xmm_result = xmm0;
4060       const XMMRegister xmm_prev_block_cipher_save = xmm2;
4061       const XMMRegister xmm_key11 = xmm3;
4062       const XMMRegister xmm_key12 = xmm4;
4063       const XMMRegister key_tmp = xmm4;
4064 
4065       __ BIND(L_singleBlock_loopTopHead[k]);
4066       if (k == 1) {
4067         __ addptr(rsp, 6 * wordSize);
4068       } else if (k == 2) {
4069         __ addptr(rsp, 10 * wordSize);
4070       }
4071       __ cmpptr(len_reg, 0); // any blocks left??
4072       __ jcc(Assembler::equal, L_exit);
4073       __ BIND(L_singleBlock_loopTopHead2[k]);
4074       if (k == 1) {
4075         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
4076         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
4077       }
4078       if (k == 2) {
4079         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
4080       }
4081       __ align(OptoLoopAlignment);
4082       __ BIND(L_singleBlock_loopTop[k]);
4083       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
4084       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
4085       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
4086       for (int rnum = 1; rnum <= 9 ; rnum++) {
4087           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4088       }
4089       if (k == 1) {
4090         __ aesdec(xmm_result, xmm_key11);
4091         __ aesdec(xmm_result, xmm_key12);
4092       }
4093       if (k == 2) {
4094         __ aesdec(xmm_result, xmm_key11);
4095         load_key(key_tmp, key, 0xc0);
4096         __ aesdec(xmm_result, key_tmp);
4097         load_key(key_tmp, key, 0xd0);
4098         __ aesdec(xmm_result, key_tmp);
4099         load_key(key_tmp, key, 0xe0);
4100         __ aesdec(xmm_result, key_tmp);
4101       }
4102 
4103       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
4104       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
4105       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
4106       // no need to store r to memory until we exit
4107       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
4108       __ addptr(pos, AESBlockSize);
4109       __ subptr(len_reg, AESBlockSize);
4110       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
4111       if (k != 2) {
4112         __ jmp(L_exit);
4113       }
4114     } //for 128/192/256
4115 
4116     __ BIND(L_exit);
4117     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
4118     __ pop(rbx);
4119 #ifdef _WIN64
4120     __ movl(rax, len_mem);
4121 #else
4122     __ pop(rax); // return length
4123 #endif
4124     __ leave(); // required for proper stackwalking of RuntimeStub frame
4125     __ ret(0);
4126     return start;
4127 }
4128 
4129   address generate_electronicCodeBook_encryptAESCrypt() {
4130     __ align(CodeEntryAlignment);
4131     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
4132     address start = __ pc();
4133     const Register from = c_rarg0;  // source array address
4134     const Register to = c_rarg1;  // destination array address
4135     const Register key = c_rarg2;  // key array address
4136     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4137     __ enter(); // required for proper stackwalking of RuntimeStub frame
4138     __ aesecb_encrypt(from, to, key, len);
4139     __ leave(); // required for proper stackwalking of RuntimeStub frame
4140     __ ret(0);
4141     return start;
4142  }
4143 
4144   address generate_electronicCodeBook_decryptAESCrypt() {
4145     __ align(CodeEntryAlignment);
4146     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
4147     address start = __ pc();
4148     const Register from = c_rarg0;  // source array address
4149     const Register to = c_rarg1;  // destination array address
4150     const Register key = c_rarg2;  // key array address
4151     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4152     __ enter(); // required for proper stackwalking of RuntimeStub frame
4153     __ aesecb_decrypt(from, to, key, len);
4154     __ leave(); // required for proper stackwalking of RuntimeStub frame
4155     __ ret(0);
4156     return start;
4157   }
4158 
4159   // ofs and limit are use for multi-block byte array.
4160   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
4161   address generate_md5_implCompress(bool multi_block, const char *name) {
4162     __ align(CodeEntryAlignment);
4163     StubCodeMark mark(this, "StubRoutines", name);
4164     address start = __ pc();
4165 
4166     const Register buf_param = r15;
4167     const Address state_param(rsp, 0 * wordSize);
4168     const Address ofs_param  (rsp, 1 * wordSize    );
4169     const Address limit_param(rsp, 1 * wordSize + 4);
4170 
4171     __ enter();
4172     __ push(rbx);
4173     __ push(rdi);
4174     __ push(rsi);
4175     __ push(r15);
4176     __ subptr(rsp, 2 * wordSize);
4177 
4178     __ movptr(buf_param, c_rarg0);
4179     __ movptr(state_param, c_rarg1);
4180     if (multi_block) {
4181       __ movl(ofs_param, c_rarg2);
4182       __ movl(limit_param, c_rarg3);
4183     }
4184     __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
4185 
4186     __ addptr(rsp, 2 * wordSize);
4187     __ pop(r15);
4188     __ pop(rsi);
4189     __ pop(rdi);
4190     __ pop(rbx);
4191     __ leave();
4192     __ ret(0);
4193     return start;
4194   }
4195 
4196   address generate_upper_word_mask() {
4197     __ align64();
4198     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
4199     address start = __ pc();
4200     __ emit_data64(0x0000000000000000, relocInfo::none);
4201     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
4202     return start;
4203   }
4204 
4205   address generate_shuffle_byte_flip_mask() {
4206     __ align64();
4207     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
4208     address start = __ pc();
4209     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4210     __ emit_data64(0x0001020304050607, relocInfo::none);
4211     return start;
4212   }
4213 
4214   // ofs and limit are use for multi-block byte array.
4215   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4216   address generate_sha1_implCompress(bool multi_block, const char *name) {
4217     __ align(CodeEntryAlignment);
4218     StubCodeMark mark(this, "StubRoutines", name);
4219     address start = __ pc();
4220 
4221     Register buf = c_rarg0;
4222     Register state = c_rarg1;
4223     Register ofs = c_rarg2;
4224     Register limit = c_rarg3;
4225 
4226     const XMMRegister abcd = xmm0;
4227     const XMMRegister e0 = xmm1;
4228     const XMMRegister e1 = xmm2;
4229     const XMMRegister msg0 = xmm3;
4230 
4231     const XMMRegister msg1 = xmm4;
4232     const XMMRegister msg2 = xmm5;
4233     const XMMRegister msg3 = xmm6;
4234     const XMMRegister shuf_mask = xmm7;
4235 
4236     __ enter();
4237 
4238     __ subptr(rsp, 4 * wordSize);
4239 
4240     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4241       buf, state, ofs, limit, rsp, multi_block);
4242 
4243     __ addptr(rsp, 4 * wordSize);
4244 
4245     __ leave();
4246     __ ret(0);
4247     return start;
4248   }
4249 
4250   address generate_pshuffle_byte_flip_mask() {
4251     __ align64();
4252     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4253     address start = __ pc();
4254     __ emit_data64(0x0405060700010203, relocInfo::none);
4255     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4256 
4257     if (VM_Version::supports_avx2()) {
4258       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4259       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4260       // _SHUF_00BA
4261       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4262       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4263       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4264       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4265       // _SHUF_DC00
4266       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4267       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4268       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4269       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4270     }
4271 
4272     return start;
4273   }
4274 
4275   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4276   address generate_pshuffle_byte_flip_mask_sha512() {
4277     __ align32();
4278     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4279     address start = __ pc();
4280     if (VM_Version::supports_avx2()) {
4281       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4282       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4283       __ emit_data64(0x1011121314151617, relocInfo::none);
4284       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4285       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4286       __ emit_data64(0x0000000000000000, relocInfo::none);
4287       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4288       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4289     }
4290 
4291     return start;
4292   }
4293 
4294 // ofs and limit are use for multi-block byte array.
4295 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4296   address generate_sha256_implCompress(bool multi_block, const char *name) {
4297     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4298     __ align(CodeEntryAlignment);
4299     StubCodeMark mark(this, "StubRoutines", name);
4300     address start = __ pc();
4301 
4302     Register buf = c_rarg0;
4303     Register state = c_rarg1;
4304     Register ofs = c_rarg2;
4305     Register limit = c_rarg3;
4306 
4307     const XMMRegister msg = xmm0;
4308     const XMMRegister state0 = xmm1;
4309     const XMMRegister state1 = xmm2;
4310     const XMMRegister msgtmp0 = xmm3;
4311 
4312     const XMMRegister msgtmp1 = xmm4;
4313     const XMMRegister msgtmp2 = xmm5;
4314     const XMMRegister msgtmp3 = xmm6;
4315     const XMMRegister msgtmp4 = xmm7;
4316 
4317     const XMMRegister shuf_mask = xmm8;
4318 
4319     __ enter();
4320 
4321     __ subptr(rsp, 4 * wordSize);
4322 
4323     if (VM_Version::supports_sha()) {
4324       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4325         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4326     } else if (VM_Version::supports_avx2()) {
4327       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4328         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4329     }
4330     __ addptr(rsp, 4 * wordSize);
4331     __ vzeroupper();
4332     __ leave();
4333     __ ret(0);
4334     return start;
4335   }
4336 
4337   address generate_sha512_implCompress(bool multi_block, const char *name) {
4338     assert(VM_Version::supports_avx2(), "");
4339     assert(VM_Version::supports_bmi2(), "");
4340     __ align(CodeEntryAlignment);
4341     StubCodeMark mark(this, "StubRoutines", name);
4342     address start = __ pc();
4343 
4344     Register buf = c_rarg0;
4345     Register state = c_rarg1;
4346     Register ofs = c_rarg2;
4347     Register limit = c_rarg3;
4348 
4349     const XMMRegister msg = xmm0;
4350     const XMMRegister state0 = xmm1;
4351     const XMMRegister state1 = xmm2;
4352     const XMMRegister msgtmp0 = xmm3;
4353     const XMMRegister msgtmp1 = xmm4;
4354     const XMMRegister msgtmp2 = xmm5;
4355     const XMMRegister msgtmp3 = xmm6;
4356     const XMMRegister msgtmp4 = xmm7;
4357 
4358     const XMMRegister shuf_mask = xmm8;
4359 
4360     __ enter();
4361 
4362     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4363     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4364 
4365     __ vzeroupper();
4366     __ leave();
4367     __ ret(0);
4368     return start;
4369   }
4370 
4371   // This mask is used for incrementing counter value(linc0, linc4, etc.)
4372   address counter_mask_addr() {
4373     __ align64();
4374     StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
4375     address start = __ pc();
4376     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
4377     __ emit_data64(0x0001020304050607, relocInfo::none);
4378     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4379     __ emit_data64(0x0001020304050607, relocInfo::none);
4380     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4381     __ emit_data64(0x0001020304050607, relocInfo::none);
4382     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4383     __ emit_data64(0x0001020304050607, relocInfo::none);
4384     __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4385     __ emit_data64(0x0000000000000000, relocInfo::none);
4386     __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4387     __ emit_data64(0x0000000000000000, relocInfo::none);
4388     __ emit_data64(0x0000000000000002, relocInfo::none);
4389     __ emit_data64(0x0000000000000000, relocInfo::none);
4390     __ emit_data64(0x0000000000000003, relocInfo::none);
4391     __ emit_data64(0x0000000000000000, relocInfo::none);
4392     __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4393     __ emit_data64(0x0000000000000000, relocInfo::none);
4394     __ emit_data64(0x0000000000000004, relocInfo::none);
4395     __ emit_data64(0x0000000000000000, relocInfo::none);
4396     __ emit_data64(0x0000000000000004, relocInfo::none);
4397     __ emit_data64(0x0000000000000000, relocInfo::none);
4398     __ emit_data64(0x0000000000000004, relocInfo::none);
4399     __ emit_data64(0x0000000000000000, relocInfo::none);
4400     __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4401     __ emit_data64(0x0000000000000000, relocInfo::none);
4402     __ emit_data64(0x0000000000000008, relocInfo::none);
4403     __ emit_data64(0x0000000000000000, relocInfo::none);
4404     __ emit_data64(0x0000000000000008, relocInfo::none);
4405     __ emit_data64(0x0000000000000000, relocInfo::none);
4406     __ emit_data64(0x0000000000000008, relocInfo::none);
4407     __ emit_data64(0x0000000000000000, relocInfo::none);
4408     __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4409     __ emit_data64(0x0000000000000000, relocInfo::none);
4410     __ emit_data64(0x0000000000000020, relocInfo::none);
4411     __ emit_data64(0x0000000000000000, relocInfo::none);
4412     __ emit_data64(0x0000000000000020, relocInfo::none);
4413     __ emit_data64(0x0000000000000000, relocInfo::none);
4414     __ emit_data64(0x0000000000000020, relocInfo::none);
4415     __ emit_data64(0x0000000000000000, relocInfo::none);
4416     __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4417     __ emit_data64(0x0000000000000000, relocInfo::none);
4418     __ emit_data64(0x0000000000000010, relocInfo::none);
4419     __ emit_data64(0x0000000000000000, relocInfo::none);
4420     __ emit_data64(0x0000000000000010, relocInfo::none);
4421     __ emit_data64(0x0000000000000000, relocInfo::none);
4422     __ emit_data64(0x0000000000000010, relocInfo::none);
4423     __ emit_data64(0x0000000000000000, relocInfo::none);
4424     return start;
4425   }
4426 
4427  // Vector AES Counter implementation
4428   address generate_counterMode_VectorAESCrypt()  {
4429     __ align(CodeEntryAlignment);
4430     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4431     address start = __ pc();
4432     const Register from = c_rarg0; // source array address
4433     const Register to = c_rarg1; // destination array address
4434     const Register key = c_rarg2; // key array address r8
4435     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4436     // and updated with the incremented counter in the end
4437 #ifndef _WIN64
4438     const Register len_reg = c_rarg4;
4439     const Register saved_encCounter_start = c_rarg5;
4440     const Register used_addr = r10;
4441     const Address  used_mem(rbp, 2 * wordSize);
4442     const Register used = r11;
4443 #else
4444     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4445     const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4446     const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4447     const Register len_reg = r10; // pick the first volatile windows register
4448     const Register saved_encCounter_start = r11;
4449     const Register used_addr = r13;
4450     const Register used = r14;
4451 #endif
4452     __ enter();
4453    // Save state before entering routine
4454     __ push(r12);
4455     __ push(r13);
4456     __ push(r14);
4457     __ push(r15);
4458 #ifdef _WIN64
4459     // on win64, fill len_reg from stack position
4460     __ movl(len_reg, len_mem);
4461     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4462     __ movptr(used_addr, used_mem);
4463     __ movl(used, Address(used_addr, 0));
4464 #else
4465     __ push(len_reg); // Save
4466     __ movptr(used_addr, used_mem);
4467     __ movl(used, Address(used_addr, 0));
4468 #endif
4469     __ push(rbx);
4470     __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
4471     // Restore state before leaving routine
4472     __ pop(rbx);
4473 #ifdef _WIN64
4474     __ movl(rax, len_mem); // return length
4475 #else
4476     __ pop(rax); // return length
4477 #endif
4478     __ pop(r15);
4479     __ pop(r14);
4480     __ pop(r13);
4481     __ pop(r12);
4482 
4483     __ leave(); // required for proper stackwalking of RuntimeStub frame
4484     __ ret(0);
4485     return start;
4486   }
4487 
4488   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4489   // to hide instruction latency
4490   //
4491   // Arguments:
4492   //
4493   // Inputs:
4494   //   c_rarg0   - source byte array address
4495   //   c_rarg1   - destination byte array address
4496   //   c_rarg2   - K (key) in little endian int array
4497   //   c_rarg3   - counter vector byte array address
4498   //   Linux
4499   //     c_rarg4   -          input length
4500   //     c_rarg5   -          saved encryptedCounter start
4501   //     rbp + 6 * wordSize - saved used length
4502   //   Windows
4503   //     rbp + 6 * wordSize - input length
4504   //     rbp + 7 * wordSize - saved encryptedCounter start
4505   //     rbp + 8 * wordSize - saved used length
4506   //
4507   // Output:
4508   //   rax       - input length
4509   //
4510   address generate_counterMode_AESCrypt_Parallel() {
4511     assert(UseAES, "need AES instructions and misaligned SSE support");
4512     __ align(CodeEntryAlignment);
4513     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4514     address start = __ pc();
4515     const Register from = c_rarg0; // source array address
4516     const Register to = c_rarg1; // destination array address
4517     const Register key = c_rarg2; // key array address
4518     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4519                                       // and updated with the incremented counter in the end
4520 #ifndef _WIN64
4521     const Register len_reg = c_rarg4;
4522     const Register saved_encCounter_start = c_rarg5;
4523     const Register used_addr = r10;
4524     const Address  used_mem(rbp, 2 * wordSize);
4525     const Register used = r11;
4526 #else
4527     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4528     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4529     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4530     const Register len_reg = r10; // pick the first volatile windows register
4531     const Register saved_encCounter_start = r11;
4532     const Register used_addr = r13;
4533     const Register used = r14;
4534 #endif
4535     const Register pos = rax;
4536 
4537     const int PARALLEL_FACTOR = 6;
4538     const XMMRegister xmm_counter_shuf_mask = xmm0;
4539     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4540     const XMMRegister xmm_curr_counter = xmm2;
4541 
4542     const XMMRegister xmm_key_tmp0 = xmm3;
4543     const XMMRegister xmm_key_tmp1 = xmm4;
4544 
4545     // registers holding the four results in the parallelized loop
4546     const XMMRegister xmm_result0 = xmm5;
4547     const XMMRegister xmm_result1 = xmm6;
4548     const XMMRegister xmm_result2 = xmm7;
4549     const XMMRegister xmm_result3 = xmm8;
4550     const XMMRegister xmm_result4 = xmm9;
4551     const XMMRegister xmm_result5 = xmm10;
4552 
4553     const XMMRegister xmm_from0 = xmm11;
4554     const XMMRegister xmm_from1 = xmm12;
4555     const XMMRegister xmm_from2 = xmm13;
4556     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4557     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4558     const XMMRegister xmm_from5 = xmm4;
4559 
4560     //for key_128, key_192, key_256
4561     const int rounds[3] = {10, 12, 14};
4562     Label L_exit_preLoop, L_preLoop_start;
4563     Label L_multiBlock_loopTop[3];
4564     Label L_singleBlockLoopTop[3];
4565     Label L__incCounter[3][6]; //for 6 blocks
4566     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4567     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4568     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4569 
4570     Label L_exit;
4571 
4572     __ enter(); // required for proper stackwalking of RuntimeStub frame
4573 
4574 #ifdef _WIN64
4575     // allocate spill slots for r13, r14
4576     enum {
4577         saved_r13_offset,
4578         saved_r14_offset
4579     };
4580     __ subptr(rsp, 2 * wordSize);
4581     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4582     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4583 
4584     // on win64, fill len_reg from stack position
4585     __ movl(len_reg, len_mem);
4586     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4587     __ movptr(used_addr, used_mem);
4588     __ movl(used, Address(used_addr, 0));
4589 #else
4590     __ push(len_reg); // Save
4591     __ movptr(used_addr, used_mem);
4592     __ movl(used, Address(used_addr, 0));
4593 #endif
4594 
4595     __ push(rbx); // Save RBX
4596     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4597     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4598     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4599     __ movptr(pos, 0);
4600 
4601     // Use the partially used encrpyted counter from last invocation
4602     __ BIND(L_preLoop_start);
4603     __ cmpptr(used, 16);
4604     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4605       __ cmpptr(len_reg, 0);
4606       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4607       __ movb(rbx, Address(saved_encCounter_start, used));
4608       __ xorb(rbx, Address(from, pos));
4609       __ movb(Address(to, pos), rbx);
4610       __ addptr(pos, 1);
4611       __ addptr(used, 1);
4612       __ subptr(len_reg, 1);
4613 
4614     __ jmp(L_preLoop_start);
4615 
4616     __ BIND(L_exit_preLoop);
4617     __ movl(Address(used_addr, 0), used);
4618 
4619     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4620     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4621     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4622     __ cmpl(rbx, 52);
4623     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4624     __ cmpl(rbx, 60);
4625     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4626 
4627 #define CTR_DoSix(opc, src_reg)                \
4628     __ opc(xmm_result0, src_reg);              \
4629     __ opc(xmm_result1, src_reg);              \
4630     __ opc(xmm_result2, src_reg);              \
4631     __ opc(xmm_result3, src_reg);              \
4632     __ opc(xmm_result4, src_reg);              \
4633     __ opc(xmm_result5, src_reg);
4634 
4635     // k == 0 :  generate code for key_128
4636     // k == 1 :  generate code for key_192
4637     // k == 2 :  generate code for key_256
4638     for (int k = 0; k < 3; ++k) {
4639       //multi blocks starts here
4640       __ align(OptoLoopAlignment);
4641       __ BIND(L_multiBlock_loopTop[k]);
4642       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4643       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4644       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4645 
4646       //load, then increase counters
4647       CTR_DoSix(movdqa, xmm_curr_counter);
4648       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4649       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4650       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4651       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4652       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4653       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4654       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4655       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4656 
4657       //load two ROUND_KEYs at a time
4658       for (int i = 1; i < rounds[k]; ) {
4659         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4660         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4661         CTR_DoSix(aesenc, xmm_key_tmp1);
4662         i++;
4663         if (i != rounds[k]) {
4664           CTR_DoSix(aesenc, xmm_key_tmp0);
4665         } else {
4666           CTR_DoSix(aesenclast, xmm_key_tmp0);
4667         }
4668         i++;
4669       }
4670 
4671       // get next PARALLEL_FACTOR blocks into xmm_result registers
4672       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4673       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4674       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4675       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4676       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4677       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4678 
4679       __ pxor(xmm_result0, xmm_from0);
4680       __ pxor(xmm_result1, xmm_from1);
4681       __ pxor(xmm_result2, xmm_from2);
4682       __ pxor(xmm_result3, xmm_from3);
4683       __ pxor(xmm_result4, xmm_from4);
4684       __ pxor(xmm_result5, xmm_from5);
4685 
4686       // store 6 results into the next 64 bytes of output
4687       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4688       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4689       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4690       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4691       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4692       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4693 
4694       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4695       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4696       __ jmp(L_multiBlock_loopTop[k]);
4697 
4698       // singleBlock starts here
4699       __ align(OptoLoopAlignment);
4700       __ BIND(L_singleBlockLoopTop[k]);
4701       __ cmpptr(len_reg, 0);
4702       __ jcc(Assembler::lessEqual, L_exit);
4703       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4704       __ movdqa(xmm_result0, xmm_curr_counter);
4705       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4706       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4707       __ pxor(xmm_result0, xmm_key_tmp0);
4708       for (int i = 1; i < rounds[k]; i++) {
4709         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4710         __ aesenc(xmm_result0, xmm_key_tmp0);
4711       }
4712       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4713       __ aesenclast(xmm_result0, xmm_key_tmp0);
4714       __ cmpptr(len_reg, AESBlockSize);
4715       __ jcc(Assembler::less, L_processTail_insr[k]);
4716         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4717         __ pxor(xmm_result0, xmm_from0);
4718         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4719         __ addptr(pos, AESBlockSize);
4720         __ subptr(len_reg, AESBlockSize);
4721         __ jmp(L_singleBlockLoopTop[k]);
4722       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4723         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4724         __ testptr(len_reg, 8);
4725         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4726           __ subptr(pos,8);
4727           __ pinsrq(xmm_from0, Address(from, pos), 0);
4728         __ BIND(L_processTail_4_insr[k]);
4729         __ testptr(len_reg, 4);
4730         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4731           __ subptr(pos,4);
4732           __ pslldq(xmm_from0, 4);
4733           __ pinsrd(xmm_from0, Address(from, pos), 0);
4734         __ BIND(L_processTail_2_insr[k]);
4735         __ testptr(len_reg, 2);
4736         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4737           __ subptr(pos, 2);
4738           __ pslldq(xmm_from0, 2);
4739           __ pinsrw(xmm_from0, Address(from, pos), 0);
4740         __ BIND(L_processTail_1_insr[k]);
4741         __ testptr(len_reg, 1);
4742         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4743           __ subptr(pos, 1);
4744           __ pslldq(xmm_from0, 1);
4745           __ pinsrb(xmm_from0, Address(from, pos), 0);
4746         __ BIND(L_processTail_exit_insr[k]);
4747 
4748         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4749         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4750 
4751         __ testptr(len_reg, 8);
4752         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4753           __ pextrq(Address(to, pos), xmm_result0, 0);
4754           __ psrldq(xmm_result0, 8);
4755           __ addptr(pos, 8);
4756         __ BIND(L_processTail_4_extr[k]);
4757         __ testptr(len_reg, 4);
4758         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4759           __ pextrd(Address(to, pos), xmm_result0, 0);
4760           __ psrldq(xmm_result0, 4);
4761           __ addptr(pos, 4);
4762         __ BIND(L_processTail_2_extr[k]);
4763         __ testptr(len_reg, 2);
4764         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4765           __ pextrw(Address(to, pos), xmm_result0, 0);
4766           __ psrldq(xmm_result0, 2);
4767           __ addptr(pos, 2);
4768         __ BIND(L_processTail_1_extr[k]);
4769         __ testptr(len_reg, 1);
4770         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4771           __ pextrb(Address(to, pos), xmm_result0, 0);
4772 
4773         __ BIND(L_processTail_exit_extr[k]);
4774         __ movl(Address(used_addr, 0), len_reg);
4775         __ jmp(L_exit);
4776 
4777     }
4778 
4779     __ BIND(L_exit);
4780     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4781     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4782     __ pop(rbx); // pop the saved RBX.
4783 #ifdef _WIN64
4784     __ movl(rax, len_mem);
4785     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4786     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4787     __ addptr(rsp, 2 * wordSize);
4788 #else
4789     __ pop(rax); // return 'len'
4790 #endif
4791     __ leave(); // required for proper stackwalking of RuntimeStub frame
4792     __ ret(0);
4793     return start;
4794   }
4795 
4796 void roundDec(XMMRegister xmm_reg) {
4797   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4798   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4799   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4800   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4801   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4802   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4803   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4804   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4805 }
4806 
4807 void roundDeclast(XMMRegister xmm_reg) {
4808   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4809   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4810   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4811   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4812   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4813   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4814   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4815   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4816 }
4817 
4818   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4819     __ movdqu(xmmdst, Address(key, offset));
4820     if (xmm_shuf_mask != NULL) {
4821       __ pshufb(xmmdst, xmm_shuf_mask);
4822     } else {
4823       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4824     }
4825     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4826 
4827   }
4828 
4829 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4830     assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
4831     __ align(CodeEntryAlignment);
4832     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4833     address start = __ pc();
4834 
4835     const Register from = c_rarg0;  // source array address
4836     const Register to = c_rarg1;  // destination array address
4837     const Register key = c_rarg2;  // key array address
4838     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4839     // and left with the results of the last encryption block
4840 #ifndef _WIN64
4841     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4842 #else
4843     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4844     const Register len_reg = r11;      // pick the volatile windows register
4845 #endif
4846 
4847     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4848           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4849 
4850     __ enter();
4851 
4852 #ifdef _WIN64
4853   // on win64, fill len_reg from stack position
4854     __ movl(len_reg, len_mem);
4855 #else
4856     __ push(len_reg); // Save
4857 #endif
4858     __ push(rbx);
4859     __ vzeroupper();
4860 
4861     // Temporary variable declaration for swapping key bytes
4862     const XMMRegister xmm_key_shuf_mask = xmm1;
4863     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4864 
4865     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4866     const Register rounds = rbx;
4867     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4868 
4869     const XMMRegister IV = xmm0;
4870     // Load IV and broadcast value to 512-bits
4871     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4872 
4873     // Temporary variables for storing round keys
4874     const XMMRegister RK0 = xmm30;
4875     const XMMRegister RK1 = xmm9;
4876     const XMMRegister RK2 = xmm18;
4877     const XMMRegister RK3 = xmm19;
4878     const XMMRegister RK4 = xmm20;
4879     const XMMRegister RK5 = xmm21;
4880     const XMMRegister RK6 = xmm22;
4881     const XMMRegister RK7 = xmm23;
4882     const XMMRegister RK8 = xmm24;
4883     const XMMRegister RK9 = xmm25;
4884     const XMMRegister RK10 = xmm26;
4885 
4886      // Load and shuffle key
4887     // the java expanded key ordering is rotated one position from what we want
4888     // so we start from 1*16 here and hit 0*16 last
4889     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4890     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4891     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4892     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4893     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4894     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4895     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4896     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4897     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4898     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4899     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4900 
4901     // Variables for storing source cipher text
4902     const XMMRegister S0 = xmm10;
4903     const XMMRegister S1 = xmm11;
4904     const XMMRegister S2 = xmm12;
4905     const XMMRegister S3 = xmm13;
4906     const XMMRegister S4 = xmm14;
4907     const XMMRegister S5 = xmm15;
4908     const XMMRegister S6 = xmm16;
4909     const XMMRegister S7 = xmm17;
4910 
4911     // Variables for storing decrypted text
4912     const XMMRegister B0 = xmm1;
4913     const XMMRegister B1 = xmm2;
4914     const XMMRegister B2 = xmm3;
4915     const XMMRegister B3 = xmm4;
4916     const XMMRegister B4 = xmm5;
4917     const XMMRegister B5 = xmm6;
4918     const XMMRegister B6 = xmm7;
4919     const XMMRegister B7 = xmm8;
4920 
4921     __ cmpl(rounds, 44);
4922     __ jcc(Assembler::greater, KEY_192);
4923     __ jmp(Loop);
4924 
4925     __ BIND(KEY_192);
4926     const XMMRegister RK11 = xmm27;
4927     const XMMRegister RK12 = xmm28;
4928     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
4929     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
4930 
4931     __ cmpl(rounds, 52);
4932     __ jcc(Assembler::greater, KEY_256);
4933     __ jmp(Loop);
4934 
4935     __ BIND(KEY_256);
4936     const XMMRegister RK13 = xmm29;
4937     const XMMRegister RK14 = xmm31;
4938     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
4939     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
4940 
4941     __ BIND(Loop);
4942     __ cmpl(len_reg, 512);
4943     __ jcc(Assembler::below, Lcbc_dec_rem);
4944     __ BIND(Loop1);
4945     __ subl(len_reg, 512);
4946     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
4947     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
4948     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
4949     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
4950     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
4951     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
4952     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
4953     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
4954     __ leaq(from, Address(from, 8 * 64));
4955 
4956     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4957     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
4958     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
4959     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
4960     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
4961     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
4962     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
4963     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
4964 
4965     __ evalignq(IV, S0, IV, 0x06);
4966     __ evalignq(S0, S1, S0, 0x06);
4967     __ evalignq(S1, S2, S1, 0x06);
4968     __ evalignq(S2, S3, S2, 0x06);
4969     __ evalignq(S3, S4, S3, 0x06);
4970     __ evalignq(S4, S5, S4, 0x06);
4971     __ evalignq(S5, S6, S5, 0x06);
4972     __ evalignq(S6, S7, S6, 0x06);
4973 
4974     roundDec(RK2);
4975     roundDec(RK3);
4976     roundDec(RK4);
4977     roundDec(RK5);
4978     roundDec(RK6);
4979     roundDec(RK7);
4980     roundDec(RK8);
4981     roundDec(RK9);
4982     roundDec(RK10);
4983 
4984     __ cmpl(rounds, 44);
4985     __ jcc(Assembler::belowEqual, L_128);
4986     roundDec(RK11);
4987     roundDec(RK12);
4988 
4989     __ cmpl(rounds, 52);
4990     __ jcc(Assembler::belowEqual, L_192);
4991     roundDec(RK13);
4992     roundDec(RK14);
4993 
4994     __ BIND(L_256);
4995     roundDeclast(RK0);
4996     __ jmp(Loop2);
4997 
4998     __ BIND(L_128);
4999     roundDeclast(RK0);
5000     __ jmp(Loop2);
5001 
5002     __ BIND(L_192);
5003     roundDeclast(RK0);
5004 
5005     __ BIND(Loop2);
5006     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5007     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
5008     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
5009     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
5010     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
5011     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
5012     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
5013     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
5014     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
5015 
5016     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
5017     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
5018     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
5019     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
5020     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
5021     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
5022     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
5023     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
5024     __ leaq(to, Address(to, 8 * 64));
5025     __ jmp(Loop);
5026 
5027     __ BIND(Lcbc_dec_rem);
5028     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
5029 
5030     __ BIND(Lcbc_dec_rem_loop);
5031     __ subl(len_reg, 16);
5032     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
5033 
5034     __ movdqu(S0, Address(from, 0));
5035     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5036     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
5037     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
5038     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
5039     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
5040     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
5041     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
5042     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
5043     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
5044     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
5045     __ cmpl(rounds, 44);
5046     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5047 
5048     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
5049     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
5050     __ cmpl(rounds, 52);
5051     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5052 
5053     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
5054     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
5055 
5056     __ BIND(Lcbc_dec_rem_last);
5057     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
5058 
5059     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5060     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
5061     __ movdqu(Address(to, 0), B0);
5062     __ leaq(from, Address(from, 16));
5063     __ leaq(to, Address(to, 16));
5064     __ jmp(Lcbc_dec_rem_loop);
5065 
5066     __ BIND(Lcbc_dec_ret);
5067     __ movdqu(Address(rvec, 0), IV);
5068 
5069     // Zero out the round keys
5070     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
5071     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
5072     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
5073     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
5074     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
5075     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
5076     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
5077     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
5078     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
5079     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
5080     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
5081     __ cmpl(rounds, 44);
5082     __ jcc(Assembler::belowEqual, Lcbc_exit);
5083     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
5084     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
5085     __ cmpl(rounds, 52);
5086     __ jcc(Assembler::belowEqual, Lcbc_exit);
5087     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
5088     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
5089 
5090     __ BIND(Lcbc_exit);
5091     __ pop(rbx);
5092 #ifdef _WIN64
5093     __ movl(rax, len_mem);
5094 #else
5095     __ pop(rax); // return length
5096 #endif
5097     __ leave(); // required for proper stackwalking of RuntimeStub frame
5098     __ ret(0);
5099     return start;
5100 }
5101 
5102 // Polynomial x^128+x^127+x^126+x^121+1
5103 address ghash_polynomial_addr() {
5104     __ align(CodeEntryAlignment);
5105     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
5106     address start = __ pc();
5107     __ emit_data64(0x0000000000000001, relocInfo::none);
5108     __ emit_data64(0xc200000000000000, relocInfo::none);
5109     return start;
5110 }
5111 
5112 address ghash_shufflemask_addr() {
5113     __ align(CodeEntryAlignment);
5114     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
5115     address start = __ pc();
5116     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5117     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5118     return start;
5119 }
5120 
5121 // Ghash single and multi block operations using AVX instructions
5122 address generate_avx_ghash_processBlocks() {
5123     __ align(CodeEntryAlignment);
5124 
5125     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5126     address start = __ pc();
5127 
5128     // arguments
5129     const Register state = c_rarg0;
5130     const Register htbl = c_rarg1;
5131     const Register data = c_rarg2;
5132     const Register blocks = c_rarg3;
5133     __ enter();
5134    // Save state before entering routine
5135     __ avx_ghash(state, htbl, data, blocks);
5136     __ leave(); // required for proper stackwalking of RuntimeStub frame
5137     __ ret(0);
5138     return start;
5139 }
5140 
5141   // byte swap x86 long
5142   address generate_ghash_long_swap_mask() {
5143     __ align(CodeEntryAlignment);
5144     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
5145     address start = __ pc();
5146     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
5147     __ emit_data64(0x0706050403020100, relocInfo::none );
5148   return start;
5149   }
5150 
5151   // byte swap x86 byte array
5152   address generate_ghash_byte_swap_mask() {
5153     __ align(CodeEntryAlignment);
5154     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
5155     address start = __ pc();
5156     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
5157     __ emit_data64(0x0001020304050607, relocInfo::none );
5158   return start;
5159   }
5160 
5161   /* Single and multi-block ghash operations */
5162   address generate_ghash_processBlocks() {
5163     __ align(CodeEntryAlignment);
5164     Label L_ghash_loop, L_exit;
5165     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5166     address start = __ pc();
5167 
5168     const Register state        = c_rarg0;
5169     const Register subkeyH      = c_rarg1;
5170     const Register data         = c_rarg2;
5171     const Register blocks       = c_rarg3;
5172 
5173     const XMMRegister xmm_temp0 = xmm0;
5174     const XMMRegister xmm_temp1 = xmm1;
5175     const XMMRegister xmm_temp2 = xmm2;
5176     const XMMRegister xmm_temp3 = xmm3;
5177     const XMMRegister xmm_temp4 = xmm4;
5178     const XMMRegister xmm_temp5 = xmm5;
5179     const XMMRegister xmm_temp6 = xmm6;
5180     const XMMRegister xmm_temp7 = xmm7;
5181     const XMMRegister xmm_temp8 = xmm8;
5182     const XMMRegister xmm_temp9 = xmm9;
5183     const XMMRegister xmm_temp10 = xmm10;
5184 
5185     __ enter();
5186 
5187     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
5188 
5189     __ movdqu(xmm_temp0, Address(state, 0));
5190     __ pshufb(xmm_temp0, xmm_temp10);
5191 
5192 
5193     __ BIND(L_ghash_loop);
5194     __ movdqu(xmm_temp2, Address(data, 0));
5195     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
5196 
5197     __ movdqu(xmm_temp1, Address(subkeyH, 0));
5198     __ pshufb(xmm_temp1, xmm_temp10);
5199 
5200     __ pxor(xmm_temp0, xmm_temp2);
5201 
5202     //
5203     // Multiply with the hash key
5204     //
5205     __ movdqu(xmm_temp3, xmm_temp0);
5206     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
5207     __ movdqu(xmm_temp4, xmm_temp0);
5208     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
5209 
5210     __ movdqu(xmm_temp5, xmm_temp0);
5211     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
5212     __ movdqu(xmm_temp6, xmm_temp0);
5213     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
5214 
5215     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
5216 
5217     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
5218     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
5219     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
5220     __ pxor(xmm_temp3, xmm_temp5);
5221     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
5222                                         // of the carry-less multiplication of
5223                                         // xmm0 by xmm1.
5224 
5225     // We shift the result of the multiplication by one bit position
5226     // to the left to cope for the fact that the bits are reversed.
5227     __ movdqu(xmm_temp7, xmm_temp3);
5228     __ movdqu(xmm_temp8, xmm_temp6);
5229     __ pslld(xmm_temp3, 1);
5230     __ pslld(xmm_temp6, 1);
5231     __ psrld(xmm_temp7, 31);
5232     __ psrld(xmm_temp8, 31);
5233     __ movdqu(xmm_temp9, xmm_temp7);
5234     __ pslldq(xmm_temp8, 4);
5235     __ pslldq(xmm_temp7, 4);
5236     __ psrldq(xmm_temp9, 12);
5237     __ por(xmm_temp3, xmm_temp7);
5238     __ por(xmm_temp6, xmm_temp8);
5239     __ por(xmm_temp6, xmm_temp9);
5240 
5241     //
5242     // First phase of the reduction
5243     //
5244     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
5245     // independently.
5246     __ movdqu(xmm_temp7, xmm_temp3);
5247     __ movdqu(xmm_temp8, xmm_temp3);
5248     __ movdqu(xmm_temp9, xmm_temp3);
5249     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
5250     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
5251     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
5252     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
5253     __ pxor(xmm_temp7, xmm_temp9);
5254     __ movdqu(xmm_temp8, xmm_temp7);
5255     __ pslldq(xmm_temp7, 12);
5256     __ psrldq(xmm_temp8, 4);
5257     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
5258 
5259     //
5260     // Second phase of the reduction
5261     //
5262     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
5263     // shift operations.
5264     __ movdqu(xmm_temp2, xmm_temp3);
5265     __ movdqu(xmm_temp4, xmm_temp3);
5266     __ movdqu(xmm_temp5, xmm_temp3);
5267     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
5268     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
5269     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
5270     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
5271     __ pxor(xmm_temp2, xmm_temp5);
5272     __ pxor(xmm_temp2, xmm_temp8);
5273     __ pxor(xmm_temp3, xmm_temp2);
5274     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
5275 
5276     __ decrement(blocks);
5277     __ jcc(Assembler::zero, L_exit);
5278     __ movdqu(xmm_temp0, xmm_temp6);
5279     __ addptr(data, 16);
5280     __ jmp(L_ghash_loop);
5281 
5282     __ BIND(L_exit);
5283     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
5284     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
5285     __ leave();
5286     __ ret(0);
5287     return start;
5288   }
5289 
5290   address base64_shuffle_addr()
5291   {
5292     __ align64();
5293     StubCodeMark mark(this, "StubRoutines", "shuffle_base64");
5294     address start = __ pc();
5295     assert(((unsigned long long)start & 0x3f) == 0,
5296            "Alignment problem (0x%08llx)", (unsigned long long)start);
5297     __ emit_data64(0x0405030401020001, relocInfo::none);
5298     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5299     __ emit_data64(0x10110f100d0e0c0d, relocInfo::none);
5300     __ emit_data64(0x1617151613141213, relocInfo::none);
5301     __ emit_data64(0x1c1d1b1c191a1819, relocInfo::none);
5302     __ emit_data64(0x222321221f201e1f, relocInfo::none);
5303     __ emit_data64(0x2829272825262425, relocInfo::none);
5304     __ emit_data64(0x2e2f2d2e2b2c2a2b, relocInfo::none);
5305     return start;
5306   }
5307 
5308   address base64_avx2_shuffle_addr()
5309   {
5310     __ align32();
5311     StubCodeMark mark(this, "StubRoutines", "avx2_shuffle_base64");
5312     address start = __ pc();
5313     __ emit_data64(0x0809070805060405, relocInfo::none);
5314     __ emit_data64(0x0e0f0d0e0b0c0a0b, relocInfo::none);
5315     __ emit_data64(0x0405030401020001, relocInfo::none);
5316     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5317     return start;
5318   }
5319 
5320   address base64_avx2_input_mask_addr()
5321   {
5322     __ align32();
5323     StubCodeMark mark(this, "StubRoutines", "avx2_input_mask_base64");
5324     address start = __ pc();
5325     __ emit_data64(0x8000000000000000, relocInfo::none);
5326     __ emit_data64(0x8000000080000000, relocInfo::none);
5327     __ emit_data64(0x8000000080000000, relocInfo::none);
5328     __ emit_data64(0x8000000080000000, relocInfo::none);
5329     return start;
5330   }
5331 
5332   address base64_avx2_lut_addr()
5333   {
5334     __ align32();
5335     StubCodeMark mark(this, "StubRoutines", "avx2_lut_base64");
5336     address start = __ pc();
5337     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5338     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5339     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5340     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5341 
5342     // URL LUT
5343     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5344     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5345     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5346     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5347     return start;
5348   }
5349 
5350   address base64_encoding_table_addr()
5351   {
5352     __ align64();
5353     StubCodeMark mark(this, "StubRoutines", "encoding_table_base64");
5354     address start = __ pc();
5355     assert(((unsigned long long)start & 0x3f) == 0, "Alignment problem (0x%08llx)", (unsigned long long)start);
5356     __ emit_data64(0x4847464544434241, relocInfo::none);
5357     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5358     __ emit_data64(0x5857565554535251, relocInfo::none);
5359     __ emit_data64(0x6665646362615a59, relocInfo::none);
5360     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5361     __ emit_data64(0x767574737271706f, relocInfo::none);
5362     __ emit_data64(0x333231307a797877, relocInfo::none);
5363     __ emit_data64(0x2f2b393837363534, relocInfo::none);
5364 
5365     // URL table
5366     __ emit_data64(0x4847464544434241, relocInfo::none);
5367     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5368     __ emit_data64(0x5857565554535251, relocInfo::none);
5369     __ emit_data64(0x6665646362615a59, relocInfo::none);
5370     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5371     __ emit_data64(0x767574737271706f, relocInfo::none);
5372     __ emit_data64(0x333231307a797877, relocInfo::none);
5373     __ emit_data64(0x5f2d393837363534, relocInfo::none);
5374     return start;
5375   }
5376 
5377   // Code for generating Base64 encoding.
5378   // Intrinsic function prototype in Base64.java:
5379   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp,
5380   // boolean isURL) {
5381   address generate_base64_encodeBlock()
5382   {
5383     __ align(CodeEntryAlignment);
5384     StubCodeMark mark(this, "StubRoutines", "implEncode");
5385     address start = __ pc();
5386     __ enter();
5387 
5388     // Save callee-saved registers before using them
5389     __ push(r12);
5390     __ push(r13);
5391     __ push(r14);
5392     __ push(r15);
5393 
5394     // arguments
5395     const Register source = c_rarg0;       // Source Array
5396     const Register start_offset = c_rarg1; // start offset
5397     const Register end_offset = c_rarg2;   // end offset
5398     const Register dest = c_rarg3;   // destination array
5399 
5400 #ifndef _WIN64
5401     const Register dp = c_rarg4;    // Position for writing to dest array
5402     const Register isURL = c_rarg5; // Base64 or URL character set
5403 #else
5404     const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5405     const Address isURL_mem(rbp, 7 * wordSize);
5406     const Register isURL = r10; // pick the volatile windows register
5407     const Register dp = r12;
5408     __ movl(dp, dp_mem);
5409     __ movl(isURL, isURL_mem);
5410 #endif
5411 
5412     const Register length = r14;
5413     const Register encode_table = r13;
5414     Label L_process3, L_exit, L_processdata, L_vbmiLoop, L_not512, L_32byteLoop;
5415 
5416     // calculate length from offsets
5417     __ movl(length, end_offset);
5418     __ subl(length, start_offset);
5419     __ cmpl(length, 0);
5420     __ jcc(Assembler::lessEqual, L_exit);
5421 
5422     // Code for 512-bit VBMI encoding.  Encodes 48 input bytes into 64
5423     // output bytes. We read 64 input bytes and ignore the last 16, so be
5424     // sure not to read past the end of the input buffer.
5425     if (VM_Version::supports_avx512_vbmi()) {
5426       __ cmpl(length, 64); // Do not overrun input buffer.
5427       __ jcc(Assembler::below, L_not512);
5428 
5429       __ shll(isURL, 6); // index into decode table based on isURL
5430       __ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5431       __ addptr(encode_table, isURL);
5432       __ shrl(isURL, 6); // restore isURL
5433 
5434       __ mov64(rax, 0x3036242a1016040aull); // Shifts
5435       __ evmovdquq(xmm3, ExternalAddress(StubRoutines::x86::base64_shuffle_addr()), Assembler::AVX_512bit, r15);
5436       __ evmovdquq(xmm2, Address(encode_table, 0), Assembler::AVX_512bit);
5437       __ evpbroadcastq(xmm1, rax, Assembler::AVX_512bit);
5438 
5439       __ align32();
5440       __ BIND(L_vbmiLoop);
5441 
5442       __ vpermb(xmm0, xmm3, Address(source, start_offset), Assembler::AVX_512bit);
5443       __ subl(length, 48);
5444 
5445       // Put the input bytes into the proper lanes for writing, then
5446       // encode them.
5447       __ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
5448       __ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
5449 
5450       // Write to destination
5451       __ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
5452 
5453       __ addptr(dest, 64);
5454       __ addptr(source, 48);
5455       __ cmpl(length, 64);
5456       __ jcc(Assembler::aboveEqual, L_vbmiLoop);
5457 
5458       __ vzeroupper();
5459     }
5460 
5461     __ BIND(L_not512);
5462     if (VM_Version::supports_avx2()
5463         && VM_Version::supports_avx512vlbw()) {
5464       /*
5465       ** This AVX2 encoder is based off the paper at:
5466       **      https://dl.acm.org/doi/10.1145/3132709
5467       **
5468       ** We use AVX2 SIMD instructions to encode 24 bytes into 32
5469       ** output bytes.
5470       **
5471       */
5472       // Lengths under 32 bytes are done with scalar routine
5473       __ cmpl(length, 31);
5474       __ jcc(Assembler::belowEqual, L_process3);
5475 
5476       // Set up supporting constant table data
5477       __ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
5478       // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
5479       __ movl(rax, 0x0fc0fc00);
5480       __ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
5481       __ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
5482 
5483       // Multiplication constant for "shifting" right by 6 and 10
5484       // bits
5485       __ movl(rax, 0x04000040);
5486 
5487       __ subl(length, 24);
5488       __ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
5489 
5490       // For the first load, we mask off reading of the first 4
5491       // bytes into the register. This is so we can get 4 3-byte
5492       // chunks into each lane of the register, avoiding having to
5493       // handle end conditions.  We then shuffle these bytes into a
5494       // specific order so that manipulation is easier.
5495       //
5496       // The initial read loads the XMM register like this:
5497       //
5498       // Lower 128-bit lane:
5499       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5500       // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1
5501       // | C2 | D0 | D1 | D2 |
5502       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5503       //
5504       // Upper 128-bit lane:
5505       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5506       // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2
5507       // | XX | XX | XX | XX |
5508       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5509       //
5510       // Where A0 is the first input byte, B0 is the fourth, etc.
5511       // The alphabetical significance denotes the 3 bytes to be
5512       // consumed and encoded into 4 bytes.
5513       //
5514       // We then shuffle the register so each 32-bit word contains
5515       // the sequence:
5516       //    A1 A0 A2 A1, B1, B0, B2, B1, etc.
5517       // Each of these byte sequences are then manipulated into 4
5518       // 6-bit values ready for encoding.
5519       //
5520       // If we focus on one set of 3-byte chunks, changing the
5521       // nomenclature such that A0 => a, A1 => b, and A2 => c, we
5522       // shuffle such that each 24-bit chunk contains:
5523       //
5524       // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6
5525       // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0
5526       // Explain this step.
5527       // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4
5528       // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2
5529       //
5530       // W first and off all but bits 4-9 and 16-21 (c5..c0 and
5531       // a5..a0) and shift them using a vector multiplication
5532       // operation (vpmulhuw) which effectively shifts c right by 6
5533       // bits and a right by 10 bits.  We similarly mask bits 10-15
5534       // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4
5535       // bits respecively.  This is done using vpmullw.  We end up
5536       // with 4 6-bit values, thus splitting the 3 input bytes,
5537       // ready for encoding:
5538       //    0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0
5539       //
5540       // For translation, we recognize that there are 5 distinct
5541       // ranges of legal Base64 characters as below:
5542       //
5543       //   +-------------+-------------+------------+
5544       //   | 6-bit value | ASCII range |   offset   |
5545       //   +-------------+-------------+------------+
5546       //   |    0..25    |    A..Z     |     65     |
5547       //   |   26..51    |    a..z     |     71     |
5548       //   |   52..61    |    0..9     |     -4     |
5549       //   |     62      |   + or -    | -19 or -17 |
5550       //   |     63      |   / or _    | -16 or 32  |
5551       //   +-------------+-------------+------------+
5552       //
5553       // We note that vpshufb does a parallel lookup in a
5554       // destination register using the lower 4 bits of bytes from a
5555       // source register.  If we use a saturated subtraction and
5556       // subtract 51 from each 6-bit value, bytes from [0,51]
5557       // saturate to 0, and [52,63] map to a range of [1,12].  We
5558       // distinguish the [0,25] and [26,51] ranges by assigning a
5559       // value of 13 for all 6-bit values less than 26.  We end up
5560       // with:
5561       //
5562       //   +-------------+-------------+------------+
5563       //   | 6-bit value |   Reduced   |   offset   |
5564       //   +-------------+-------------+------------+
5565       //   |    0..25    |     13      |     65     |
5566       //   |   26..51    |      0      |     71     |
5567       //   |   52..61    |    0..9     |     -4     |
5568       //   |     62      |     11      | -19 or -17 |
5569       //   |     63      |     12      | -16 or 32  |
5570       //   +-------------+-------------+------------+
5571       //
5572       // We then use a final vpshufb to add the appropriate offset,
5573       // translating the bytes.
5574       //
5575       // Load input bytes - only 28 bytes.  Mask the first load to
5576       // not load into the full register.
5577       __ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
5578 
5579       // Move 3-byte chunks of input (12 bytes) into 16 bytes,
5580       // ordering by:
5581       //   1, 0, 2, 1; 4, 3, 5, 4; etc.  This groups 6-bit chunks
5582       //   for easy masking
5583       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5584 
5585       __ addl(start_offset, 24);
5586 
5587       // Load masking register for first and third (and multiples)
5588       // 6-bit values.
5589       __ movl(rax, 0x003f03f0);
5590       __ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
5591       // Multiplication constant for "shifting" left by 4 and 8 bits
5592       __ movl(rax, 0x01000010);
5593       __ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
5594 
5595       // Isolate 6-bit chunks of interest
5596       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5597 
5598       // Load constants for encoding
5599       __ movl(rax, 0x19191919);
5600       __ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
5601       __ movl(rax, 0x33333333);
5602       __ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
5603 
5604       // Shift output bytes 0 and 2 into proper lanes
5605       __ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
5606 
5607       // Mask and shift output bytes 1 and 3 into proper lanes and
5608       // combine
5609       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5610       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5611       __ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
5612 
5613       // Find out which are 0..25.  This indicates which input
5614       // values fall in the range of 'A'-'Z', which require an
5615       // additional offset (see comments above)
5616       __ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
5617       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5618       __ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
5619 
5620       // Load the proper lookup table
5621       __ lea(r11, ExternalAddress(StubRoutines::x86::base64_avx2_lut_addr()));
5622       __ movl(r15, isURL);
5623       __ shll(r15, 5);
5624       __ vmovdqu(xmm2, Address(r11, r15));
5625 
5626       // Shuffle the offsets based on the range calculation done
5627       // above. This allows us to add the correct offset to the
5628       // 6-bit value corresponding to the range documented above.
5629       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5630       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5631 
5632       // Store the encoded bytes
5633       __ vmovdqu(Address(dest, dp), xmm0);
5634       __ addl(dp, 32);
5635 
5636       __ cmpl(length, 31);
5637       __ jcc(Assembler::belowEqual, L_process3);
5638 
5639       __ align32();
5640       __ BIND(L_32byteLoop);
5641 
5642       // Get next 32 bytes
5643       __ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
5644 
5645       __ subl(length, 24);
5646       __ addl(start_offset, 24);
5647 
5648       // This logic is identical to the above, with only constant
5649       // register loads removed.  Shuffle the input, mask off 6-bit
5650       // chunks, shift them into place, then add the offset to
5651       // encode.
5652       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5653 
5654       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5655       __ vpmulhuw(xmm10, xmm0, xmm7, Assembler::AVX_256bit);
5656       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5657       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5658       __ vpor(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
5659       __ vpcmpgtb(xmm10, xmm0, xmm3, Assembler::AVX_256bit);
5660       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5661       __ vpsubb(xmm1, xmm1, xmm10, Assembler::AVX_256bit);
5662       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5663       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5664 
5665       // Store the encoded bytes
5666       __ vmovdqu(Address(dest, dp), xmm0);
5667       __ addl(dp, 32);
5668 
5669       __ cmpl(length, 31);
5670       __ jcc(Assembler::above, L_32byteLoop);
5671 
5672       __ BIND(L_process3);
5673       __ vzeroupper();
5674     } else {
5675       __ BIND(L_process3);
5676     }
5677 
5678     __ cmpl(length, 3);
5679     __ jcc(Assembler::below, L_exit);
5680 
5681     // Load the encoding table based on isURL
5682     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5683     __ movl(r15, isURL);
5684     __ shll(r15, 6);
5685     __ addptr(r11, r15);
5686 
5687     __ BIND(L_processdata);
5688 
5689     // Load 3 bytes
5690     __ load_unsigned_byte(r15, Address(source, start_offset));
5691     __ load_unsigned_byte(r10, Address(source, start_offset, Address::times_1, 1));
5692     __ load_unsigned_byte(r13, Address(source, start_offset, Address::times_1, 2));
5693 
5694     // Build a 32-bit word with bytes 1, 2, 0, 1
5695     __ movl(rax, r10);
5696     __ shll(r10, 24);
5697     __ orl(rax, r10);
5698 
5699     __ subl(length, 3);
5700 
5701     __ shll(r15, 8);
5702     __ shll(r13, 16);
5703     __ orl(rax, r15);
5704 
5705     __ addl(start_offset, 3);
5706 
5707     __ orl(rax, r13);
5708     // At this point, rax contains | byte1 | byte2 | byte0 | byte1
5709     // r13 has byte2 << 16 - need low-order 6 bits to translate.
5710     // This translated byte is the fourth output byte.
5711     __ shrl(r13, 16);
5712     __ andl(r13, 0x3f);
5713 
5714     // The high-order 6 bits of r15 (byte0) is translated.
5715     // The translated byte is the first output byte.
5716     __ shrl(r15, 10);
5717 
5718     __ load_unsigned_byte(r13, Address(r11, r13));
5719     __ load_unsigned_byte(r15, Address(r11, r15));
5720 
5721     __ movb(Address(dest, dp, Address::times_1, 3), r13);
5722 
5723     // Extract high-order 4 bits of byte1 and low-order 2 bits of byte0.
5724     // This translated byte is the second output byte.
5725     __ shrl(rax, 4);
5726     __ movl(r10, rax);
5727     __ andl(rax, 0x3f);
5728 
5729     __ movb(Address(dest, dp, Address::times_1, 0), r15);
5730 
5731     __ load_unsigned_byte(rax, Address(r11, rax));
5732 
5733     // Extract low-order 2 bits of byte1 and high-order 4 bits of byte2.
5734     // This translated byte is the third output byte.
5735     __ shrl(r10, 18);
5736     __ andl(r10, 0x3f);
5737 
5738     __ load_unsigned_byte(r10, Address(r11, r10));
5739 
5740     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5741     __ movb(Address(dest, dp, Address::times_1, 2), r10);
5742 
5743     __ addl(dp, 4);
5744     __ cmpl(length, 3);
5745     __ jcc(Assembler::aboveEqual, L_processdata);
5746 
5747     __ BIND(L_exit);
5748     __ pop(r15);
5749     __ pop(r14);
5750     __ pop(r13);
5751     __ pop(r12);
5752     __ leave();
5753     __ ret(0);
5754     return start;
5755   }
5756 
5757   // base64 AVX512vbmi tables
5758   address base64_vbmi_lookup_lo_addr() {
5759     __ align64();
5760     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64");
5761     address start = __ pc();
5762     assert(((unsigned long long)start & 0x3f) == 0,
5763            "Alignment problem (0x%08llx)", (unsigned long long)start);
5764     __ emit_data64(0x8080808080808080, relocInfo::none);
5765     __ emit_data64(0x8080808080808080, relocInfo::none);
5766     __ emit_data64(0x8080808080808080, relocInfo::none);
5767     __ emit_data64(0x8080808080808080, relocInfo::none);
5768     __ emit_data64(0x8080808080808080, relocInfo::none);
5769     __ emit_data64(0x3f8080803e808080, relocInfo::none);
5770     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5771     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5772     return start;
5773   }
5774 
5775   address base64_vbmi_lookup_hi_addr() {
5776     __ align64();
5777     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64");
5778     address start = __ pc();
5779     assert(((unsigned long long)start & 0x3f) == 0,
5780            "Alignment problem (0x%08llx)", (unsigned long long)start);
5781     __ emit_data64(0x0605040302010080, relocInfo::none);
5782     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5783     __ emit_data64(0x161514131211100f, relocInfo::none);
5784     __ emit_data64(0x8080808080191817, relocInfo::none);
5785     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5786     __ emit_data64(0x2827262524232221, relocInfo::none);
5787     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5788     __ emit_data64(0x8080808080333231, relocInfo::none);
5789     return start;
5790   }
5791   address base64_vbmi_lookup_lo_url_addr() {
5792     __ align64();
5793     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64url");
5794     address start = __ pc();
5795     assert(((unsigned long long)start & 0x3f) == 0,
5796            "Alignment problem (0x%08llx)", (unsigned long long)start);
5797     __ emit_data64(0x8080808080808080, relocInfo::none);
5798     __ emit_data64(0x8080808080808080, relocInfo::none);
5799     __ emit_data64(0x8080808080808080, relocInfo::none);
5800     __ emit_data64(0x8080808080808080, relocInfo::none);
5801     __ emit_data64(0x8080808080808080, relocInfo::none);
5802     __ emit_data64(0x80803e8080808080, relocInfo::none);
5803     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5804     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5805     return start;
5806   }
5807 
5808   address base64_vbmi_lookup_hi_url_addr() {
5809     __ align64();
5810     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64url");
5811     address start = __ pc();
5812     assert(((unsigned long long)start & 0x3f) == 0,
5813            "Alignment problem (0x%08llx)", (unsigned long long)start);
5814     __ emit_data64(0x0605040302010080, relocInfo::none);
5815     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5816     __ emit_data64(0x161514131211100f, relocInfo::none);
5817     __ emit_data64(0x3f80808080191817, relocInfo::none);
5818     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5819     __ emit_data64(0x2827262524232221, relocInfo::none);
5820     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5821     __ emit_data64(0x8080808080333231, relocInfo::none);
5822     return start;
5823   }
5824 
5825   address base64_vbmi_pack_vec_addr() {
5826     __ align64();
5827     StubCodeMark mark(this, "StubRoutines", "pack_vec_base64");
5828     address start = __ pc();
5829     assert(((unsigned long long)start & 0x3f) == 0,
5830            "Alignment problem (0x%08llx)", (unsigned long long)start);
5831     __ emit_data64(0x090a040506000102, relocInfo::none);
5832     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5833     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5834     __ emit_data64(0x292a242526202122, relocInfo::none);
5835     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5836     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5837     __ emit_data64(0x0000000000000000, relocInfo::none);
5838     __ emit_data64(0x0000000000000000, relocInfo::none);
5839     return start;
5840   }
5841 
5842   address base64_vbmi_join_0_1_addr() {
5843     __ align64();
5844     StubCodeMark mark(this, "StubRoutines", "join_0_1_base64");
5845     address start = __ pc();
5846     assert(((unsigned long long)start & 0x3f) == 0,
5847            "Alignment problem (0x%08llx)", (unsigned long long)start);
5848     __ emit_data64(0x090a040506000102, relocInfo::none);
5849     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5850     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5851     __ emit_data64(0x292a242526202122, relocInfo::none);
5852     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5853     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5854     __ emit_data64(0x494a444546404142, relocInfo::none);
5855     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5856     return start;
5857   }
5858 
5859   address base64_vbmi_join_1_2_addr() {
5860     __ align64();
5861     StubCodeMark mark(this, "StubRoutines", "join_1_2_base64");
5862     address start = __ pc();
5863     assert(((unsigned long long)start & 0x3f) == 0,
5864            "Alignment problem (0x%08llx)", (unsigned long long)start);
5865     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5866     __ emit_data64(0x292a242526202122, relocInfo::none);
5867     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5868     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5869     __ emit_data64(0x494a444546404142, relocInfo::none);
5870     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5871     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
5872     __ emit_data64(0x696a646566606162, relocInfo::none);
5873     return start;
5874   }
5875 
5876   address base64_vbmi_join_2_3_addr() {
5877     __ align64();
5878     StubCodeMark mark(this, "StubRoutines", "join_2_3_base64");
5879     address start = __ pc();
5880     assert(((unsigned long long)start & 0x3f) == 0,
5881            "Alignment problem (0x%08llx)", (unsigned long long)start);
5882     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5883     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5884     __ emit_data64(0x494a444546404142, relocInfo::none);
5885     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5886     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
5887     __ emit_data64(0x696a646566606162, relocInfo::none);
5888     __ emit_data64(0x767071726c6d6e68, relocInfo::none);
5889     __ emit_data64(0x7c7d7e78797a7475, relocInfo::none);
5890     return start;
5891   }
5892 
5893   address base64_decoding_table_addr() {
5894     StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
5895     address start = __ pc();
5896     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5897     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5898     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5899     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5900     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5901     __ emit_data64(0x3fffffff3effffff, relocInfo::none);
5902     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5903     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
5904     __ emit_data64(0x06050403020100ff, relocInfo::none);
5905     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5906     __ emit_data64(0x161514131211100f, relocInfo::none);
5907     __ emit_data64(0xffffffffff191817, relocInfo::none);
5908     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
5909     __ emit_data64(0x2827262524232221, relocInfo::none);
5910     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5911     __ emit_data64(0xffffffffff333231, relocInfo::none);
5912     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5913     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5914     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5915     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5916     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5917     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5918     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5919     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5920     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5921     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5922     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5923     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5924     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5925     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5926     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5927     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5928 
5929     // URL table
5930     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5931     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5932     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5933     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5934     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5935     __ emit_data64(0xffff3effffffffff, relocInfo::none);
5936     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5937     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
5938     __ emit_data64(0x06050403020100ff, relocInfo::none);
5939     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5940     __ emit_data64(0x161514131211100f, relocInfo::none);
5941     __ emit_data64(0x3fffffffff191817, relocInfo::none);
5942     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
5943     __ emit_data64(0x2827262524232221, relocInfo::none);
5944     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5945     __ emit_data64(0xffffffffff333231, relocInfo::none);
5946     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5947     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5948     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5949     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5950     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5951     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5952     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5953     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5954     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5955     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5956     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5957     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5958     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5959     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5960     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5961     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5962     return start;
5963   }
5964 
5965 
5966 // Code for generating Base64 decoding.
5967 //
5968 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
5969 //
5970 // Intrinsic function prototype in Base64.java:
5971 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME) {
5972   address generate_base64_decodeBlock() {
5973     __ align(CodeEntryAlignment);
5974     StubCodeMark mark(this, "StubRoutines", "implDecode");
5975     address start = __ pc();
5976     __ enter();
5977 
5978     // Save callee-saved registers before using them
5979     __ push(r12);
5980     __ push(r13);
5981     __ push(r14);
5982     __ push(r15);
5983     __ push(rbx);
5984 
5985     // arguments
5986     const Register source = c_rarg0; // Source Array
5987     const Register start_offset = c_rarg1; // start offset
5988     const Register end_offset = c_rarg2; // end offset
5989     const Register dest = c_rarg3; // destination array
5990     const Register isMIME = rbx;
5991 
5992 #ifndef _WIN64
5993     const Register dp = c_rarg4;  // Position for writing to dest array
5994     const Register isURL = c_rarg5;// Base64 or URL character set
5995     __ movl(isMIME, Address(rbp, 2 * wordSize));
5996 #else
5997     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
5998     const Address isURL_mem(rbp, 7 * wordSize);
5999     const Register isURL = r10;      // pick the volatile windows register
6000     const Register dp = r12;
6001     __ movl(dp, dp_mem);
6002     __ movl(isURL, isURL_mem);
6003     __ movl(isMIME, Address(rbp, 8 * wordSize));
6004 #endif
6005 
6006     const XMMRegister lookup_lo = xmm5;
6007     const XMMRegister lookup_hi = xmm6;
6008     const XMMRegister errorvec = xmm7;
6009     const XMMRegister pack16_op = xmm9;
6010     const XMMRegister pack32_op = xmm8;
6011     const XMMRegister input0 = xmm3;
6012     const XMMRegister input1 = xmm20;
6013     const XMMRegister input2 = xmm21;
6014     const XMMRegister input3 = xmm19;
6015     const XMMRegister join01 = xmm12;
6016     const XMMRegister join12 = xmm11;
6017     const XMMRegister join23 = xmm10;
6018     const XMMRegister translated0 = xmm2;
6019     const XMMRegister translated1 = xmm1;
6020     const XMMRegister translated2 = xmm0;
6021     const XMMRegister translated3 = xmm4;
6022 
6023     const XMMRegister merged0 = xmm2;
6024     const XMMRegister merged1 = xmm1;
6025     const XMMRegister merged2 = xmm0;
6026     const XMMRegister merged3 = xmm4;
6027     const XMMRegister merge_ab_bc0 = xmm2;
6028     const XMMRegister merge_ab_bc1 = xmm1;
6029     const XMMRegister merge_ab_bc2 = xmm0;
6030     const XMMRegister merge_ab_bc3 = xmm4;
6031 
6032     const XMMRegister pack24bits = xmm4;
6033 
6034     const Register length = r14;
6035     const Register output_size = r13;
6036     const Register output_mask = r15;
6037     const KRegister input_mask = k1;
6038 
6039     const XMMRegister input_initial_valid_b64 = xmm0;
6040     const XMMRegister tmp = xmm10;
6041     const XMMRegister mask = xmm0;
6042     const XMMRegister invalid_b64 = xmm1;
6043 
6044     Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
6045     Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
6046     Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
6047 
6048     // calculate length from offsets
6049     __ movl(length, end_offset);
6050     __ subl(length, start_offset);
6051     __ push(dest);          // Save for return value calc
6052 
6053     // If AVX512 VBMI not supported, just compile non-AVX code
6054     if(VM_Version::supports_avx512_vbmi() &&
6055        VM_Version::supports_avx512bw()) {
6056       __ cmpl(length, 128);     // 128-bytes is break-even for AVX-512
6057       __ jcc(Assembler::lessEqual, L_bruteForce);
6058 
6059       __ cmpl(isMIME, 0);
6060       __ jcc(Assembler::notEqual, L_bruteForce);
6061 
6062       // Load lookup tables based on isURL
6063       __ cmpl(isURL, 0);
6064       __ jcc(Assembler::notZero, L_loadURL);
6065 
6066       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_addr()), Assembler::AVX_512bit, r13);
6067       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_addr()), Assembler::AVX_512bit, r13);
6068 
6069       __ BIND(L_continue);
6070 
6071       __ movl(r15, 0x01400140);
6072       __ evpbroadcastd(pack16_op, r15, Assembler::AVX_512bit);
6073 
6074       __ movl(r15, 0x00011000);
6075       __ evpbroadcastd(pack32_op, r15, Assembler::AVX_512bit);
6076 
6077       __ cmpl(length, 0xff);
6078       __ jcc(Assembler::lessEqual, L_process64);
6079 
6080       // load masks required for decoding data
6081       __ BIND(L_processdata);
6082       __ evmovdquq(join01, ExternalAddress(StubRoutines::x86::base64_vbmi_join_0_1_addr()), Assembler::AVX_512bit,r13);
6083       __ evmovdquq(join12, ExternalAddress(StubRoutines::x86::base64_vbmi_join_1_2_addr()), Assembler::AVX_512bit, r13);
6084       __ evmovdquq(join23, ExternalAddress(StubRoutines::x86::base64_vbmi_join_2_3_addr()), Assembler::AVX_512bit, r13);
6085 
6086       __ align32();
6087       __ BIND(L_process256);
6088       // Grab input data
6089       __ evmovdquq(input0, Address(source, start_offset, Address::times_1, 0x00), Assembler::AVX_512bit);
6090       __ evmovdquq(input1, Address(source, start_offset, Address::times_1, 0x40), Assembler::AVX_512bit);
6091       __ evmovdquq(input2, Address(source, start_offset, Address::times_1, 0x80), Assembler::AVX_512bit);
6092       __ evmovdquq(input3, Address(source, start_offset, Address::times_1, 0xc0), Assembler::AVX_512bit);
6093 
6094       // Copy the low part of the lookup table into the destination of the permutation
6095       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6096       __ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
6097       __ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
6098       __ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
6099 
6100       // Translate the base64 input into "decoded" bytes
6101       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6102       __ evpermt2b(translated1, input1, lookup_hi, Assembler::AVX_512bit);
6103       __ evpermt2b(translated2, input2, lookup_hi, Assembler::AVX_512bit);
6104       __ evpermt2b(translated3, input3, lookup_hi, Assembler::AVX_512bit);
6105 
6106       // OR all of the translations together to check for errors (high-order bit of byte set)
6107       __ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
6108 
6109       __ vpternlogd(input3, 0xfe, translated0, translated1, Assembler::AVX_512bit);
6110       __ vpternlogd(input0, 0xfe, translated2, translated3, Assembler::AVX_512bit);
6111       __ vpor(errorvec, input3, input0, Assembler::AVX_512bit);
6112 
6113       // Check if there was an error - if so, try 64-byte chunks
6114       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6115       __ kortestql(k3, k3);
6116       __ jcc(Assembler::notZero, L_process64);
6117 
6118       // The merging and shuffling happens here
6119       // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa]
6120       // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd]
6121       // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
6122       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6123       __ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
6124       __ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
6125       __ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
6126 
6127       // Now do the same with packed 16-bit values.
6128       // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb]
6129       // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12
6130       // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
6131       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6132       __ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
6133       __ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
6134       __ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
6135 
6136       // The join vectors specify which byte from which vector goes into the outputs
6137       // One of every 4 bytes in the extended vector is zero, so we pack them into their
6138       // final positions in the register for storing (256 bytes in, 192 bytes out)
6139       __ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
6140       __ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
6141       __ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
6142 
6143       // Store result
6144       __ evmovdquq(Address(dest, dp, Address::times_1, 0x00), merged0, Assembler::AVX_512bit);
6145       __ evmovdquq(Address(dest, dp, Address::times_1, 0x40), merged1, Assembler::AVX_512bit);
6146       __ evmovdquq(Address(dest, dp, Address::times_1, 0x80), merged2, Assembler::AVX_512bit);
6147 
6148       __ addptr(source, 0x100);
6149       __ addptr(dest, 0xc0);
6150       __ subl(length, 0x100);
6151       __ cmpl(length, 64 * 4);
6152       __ jcc(Assembler::greaterEqual, L_process256);
6153 
6154       // At this point, we've decoded 64 * 4 * n bytes.
6155       // The remaining length will be <= 64 * 4 - 1.
6156       // UNLESS there was an error decoding the first 256-byte chunk.  In this
6157       // case, the length will be arbitrarily long.
6158       //
6159       // Note that this will be the path for MIME-encoded strings.
6160 
6161       __ BIND(L_process64);
6162 
6163       __ evmovdquq(pack24bits, ExternalAddress(StubRoutines::x86::base64_vbmi_pack_vec_addr()), Assembler::AVX_512bit, r13);
6164 
6165       __ cmpl(length, 63);
6166       __ jcc(Assembler::lessEqual, L_finalBit);
6167 
6168       __ mov64(rax, 0x0000ffffffffffff);
6169       __ kmovql(k2, rax);
6170 
6171       __ align32();
6172       __ BIND(L_process64Loop);
6173 
6174       // Handle first 64-byte block
6175 
6176       __ evmovdquq(input0, Address(source, start_offset), Assembler::AVX_512bit);
6177       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6178       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6179 
6180       __ vpor(errorvec, translated0, input0, Assembler::AVX_512bit);
6181 
6182       // Check for error and bomb out before updating dest
6183       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6184       __ kortestql(k3, k3);
6185       __ jcc(Assembler::notZero, L_exit);
6186 
6187       // Pack output register, selecting correct byte ordering
6188       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6189       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6190       __ vpermb(merged0, pack24bits, merged0, Assembler::AVX_512bit);
6191 
6192       __ evmovdqub(Address(dest, dp), k2, merged0, true, Assembler::AVX_512bit);
6193 
6194       __ subl(length, 64);
6195       __ addptr(source, 64);
6196       __ addptr(dest, 48);
6197 
6198       __ cmpl(length, 64);
6199       __ jcc(Assembler::greaterEqual, L_process64Loop);
6200 
6201       __ cmpl(length, 0);
6202       __ jcc(Assembler::lessEqual, L_exit);
6203 
6204       __ BIND(L_finalBit);
6205       // Now have 1 to 63 bytes left to decode
6206 
6207       // I was going to let Java take care of the final fragment
6208       // however it will repeatedly call this routine for every 4 bytes
6209       // of input data, so handle the rest here.
6210       __ movq(rax, -1);
6211       __ bzhiq(rax, rax, length);    // Input mask in rax
6212 
6213       __ movl(output_size, length);
6214       __ shrl(output_size, 2);   // Find (len / 4) * 3 (output length)
6215       __ lea(output_size, Address(output_size, output_size, Address::times_2, 0));
6216       // output_size in r13
6217 
6218       // Strip pad characters, if any, and adjust length and mask
6219       __ cmpb(Address(source, length, Address::times_1, -1), '=');
6220       __ jcc(Assembler::equal, L_padding);
6221 
6222       __ BIND(L_donePadding);
6223 
6224       // Output size is (64 - output_size), output mask is (all 1s >> output_size).
6225       __ kmovql(input_mask, rax);
6226       __ movq(output_mask, -1);
6227       __ bzhiq(output_mask, output_mask, output_size);
6228 
6229       // Load initial input with all valid base64 characters.  Will be used
6230       // in merging source bytes to avoid masking when determining if an error occurred.
6231       __ movl(rax, 0x61616161);
6232       __ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
6233 
6234       // A register containing all invalid base64 decoded values
6235       __ movl(rax, 0x80808080);
6236       __ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
6237 
6238       // input_mask is in k1
6239       // output_size is in r13
6240       // output_mask is in r15
6241       // zmm0 - free
6242       // zmm1 - 0x00011000
6243       // zmm2 - 0x01400140
6244       // zmm3 - errorvec
6245       // zmm4 - pack vector
6246       // zmm5 - lookup_lo
6247       // zmm6 - lookup_hi
6248       // zmm7 - errorvec
6249       // zmm8 - 0x61616161
6250       // zmm9 - 0x80808080
6251 
6252       // Load only the bytes from source, merging into our "fully-valid" register
6253       __ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
6254 
6255       // Decode all bytes within our merged input
6256       __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
6257       __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
6258       __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
6259 
6260       // Check for error.  Compare (decoded | initial) to all invalid.
6261       // If any bytes have their high-order bit set, then we have an error.
6262       __ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
6263       __ kortestql(k2, k2);
6264 
6265       // If we have an error, use the brute force loop to decode what we can (4-byte chunks).
6266       __ jcc(Assembler::notZero, L_bruteForce);
6267 
6268       // Shuffle output bytes
6269       __ vpmaddubsw(tmp, tmp, pack16_op, Assembler::AVX_512bit);
6270       __ vpmaddwd(tmp, tmp, pack32_op, Assembler::AVX_512bit);
6271 
6272       __ vpermb(tmp, pack24bits, tmp, Assembler::AVX_512bit);
6273       __ kmovql(k1, output_mask);
6274       __ evmovdqub(Address(dest, dp), k1, tmp, true, Assembler::AVX_512bit);
6275 
6276       __ addptr(dest, output_size);
6277 
6278       __ BIND(L_exit);
6279       __ vzeroupper();
6280       __ pop(rax);             // Get original dest value
6281       __ subptr(dest, rax);      // Number of bytes converted
6282       __ movptr(rax, dest);
6283       __ pop(rbx);
6284       __ pop(r15);
6285       __ pop(r14);
6286       __ pop(r13);
6287       __ pop(r12);
6288       __ leave();
6289       __ ret(0);
6290 
6291       __ BIND(L_loadURL);
6292       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_url_addr()), Assembler::AVX_512bit, r13);
6293       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_url_addr()), Assembler::AVX_512bit, r13);
6294       __ jmp(L_continue);
6295 
6296       __ BIND(L_padding);
6297       __ decrementq(output_size, 1);
6298       __ shrq(rax, 1);
6299 
6300       __ cmpb(Address(source, length, Address::times_1, -2), '=');
6301       __ jcc(Assembler::notEqual, L_donePadding);
6302 
6303       __ decrementq(output_size, 1);
6304       __ shrq(rax, 1);
6305       __ jmp(L_donePadding);
6306 
6307       __ align32();
6308       __ BIND(L_bruteForce);
6309     }   // End of if(avx512_vbmi)
6310 
6311     // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
6312 
6313     // Register state (Linux):
6314     // r12-15 - saved on stack
6315     // rdi - src
6316     // rsi - sp
6317     // rdx - sl
6318     // rcx - dst
6319     // r8 - dp
6320     // r9 - isURL
6321 
6322     // Register state (Windows):
6323     // r12-15 - saved on stack
6324     // rcx - src
6325     // rdx - sp
6326     // r8 - sl
6327     // r9 - dst
6328     // r12 - dp
6329     // r10 - isURL
6330 
6331     // Registers (common):
6332     // length (r14) - bytes in src
6333 
6334     const Register decode_table = r11;
6335     const Register out_byte_count = rbx;
6336     const Register byte1 = r13;
6337     const Register byte2 = r15;
6338     const Register byte3 = WINDOWS_ONLY(r8) NOT_WINDOWS(rdx);
6339     const Register byte4 = WINDOWS_ONLY(r10) NOT_WINDOWS(r9);
6340 
6341     __ shrl(length, 2);    // Multiple of 4 bytes only - length is # 4-byte chunks
6342     __ cmpl(length, 0);
6343     __ jcc(Assembler::lessEqual, L_exit_no_vzero);
6344 
6345     __ shll(isURL, 8);    // index into decode table based on isURL
6346     __ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
6347     __ addptr(decode_table, isURL);
6348 
6349     __ jmp(L_bottomLoop);
6350 
6351     __ align32();
6352     __ BIND(L_forceLoop);
6353     __ shll(byte1, 18);
6354     __ shll(byte2, 12);
6355     __ shll(byte3, 6);
6356     __ orl(byte1, byte2);
6357     __ orl(byte1, byte3);
6358     __ orl(byte1, byte4);
6359 
6360     __ addptr(source, 4);
6361 
6362     __ movb(Address(dest, dp, Address::times_1, 2), byte1);
6363     __ shrl(byte1, 8);
6364     __ movb(Address(dest, dp, Address::times_1, 1), byte1);
6365     __ shrl(byte1, 8);
6366     __ movb(Address(dest, dp, Address::times_1, 0), byte1);
6367 
6368     __ addptr(dest, 3);
6369     __ decrementl(length, 1);
6370     __ jcc(Assembler::zero, L_exit_no_vzero);
6371 
6372     __ BIND(L_bottomLoop);
6373     __ load_unsigned_byte(byte1, Address(source, start_offset, Address::times_1, 0x00));
6374     __ load_unsigned_byte(byte2, Address(source, start_offset, Address::times_1, 0x01));
6375     __ load_signed_byte(byte1, Address(decode_table, byte1));
6376     __ load_signed_byte(byte2, Address(decode_table, byte2));
6377     __ load_unsigned_byte(byte3, Address(source, start_offset, Address::times_1, 0x02));
6378     __ load_unsigned_byte(byte4, Address(source, start_offset, Address::times_1, 0x03));
6379     __ load_signed_byte(byte3, Address(decode_table, byte3));
6380     __ load_signed_byte(byte4, Address(decode_table, byte4));
6381 
6382     __ mov(rax, byte1);
6383     __ orl(rax, byte2);
6384     __ orl(rax, byte3);
6385     __ orl(rax, byte4);
6386     __ jcc(Assembler::positive, L_forceLoop);
6387 
6388     __ BIND(L_exit_no_vzero);
6389     __ pop(rax);             // Get original dest value
6390     __ subptr(dest, rax);      // Number of bytes converted
6391     __ movptr(rax, dest);
6392     __ pop(rbx);
6393     __ pop(r15);
6394     __ pop(r14);
6395     __ pop(r13);
6396     __ pop(r12);
6397     __ leave();
6398     __ ret(0);
6399 
6400     return start;
6401   }
6402 
6403 
6404   /**
6405    *  Arguments:
6406    *
6407    * Inputs:
6408    *   c_rarg0   - int crc
6409    *   c_rarg1   - byte* buf
6410    *   c_rarg2   - int length
6411    *
6412    * Ouput:
6413    *       rax   - int crc result
6414    */
6415   address generate_updateBytesCRC32() {
6416     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
6417 
6418     __ align(CodeEntryAlignment);
6419     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6420 
6421     address start = __ pc();
6422     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6423     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6424     // rscratch1: r10
6425     const Register crc   = c_rarg0;  // crc
6426     const Register buf   = c_rarg1;  // source java byte array address
6427     const Register len   = c_rarg2;  // length
6428     const Register table = c_rarg3;  // crc_table address (reuse register)
6429     const Register tmp1   = r11;
6430     const Register tmp2   = r10;
6431     assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
6432 
6433     BLOCK_COMMENT("Entry:");
6434     __ enter(); // required for proper stackwalking of RuntimeStub frame
6435 
6436     if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6437         VM_Version::supports_avx512bw() &&
6438         VM_Version::supports_avx512vl()) {
6439         // The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value.
6440         // However, the constant table for CRC32-C assumes the original crc value.  Account for this
6441         // difference before calling and after returning.
6442       __ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
6443       __ notl(crc);
6444       __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6445       __ notl(crc);
6446     } else {
6447       __ kernel_crc32(crc, buf, len, table, tmp1);
6448     }
6449 
6450     __ movl(rax, crc);
6451     __ vzeroupper();
6452     __ leave(); // required for proper stackwalking of RuntimeStub frame
6453     __ ret(0);
6454 
6455     return start;
6456   }
6457 
6458   /**
6459   *  Arguments:
6460   *
6461   * Inputs:
6462   *   c_rarg0   - int crc
6463   *   c_rarg1   - byte* buf
6464   *   c_rarg2   - long length
6465   *   c_rarg3   - table_start - optional (present only when doing a library_call,
6466   *              not used by x86 algorithm)
6467   *
6468   * Ouput:
6469   *       rax   - int crc result
6470   */
6471   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
6472       assert(UseCRC32CIntrinsics, "need SSE4_2");
6473       __ align(CodeEntryAlignment);
6474       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
6475       address start = __ pc();
6476       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
6477       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
6478       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
6479       const Register crc = c_rarg0;  // crc
6480       const Register buf = c_rarg1;  // source java byte array address
6481       const Register len = c_rarg2;  // length
6482       const Register a = rax;
6483       const Register j = r9;
6484       const Register k = r10;
6485       const Register l = r11;
6486 #ifdef _WIN64
6487       const Register y = rdi;
6488       const Register z = rsi;
6489 #else
6490       const Register y = rcx;
6491       const Register z = r8;
6492 #endif
6493       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
6494 
6495       BLOCK_COMMENT("Entry:");
6496       __ enter(); // required for proper stackwalking of RuntimeStub frame
6497       if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6498           VM_Version::supports_avx512bw() &&
6499           VM_Version::supports_avx512vl()) {
6500         __ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr()));
6501         __ kernel_crc32_avx512(crc, buf, len, j, l, k);
6502       } else {
6503 #ifdef _WIN64
6504         __ push(y);
6505         __ push(z);
6506 #endif
6507         __ crc32c_ipl_alg2_alt2(crc, buf, len,
6508                                 a, j, k,
6509                                 l, y, z,
6510                                 c_farg0, c_farg1, c_farg2,
6511                                 is_pclmulqdq_supported);
6512 #ifdef _WIN64
6513         __ pop(z);
6514         __ pop(y);
6515 #endif
6516       }
6517       __ movl(rax, crc);
6518       __ vzeroupper();
6519       __ leave(); // required for proper stackwalking of RuntimeStub frame
6520       __ ret(0);
6521 
6522       return start;
6523   }
6524 
6525 
6526   /***
6527    *  Arguments:
6528    *
6529    *  Inputs:
6530    *   c_rarg0   - int   adler
6531    *   c_rarg1   - byte* buff
6532    *   c_rarg2   - int   len
6533    *
6534    * Output:
6535    *   rax   - int adler result
6536    */
6537 
6538   address generate_updateBytesAdler32() {
6539       assert(UseAdler32Intrinsics, "need AVX2");
6540 
6541       __ align(CodeEntryAlignment);
6542       StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
6543 
6544       address start = __ pc();
6545 
6546       const Register data = r9;
6547       const Register size = r10;
6548 
6549       const XMMRegister yshuf0 = xmm6;
6550       const XMMRegister yshuf1 = xmm7;
6551       assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
6552 
6553       BLOCK_COMMENT("Entry:");
6554       __ enter(); // required for proper stackwalking of RuntimeStub frame
6555 
6556       __ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
6557       __ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
6558       __ movptr(data, c_rarg1); //data
6559       __ movl(size, c_rarg2); //length
6560       __ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
6561       __ leave();
6562       __ ret(0);
6563       return start;
6564   }
6565 
6566   /**
6567    *  Arguments:
6568    *
6569    *  Input:
6570    *    c_rarg0   - x address
6571    *    c_rarg1   - x length
6572    *    c_rarg2   - y address
6573    *    c_rarg3   - y length
6574    * not Win64
6575    *    c_rarg4   - z address
6576    *    c_rarg5   - z length
6577    * Win64
6578    *    rsp+40    - z address
6579    *    rsp+48    - z length
6580    */
6581   address generate_multiplyToLen() {
6582     __ align(CodeEntryAlignment);
6583     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
6584 
6585     address start = __ pc();
6586     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6587     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6588     const Register x     = rdi;
6589     const Register xlen  = rax;
6590     const Register y     = rsi;
6591     const Register ylen  = rcx;
6592     const Register z     = r8;
6593     const Register zlen  = r11;
6594 
6595     // Next registers will be saved on stack in multiply_to_len().
6596     const Register tmp1  = r12;
6597     const Register tmp2  = r13;
6598     const Register tmp3  = r14;
6599     const Register tmp4  = r15;
6600     const Register tmp5  = rbx;
6601 
6602     BLOCK_COMMENT("Entry:");
6603     __ enter(); // required for proper stackwalking of RuntimeStub frame
6604 
6605 #ifndef _WIN64
6606     __ movptr(zlen, r9); // Save r9 in r11 - zlen
6607 #endif
6608     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
6609                        // ylen => rcx, z => r8, zlen => r11
6610                        // r9 and r10 may be used to save non-volatile registers
6611 #ifdef _WIN64
6612     // last 2 arguments (#4, #5) are on stack on Win64
6613     __ movptr(z, Address(rsp, 6 * wordSize));
6614     __ movptr(zlen, Address(rsp, 7 * wordSize));
6615 #endif
6616 
6617     __ movptr(xlen, rsi);
6618     __ movptr(y,    rdx);
6619     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
6620 
6621     restore_arg_regs();
6622 
6623     __ leave(); // required for proper stackwalking of RuntimeStub frame
6624     __ ret(0);
6625 
6626     return start;
6627   }
6628 
6629   /**
6630   *  Arguments:
6631   *
6632   *  Input:
6633   *    c_rarg0   - obja     address
6634   *    c_rarg1   - objb     address
6635   *    c_rarg3   - length   length
6636   *    c_rarg4   - scale    log2_array_indxscale
6637   *
6638   *  Output:
6639   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
6640   */
6641   address generate_vectorizedMismatch() {
6642     __ align(CodeEntryAlignment);
6643     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
6644     address start = __ pc();
6645 
6646     BLOCK_COMMENT("Entry:");
6647     __ enter();
6648 
6649 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6650     const Register scale = c_rarg0;  //rcx, will exchange with r9
6651     const Register objb = c_rarg1;   //rdx
6652     const Register length = c_rarg2; //r8
6653     const Register obja = c_rarg3;   //r9
6654     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
6655 
6656     const Register tmp1 = r10;
6657     const Register tmp2 = r11;
6658 #endif
6659 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6660     const Register obja = c_rarg0;   //U:rdi
6661     const Register objb = c_rarg1;   //U:rsi
6662     const Register length = c_rarg2; //U:rdx
6663     const Register scale = c_rarg3;  //U:rcx
6664     const Register tmp1 = r8;
6665     const Register tmp2 = r9;
6666 #endif
6667     const Register result = rax; //return value
6668     const XMMRegister vec0 = xmm0;
6669     const XMMRegister vec1 = xmm1;
6670     const XMMRegister vec2 = xmm2;
6671 
6672     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
6673 
6674     __ vzeroupper();
6675     __ leave();
6676     __ ret(0);
6677 
6678     return start;
6679   }
6680 
6681 /**
6682    *  Arguments:
6683    *
6684   //  Input:
6685   //    c_rarg0   - x address
6686   //    c_rarg1   - x length
6687   //    c_rarg2   - z address
6688   //    c_rarg3   - z lenth
6689    *
6690    */
6691   address generate_squareToLen() {
6692 
6693     __ align(CodeEntryAlignment);
6694     StubCodeMark mark(this, "StubRoutines", "squareToLen");
6695 
6696     address start = __ pc();
6697     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6698     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
6699     const Register x      = rdi;
6700     const Register len    = rsi;
6701     const Register z      = r8;
6702     const Register zlen   = rcx;
6703 
6704    const Register tmp1      = r12;
6705    const Register tmp2      = r13;
6706    const Register tmp3      = r14;
6707    const Register tmp4      = r15;
6708    const Register tmp5      = rbx;
6709 
6710     BLOCK_COMMENT("Entry:");
6711     __ enter(); // required for proper stackwalking of RuntimeStub frame
6712 
6713     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
6714                        // zlen => rcx
6715                        // r9 and r10 may be used to save non-volatile registers
6716     __ movptr(r8, rdx);
6717     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6718 
6719     restore_arg_regs();
6720 
6721     __ leave(); // required for proper stackwalking of RuntimeStub frame
6722     __ ret(0);
6723 
6724     return start;
6725   }
6726 
6727   address generate_method_entry_barrier() {
6728     __ align(CodeEntryAlignment);
6729     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
6730 
6731     Label deoptimize_label;
6732 
6733     address start = __ pc();
6734 
6735     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
6736 
6737     BLOCK_COMMENT("Entry:");
6738     __ enter(); // save rbp
6739 
6740     // save c_rarg0, because we want to use that value.
6741     // We could do without it but then we depend on the number of slots used by pusha
6742     __ push(c_rarg0);
6743 
6744     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
6745 
6746     __ pusha();
6747 
6748     // The method may have floats as arguments, and we must spill them before calling
6749     // the VM runtime.
6750     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
6751     const int xmm_size = wordSize * 2;
6752     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
6753     __ subptr(rsp, xmm_spill_size);
6754     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
6755     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
6756     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
6757     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
6758     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
6759     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
6760     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
6761     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
6762 
6763     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
6764 
6765     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
6766     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
6767     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
6768     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
6769     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
6770     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
6771     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
6772     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
6773     __ addptr(rsp, xmm_spill_size);
6774 
6775     __ cmpl(rax, 1); // 1 means deoptimize
6776     __ jcc(Assembler::equal, deoptimize_label);
6777 
6778     __ popa();
6779     __ pop(c_rarg0);
6780 
6781     __ leave();
6782 
6783     __ addptr(rsp, 1 * wordSize); // cookie
6784     __ ret(0);
6785 
6786 
6787     __ BIND(deoptimize_label);
6788 
6789     __ popa();
6790     __ pop(c_rarg0);
6791 
6792     __ leave();
6793 
6794     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
6795     // here while still having a correct stack is valuable
6796     __ testptr(rsp, Address(rsp, 0));
6797 
6798     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
6799     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
6800 
6801     return start;
6802   }
6803 
6804    /**
6805    *  Arguments:
6806    *
6807    *  Input:
6808    *    c_rarg0   - out address
6809    *    c_rarg1   - in address
6810    *    c_rarg2   - offset
6811    *    c_rarg3   - len
6812    * not Win64
6813    *    c_rarg4   - k
6814    * Win64
6815    *    rsp+40    - k
6816    */
6817   address generate_mulAdd() {
6818     __ align(CodeEntryAlignment);
6819     StubCodeMark mark(this, "StubRoutines", "mulAdd");
6820 
6821     address start = __ pc();
6822     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6823     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6824     const Register out     = rdi;
6825     const Register in      = rsi;
6826     const Register offset  = r11;
6827     const Register len     = rcx;
6828     const Register k       = r8;
6829 
6830     // Next registers will be saved on stack in mul_add().
6831     const Register tmp1  = r12;
6832     const Register tmp2  = r13;
6833     const Register tmp3  = r14;
6834     const Register tmp4  = r15;
6835     const Register tmp5  = rbx;
6836 
6837     BLOCK_COMMENT("Entry:");
6838     __ enter(); // required for proper stackwalking of RuntimeStub frame
6839 
6840     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
6841                        // len => rcx, k => r8
6842                        // r9 and r10 may be used to save non-volatile registers
6843 #ifdef _WIN64
6844     // last argument is on stack on Win64
6845     __ movl(k, Address(rsp, 6 * wordSize));
6846 #endif
6847     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
6848     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6849 
6850     restore_arg_regs();
6851 
6852     __ leave(); // required for proper stackwalking of RuntimeStub frame
6853     __ ret(0);
6854 
6855     return start;
6856   }
6857 
6858   address generate_bigIntegerRightShift() {
6859     __ align(CodeEntryAlignment);
6860     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
6861 
6862     address start = __ pc();
6863     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
6864     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
6865     const Register newArr = rdi;
6866     const Register oldArr = rsi;
6867     const Register newIdx = rdx;
6868     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
6869     const Register totalNumIter = r8;
6870 
6871     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
6872     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
6873     const Register tmp1 = r11;                    // Caller save.
6874     const Register tmp2 = rax;                    // Caller save.
6875     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
6876     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
6877     const Register tmp5 = r14;                    // Callee save.
6878     const Register tmp6 = r15;
6879 
6880     const XMMRegister x0 = xmm0;
6881     const XMMRegister x1 = xmm1;
6882     const XMMRegister x2 = xmm2;
6883 
6884     BLOCK_COMMENT("Entry:");
6885     __ enter(); // required for proper stackwalking of RuntimeStub frame
6886 
6887 #ifdef _WINDOWS
6888     setup_arg_regs(4);
6889     // For windows, since last argument is on stack, we need to move it to the appropriate register.
6890     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
6891     // Save callee save registers.
6892     __ push(tmp3);
6893     __ push(tmp4);
6894 #endif
6895     __ push(tmp5);
6896 
6897     // Rename temps used throughout the code.
6898     const Register idx = tmp1;
6899     const Register nIdx = tmp2;
6900 
6901     __ xorl(idx, idx);
6902 
6903     // Start right shift from end of the array.
6904     // For example, if #iteration = 4 and newIdx = 1
6905     // then dest[4] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
6906     // if #iteration = 4 and newIdx = 0
6907     // then dest[3] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
6908     __ movl(idx, totalNumIter);
6909     __ movl(nIdx, idx);
6910     __ addl(nIdx, newIdx);
6911 
6912     // If vectorization is enabled, check if the number of iterations is at least 64
6913     // If not, then go to ShifTwo processing 2 iterations
6914     if (VM_Version::supports_avx512_vbmi2()) {
6915       __ cmpptr(totalNumIter, (AVX3Threshold/64));
6916       __ jcc(Assembler::less, ShiftTwo);
6917 
6918       if (AVX3Threshold < 16 * 64) {
6919         __ cmpl(totalNumIter, 16);
6920         __ jcc(Assembler::less, ShiftTwo);
6921       }
6922       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
6923       __ subl(idx, 16);
6924       __ subl(nIdx, 16);
6925       __ BIND(Shift512Loop);
6926       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
6927       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
6928       __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
6929       __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
6930       __ subl(nIdx, 16);
6931       __ subl(idx, 16);
6932       __ jcc(Assembler::greaterEqual, Shift512Loop);
6933       __ addl(idx, 16);
6934       __ addl(nIdx, 16);
6935     }
6936     __ BIND(ShiftTwo);
6937     __ cmpl(idx, 2);
6938     __ jcc(Assembler::less, ShiftOne);
6939     __ subl(idx, 2);
6940     __ subl(nIdx, 2);
6941     __ BIND(ShiftTwoLoop);
6942     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
6943     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
6944     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
6945     __ shrdl(tmp5, tmp4);
6946     __ shrdl(tmp4, tmp3);
6947     __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
6948     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
6949     __ subl(nIdx, 2);
6950     __ subl(idx, 2);
6951     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
6952     __ addl(idx, 2);
6953     __ addl(nIdx, 2);
6954 
6955     // Do the last iteration
6956     __ BIND(ShiftOne);
6957     __ cmpl(idx, 1);
6958     __ jcc(Assembler::less, Exit);
6959     __ subl(idx, 1);
6960     __ subl(nIdx, 1);
6961     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
6962     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
6963     __ shrdl(tmp4, tmp3);
6964     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
6965     __ BIND(Exit);
6966     // Restore callee save registers.
6967     __ pop(tmp5);
6968 #ifdef _WINDOWS
6969     __ pop(tmp4);
6970     __ pop(tmp3);
6971     restore_arg_regs();
6972 #endif
6973     __ leave(); // required for proper stackwalking of RuntimeStub frame
6974     __ ret(0);
6975     return start;
6976   }
6977 
6978    /**
6979    *  Arguments:
6980    *
6981    *  Input:
6982    *    c_rarg0   - newArr address
6983    *    c_rarg1   - oldArr address
6984    *    c_rarg2   - newIdx
6985    *    c_rarg3   - shiftCount
6986    * not Win64
6987    *    c_rarg4   - numIter
6988    * Win64
6989    *    rsp40    - numIter
6990    */
6991   address generate_bigIntegerLeftShift() {
6992     __ align(CodeEntryAlignment);
6993     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
6994     address start = __ pc();
6995     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
6996     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
6997     const Register newArr = rdi;
6998     const Register oldArr = rsi;
6999     const Register newIdx = rdx;
7000     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7001     const Register totalNumIter = r8;
7002     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7003     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7004     const Register tmp1 = r11;                    // Caller save.
7005     const Register tmp2 = rax;                    // Caller save.
7006     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7007     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7008     const Register tmp5 = r14;                    // Callee save.
7009 
7010     const XMMRegister x0 = xmm0;
7011     const XMMRegister x1 = xmm1;
7012     const XMMRegister x2 = xmm2;
7013     BLOCK_COMMENT("Entry:");
7014     __ enter(); // required for proper stackwalking of RuntimeStub frame
7015 
7016 #ifdef _WINDOWS
7017     setup_arg_regs(4);
7018     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7019     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7020     // Save callee save registers.
7021     __ push(tmp3);
7022     __ push(tmp4);
7023 #endif
7024     __ push(tmp5);
7025 
7026     // Rename temps used throughout the code
7027     const Register idx = tmp1;
7028     const Register numIterTmp = tmp2;
7029 
7030     // Start idx from zero.
7031     __ xorl(idx, idx);
7032     // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
7033     __ lea(newArr, Address(newArr, newIdx, Address::times_4));
7034     __ movl(numIterTmp, totalNumIter);
7035 
7036     // If vectorization is enabled, check if the number of iterations is at least 64
7037     // If not, then go to ShiftTwo shifting two numbers at a time
7038     if (VM_Version::supports_avx512_vbmi2()) {
7039       __ cmpl(totalNumIter, (AVX3Threshold/64));
7040       __ jcc(Assembler::less, ShiftTwo);
7041 
7042       if (AVX3Threshold < 16 * 64) {
7043         __ cmpl(totalNumIter, 16);
7044         __ jcc(Assembler::less, ShiftTwo);
7045       }
7046       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7047       __ subl(numIterTmp, 16);
7048       __ BIND(Shift512Loop);
7049       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7050       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
7051       __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
7052       __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
7053       __ addl(idx, 16);
7054       __ subl(numIterTmp, 16);
7055       __ jcc(Assembler::greaterEqual, Shift512Loop);
7056       __ addl(numIterTmp, 16);
7057     }
7058     __ BIND(ShiftTwo);
7059     __ cmpl(totalNumIter, 1);
7060     __ jcc(Assembler::less, Exit);
7061     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7062     __ subl(numIterTmp, 2);
7063     __ jcc(Assembler::less, ShiftOne);
7064 
7065     __ BIND(ShiftTwoLoop);
7066     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7067     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
7068     __ shldl(tmp3, tmp4);
7069     __ shldl(tmp4, tmp5);
7070     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7071     __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
7072     __ movl(tmp3, tmp5);
7073     __ addl(idx, 2);
7074     __ subl(numIterTmp, 2);
7075     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7076 
7077     // Do the last iteration
7078     __ BIND(ShiftOne);
7079     __ addl(numIterTmp, 2);
7080     __ cmpl(numIterTmp, 1);
7081     __ jcc(Assembler::less, Exit);
7082     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7083     __ shldl(tmp3, tmp4);
7084     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7085 
7086     __ BIND(Exit);
7087     // Restore callee save registers.
7088     __ pop(tmp5);
7089 #ifdef _WINDOWS
7090     __ pop(tmp4);
7091     __ pop(tmp3);
7092     restore_arg_regs();
7093 #endif
7094     __ leave(); // required for proper stackwalking of RuntimeStub frame
7095     __ ret(0);
7096     return start;
7097   }
7098 
7099   address generate_libmExp() {
7100     StubCodeMark mark(this, "StubRoutines", "libmExp");
7101 
7102     address start = __ pc();
7103 
7104     const XMMRegister x0  = xmm0;
7105     const XMMRegister x1  = xmm1;
7106     const XMMRegister x2  = xmm2;
7107     const XMMRegister x3  = xmm3;
7108 
7109     const XMMRegister x4  = xmm4;
7110     const XMMRegister x5  = xmm5;
7111     const XMMRegister x6  = xmm6;
7112     const XMMRegister x7  = xmm7;
7113 
7114     const Register tmp   = r11;
7115 
7116     BLOCK_COMMENT("Entry:");
7117     __ enter(); // required for proper stackwalking of RuntimeStub frame
7118 
7119     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7120 
7121     __ leave(); // required for proper stackwalking of RuntimeStub frame
7122     __ ret(0);
7123 
7124     return start;
7125 
7126   }
7127 
7128   address generate_libmLog() {
7129     StubCodeMark mark(this, "StubRoutines", "libmLog");
7130 
7131     address start = __ pc();
7132 
7133     const XMMRegister x0 = xmm0;
7134     const XMMRegister x1 = xmm1;
7135     const XMMRegister x2 = xmm2;
7136     const XMMRegister x3 = xmm3;
7137 
7138     const XMMRegister x4 = xmm4;
7139     const XMMRegister x5 = xmm5;
7140     const XMMRegister x6 = xmm6;
7141     const XMMRegister x7 = xmm7;
7142 
7143     const Register tmp1 = r11;
7144     const Register tmp2 = r8;
7145 
7146     BLOCK_COMMENT("Entry:");
7147     __ enter(); // required for proper stackwalking of RuntimeStub frame
7148 
7149     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
7150 
7151     __ leave(); // required for proper stackwalking of RuntimeStub frame
7152     __ ret(0);
7153 
7154     return start;
7155 
7156   }
7157 
7158   address generate_libmLog10() {
7159     StubCodeMark mark(this, "StubRoutines", "libmLog10");
7160 
7161     address start = __ pc();
7162 
7163     const XMMRegister x0 = xmm0;
7164     const XMMRegister x1 = xmm1;
7165     const XMMRegister x2 = xmm2;
7166     const XMMRegister x3 = xmm3;
7167 
7168     const XMMRegister x4 = xmm4;
7169     const XMMRegister x5 = xmm5;
7170     const XMMRegister x6 = xmm6;
7171     const XMMRegister x7 = xmm7;
7172 
7173     const Register tmp = r11;
7174 
7175     BLOCK_COMMENT("Entry:");
7176     __ enter(); // required for proper stackwalking of RuntimeStub frame
7177 
7178     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7179 
7180     __ leave(); // required for proper stackwalking of RuntimeStub frame
7181     __ ret(0);
7182 
7183     return start;
7184 
7185   }
7186 
7187   address generate_libmPow() {
7188     StubCodeMark mark(this, "StubRoutines", "libmPow");
7189 
7190     address start = __ pc();
7191 
7192     const XMMRegister x0 = xmm0;
7193     const XMMRegister x1 = xmm1;
7194     const XMMRegister x2 = xmm2;
7195     const XMMRegister x3 = xmm3;
7196 
7197     const XMMRegister x4 = xmm4;
7198     const XMMRegister x5 = xmm5;
7199     const XMMRegister x6 = xmm6;
7200     const XMMRegister x7 = xmm7;
7201 
7202     const Register tmp1 = r8;
7203     const Register tmp2 = r9;
7204     const Register tmp3 = r10;
7205     const Register tmp4 = r11;
7206 
7207     BLOCK_COMMENT("Entry:");
7208     __ enter(); // required for proper stackwalking of RuntimeStub frame
7209 
7210     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7211 
7212     __ leave(); // required for proper stackwalking of RuntimeStub frame
7213     __ ret(0);
7214 
7215     return start;
7216 
7217   }
7218 
7219   address generate_libmSin() {
7220     StubCodeMark mark(this, "StubRoutines", "libmSin");
7221 
7222     address start = __ pc();
7223 
7224     const XMMRegister x0 = xmm0;
7225     const XMMRegister x1 = xmm1;
7226     const XMMRegister x2 = xmm2;
7227     const XMMRegister x3 = xmm3;
7228 
7229     const XMMRegister x4 = xmm4;
7230     const XMMRegister x5 = xmm5;
7231     const XMMRegister x6 = xmm6;
7232     const XMMRegister x7 = xmm7;
7233 
7234     const Register tmp1 = r8;
7235     const Register tmp2 = r9;
7236     const Register tmp3 = r10;
7237     const Register tmp4 = r11;
7238 
7239     BLOCK_COMMENT("Entry:");
7240     __ enter(); // required for proper stackwalking of RuntimeStub frame
7241 
7242 #ifdef _WIN64
7243     __ push(rsi);
7244     __ push(rdi);
7245 #endif
7246     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7247 
7248 #ifdef _WIN64
7249     __ pop(rdi);
7250     __ pop(rsi);
7251 #endif
7252 
7253     __ leave(); // required for proper stackwalking of RuntimeStub frame
7254     __ ret(0);
7255 
7256     return start;
7257 
7258   }
7259 
7260   address generate_libmCos() {
7261     StubCodeMark mark(this, "StubRoutines", "libmCos");
7262 
7263     address start = __ pc();
7264 
7265     const XMMRegister x0 = xmm0;
7266     const XMMRegister x1 = xmm1;
7267     const XMMRegister x2 = xmm2;
7268     const XMMRegister x3 = xmm3;
7269 
7270     const XMMRegister x4 = xmm4;
7271     const XMMRegister x5 = xmm5;
7272     const XMMRegister x6 = xmm6;
7273     const XMMRegister x7 = xmm7;
7274 
7275     const Register tmp1 = r8;
7276     const Register tmp2 = r9;
7277     const Register tmp3 = r10;
7278     const Register tmp4 = r11;
7279 
7280     BLOCK_COMMENT("Entry:");
7281     __ enter(); // required for proper stackwalking of RuntimeStub frame
7282 
7283 #ifdef _WIN64
7284     __ push(rsi);
7285     __ push(rdi);
7286 #endif
7287     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7288 
7289 #ifdef _WIN64
7290     __ pop(rdi);
7291     __ pop(rsi);
7292 #endif
7293 
7294     __ leave(); // required for proper stackwalking of RuntimeStub frame
7295     __ ret(0);
7296 
7297     return start;
7298 
7299   }
7300 
7301   address generate_libmTan() {
7302     StubCodeMark mark(this, "StubRoutines", "libmTan");
7303 
7304     address start = __ pc();
7305 
7306     const XMMRegister x0 = xmm0;
7307     const XMMRegister x1 = xmm1;
7308     const XMMRegister x2 = xmm2;
7309     const XMMRegister x3 = xmm3;
7310 
7311     const XMMRegister x4 = xmm4;
7312     const XMMRegister x5 = xmm5;
7313     const XMMRegister x6 = xmm6;
7314     const XMMRegister x7 = xmm7;
7315 
7316     const Register tmp1 = r8;
7317     const Register tmp2 = r9;
7318     const Register tmp3 = r10;
7319     const Register tmp4 = r11;
7320 
7321     BLOCK_COMMENT("Entry:");
7322     __ enter(); // required for proper stackwalking of RuntimeStub frame
7323 
7324 #ifdef _WIN64
7325     __ push(rsi);
7326     __ push(rdi);
7327 #endif
7328     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7329 
7330 #ifdef _WIN64
7331     __ pop(rdi);
7332     __ pop(rsi);
7333 #endif
7334 
7335     __ leave(); // required for proper stackwalking of RuntimeStub frame
7336     __ ret(0);
7337 
7338     return start;
7339 
7340   }
7341 
7342 #undef __
7343 #define __ masm->
7344 
7345   // Continuation point for throwing of implicit exceptions that are
7346   // not handled in the current activation. Fabricates an exception
7347   // oop and initiates normal exception dispatching in this
7348   // frame. Since we need to preserve callee-saved values (currently
7349   // only for C2, but done for C1 as well) we need a callee-saved oop
7350   // map and therefore have to make these stubs into RuntimeStubs
7351   // rather than BufferBlobs.  If the compiler needs all registers to
7352   // be preserved between the fault point and the exception handler
7353   // then it must assume responsibility for that in
7354   // AbstractCompiler::continuation_for_implicit_null_exception or
7355   // continuation_for_implicit_division_by_zero_exception. All other
7356   // implicit exceptions (e.g., NullPointerException or
7357   // AbstractMethodError on entry) are either at call sites or
7358   // otherwise assume that stack unwinding will be initiated, so
7359   // caller saved registers were assumed volatile in the compiler.
7360   address generate_throw_exception(const char* name,
7361                                    address runtime_entry,
7362                                    Register arg1 = noreg,
7363                                    Register arg2 = noreg) {
7364     // Information about frame layout at time of blocking runtime call.
7365     // Note that we only have to preserve callee-saved registers since
7366     // the compilers are responsible for supplying a continuation point
7367     // if they expect all registers to be preserved.
7368     enum layout {
7369       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
7370       rbp_off2,
7371       return_off,
7372       return_off2,
7373       framesize // inclusive of return address
7374     };
7375 
7376     int insts_size = 512;
7377     int locs_size  = 64;
7378 
7379     CodeBuffer code(name, insts_size, locs_size);
7380     OopMapSet* oop_maps  = new OopMapSet();
7381     MacroAssembler* masm = new MacroAssembler(&code);
7382 
7383     address start = __ pc();
7384 
7385     // This is an inlined and slightly modified version of call_VM
7386     // which has the ability to fetch the return PC out of
7387     // thread-local storage and also sets up last_Java_sp slightly
7388     // differently than the real call_VM
7389 
7390     __ enter(); // required for proper stackwalking of RuntimeStub frame
7391 
7392     assert(is_even(framesize/2), "sp not 16-byte aligned");
7393 
7394     // return address and rbp are already in place
7395     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
7396 
7397     int frame_complete = __ pc() - start;
7398 
7399     // Set up last_Java_sp and last_Java_fp
7400     address the_pc = __ pc();
7401     __ set_last_Java_frame(rsp, rbp, the_pc);
7402     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
7403 
7404     // Call runtime
7405     if (arg1 != noreg) {
7406       assert(arg2 != c_rarg1, "clobbered");
7407       __ movptr(c_rarg1, arg1);
7408     }
7409     if (arg2 != noreg) {
7410       __ movptr(c_rarg2, arg2);
7411     }
7412     __ movptr(c_rarg0, r15_thread);
7413     BLOCK_COMMENT("call runtime_entry");
7414     __ call(RuntimeAddress(runtime_entry));
7415 
7416     // Generate oop map
7417     OopMap* map = new OopMap(framesize, 0);
7418 
7419     oop_maps->add_gc_map(the_pc - start, map);
7420 
7421     __ reset_last_Java_frame(true);
7422 
7423     __ leave(); // required for proper stackwalking of RuntimeStub frame
7424 
7425     // check for pending exceptions
7426 #ifdef ASSERT
7427     Label L;
7428     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
7429             (int32_t) NULL_WORD);
7430     __ jcc(Assembler::notEqual, L);
7431     __ should_not_reach_here();
7432     __ bind(L);
7433 #endif // ASSERT
7434     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7435 
7436 
7437     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7438     RuntimeStub* stub =
7439       RuntimeStub::new_runtime_stub(name,
7440                                     &code,
7441                                     frame_complete,
7442                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7443                                     oop_maps, false);
7444     return stub->entry_point();
7445   }
7446 
7447   void create_control_words() {
7448     // Round to nearest, 64-bit mode, exceptions masked
7449     StubRoutines::x86::_mxcsr_std = 0x1F80;
7450   }
7451 
7452   // Initialization
7453   void generate_initial() {
7454     // Generates all stubs and initializes the entry points
7455 
7456     // This platform-specific settings are needed by generate_call_stub()
7457     create_control_words();
7458 
7459     // entry points that exist in all platforms Note: This is code
7460     // that could be shared among different platforms - however the
7461     // benefit seems to be smaller than the disadvantage of having a
7462     // much more complicated generator structure. See also comment in
7463     // stubRoutines.hpp.
7464 
7465     StubRoutines::_forward_exception_entry = generate_forward_exception();
7466 
7467     StubRoutines::_call_stub_entry =
7468       generate_call_stub(StubRoutines::_call_stub_return_address);
7469 
7470     // is referenced by megamorphic call
7471     StubRoutines::_catch_exception_entry = generate_catch_exception();
7472 
7473     // atomic calls
7474     StubRoutines::_fence_entry                = generate_orderaccess_fence();
7475 
7476     // platform dependent
7477     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
7478 
7479     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
7480 
7481     StubRoutines::x86::_f2i_fixup             = generate_f2i_fixup();
7482     StubRoutines::x86::_f2l_fixup             = generate_f2l_fixup();
7483     StubRoutines::x86::_d2i_fixup             = generate_d2i_fixup();
7484     StubRoutines::x86::_d2l_fixup             = generate_d2l_fixup();
7485 
7486     StubRoutines::x86::_float_sign_mask       = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
7487     StubRoutines::x86::_float_sign_flip       = generate_fp_mask("float_sign_flip",  0x8000000080000000);
7488     StubRoutines::x86::_double_sign_mask      = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7489     StubRoutines::x86::_double_sign_flip      = generate_fp_mask("double_sign_flip", 0x8000000000000000);
7490 
7491     // Build this early so it's available for the interpreter.
7492     StubRoutines::_throw_StackOverflowError_entry =
7493       generate_throw_exception("StackOverflowError throw_exception",
7494                                CAST_FROM_FN_PTR(address,
7495                                                 SharedRuntime::
7496                                                 throw_StackOverflowError));
7497     StubRoutines::_throw_delayed_StackOverflowError_entry =
7498       generate_throw_exception("delayed StackOverflowError throw_exception",
7499                                CAST_FROM_FN_PTR(address,
7500                                                 SharedRuntime::
7501                                                 throw_delayed_StackOverflowError));
7502     if (UseCRC32Intrinsics) {
7503       // set table address before stub generation which use it
7504       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
7505       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7506     }
7507 
7508     if (UseCRC32CIntrinsics) {
7509       bool supports_clmul = VM_Version::supports_clmul();
7510       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
7511       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
7512       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
7513     }
7514 
7515     if (UseAdler32Intrinsics) {
7516        StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7517     }
7518 
7519     if (UseLibmIntrinsic && InlineIntrinsics) {
7520       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
7521           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
7522           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7523         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
7524         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
7525         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
7526         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
7527         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
7528         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
7529         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
7530         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
7531         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
7532         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
7533         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
7534         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
7535         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
7536         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
7537       }
7538       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
7539         StubRoutines::_dexp = generate_libmExp();
7540       }
7541       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7542         StubRoutines::_dlog = generate_libmLog();
7543       }
7544       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
7545         StubRoutines::_dlog10 = generate_libmLog10();
7546       }
7547       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
7548         StubRoutines::_dpow = generate_libmPow();
7549       }
7550       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7551         StubRoutines::_dsin = generate_libmSin();
7552       }
7553       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7554         StubRoutines::_dcos = generate_libmCos();
7555       }
7556       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7557         StubRoutines::_dtan = generate_libmTan();
7558       }
7559     }
7560 
7561     // Safefetch stubs.
7562     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7563                                                        &StubRoutines::_safefetch32_fault_pc,
7564                                                        &StubRoutines::_safefetch32_continuation_pc);
7565     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7566                                                        &StubRoutines::_safefetchN_fault_pc,
7567                                                        &StubRoutines::_safefetchN_continuation_pc);
7568   }
7569 
7570   void generate_all() {
7571     // Generates all stubs and initializes the entry points
7572 
7573     // These entry points require SharedInfo::stack0 to be set up in
7574     // non-core builds and need to be relocatable, so they each
7575     // fabricate a RuntimeStub internally.
7576     StubRoutines::_throw_AbstractMethodError_entry =
7577       generate_throw_exception("AbstractMethodError throw_exception",
7578                                CAST_FROM_FN_PTR(address,
7579                                                 SharedRuntime::
7580                                                 throw_AbstractMethodError));
7581 
7582     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7583       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7584                                CAST_FROM_FN_PTR(address,
7585                                                 SharedRuntime::
7586                                                 throw_IncompatibleClassChangeError));
7587 
7588     StubRoutines::_throw_NullPointerException_at_call_entry =
7589       generate_throw_exception("NullPointerException at call throw_exception",
7590                                CAST_FROM_FN_PTR(address,
7591                                                 SharedRuntime::
7592                                                 throw_NullPointerException_at_call));
7593 
7594     // entry points that are platform specific
7595     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
7596     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
7597     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7598     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
7599     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
7600     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
7601     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
7602     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
7603     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
7604     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
7605                                                                         0xFFFFFFFF, 0, 0, 0);
7606     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
7607                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
7608     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
7609     StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
7610     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
7611     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
7612     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
7613     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
7614 
7615     // support for verify_oop (must happen after universe_init)
7616     if (VerifyOops) {
7617       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7618     }
7619 
7620     // data cache line writeback
7621     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7622     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7623 
7624     // arraycopy stubs used by compilers
7625     generate_arraycopy_stubs();
7626 
7627     // don't bother generating these AES intrinsic stubs unless global flag is set
7628     if (UseAESIntrinsics) {
7629       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
7630       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7631       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7632       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7633       if (VM_Version::supports_avx512_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
7634         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
7635         StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
7636         StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
7637       } else {
7638         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
7639       }
7640     }
7641     if (UseAESCTRIntrinsics) {
7642       if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
7643         StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7644         StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
7645       } else {
7646         StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
7647         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
7648       }
7649     }
7650 
7651     if (UseMD5Intrinsics) {
7652       StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
7653       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
7654     }
7655     if (UseSHA1Intrinsics) {
7656       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
7657       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
7658       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
7659       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
7660     }
7661     if (UseSHA256Intrinsics) {
7662       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
7663       char* dst = (char*)StubRoutines::x86::_k256_W;
7664       char* src = (char*)StubRoutines::x86::_k256;
7665       for (int ii = 0; ii < 16; ++ii) {
7666         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
7667         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
7668       }
7669       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
7670       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
7671       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
7672       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
7673     }
7674     if (UseSHA512Intrinsics) {
7675       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
7676       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
7677       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
7678       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
7679     }
7680 
7681     // Generate GHASH intrinsics code
7682     if (UseGHASHIntrinsics) {
7683     StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7684     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
7685       if (VM_Version::supports_avx()) {
7686         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
7687         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
7688         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
7689       } else {
7690         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7691       }
7692     }
7693 
7694 
7695     if (UseBASE64Intrinsics) {
7696       if(VM_Version::supports_avx2() &&
7697          VM_Version::supports_avx512bw() &&
7698          VM_Version::supports_avx512vl()) {
7699         StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
7700         StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
7701         StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
7702       }
7703       StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
7704       if (VM_Version::supports_avx512_vbmi()) {
7705         StubRoutines::x86::_shuffle_base64 = base64_shuffle_addr();
7706         StubRoutines::x86::_lookup_lo_base64 = base64_vbmi_lookup_lo_addr();
7707         StubRoutines::x86::_lookup_hi_base64 = base64_vbmi_lookup_hi_addr();
7708         StubRoutines::x86::_lookup_lo_base64url = base64_vbmi_lookup_lo_url_addr();
7709         StubRoutines::x86::_lookup_hi_base64url = base64_vbmi_lookup_hi_url_addr();
7710         StubRoutines::x86::_pack_vec_base64 = base64_vbmi_pack_vec_addr();
7711         StubRoutines::x86::_join_0_1_base64 = base64_vbmi_join_0_1_addr();
7712         StubRoutines::x86::_join_1_2_base64 = base64_vbmi_join_1_2_addr();
7713         StubRoutines::x86::_join_2_3_base64 = base64_vbmi_join_2_3_addr();
7714       }
7715       StubRoutines::x86::_decoding_table_base64 = base64_decoding_table_addr();
7716       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7717       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7718     }
7719 
7720     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7721     if (bs_nm != NULL) {
7722       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
7723     }
7724 #ifdef COMPILER2
7725     if (UseMultiplyToLenIntrinsic) {
7726       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7727     }
7728     if (UseSquareToLenIntrinsic) {
7729       StubRoutines::_squareToLen = generate_squareToLen();
7730     }
7731     if (UseMulAddIntrinsic) {
7732       StubRoutines::_mulAdd = generate_mulAdd();
7733     }
7734     if (VM_Version::supports_avx512_vbmi2()) {
7735       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7736       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7737     }
7738     if (UseMontgomeryMultiplyIntrinsic) {
7739       StubRoutines::_montgomeryMultiply
7740         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
7741     }
7742     if (UseMontgomerySquareIntrinsic) {
7743       StubRoutines::_montgomerySquare
7744         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
7745     }
7746 
7747     // Get svml stub routine addresses
7748     void *libjsvml = NULL;
7749     char ebuf[1024];
7750     char dll_name[JVM_MAXPATHLEN];
7751     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "jsvml")) {
7752       libjsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
7753     }
7754     if (libjsvml != NULL) {
7755       // SVML method naming convention
7756       //   All the methods are named as __jsvml_op<T><N>_ha_<VV>
7757       //   Where:
7758       //      ha stands for high accuracy
7759       //      <T> is optional to indicate float/double
7760       //              Set to f for vector float operation
7761       //              Omitted for vector double operation
7762       //      <N> is the number of elements in the vector
7763       //              1, 2, 4, 8, 16
7764       //              e.g. 128 bit float vector has 4 float elements
7765       //      <VV> indicates the avx/sse level:
7766       //              z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2
7767       //      e.g. __jsvml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns
7768       //           __jsvml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
7769 
7770       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "jsvml" JNI_LIB_SUFFIX, p2i(libjsvml));
7771       if (UseAVX > 2) {
7772         for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7773           int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7774           if ((!VM_Version::supports_avx512dq()) &&
7775               (vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) {
7776             continue;
7777           }
7778           snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf16_ha_z0", VectorSupport::svmlname[op]);
7779           StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
7780 
7781           snprintf(ebuf, sizeof(ebuf), "__jsvml_%s8_ha_z0", VectorSupport::svmlname[op]);
7782           StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
7783         }
7784       }
7785       const char* avx_sse_str = (UseAVX >= 2) ? "l9" : ((UseAVX == 1) ? "e9" : "ex");
7786       for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7787         int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7788         if (vop == VectorSupport::VECTOR_OP_POW) {
7789           continue;
7790         }
7791         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7792         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
7793 
7794         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7795         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
7796 
7797         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf8_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7798         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
7799 
7800         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7801         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
7802 
7803         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7804         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
7805 
7806         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7807         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
7808       }
7809     }
7810 #endif // COMPILER2
7811 
7812     if (UseVectorizedMismatchIntrinsic) {
7813       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
7814     }
7815   }
7816 
7817  public:
7818   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7819     if (all) {
7820       generate_all();
7821     } else {
7822       generate_initial();
7823     }
7824   }
7825 }; // end class declaration
7826 
7827 #define UCM_TABLE_MAX_ENTRIES 16
7828 void StubGenerator_generate(CodeBuffer* code, bool all) {
7829   if (UnsafeCopyMemory::_table == NULL) {
7830     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7831   }
7832   StubGenerator g(code, all);
7833 }