1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/barrierSetNMethod.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/universe.hpp"
  36 #include "nativeInst_x86.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"
  41 #include "prims/jvmtiExport.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/arguments.hpp"
  44 #include "runtime/continuation.hpp"
  45 #include "runtime/frame.inline.hpp"
  46 #include "runtime/handles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.inline.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/runtime.hpp"
  53 #endif
  54 #if INCLUDE_JVMCI
  55 #include "jvmci/jvmci_globals.hpp"
  56 #endif
  57 #if INCLUDE_ZGC
  58 #include "gc/z/zThreadLocalData.hpp"
  59 #endif
  60 #if INCLUDE_JFR
  61 #include "jfr/support/jfrIntrinsics.hpp"
  62 #endif
  63 
  64 // Declaration and definition of StubGenerator (no .hpp file).
  65 // For a more detailed description of the stub routine structure
  66 // see the comment in stubRoutines.hpp
  67 
  68 #define __ _masm->
  69 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  70 #define a__ ((Assembler*)_masm)->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  80 
  81 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots);
  82 void fill_continuation_entry(MacroAssembler* masm);
  83 void continuation_enter_cleanup(MacroAssembler* masm);
  84 
  85 // Stub Code definitions
  86 
  87 class StubGenerator: public StubCodeGenerator {
  88  private:
  89 
  90 #ifdef PRODUCT
  91 #define inc_counter_np(counter) ((void)0)
  92 #else
  93   void inc_counter_np_(int& counter) {
  94     // This can destroy rscratch1 if counter is far from the code cache
  95     __ incrementl(ExternalAddress((address)&counter));
  96   }
  97 #define inc_counter_np(counter) \
  98   BLOCK_COMMENT("inc_counter " #counter); \
  99   inc_counter_np_(counter);
 100 #endif
 101 
 102   // Call stubs are used to call Java from C
 103   //
 104   // Linux Arguments:
 105   //    c_rarg0:   call wrapper address                   address
 106   //    c_rarg1:   result                                 address
 107   //    c_rarg2:   result type                            BasicType
 108   //    c_rarg3:   method                                 Method*
 109   //    c_rarg4:   (interpreter) entry point              address
 110   //    c_rarg5:   parameters                             intptr_t*
 111   //    16(rbp): parameter size (in words)              int
 112   //    24(rbp): thread                                 Thread*
 113   //
 114   //     [ return_from_Java     ] <--- rsp
 115   //     [ argument word n      ]
 116   //      ...
 117   // -12 [ argument word 1      ]
 118   // -11 [ saved r15            ] <--- rsp_after_call
 119   // -10 [ saved r14            ]
 120   //  -9 [ saved r13            ]
 121   //  -8 [ saved r12            ]
 122   //  -7 [ saved rbx            ]
 123   //  -6 [ call wrapper         ]
 124   //  -5 [ result               ]
 125   //  -4 [ result type          ]
 126   //  -3 [ method               ]
 127   //  -2 [ entry point          ]
 128   //  -1 [ parameters           ]
 129   //   0 [ saved rbp            ] <--- rbp
 130   //   1 [ return address       ]
 131   //   2 [ parameter size       ]
 132   //   3 [ thread               ]
 133   //
 134   // Windows Arguments:
 135   //    c_rarg0:   call wrapper address                   address
 136   //    c_rarg1:   result                                 address
 137   //    c_rarg2:   result type                            BasicType
 138   //    c_rarg3:   method                                 Method*
 139   //    48(rbp): (interpreter) entry point              address
 140   //    56(rbp): parameters                             intptr_t*
 141   //    64(rbp): parameter size (in words)              int
 142   //    72(rbp): thread                                 Thread*
 143   //
 144   //     [ return_from_Java     ] <--- rsp
 145   //     [ argument word n      ]
 146   //      ...
 147   // -60 [ argument word 1      ]
 148   // -59 [ saved xmm31          ] <--- rsp after_call
 149   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 150   // -27 [ saved xmm15          ]
 151   //     [ saved xmm7-xmm14     ]
 152   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 153   //  -7 [ saved r15            ]
 154   //  -6 [ saved r14            ]
 155   //  -5 [ saved r13            ]
 156   //  -4 [ saved r12            ]
 157   //  -3 [ saved rdi            ]
 158   //  -2 [ saved rsi            ]
 159   //  -1 [ saved rbx            ]
 160   //   0 [ saved rbp            ] <--- rbp
 161   //   1 [ return address       ]
 162   //   2 [ call wrapper         ]
 163   //   3 [ result               ]
 164   //   4 [ result type          ]
 165   //   5 [ method               ]
 166   //   6 [ entry point          ]
 167   //   7 [ parameters           ]
 168   //   8 [ parameter size       ]
 169   //   9 [ thread               ]
 170   //
 171   //    Windows reserves the callers stack space for arguments 1-4.
 172   //    We spill c_rarg0-c_rarg3 to this space.
 173 
 174   // Call stub stack layout word offsets from rbp
 175   enum call_stub_layout {
 176 #ifdef _WIN64
 177     xmm_save_first     = 6,  // save from xmm6
 178     xmm_save_last      = 31, // to xmm31
 179     xmm_save_base      = -9,
 180     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 181     r15_off            = -7,
 182     r14_off            = -6,
 183     r13_off            = -5,
 184     r12_off            = -4,
 185     rdi_off            = -3,
 186     rsi_off            = -2,
 187     rbx_off            = -1,
 188     rbp_off            =  0,
 189     retaddr_off        =  1,
 190     call_wrapper_off   =  2,
 191     result_off         =  3,
 192     result_type_off    =  4,
 193     method_off         =  5,
 194     entry_point_off    =  6,
 195     parameters_off     =  7,
 196     parameter_size_off =  8,
 197     thread_off         =  9
 198 #else
 199     rsp_after_call_off = -12,
 200     mxcsr_off          = rsp_after_call_off,
 201     r15_off            = -11,
 202     r14_off            = -10,
 203     r13_off            = -9,
 204     r12_off            = -8,
 205     rbx_off            = -7,
 206     call_wrapper_off   = -6,
 207     result_off         = -5,
 208     result_type_off    = -4,
 209     method_off         = -3,
 210     entry_point_off    = -2,
 211     parameters_off     = -1,
 212     rbp_off            =  0,
 213     retaddr_off        =  1,
 214     parameter_size_off =  2,
 215     thread_off         =  3
 216 #endif
 217   };
 218 
 219 #ifdef _WIN64
 220   Address xmm_save(int reg) {
 221     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 222     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 223   }
 224 #endif
 225 
 226   address generate_call_stub(address& return_address) {
 227     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 228            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 229            "adjust this code");
 230     StubCodeMark mark(this, "StubRoutines", "call_stub");
 231     address start = __ pc();
 232 
 233     // same as in generate_catch_exception()!
 234     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 235 
 236     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 237     const Address result        (rbp, result_off         * wordSize);
 238     const Address result_type   (rbp, result_type_off    * wordSize);
 239     const Address method        (rbp, method_off         * wordSize);
 240     const Address entry_point   (rbp, entry_point_off    * wordSize);
 241     const Address parameters    (rbp, parameters_off     * wordSize);
 242     const Address parameter_size(rbp, parameter_size_off * wordSize);
 243 
 244     // same as in generate_catch_exception()!
 245     const Address thread        (rbp, thread_off         * wordSize);
 246 
 247     const Address r15_save(rbp, r15_off * wordSize);
 248     const Address r14_save(rbp, r14_off * wordSize);
 249     const Address r13_save(rbp, r13_off * wordSize);
 250     const Address r12_save(rbp, r12_off * wordSize);
 251     const Address rbx_save(rbp, rbx_off * wordSize);
 252 
 253     // stub code
 254     __ enter();
 255     __ subptr(rsp, -rsp_after_call_off * wordSize);
 256 
 257     // save register parameters
 258 #ifndef _WIN64
 259     __ movptr(parameters,   c_rarg5); // parameters
 260     __ movptr(entry_point,  c_rarg4); // entry_point
 261 #endif
 262 
 263     __ movptr(method,       c_rarg3); // method
 264     __ movl(result_type,  c_rarg2);   // result type
 265     __ movptr(result,       c_rarg1); // result
 266     __ movptr(call_wrapper, c_rarg0); // call wrapper
 267 
 268     // save regs belonging to calling function
 269     __ movptr(rbx_save, rbx);
 270     __ movptr(r12_save, r12);
 271     __ movptr(r13_save, r13);
 272     __ movptr(r14_save, r14);
 273     __ movptr(r15_save, r15);
 274 
 275 #ifdef _WIN64
 276     int last_reg = 15;
 277     if (UseAVX > 2) {
 278       last_reg = 31;
 279     }
 280     if (VM_Version::supports_evex()) {
 281       for (int i = xmm_save_first; i <= last_reg; i++) {
 282         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 283       }
 284     } else {
 285       for (int i = xmm_save_first; i <= last_reg; i++) {
 286         __ movdqu(xmm_save(i), as_XMMRegister(i));
 287       }
 288     }
 289 
 290     const Address rdi_save(rbp, rdi_off * wordSize);
 291     const Address rsi_save(rbp, rsi_off * wordSize);
 292 
 293     __ movptr(rsi_save, rsi);
 294     __ movptr(rdi_save, rdi);
 295 #else
 296     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 297     {
 298       Label skip_ldmx;
 299       __ stmxcsr(mxcsr_save);
 300       __ movl(rax, mxcsr_save);
 301       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 302       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 303       __ cmp32(rax, mxcsr_std);
 304       __ jcc(Assembler::equal, skip_ldmx);
 305       __ ldmxcsr(mxcsr_std);
 306       __ bind(skip_ldmx);
 307     }
 308 #endif
 309 
 310     // Load up thread register
 311     __ movptr(r15_thread, thread);
 312     __ reinit_heapbase();
 313 
 314 #ifdef ASSERT
 315     // make sure we have no pending exceptions
 316     {
 317       Label L;
 318       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 319       __ jcc(Assembler::equal, L);
 320       __ stop("StubRoutines::call_stub: entered with pending exception");
 321       __ bind(L);
 322     }
 323 #endif
 324 
 325     // pass parameters if any
 326     BLOCK_COMMENT("pass parameters if any");
 327     Label parameters_done;
 328     __ movl(c_rarg3, parameter_size);
 329     __ testl(c_rarg3, c_rarg3);
 330     __ jcc(Assembler::zero, parameters_done);
 331 
 332     Label loop;
 333     __ movptr(c_rarg2, parameters);       // parameter pointer
 334     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 335     __ BIND(loop);
 336     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 337     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 338     __ decrementl(c_rarg1);             // decrement counter
 339     __ push(rax);                       // pass parameter
 340     __ jcc(Assembler::notZero, loop);
 341 
 342     // call Java function
 343     __ BIND(parameters_done);
 344     __ movptr(rbx, method);             // get Method*
 345     __ movptr(c_rarg1, entry_point);    // get entry_point
 346     __ mov(r13, rsp);                   // set sender sp
 347     BLOCK_COMMENT("call Java function");
 348     __ call(c_rarg1);
 349 
 350     BLOCK_COMMENT("call_stub_return_address:");
 351     return_address = __ pc();
 352 
 353     // store result depending on type (everything that is not
 354     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 355     __ movptr(c_rarg0, result);
 356     Label is_long, is_float, is_double, exit;
 357     __ movl(c_rarg1, result_type);
 358     __ cmpl(c_rarg1, T_OBJECT);
 359     __ jcc(Assembler::equal, is_long);
 360     __ cmpl(c_rarg1, T_LONG);
 361     __ jcc(Assembler::equal, is_long);
 362     __ cmpl(c_rarg1, T_FLOAT);
 363     __ jcc(Assembler::equal, is_float);
 364     __ cmpl(c_rarg1, T_DOUBLE);
 365     __ jcc(Assembler::equal, is_double);
 366 
 367     // handle T_INT case
 368     __ movl(Address(c_rarg0, 0), rax);
 369 
 370     __ BIND(exit);
 371 
 372     // pop parameters
 373     __ lea(rsp, rsp_after_call);
 374 
 375 #ifdef ASSERT
 376     // verify that threads correspond
 377     {
 378      Label L1, L2, L3;
 379       __ cmpptr(r15_thread, thread);
 380       __ jcc(Assembler::equal, L1);
 381       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 382       __ bind(L1);
 383       __ get_thread(rbx);
 384       __ cmpptr(r15_thread, thread);
 385       __ jcc(Assembler::equal, L2);
 386       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 387       __ bind(L2);
 388       __ cmpptr(r15_thread, rbx);
 389       __ jcc(Assembler::equal, L3);
 390       __ stop("StubRoutines::call_stub: threads must correspond");
 391       __ bind(L3);
 392     }
 393 #endif
 394 
 395     __ pop_cont_fastpath(r15_thread);
 396 
 397     // restore regs belonging to calling function
 398 #ifdef _WIN64
 399     // emit the restores for xmm regs
 400     if (VM_Version::supports_evex()) {
 401       for (int i = xmm_save_first; i <= last_reg; i++) {
 402         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 403       }
 404     } else {
 405       for (int i = xmm_save_first; i <= last_reg; i++) {
 406         __ movdqu(as_XMMRegister(i), xmm_save(i));
 407       }
 408     }
 409 #endif
 410     __ movptr(r15, r15_save);
 411     __ movptr(r14, r14_save);
 412     __ movptr(r13, r13_save);
 413     __ movptr(r12, r12_save);
 414     __ movptr(rbx, rbx_save);
 415 
 416 #ifdef _WIN64
 417     __ movptr(rdi, rdi_save);
 418     __ movptr(rsi, rsi_save);
 419 #else
 420     __ ldmxcsr(mxcsr_save);
 421 #endif
 422 
 423     // restore rsp
 424     __ addptr(rsp, -rsp_after_call_off * wordSize);
 425 
 426     // return
 427     __ vzeroupper();
 428     __ pop(rbp);
 429     __ ret(0);
 430 
 431     // handle return types different from T_INT
 432     __ BIND(is_long);
 433     __ movq(Address(c_rarg0, 0), rax);
 434     __ jmp(exit);
 435 
 436     __ BIND(is_float);
 437     __ movflt(Address(c_rarg0, 0), xmm0);
 438     __ jmp(exit);
 439 
 440     __ BIND(is_double);
 441     __ movdbl(Address(c_rarg0, 0), xmm0);
 442     __ jmp(exit);
 443 
 444     return start;
 445   }
 446 
 447   // Return point for a Java call if there's an exception thrown in
 448   // Java code.  The exception is caught and transformed into a
 449   // pending exception stored in JavaThread that can be tested from
 450   // within the VM.
 451   //
 452   // Note: Usually the parameters are removed by the callee. In case
 453   // of an exception crossing an activation frame boundary, that is
 454   // not the case if the callee is compiled code => need to setup the
 455   // rsp.
 456   //
 457   // rax: exception oop
 458 
 459   address generate_catch_exception() {
 460     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 461     address start = __ pc();
 462 
 463     // same as in generate_call_stub():
 464     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 465     const Address thread        (rbp, thread_off         * wordSize);
 466 
 467 #ifdef ASSERT
 468     // verify that threads correspond
 469     {
 470       Label L1, L2, L3;
 471       __ cmpptr(r15_thread, thread);
 472       __ jcc(Assembler::equal, L1);
 473       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 474       __ bind(L1);
 475       __ get_thread(rbx);
 476       __ cmpptr(r15_thread, thread);
 477       __ jcc(Assembler::equal, L2);
 478       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 479       __ bind(L2);
 480       __ cmpptr(r15_thread, rbx);
 481       __ jcc(Assembler::equal, L3);
 482       __ stop("StubRoutines::catch_exception: threads must correspond");
 483       __ bind(L3);
 484     }
 485 #endif
 486 
 487     // set pending exception
 488     __ verify_oop(rax);
 489 
 490     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 491     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 492     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 493     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 494 
 495     // complete return to VM
 496     assert(StubRoutines::_call_stub_return_address != NULL,
 497            "_call_stub_return_address must have been generated before");
 498     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 499 
 500     return start;
 501   }
 502 
 503   // Continuation point for runtime calls returning with a pending
 504   // exception.  The pending exception check happened in the runtime
 505   // or native call stub.  The pending exception in Thread is
 506   // converted into a Java-level exception.
 507   //
 508   // Contract with Java-level exception handlers:
 509   // rax: exception
 510   // rdx: throwing pc
 511   //
 512   // NOTE: At entry of this stub, exception-pc must be on stack !!
 513 
 514   address generate_forward_exception() {
 515     StubCodeMark mark(this, "StubRoutines", "forward exception");
 516     address start = __ pc();
 517 
 518     // Upon entry, the sp points to the return address returning into
 519     // Java (interpreted or compiled) code; i.e., the return address
 520     // becomes the throwing pc.
 521     //
 522     // Arguments pushed before the runtime call are still on the stack
 523     // but the exception handler will reset the stack pointer ->
 524     // ignore them.  A potential result in registers can be ignored as
 525     // well.
 526 
 527 #ifdef ASSERT
 528     // make sure this code is only executed if there is a pending exception
 529     {
 530       Label L;
 531       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 532       __ jcc(Assembler::notEqual, L);
 533       __ stop("StubRoutines::forward exception: no pending exception (1)");
 534       __ bind(L);
 535     }
 536 #endif
 537 
 538     // compute exception handler into rbx
 539     __ movptr(c_rarg0, Address(rsp, 0));
 540     BLOCK_COMMENT("call exception_handler_for_return_address");
 541     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 542                          SharedRuntime::exception_handler_for_return_address),
 543                     r15_thread, c_rarg0);
 544     __ mov(rbx, rax);
 545 
 546     // setup rax & rdx, remove return address & clear pending exception
 547     __ pop(rdx);
 548     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 549     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 550 
 551 #ifdef ASSERT
 552     // make sure exception is set
 553     {
 554       Label L;
 555       __ testptr(rax, rax);
 556       __ jcc(Assembler::notEqual, L);
 557       __ stop("StubRoutines::forward exception: no pending exception (2)");
 558       __ bind(L);
 559     }
 560 #endif
 561 
 562     // continue at exception handler (return address removed)
 563     // rax: exception
 564     // rbx: exception handler
 565     // rdx: throwing pc
 566     __ verify_oop(rax);
 567     __ jmp(rbx);
 568 
 569     return start;
 570   }
 571 
 572   // Support for intptr_t OrderAccess::fence()
 573   //
 574   // Arguments :
 575   //
 576   // Result:
 577   address generate_orderaccess_fence() {
 578     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 579     address start = __ pc();
 580     __ membar(Assembler::StoreLoad);
 581     __ ret(0);
 582 
 583     return start;
 584   }
 585 
 586 
 587   // Support for intptr_t get_previous_sp()
 588   //
 589   // This routine is used to find the previous stack pointer for the
 590   // caller.
 591   address generate_get_previous_sp() {
 592     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 593     address start = __ pc();
 594 
 595     __ movptr(rax, rsp);
 596     __ addptr(rax, 8); // return address is at the top of the stack.
 597     __ ret(0);
 598 
 599     return start;
 600   }
 601 
 602   //----------------------------------------------------------------------------------------------------
 603   // Support for void verify_mxcsr()
 604   //
 605   // This routine is used with -Xcheck:jni to verify that native
 606   // JNI code does not return to Java code without restoring the
 607   // MXCSR register to our expected state.
 608 
 609   address generate_verify_mxcsr() {
 610     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 611     address start = __ pc();
 612 
 613     const Address mxcsr_save(rsp, 0);
 614 
 615     if (CheckJNICalls) {
 616       Label ok_ret;
 617       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 618       __ push(rax);
 619       __ subptr(rsp, wordSize);      // allocate a temp location
 620       __ stmxcsr(mxcsr_save);
 621       __ movl(rax, mxcsr_save);
 622       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 623       __ cmp32(rax, mxcsr_std);
 624       __ jcc(Assembler::equal, ok_ret);
 625 
 626       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 627 
 628       __ ldmxcsr(mxcsr_std);
 629 
 630       __ bind(ok_ret);
 631       __ addptr(rsp, wordSize);
 632       __ pop(rax);
 633     }
 634 
 635     __ ret(0);
 636 
 637     return start;
 638   }
 639 
 640   address generate_f2i_fixup() {
 641     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 642     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 643 
 644     address start = __ pc();
 645 
 646     Label L;
 647 
 648     __ push(rax);
 649     __ push(c_rarg3);
 650     __ push(c_rarg2);
 651     __ push(c_rarg1);
 652 
 653     __ movl(rax, 0x7f800000);
 654     __ xorl(c_rarg3, c_rarg3);
 655     __ movl(c_rarg2, inout);
 656     __ movl(c_rarg1, c_rarg2);
 657     __ andl(c_rarg1, 0x7fffffff);
 658     __ cmpl(rax, c_rarg1); // NaN? -> 0
 659     __ jcc(Assembler::negative, L);
 660     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 661     __ movl(c_rarg3, 0x80000000);
 662     __ movl(rax, 0x7fffffff);
 663     __ cmovl(Assembler::positive, c_rarg3, rax);
 664 
 665     __ bind(L);
 666     __ movptr(inout, c_rarg3);
 667 
 668     __ pop(c_rarg1);
 669     __ pop(c_rarg2);
 670     __ pop(c_rarg3);
 671     __ pop(rax);
 672 
 673     __ ret(0);
 674 
 675     return start;
 676   }
 677 
 678   address generate_f2l_fixup() {
 679     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 680     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 681     address start = __ pc();
 682 
 683     Label L;
 684 
 685     __ push(rax);
 686     __ push(c_rarg3);
 687     __ push(c_rarg2);
 688     __ push(c_rarg1);
 689 
 690     __ movl(rax, 0x7f800000);
 691     __ xorl(c_rarg3, c_rarg3);
 692     __ movl(c_rarg2, inout);
 693     __ movl(c_rarg1, c_rarg2);
 694     __ andl(c_rarg1, 0x7fffffff);
 695     __ cmpl(rax, c_rarg1); // NaN? -> 0
 696     __ jcc(Assembler::negative, L);
 697     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 698     __ mov64(c_rarg3, 0x8000000000000000);
 699     __ mov64(rax, 0x7fffffffffffffff);
 700     __ cmov(Assembler::positive, c_rarg3, rax);
 701 
 702     __ bind(L);
 703     __ movptr(inout, c_rarg3);
 704 
 705     __ pop(c_rarg1);
 706     __ pop(c_rarg2);
 707     __ pop(c_rarg3);
 708     __ pop(rax);
 709 
 710     __ ret(0);
 711 
 712     return start;
 713   }
 714 
 715   address generate_d2i_fixup() {
 716     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 717     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 718 
 719     address start = __ pc();
 720 
 721     Label L;
 722 
 723     __ push(rax);
 724     __ push(c_rarg3);
 725     __ push(c_rarg2);
 726     __ push(c_rarg1);
 727     __ push(c_rarg0);
 728 
 729     __ movl(rax, 0x7ff00000);
 730     __ movq(c_rarg2, inout);
 731     __ movl(c_rarg3, c_rarg2);
 732     __ mov(c_rarg1, c_rarg2);
 733     __ mov(c_rarg0, c_rarg2);
 734     __ negl(c_rarg3);
 735     __ shrptr(c_rarg1, 0x20);
 736     __ orl(c_rarg3, c_rarg2);
 737     __ andl(c_rarg1, 0x7fffffff);
 738     __ xorl(c_rarg2, c_rarg2);
 739     __ shrl(c_rarg3, 0x1f);
 740     __ orl(c_rarg1, c_rarg3);
 741     __ cmpl(rax, c_rarg1);
 742     __ jcc(Assembler::negative, L); // NaN -> 0
 743     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 744     __ movl(c_rarg2, 0x80000000);
 745     __ movl(rax, 0x7fffffff);
 746     __ cmov(Assembler::positive, c_rarg2, rax);
 747 
 748     __ bind(L);
 749     __ movptr(inout, c_rarg2);
 750 
 751     __ pop(c_rarg0);
 752     __ pop(c_rarg1);
 753     __ pop(c_rarg2);
 754     __ pop(c_rarg3);
 755     __ pop(rax);
 756 
 757     __ ret(0);
 758 
 759     return start;
 760   }
 761 
 762   address generate_d2l_fixup() {
 763     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 764     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 765 
 766     address start = __ pc();
 767 
 768     Label L;
 769 
 770     __ push(rax);
 771     __ push(c_rarg3);
 772     __ push(c_rarg2);
 773     __ push(c_rarg1);
 774     __ push(c_rarg0);
 775 
 776     __ movl(rax, 0x7ff00000);
 777     __ movq(c_rarg2, inout);
 778     __ movl(c_rarg3, c_rarg2);
 779     __ mov(c_rarg1, c_rarg2);
 780     __ mov(c_rarg0, c_rarg2);
 781     __ negl(c_rarg3);
 782     __ shrptr(c_rarg1, 0x20);
 783     __ orl(c_rarg3, c_rarg2);
 784     __ andl(c_rarg1, 0x7fffffff);
 785     __ xorl(c_rarg2, c_rarg2);
 786     __ shrl(c_rarg3, 0x1f);
 787     __ orl(c_rarg1, c_rarg3);
 788     __ cmpl(rax, c_rarg1);
 789     __ jcc(Assembler::negative, L); // NaN -> 0
 790     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 791     __ mov64(c_rarg2, 0x8000000000000000);
 792     __ mov64(rax, 0x7fffffffffffffff);
 793     __ cmovq(Assembler::positive, c_rarg2, rax);
 794 
 795     __ bind(L);
 796     __ movq(inout, c_rarg2);
 797 
 798     __ pop(c_rarg0);
 799     __ pop(c_rarg1);
 800     __ pop(c_rarg2);
 801     __ pop(c_rarg3);
 802     __ pop(rax);
 803 
 804     __ ret(0);
 805 
 806     return start;
 807   }
 808 
 809   address generate_iota_indices(const char *stub_name) {
 810     __ align(CodeEntryAlignment);
 811     StubCodeMark mark(this, "StubRoutines", stub_name);
 812     address start = __ pc();
 813     __ emit_data64(0x0706050403020100, relocInfo::none);
 814     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 815     __ emit_data64(0x1716151413121110, relocInfo::none);
 816     __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
 817     __ emit_data64(0x2726252423222120, relocInfo::none);
 818     __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
 819     __ emit_data64(0x3736353433323130, relocInfo::none);
 820     __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
 821     return start;
 822   }
 823 
 824   address generate_vector_byte_shuffle_mask(const char *stub_name) {
 825     __ align(CodeEntryAlignment);
 826     StubCodeMark mark(this, "StubRoutines", stub_name);
 827     address start = __ pc();
 828     __ emit_data64(0x7070707070707070, relocInfo::none);
 829     __ emit_data64(0x7070707070707070, relocInfo::none);
 830     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 831     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 832     return start;
 833   }
 834 
 835   address generate_fp_mask(const char *stub_name, int64_t mask) {
 836     __ align(CodeEntryAlignment);
 837     StubCodeMark mark(this, "StubRoutines", stub_name);
 838     address start = __ pc();
 839 
 840     __ emit_data64( mask, relocInfo::none );
 841     __ emit_data64( mask, relocInfo::none );
 842 
 843     return start;
 844   }
 845 
 846   address generate_vector_mask(const char *stub_name, int64_t mask) {
 847     __ align(CodeEntryAlignment);
 848     StubCodeMark mark(this, "StubRoutines", stub_name);
 849     address start = __ pc();
 850 
 851     __ emit_data64(mask, relocInfo::none);
 852     __ emit_data64(mask, relocInfo::none);
 853     __ emit_data64(mask, relocInfo::none);
 854     __ emit_data64(mask, relocInfo::none);
 855     __ emit_data64(mask, relocInfo::none);
 856     __ emit_data64(mask, relocInfo::none);
 857     __ emit_data64(mask, relocInfo::none);
 858     __ emit_data64(mask, relocInfo::none);
 859 
 860     return start;
 861   }
 862 
 863   address generate_vector_byte_perm_mask(const char *stub_name) {
 864     __ align(CodeEntryAlignment);
 865     StubCodeMark mark(this, "StubRoutines", stub_name);
 866     address start = __ pc();
 867 
 868     __ emit_data64(0x0000000000000001, relocInfo::none);
 869     __ emit_data64(0x0000000000000003, relocInfo::none);
 870     __ emit_data64(0x0000000000000005, relocInfo::none);
 871     __ emit_data64(0x0000000000000007, relocInfo::none);
 872     __ emit_data64(0x0000000000000000, relocInfo::none);
 873     __ emit_data64(0x0000000000000002, relocInfo::none);
 874     __ emit_data64(0x0000000000000004, relocInfo::none);
 875     __ emit_data64(0x0000000000000006, relocInfo::none);
 876 
 877     return start;
 878   }
 879 
 880   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
 881     __ align(CodeEntryAlignment);
 882     StubCodeMark mark(this, "StubRoutines", stub_name);
 883     address start = __ pc();
 884 
 885     __ emit_data64(mask, relocInfo::none);
 886     __ emit_data64(mask, relocInfo::none);
 887     __ emit_data64(mask, relocInfo::none);
 888     __ emit_data64(mask, relocInfo::none);
 889     __ emit_data64(mask, relocInfo::none);
 890     __ emit_data64(mask, relocInfo::none);
 891     __ emit_data64(mask, relocInfo::none);
 892     __ emit_data64(mask, relocInfo::none);
 893 
 894     return start;
 895   }
 896 
 897   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 898                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 899                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 900                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 901                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 902     __ align(CodeEntryAlignment);
 903     StubCodeMark mark(this, "StubRoutines", stub_name);
 904     address start = __ pc();
 905 
 906     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 907     __ emit_data(val0, relocInfo::none, 0);
 908     __ emit_data(val1, relocInfo::none, 0);
 909     __ emit_data(val2, relocInfo::none, 0);
 910     __ emit_data(val3, relocInfo::none, 0);
 911     if (len >= Assembler::AVX_256bit) {
 912       __ emit_data(val4, relocInfo::none, 0);
 913       __ emit_data(val5, relocInfo::none, 0);
 914       __ emit_data(val6, relocInfo::none, 0);
 915       __ emit_data(val7, relocInfo::none, 0);
 916       if (len >= Assembler::AVX_512bit) {
 917         __ emit_data(val8, relocInfo::none, 0);
 918         __ emit_data(val9, relocInfo::none, 0);
 919         __ emit_data(val10, relocInfo::none, 0);
 920         __ emit_data(val11, relocInfo::none, 0);
 921         __ emit_data(val12, relocInfo::none, 0);
 922         __ emit_data(val13, relocInfo::none, 0);
 923         __ emit_data(val14, relocInfo::none, 0);
 924         __ emit_data(val15, relocInfo::none, 0);
 925       }
 926     }
 927 
 928     return start;
 929   }
 930 
 931   // Non-destructive plausibility checks for oops
 932   //
 933   // Arguments:
 934   //    all args on stack!
 935   //
 936   // Stack after saving c_rarg3:
 937   //    [tos + 0]: saved c_rarg3
 938   //    [tos + 1]: saved c_rarg2
 939   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 940   //    [tos + 3]: saved flags
 941   //    [tos + 4]: return address
 942   //  * [tos + 5]: error message (char*)
 943   //  * [tos + 6]: object to verify (oop)
 944   //  * [tos + 7]: saved rax - saved by caller and bashed
 945   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 946   //  * = popped on exit
 947   address generate_verify_oop() {
 948     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 949     address start = __ pc();
 950 
 951     Label exit, error;
 952 
 953     __ pushf();
 954     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 955 
 956     __ push(r12);
 957 
 958     // save c_rarg2 and c_rarg3
 959     __ push(c_rarg2);
 960     __ push(c_rarg3);
 961 
 962     enum {
 963            // After previous pushes.
 964            oop_to_verify = 6 * wordSize,
 965            saved_rax     = 7 * wordSize,
 966            saved_r10     = 8 * wordSize,
 967 
 968            // Before the call to MacroAssembler::debug(), see below.
 969            return_addr   = 16 * wordSize,
 970            error_msg     = 17 * wordSize
 971     };
 972 
 973     // get object
 974     __ movptr(rax, Address(rsp, oop_to_verify));
 975 
 976     // make sure object is 'reasonable'
 977     __ testptr(rax, rax);
 978     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
 979 
 980 #if INCLUDE_ZGC
 981     if (UseZGC) {
 982       // Check if metadata bits indicate a bad oop
 983       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
 984       __ jcc(Assembler::notZero, error);
 985     }
 986 #endif
 987 
 988     // Check if the oop is in the right area of memory
 989     __ movptr(c_rarg2, rax);
 990     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 991     __ andptr(c_rarg2, c_rarg3);
 992     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 993     __ cmpptr(c_rarg2, c_rarg3);
 994     __ jcc(Assembler::notZero, error);
 995 
 996     // make sure klass is 'reasonable', which is not zero.
 997     __ load_klass(rax, rax, rscratch1);  // get klass
 998     __ testptr(rax, rax);
 999     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1000 
1001     // return if everything seems ok
1002     __ bind(exit);
1003     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1004     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1005     __ pop(c_rarg3);                             // restore c_rarg3
1006     __ pop(c_rarg2);                             // restore c_rarg2
1007     __ pop(r12);                                 // restore r12
1008     __ popf();                                   // restore flags
1009     __ ret(4 * wordSize);                        // pop caller saved stuff
1010 
1011     // handle errors
1012     __ bind(error);
1013     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1014     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1015     __ pop(c_rarg3);                             // get saved c_rarg3 back
1016     __ pop(c_rarg2);                             // get saved c_rarg2 back
1017     __ pop(r12);                                 // get saved r12 back
1018     __ popf();                                   // get saved flags off stack --
1019                                                  // will be ignored
1020 
1021     __ pusha();                                  // push registers
1022                                                  // (rip is already
1023                                                  // already pushed)
1024     // debug(char* msg, int64_t pc, int64_t regs[])
1025     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1026     // pushed all the registers, so now the stack looks like:
1027     //     [tos +  0] 16 saved registers
1028     //     [tos + 16] return address
1029     //   * [tos + 17] error message (char*)
1030     //   * [tos + 18] object to verify (oop)
1031     //   * [tos + 19] saved rax - saved by caller and bashed
1032     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1033     //   * = popped on exit
1034 
1035     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1036     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1037     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1038     __ mov(r12, rsp);                               // remember rsp
1039     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1040     __ andptr(rsp, -16);                            // align stack as required by ABI
1041     BLOCK_COMMENT("call MacroAssembler::debug");
1042     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1043     __ hlt();
1044     return start;
1045   }
1046 
1047   //
1048   // Verify that a register contains clean 32-bits positive value
1049   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1050   //
1051   //  Input:
1052   //    Rint  -  32-bits value
1053   //    Rtmp  -  scratch
1054   //
1055   void assert_clean_int(Register Rint, Register Rtmp) {
1056 #ifdef ASSERT
1057     Label L;
1058     assert_different_registers(Rtmp, Rint);
1059     __ movslq(Rtmp, Rint);
1060     __ cmpq(Rtmp, Rint);
1061     __ jcc(Assembler::equal, L);
1062     __ stop("high 32-bits of int value are not 0");
1063     __ bind(L);
1064 #endif
1065   }
1066 
1067   //  Generate overlap test for array copy stubs
1068   //
1069   //  Input:
1070   //     c_rarg0 - from
1071   //     c_rarg1 - to
1072   //     c_rarg2 - element count
1073   //
1074   //  Output:
1075   //     rax   - &from[element count - 1]
1076   //
1077   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1078     assert(no_overlap_target != NULL, "must be generated");
1079     array_overlap_test(no_overlap_target, NULL, sf);
1080   }
1081   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1082     array_overlap_test(NULL, &L_no_overlap, sf);
1083   }
1084   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1085     const Register from     = c_rarg0;
1086     const Register to       = c_rarg1;
1087     const Register count    = c_rarg2;
1088     const Register end_from = rax;
1089 
1090     __ cmpptr(to, from);
1091     __ lea(end_from, Address(from, count, sf, 0));
1092     if (NOLp == NULL) {
1093       ExternalAddress no_overlap(no_overlap_target);
1094       __ jump_cc(Assembler::belowEqual, no_overlap);
1095       __ cmpptr(to, end_from);
1096       __ jump_cc(Assembler::aboveEqual, no_overlap);
1097     } else {
1098       __ jcc(Assembler::belowEqual, (*NOLp));
1099       __ cmpptr(to, end_from);
1100       __ jcc(Assembler::aboveEqual, (*NOLp));
1101     }
1102   }
1103 
1104   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1105   //
1106   // Outputs:
1107   //    rdi - rcx
1108   //    rsi - rdx
1109   //    rdx - r8
1110   //    rcx - r9
1111   //
1112   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1113   // are non-volatile.  r9 and r10 should not be used by the caller.
1114   //
1115   DEBUG_ONLY(bool regs_in_thread;)
1116 
1117   void setup_arg_regs(int nargs = 3) {
1118     const Register saved_rdi = r9;
1119     const Register saved_rsi = r10;
1120     assert(nargs == 3 || nargs == 4, "else fix");
1121 #ifdef _WIN64
1122     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1123            "unexpected argument registers");
1124     if (nargs >= 4)
1125       __ mov(rax, r9);  // r9 is also saved_rdi
1126     __ movptr(saved_rdi, rdi);
1127     __ movptr(saved_rsi, rsi);
1128     __ mov(rdi, rcx); // c_rarg0
1129     __ mov(rsi, rdx); // c_rarg1
1130     __ mov(rdx, r8);  // c_rarg2
1131     if (nargs >= 4)
1132       __ mov(rcx, rax); // c_rarg3 (via rax)
1133 #else
1134     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1135            "unexpected argument registers");
1136 #endif
1137     DEBUG_ONLY(regs_in_thread = false;)
1138   }
1139 
1140   void restore_arg_regs() {
1141     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1142     const Register saved_rdi = r9;
1143     const Register saved_rsi = r10;
1144 #ifdef _WIN64
1145     __ movptr(rdi, saved_rdi);
1146     __ movptr(rsi, saved_rsi);
1147 #endif
1148   }
1149 
1150   // This is used in places where r10 is a scratch register, and can
1151   // be adapted if r9 is needed also.
1152   void setup_arg_regs_using_thread() {
1153     const Register saved_r15 = r9;
1154 #ifdef _WIN64
1155     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1156     __ get_thread(r15_thread);
1157     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1158            "unexpected argument registers");
1159     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1160     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1161 
1162     __ mov(rdi, rcx); // c_rarg0
1163     __ mov(rsi, rdx); // c_rarg1
1164     __ mov(rdx, r8);  // c_rarg2
1165 #else
1166     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1167            "unexpected argument registers");
1168 #endif
1169     DEBUG_ONLY(regs_in_thread = true;)
1170   }
1171 
1172   void restore_arg_regs_using_thread() {
1173     assert(regs_in_thread, "wrong call to restore_arg_regs");
1174     const Register saved_r15 = r9;
1175 #ifdef _WIN64
1176     __ get_thread(r15_thread);
1177     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1178     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1179     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1180 #endif
1181   }
1182 
1183   // Copy big chunks forward
1184   //
1185   // Inputs:
1186   //   end_from     - source arrays end address
1187   //   end_to       - destination array end address
1188   //   qword_count  - 64-bits element count, negative
1189   //   to           - scratch
1190   //   L_copy_bytes - entry label
1191   //   L_copy_8_bytes  - exit  label
1192   //
1193   void copy_bytes_forward(Register end_from, Register end_to,
1194                              Register qword_count, Register to,
1195                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1196     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1197     Label L_loop;
1198     __ align(OptoLoopAlignment);
1199     if (UseUnalignedLoadStores) {
1200       Label L_end;
1201       __ BIND(L_loop);
1202       if (UseAVX >= 2) {
1203         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1204         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1205         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1206         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1207       } else {
1208         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1209         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1210         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1211         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1212         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1213         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1214         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1215         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1216       }
1217 
1218       __ BIND(L_copy_bytes);
1219       __ addptr(qword_count, 8);
1220       __ jcc(Assembler::lessEqual, L_loop);
1221       __ subptr(qword_count, 4);  // sub(8) and add(4)
1222       __ jccb(Assembler::greater, L_end);
1223       // Copy trailing 32 bytes
1224       if (UseAVX >= 2) {
1225         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1226         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1227       } else {
1228         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1229         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1230         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1231         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1232       }
1233       __ addptr(qword_count, 4);
1234       __ BIND(L_end);
1235       if (UseAVX >= 2) {
1236         // clean upper bits of YMM registers
1237         __ vpxor(xmm0, xmm0);
1238         __ vpxor(xmm1, xmm1);
1239       }
1240     } else {
1241       // Copy 32-bytes per iteration
1242       __ BIND(L_loop);
1243       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1244       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1245       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1246       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1247       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1248       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1249       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1250       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1251 
1252       __ BIND(L_copy_bytes);
1253       __ addptr(qword_count, 4);
1254       __ jcc(Assembler::lessEqual, L_loop);
1255     }
1256     __ subptr(qword_count, 4);
1257     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1258   }
1259 
1260   // Copy big chunks backward
1261   //
1262   // Inputs:
1263   //   from         - source arrays address
1264   //   dest         - destination array address
1265   //   qword_count  - 64-bits element count
1266   //   to           - scratch
1267   //   L_copy_bytes - entry label
1268   //   L_copy_8_bytes  - exit  label
1269   //
1270   void copy_bytes_backward(Register from, Register dest,
1271                               Register qword_count, Register to,
1272                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1273     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1274     Label L_loop;
1275     __ align(OptoLoopAlignment);
1276     if (UseUnalignedLoadStores) {
1277       Label L_end;
1278       __ BIND(L_loop);
1279       if (UseAVX >= 2) {
1280         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1281         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1282         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1283         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1284       } else {
1285         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1286         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1287         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1288         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1289         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1290         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1291         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1292         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1293       }
1294 
1295       __ BIND(L_copy_bytes);
1296       __ subptr(qword_count, 8);
1297       __ jcc(Assembler::greaterEqual, L_loop);
1298 
1299       __ addptr(qword_count, 4);  // add(8) and sub(4)
1300       __ jccb(Assembler::less, L_end);
1301       // Copy trailing 32 bytes
1302       if (UseAVX >= 2) {
1303         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1304         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1305       } else {
1306         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1307         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1308         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1309         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1310       }
1311       __ subptr(qword_count, 4);
1312       __ BIND(L_end);
1313       if (UseAVX >= 2) {
1314         // clean upper bits of YMM registers
1315         __ vpxor(xmm0, xmm0);
1316         __ vpxor(xmm1, xmm1);
1317       }
1318     } else {
1319       // Copy 32-bytes per iteration
1320       __ BIND(L_loop);
1321       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1322       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1323       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1324       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1325       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1326       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1327       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1328       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1329 
1330       __ BIND(L_copy_bytes);
1331       __ subptr(qword_count, 4);
1332       __ jcc(Assembler::greaterEqual, L_loop);
1333     }
1334     __ addptr(qword_count, 4);
1335     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1336   }
1337 
1338 #ifndef PRODUCT
1339     int& get_profile_ctr(int shift) {
1340       if ( 0 == shift)
1341         return SharedRuntime::_jbyte_array_copy_ctr;
1342       else if(1 == shift)
1343         return SharedRuntime::_jshort_array_copy_ctr;
1344       else if(2 == shift)
1345         return SharedRuntime::_jint_array_copy_ctr;
1346       else
1347         return SharedRuntime::_jlong_array_copy_ctr;
1348     }
1349 #endif
1350 
1351   void setup_argument_regs(BasicType type) {
1352     if (type == T_BYTE || type == T_SHORT) {
1353       setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1354                         // r9 and r10 may be used to save non-volatile registers
1355     } else {
1356       setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1357                                      // r9 is used to save r15_thread
1358     }
1359   }
1360 
1361   void restore_argument_regs(BasicType type) {
1362     if (type == T_BYTE || type == T_SHORT) {
1363       restore_arg_regs();
1364     } else {
1365       restore_arg_regs_using_thread();
1366     }
1367   }
1368 
1369 #if COMPILER2_OR_JVMCI
1370   // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
1371   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
1372   //   for both special cases (various small block sizes) and aligned copy loop. This is the
1373   //   default configuration.
1374   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
1375   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
1376   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
1377   //   better performance for disjoint copies. For conjoint/backward copy vector based
1378   //   copy performs better.
1379   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
1380   //   64 byte vector registers (ZMMs).
1381 
1382   // Inputs:
1383   //   c_rarg0   - source array address
1384   //   c_rarg1   - destination array address
1385   //   c_rarg2   - element count, treated as ssize_t, can be zero
1386   //
1387   //
1388   // Side Effects:
1389   //   disjoint_copy_avx3_masked is set to the no-overlap entry point
1390   //   used by generate_conjoint_[byte/int/short/long]_copy().
1391   //
1392 
1393   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1394                                              bool aligned, bool is_oop, bool dest_uninitialized) {
1395     __ align(CodeEntryAlignment);
1396     StubCodeMark mark(this, "StubRoutines", name);
1397     address start = __ pc();
1398 
1399     bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
1400     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1401     Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1402     const Register from        = rdi;  // source array address
1403     const Register to          = rsi;  // destination array address
1404     const Register count       = rdx;  // elements count
1405     const Register temp1       = r8;
1406     const Register temp2       = r11;
1407     const Register temp3       = rax;
1408     const Register temp4       = rcx;
1409     // End pointers are inclusive, and if count is not zero they point
1410     // to the last unit copied:  end_to[0] := end_from[0]
1411 
1412     __ enter(); // required for proper stackwalking of RuntimeStub frame
1413     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1414 
1415     if (entry != NULL) {
1416       *entry = __ pc();
1417        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1418       BLOCK_COMMENT("Entry:");
1419     }
1420 
1421     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1422     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1423 
1424     setup_argument_regs(type);
1425 
1426     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1427     if (dest_uninitialized) {
1428       decorators |= IS_DEST_UNINITIALIZED;
1429     }
1430     if (aligned) {
1431       decorators |= ARRAYCOPY_ALIGNED;
1432     }
1433     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1434     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1435 
1436     {
1437       // Type(shift)           byte(0), short(1), int(2),   long(3)
1438       int loop_size[]        = { 192,     96,       48,      24};
1439       int threshold[]        = { 4096,    2048,     1024,    512};
1440 
1441       // UnsafeCopyMemory page error: continue after ucm
1442       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1443       // 'from', 'to' and 'count' are now valid
1444 
1445       // temp1 holds remaining count and temp4 holds running count used to compute
1446       // next address offset for start of to/from addresses (temp4 * scale).
1447       __ mov64(temp4, 0);
1448       __ movq(temp1, count);
1449 
1450       // Zero length check.
1451       __ BIND(L_tail);
1452       __ cmpq(temp1, 0);
1453       __ jcc(Assembler::lessEqual, L_exit);
1454 
1455       // Special cases using 32 byte [masked] vector copy operations.
1456       __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1457                                       temp4, temp3, use64byteVector, L_entry, L_exit);
1458 
1459       // PRE-MAIN-POST loop for aligned copy.
1460       __ BIND(L_entry);
1461 
1462       if (AVX3Threshold != 0) {
1463         __ cmpq(count, threshold[shift]);
1464         if (MaxVectorSize == 64) {
1465           // Copy using 64 byte vectors.
1466           __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1467         } else {
1468           assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
1469           // REP MOVS offer a faster copy path.
1470           __ jcc(Assembler::greaterEqual, L_repmovs);
1471         }
1472       }
1473 
1474       if (MaxVectorSize < 64  || AVX3Threshold != 0) {
1475         // Partial copy to make dst address 32 byte aligned.
1476         __ movq(temp2, to);
1477         __ andq(temp2, 31);
1478         __ jcc(Assembler::equal, L_main_pre_loop);
1479 
1480         __ negptr(temp2);
1481         __ addq(temp2, 32);
1482         if (shift) {
1483           __ shrq(temp2, shift);
1484         }
1485         __ movq(temp3, temp2);
1486         __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
1487         __ movq(temp4, temp2);
1488         __ movq(temp1, count);
1489         __ subq(temp1, temp2);
1490 
1491         __ cmpq(temp1, loop_size[shift]);
1492         __ jcc(Assembler::less, L_tail);
1493 
1494         __ BIND(L_main_pre_loop);
1495         __ subq(temp1, loop_size[shift]);
1496 
1497         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1498         __ align32();
1499         __ BIND(L_main_loop);
1500            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
1501            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
1502            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
1503            __ addptr(temp4, loop_size[shift]);
1504            __ subq(temp1, loop_size[shift]);
1505            __ jcc(Assembler::greater, L_main_loop);
1506 
1507         __ addq(temp1, loop_size[shift]);
1508 
1509         // Tail loop.
1510         __ jmp(L_tail);
1511 
1512         __ BIND(L_repmovs);
1513           __ movq(temp2, temp1);
1514           // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
1515           __ movq(temp3, to);
1516           __ movq(to,  from);
1517           __ movq(from, temp3);
1518           // Save to/from for restoration post rep_mov.
1519           __ movq(temp1, to);
1520           __ movq(temp3, from);
1521           if(shift < 3) {
1522             __ shrq(temp2, 3-shift);     // quad word count
1523           }
1524           __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
1525           __ rep_mov();
1526           __ shlq(temp2, 3);             // convert quad words into byte count.
1527           if(shift) {
1528             __ shrq(temp2, shift);       // type specific count.
1529           }
1530           // Restore original addresses in to/from.
1531           __ movq(to, temp3);
1532           __ movq(from, temp1);
1533           __ movq(temp4, temp2);
1534           __ movq(temp1, count);
1535           __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
1536           __ jmp(L_tail);
1537       }
1538 
1539       if (MaxVectorSize > 32) {
1540         __ BIND(L_pre_main_post_64);
1541         // Partial copy to make dst address 64 byte aligned.
1542         __ movq(temp2, to);
1543         __ andq(temp2, 63);
1544         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1545 
1546         __ negptr(temp2);
1547         __ addq(temp2, 64);
1548         if (shift) {
1549           __ shrq(temp2, shift);
1550         }
1551         __ movq(temp3, temp2);
1552         __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
1553         __ movq(temp4, temp2);
1554         __ movq(temp1, count);
1555         __ subq(temp1, temp2);
1556 
1557         __ cmpq(temp1, loop_size[shift]);
1558         __ jcc(Assembler::less, L_tail64);
1559 
1560         __ BIND(L_main_pre_loop_64bytes);
1561         __ subq(temp1, loop_size[shift]);
1562 
1563         // Main loop with aligned copy block size of 192 bytes at
1564         // 64 byte copy granularity.
1565         __ align32();
1566         __ BIND(L_main_loop_64bytes);
1567            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
1568            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
1569            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
1570            __ addptr(temp4, loop_size[shift]);
1571            __ subq(temp1, loop_size[shift]);
1572            __ jcc(Assembler::greater, L_main_loop_64bytes);
1573 
1574         __ addq(temp1, loop_size[shift]);
1575         // Zero length check.
1576         __ jcc(Assembler::lessEqual, L_exit);
1577 
1578         __ BIND(L_tail64);
1579 
1580         // Tail handling using 64 byte [masked] vector copy operations.
1581         use64byteVector = true;
1582         __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1583                                         temp4, temp3, use64byteVector, L_entry, L_exit);
1584       }
1585       __ BIND(L_exit);
1586     }
1587 
1588     address ucme_exit_pc = __ pc();
1589     // When called from generic_arraycopy r11 contains specific values
1590     // used during arraycopy epilogue, re-initializing r11.
1591     if (is_oop) {
1592       __ movq(r11, shift == 3 ? count : to);
1593     }
1594     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1595     restore_argument_regs(type);
1596     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1597     __ xorptr(rax, rax); // return 0
1598     __ vzeroupper();
1599     __ leave(); // required for proper stackwalking of RuntimeStub frame
1600     __ ret(0);
1601     return start;
1602   }
1603 
1604   // Inputs:
1605   //   c_rarg0   - source array address
1606   //   c_rarg1   - destination array address
1607   //   c_rarg2   - element count, treated as ssize_t, can be zero
1608   //
1609   //
1610   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1611                                              address nooverlap_target, bool aligned, bool is_oop,
1612                                              bool dest_uninitialized) {
1613     __ align(CodeEntryAlignment);
1614     StubCodeMark mark(this, "StubRoutines", name);
1615     address start = __ pc();
1616 
1617     bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
1618 
1619     Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1620     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1621     const Register from        = rdi;  // source array address
1622     const Register to          = rsi;  // destination array address
1623     const Register count       = rdx;  // elements count
1624     const Register temp1       = r8;
1625     const Register temp2       = rcx;
1626     const Register temp3       = r11;
1627     const Register temp4       = rax;
1628     // End pointers are inclusive, and if count is not zero they point
1629     // to the last unit copied:  end_to[0] := end_from[0]
1630 
1631     __ enter(); // required for proper stackwalking of RuntimeStub frame
1632     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1633 
1634     if (entry != NULL) {
1635       *entry = __ pc();
1636        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1637       BLOCK_COMMENT("Entry:");
1638     }
1639 
1640     array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1641 
1642     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1643     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1644 
1645     setup_argument_regs(type);
1646 
1647     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1648     if (dest_uninitialized) {
1649       decorators |= IS_DEST_UNINITIALIZED;
1650     }
1651     if (aligned) {
1652       decorators |= ARRAYCOPY_ALIGNED;
1653     }
1654     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1655     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1656     {
1657       // Type(shift)       byte(0), short(1), int(2),   long(3)
1658       int loop_size[]   = { 192,     96,       48,      24};
1659       int threshold[]   = { 4096,    2048,     1024,    512};
1660 
1661       // UnsafeCopyMemory page error: continue after ucm
1662       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1663       // 'from', 'to' and 'count' are now valid
1664 
1665       // temp1 holds remaining count.
1666       __ movq(temp1, count);
1667 
1668       // Zero length check.
1669       __ BIND(L_tail);
1670       __ cmpq(temp1, 0);
1671       __ jcc(Assembler::lessEqual, L_exit);
1672 
1673       __ mov64(temp2, 0);
1674       __ movq(temp3, temp1);
1675       // Special cases using 32 byte [masked] vector copy operations.
1676       __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1677                                                temp4, use64byteVector, L_entry, L_exit);
1678 
1679       // PRE-MAIN-POST loop for aligned copy.
1680       __ BIND(L_entry);
1681 
1682       if (MaxVectorSize > 32 && AVX3Threshold != 0) {
1683         __ cmpq(temp1, threshold[shift]);
1684         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1685       }
1686 
1687       if (MaxVectorSize < 64  || AVX3Threshold != 0) {
1688         // Partial copy to make dst address 32 byte aligned.
1689         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1690         __ andq(temp2, 31);
1691         __ jcc(Assembler::equal, L_main_pre_loop);
1692 
1693         if (shift) {
1694           __ shrq(temp2, shift);
1695         }
1696         __ subq(temp1, temp2);
1697         __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1698 
1699         __ cmpq(temp1, loop_size[shift]);
1700         __ jcc(Assembler::less, L_tail);
1701 
1702         __ BIND(L_main_pre_loop);
1703 
1704         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1705         __ align32();
1706         __ BIND(L_main_loop);
1707            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1708            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1709            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1710            __ subptr(temp1, loop_size[shift]);
1711            __ cmpq(temp1, loop_size[shift]);
1712            __ jcc(Assembler::greater, L_main_loop);
1713 
1714         // Tail loop.
1715         __ jmp(L_tail);
1716       }
1717 
1718       if (MaxVectorSize > 32) {
1719         __ BIND(L_pre_main_post_64);
1720         // Partial copy to make dst address 64 byte aligned.
1721         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1722         __ andq(temp2, 63);
1723         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1724 
1725         if (shift) {
1726           __ shrq(temp2, shift);
1727         }
1728         __ subq(temp1, temp2);
1729         __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1730 
1731         __ cmpq(temp1, loop_size[shift]);
1732         __ jcc(Assembler::less, L_tail64);
1733 
1734         __ BIND(L_main_pre_loop_64bytes);
1735 
1736         // Main loop with aligned copy block size of 192 bytes at
1737         // 64 byte copy granularity.
1738         __ align32();
1739         __ BIND(L_main_loop_64bytes);
1740            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1741            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1742            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1743            __ subq(temp1, loop_size[shift]);
1744            __ cmpq(temp1, loop_size[shift]);
1745            __ jcc(Assembler::greater, L_main_loop_64bytes);
1746 
1747         // Zero length check.
1748         __ cmpq(temp1, 0);
1749         __ jcc(Assembler::lessEqual, L_exit);
1750 
1751         __ BIND(L_tail64);
1752 
1753         // Tail handling using 64 byte [masked] vector copy operations.
1754         use64byteVector = true;
1755         __ mov64(temp2, 0);
1756         __ movq(temp3, temp1);
1757         __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1758                                                  temp4, use64byteVector, L_entry, L_exit);
1759       }
1760       __ BIND(L_exit);
1761     }
1762     address ucme_exit_pc = __ pc();
1763     // When called from generic_arraycopy r11 contains specific values
1764     // used during arraycopy epilogue, re-initializing r11.
1765     if(is_oop) {
1766       __ movq(r11, count);
1767     }
1768     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1769     restore_argument_regs(type);
1770     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1771     __ xorptr(rax, rax); // return 0
1772     __ vzeroupper();
1773     __ leave(); // required for proper stackwalking of RuntimeStub frame
1774     __ ret(0);
1775     return start;
1776   }
1777 #endif // COMPILER2_OR_JVMCI
1778 
1779 
1780   // Arguments:
1781   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1782   //             ignored
1783   //   name    - stub name string
1784   //
1785   // Inputs:
1786   //   c_rarg0   - source array address
1787   //   c_rarg1   - destination array address
1788   //   c_rarg2   - element count, treated as ssize_t, can be zero
1789   //
1790   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1791   // we let the hardware handle it.  The one to eight bytes within words,
1792   // dwords or qwords that span cache line boundaries will still be loaded
1793   // and stored atomically.
1794   //
1795   // Side Effects:
1796   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1797   //   used by generate_conjoint_byte_copy().
1798   //
1799   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1800 #if COMPILER2_OR_JVMCI
1801     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1802        return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1803                                                  aligned, false, false);
1804     }
1805 #endif
1806     __ align(CodeEntryAlignment);
1807     StubCodeMark mark(this, "StubRoutines", name);
1808     address start = __ pc();
1809 
1810     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1811     Label L_copy_byte, L_exit;
1812     const Register from        = rdi;  // source array address
1813     const Register to          = rsi;  // destination array address
1814     const Register count       = rdx;  // elements count
1815     const Register byte_count  = rcx;
1816     const Register qword_count = count;
1817     const Register end_from    = from; // source array end address
1818     const Register end_to      = to;   // destination array end address
1819     // End pointers are inclusive, and if count is not zero they point
1820     // to the last unit copied:  end_to[0] := end_from[0]
1821 
1822     __ enter(); // required for proper stackwalking of RuntimeStub frame
1823     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1824 
1825     if (entry != NULL) {
1826       *entry = __ pc();
1827        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1828       BLOCK_COMMENT("Entry:");
1829     }
1830 
1831     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1832                       // r9 and r10 may be used to save non-volatile registers
1833 
1834     {
1835       // UnsafeCopyMemory page error: continue after ucm
1836       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1837       // 'from', 'to' and 'count' are now valid
1838       __ movptr(byte_count, count);
1839       __ shrptr(count, 3); // count => qword_count
1840 
1841       // Copy from low to high addresses.  Use 'to' as scratch.
1842       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1843       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1844       __ negptr(qword_count); // make the count negative
1845       __ jmp(L_copy_bytes);
1846 
1847       // Copy trailing qwords
1848     __ BIND(L_copy_8_bytes);
1849       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1850       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1851       __ increment(qword_count);
1852       __ jcc(Assembler::notZero, L_copy_8_bytes);
1853 
1854       // Check for and copy trailing dword
1855     __ BIND(L_copy_4_bytes);
1856       __ testl(byte_count, 4);
1857       __ jccb(Assembler::zero, L_copy_2_bytes);
1858       __ movl(rax, Address(end_from, 8));
1859       __ movl(Address(end_to, 8), rax);
1860 
1861       __ addptr(end_from, 4);
1862       __ addptr(end_to, 4);
1863 
1864       // Check for and copy trailing word
1865     __ BIND(L_copy_2_bytes);
1866       __ testl(byte_count, 2);
1867       __ jccb(Assembler::zero, L_copy_byte);
1868       __ movw(rax, Address(end_from, 8));
1869       __ movw(Address(end_to, 8), rax);
1870 
1871       __ addptr(end_from, 2);
1872       __ addptr(end_to, 2);
1873 
1874       // Check for and copy trailing byte
1875     __ BIND(L_copy_byte);
1876       __ testl(byte_count, 1);
1877       __ jccb(Assembler::zero, L_exit);
1878       __ movb(rax, Address(end_from, 8));
1879       __ movb(Address(end_to, 8), rax);
1880     }
1881   __ BIND(L_exit);
1882     address ucme_exit_pc = __ pc();
1883     restore_arg_regs();
1884     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1885     __ xorptr(rax, rax); // return 0
1886     __ vzeroupper();
1887     __ leave(); // required for proper stackwalking of RuntimeStub frame
1888     __ ret(0);
1889 
1890     {
1891       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1892       // Copy in multi-bytes chunks
1893       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1894       __ jmp(L_copy_4_bytes);
1895     }
1896     return start;
1897   }
1898 
1899   // Fast memory copying for continuations
1900   // See:
1901   // - Intel 64 and IA-32 Architectures Optimization Reference Manual: (https://software.intel.com/sites/default/files/managed/9e/bc/64-ia-32-architectures-optimization-manual.pdf)
1902   //   - 2.7.6 REP String Enhancement
1903   //   - 3.7.5 REP Prefix and Data Movement
1904   //   - 3.7.6 Enhanced REP MOVSB and STOSB Operation
1905   //   - 8.1 GENERAL PREFETCH CODING GUIDELINES
1906   //   - 8.4.1.2 Streaming Non-temporal Stores, 8.4.1.3 Memory Type and Non-temporal Stores
1907   //   - 8.5 MEMORY OPTIMIZATION USING PREFETCH, 8.5.6 Software Prefetch Scheduling Distance, 8.5.7 Software Prefetch Concatenation
1908   //   - 14.3, MIXING AVX CODE WITH SSE CODE + https://software.intel.com/en-us/articles/intel-avx-state-transitions-migrating-sse-code-to-avx
1909   // - Optimizing subroutines in assembly language, 17.9 Moving blocks of data https://www.agner.org/optimize/optimizing_assembly.pdf
1910   // - StackOverflow
1911   //   - https://stackoverflow.com/q/26246040/750563 What's missing/sub-optimal in this memcpy implementation?
1912   //   - https://stackoverflow.com/q/43343231/750563 Enhanced REP MOVSB for memcpy
1913   //   - https://stackoverflow.com/q/33902068/750563 What setup does REP do?
1914   //   - https://stackoverflow.com/q/8858778/750563  Why are complicated memcpy/memset superior?
1915   //   - https://stackoverflow.com/q/1715224/750563  Very fast memcpy for image processing?
1916   //   - https://stackoverflow.com/q/17312823/750563 When program will benefit from prefetch & non-temporal load/store?
1917   //   - https://stackoverflow.com/q/40096894/750563 Do current x86 architectures support non-temporal loads (from “normal” memory)?
1918   //   - https://stackoverflow.com/q/32103968/750563 Non-temporal loads and the hardware prefetcher, do they work together?
1919   // - https://docs.roguewave.com/threadspotter/2011.2/manual_html_linux/manual_html/ch05s03.html Non-Temporal Data
1920   // - https://blogs.fau.de/hager/archives/2103 A case for the non-temporal store
1921   // - https://vgatherps.github.io/2018-09-02-nontemporal/ Optimizing Cache Usage With Nontemporal Accesses
1922   // - https://www.reddit.com/r/cpp/comments/9ccb88/optimizing_cache_usage_with_nontemporal_accesses/
1923   // - https://lwn.net/Articles/255364/ Memory part 5: What programmers can do
1924   // - https://software.intel.com/en-us/forums/intel-isa-extensions/topic/597075 Do Non-Temporal Loads Prefetch?
1925   // - https://software.intel.com/en-us/forums/intel-fortran-compiler/topic/275765#comment-1551057 Time to revisit REP;MOVS
1926   // - https://lemire.me/blog/2018/09/07/avx-512-when-and-how-to-use-these-new-instructions/ AVX-512: when and how to use these new instructions (explains AVX3Threshold)
1927   // - https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html Gathering Intel on Intel AVX-512 Transitions
1928 
1929 
1930   // Used by continuations to copy from stack
1931   // Arguments:
1932   //   name - stub name string
1933   //   nt   -  use non-temporal stores
1934   //
1935   // Inputs:
1936   //   c_rarg0   - source array address       -- 16-byte aligned
1937   //   c_rarg1   - destination array address  --  8-byte aligned
1938   //   c_rarg2   - element count, in qwords (8 bytes), >= 2
1939   //
1940   address generate_disjoint_word_copy_up(bool nt, const char *name) {
1941     const bool align = nt;
1942 
1943     __ align(CodeEntryAlignment);
1944     StubCodeMark mark(this, "StubRoutines", name);
1945     address start = __ pc();
1946 
1947     Label L_copy_bytes, L_copy_8_bytes, L_loop, L_end, L_exit;
1948     const Register from        = rdi;  // source array address
1949     const Register to          = rsi;  // destination array address
1950     const Register count       = rdx;  // elements count
1951     const Register qword_count = count;
1952     const Register end_from    = from; // source array end address
1953     const Register end_to      = to;   // destination array end address
1954     const Register alignment   = rcx;
1955 
1956     // End pointers are inclusive, and if count is not zero they point
1957     // to the last unit copied:  end_to[0] := end_from[0]
1958 
1959     __ enter(); // required for proper stackwalking of RuntimeStub frame
1960     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1961 
1962     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1963                       // r9 and r10 may be used to save non-volatile registers
1964 
1965     // Copy from low to high addresses.
1966     // By pointing to the end and negating qword_count we:
1967     // 1. only update count, not from/tp; 2. don't need another register to hold total count; 3. can jcc right after addptr without cmpptr
1968 
1969     // __ movptr(alignment, to);
1970     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1971     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1972     __ negptr(qword_count); // make the count negative
1973     // Address(end_from/to, qword_count, Address::times_8) now points 8 bytes *below* to original from/to
1974     // i.e. orig to == Address(end_to, qword_count, Address::times_8, 8)
1975 
1976     // Copy in multi-bytes chunks
1977 
1978     if (UseUnalignedLoadStores) {
1979       if (align) { // align target
1980         NearLabel L_aligned_128, L_aligned_256, L_aligned_512;
1981 
1982         __ lea(alignment, Address(end_to, qword_count, Address::times_8, 8)); // == original to
1983         __ negptr(alignment); // we align by copying from the beginning of to, making it effectively larger
1984 
1985         __ testl(alignment, 8);
1986         __ jccb(Assembler::zero, L_aligned_128);
1987         __ increment(qword_count);
1988         // no need to test because we know qword_count >= 2
1989         __ movq(rax, Address(end_from, qword_count, Address::times_8, -0));
1990         __ movqa(Address(end_to, qword_count, Address::times_8, -0), rax, nt);
1991         __ bind(L_aligned_128);
1992 
1993         if (UseAVX >= 2) {
1994           __ testl(alignment, 16);
1995           __ jccb(Assembler::zero, L_aligned_256);
1996           __ cmpptr(qword_count, -2);
1997           if (UseAVX > 2) {
1998             __ jcc(Assembler::greater, L_copy_8_bytes);
1999           } else {
2000             __ jccb(Assembler::greater, L_copy_8_bytes);
2001           }
2002           __ addptr(qword_count, 2);
2003           __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -8));
2004           __ movdqa(Address(end_to, qword_count, Address::times_8, -8), xmm0, nt);
2005           __ bind(L_aligned_256);
2006           // we can move from SSE to AVX without penalty, but not the other way around
2007         }
2008 
2009         if (UseAVX > 2) {
2010           __ testl(alignment, 32);
2011           __ jccb(Assembler::zero, L_aligned_512);
2012           __ addptr(qword_count, 4);
2013           __ jccb(Assembler::less, L_end);
2014           __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
2015           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
2016           __ bind(L_aligned_512);
2017         }
2018       }
2019 
2020       // Copy 64-bytes per iteration
2021       if (UseAVX > 2) {
2022         Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
2023 
2024         __ BIND(L_copy_bytes);
2025         __ cmpptr(qword_count, (-1 * AVX3Threshold / 8));
2026         __ jccb(Assembler::less, L_above_threshold);
2027         __ jmpb(L_below_threshold);
2028 
2029         __ align(OptoLoopAlignment);
2030         __ bind(L_loop_avx512);
2031         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
2032         __ evmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit, nt);
2033         __ bind(L_above_threshold);
2034         __ addptr(qword_count, 8);
2035         __ jcc(Assembler::lessEqual, L_loop_avx512);
2036         __ jmpb(L_32_byte_head);
2037 
2038         __ bind(L_loop_avx2);
2039         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
2040         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
2041         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
2042         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm1, nt);
2043         __ bind(L_below_threshold);
2044         __ addptr(qword_count, 8);
2045         __ jcc(Assembler::lessEqual, L_loop_avx2);
2046 
2047         __ bind(L_32_byte_head);
2048         __ subptr(qword_count, 4);  // sub(8) and add(4)
2049         __ jccb(Assembler::greater, L_end);
2050       } else {
2051         __ jmp(L_copy_bytes);
2052         __ align(OptoLoopAlignment);
2053         __ BIND(L_loop);
2054         if (UseAVX == 2) {
2055           __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
2056           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
2057           __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
2058           __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm1, nt);
2059         } else {
2060           __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
2061           __ movdqa(Address(end_to, qword_count, Address::times_8, -56), xmm0, nt);
2062           __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
2063           __ movdqa(Address(end_to, qword_count, Address::times_8, -40), xmm1, nt);
2064           __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
2065           __ movdqa(Address(end_to, qword_count, Address::times_8, -24), xmm2, nt);
2066           __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
2067           __ movdqa(Address(end_to, qword_count, Address::times_8, - 8), xmm3, nt);
2068         }
2069 
2070         __ BIND(L_copy_bytes);
2071         __ addptr(qword_count, 8);
2072         __ jcc(Assembler::lessEqual, L_loop);
2073         __ subptr(qword_count, 4);  // sub(8) and add(4); we added the extra 8 at the end of the loop; we'll subtract the extra 4 right before "copy trailing qwords"
2074         __ jccb(Assembler::greater, L_end);
2075       }
2076       // Copy trailing 32 bytes
2077       if (UseAVX >= 2) {
2078         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
2079         __ vmovdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
2080       } else {
2081         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
2082         __ movdqa(Address(end_to, qword_count, Address::times_8, -24), xmm0, nt);
2083         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
2084         __ movdqa(Address(end_to, qword_count, Address::times_8, - 8), xmm1, nt);
2085       }
2086       __ addptr(qword_count, 4);
2087     } else {
2088       // Copy 32-bytes per iteration
2089       __ jmp(L_copy_bytes);
2090       __ align(OptoLoopAlignment);
2091       __ BIND(L_loop);
2092       __ movq(rax, Address(end_from, qword_count, Address::times_8, -24));
2093       __ movqa(Address(end_to, qword_count, Address::times_8, -24), rax, nt);
2094       __ movq(rax, Address(end_from, qword_count, Address::times_8, -16));
2095       __ movqa(Address(end_to, qword_count, Address::times_8, -16), rax, nt);
2096       __ movq(rax, Address(end_from, qword_count, Address::times_8, - 8));
2097       __ movqa(Address(end_to, qword_count, Address::times_8, - 8), rax, nt);
2098       __ movq(rax, Address(end_from, qword_count, Address::times_8, - 0));
2099       __ movqa(Address(end_to, qword_count, Address::times_8, - 0), rax, nt);
2100 
2101       __ BIND(L_copy_bytes);
2102       __ addptr(qword_count, 4);
2103       __ jcc(Assembler::lessEqual, L_loop);
2104     }
2105     __ BIND(L_end);
2106     __ subptr(qword_count, 4);
2107     __ jccb(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
2108 
2109     __ BIND(L_exit);
2110     restore_arg_regs();
2111     __ xorptr(rax, rax); // return 0
2112     __ vzeroupper();
2113     __ leave(); // required for proper stackwalking of RuntimeStub frame
2114     __ ret(0);
2115 
2116     // Copy trailing qwords
2117     __ BIND(L_copy_8_bytes);
2118     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2119     __ movqa(Address(end_to, qword_count, Address::times_8, 8), rax, nt);
2120     __ increment(qword_count);
2121     __ jcc(Assembler::notZero, L_copy_8_bytes);
2122     __ jmp(L_exit);
2123 
2124     return start;
2125   }
2126 
2127   // Used by continuations to copy to stack
2128   // Arguments:
2129   //   name    - stub name string
2130   //   nt_mode - 0 - none, 1 - use non-temporal prefetches, 2 - use non-temporal loads
2131   //
2132   // Inputs:
2133   //   c_rarg0   - source array address      --  8-byte aligned
2134   //   c_rarg1   - destination array address -- 16-byte aligned
2135   //   c_rarg2   - element count, in qwords (8 bytes), >= 2
2136   //
2137   address generate_disjoint_word_copy_down(int nt_mode, const char *name) {
2138     const bool prefetchnt = (nt_mode == 1);
2139     const bool nt         = (nt_mode == 2);
2140     const bool align      = nt;
2141 
2142     __ align(CodeEntryAlignment);
2143     StubCodeMark mark(this, "StubRoutines", name);
2144     address start = __ pc();
2145 
2146     Label L_copy_bytes, L_copy_8_bytes, L_loop, L_end, L_exit;
2147     const Register from        = rdi;  // source array address
2148     const Register to          = rsi;  // destination array address
2149     const Register count       = rdx;  // elements count
2150     const Register qword_count = count;
2151     const Register alignment   = rcx; // rbx causes trouble
2152 
2153     __ enter(); // required for proper stackwalking of RuntimeStub frame
2154     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2155 
2156     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2157                       // r9 and r10 may be used to save non-volatile registers
2158 
2159     // Copy from high to low addresses.
2160 
2161     // Copy in multi-bytes chunks
2162 
2163     if (UseUnalignedLoadStores) {
2164       if (align) { // align source (only useful for nt)
2165         NearLabel L_aligned_128, L_aligned_256, L_aligned_512;
2166 
2167         __ lea(alignment, Address(from, qword_count, Address::times_8, 0)); // == original to
2168 
2169         __ testl(alignment, 8);
2170         __ jccb(Assembler::zero, L_aligned_128);
2171         __ decrement(qword_count);
2172         // no need to test because we know qword_count >= 2
2173         __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt); // no 8-byte nt load
2174         __ psrldq(xmm0, 8); // movlhps(xmm0, xmm0);
2175         __ movdq(rax, xmm0);
2176         // __ movq(rax, Address(from, qword_count, Address::times_8, 0));
2177         __ movq(Address(to, qword_count, Address::times_8, 0), rax);
2178         __ bind(L_aligned_128);
2179 
2180         if (UseAVX >= 2) {
2181           __ testl(alignment, 16);
2182           __ jccb(Assembler::zero, L_aligned_256);
2183           __ cmpptr(qword_count, 2);
2184           if (UseAVX > 2) {
2185             __ jcc(Assembler::less, L_copy_8_bytes);
2186           } else {
2187             __ jccb(Assembler::less, L_copy_8_bytes);
2188           }
2189           __ subptr(qword_count, 2);
2190           __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
2191           __ movdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
2192           __ bind(L_aligned_256);
2193           // we can move from SSE to AVX without penalty, but not the other way around
2194         }
2195 
2196         if (UseAVX > 2) {
2197           __ testl(alignment, 32);
2198           __ jccb(Assembler::zero, L_aligned_512);
2199           __ subptr(qword_count, 4);
2200           __ jccb(Assembler::less, L_end);
2201           __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
2202           __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
2203           __ bind(L_aligned_512);
2204         }
2205       }
2206 
2207       // Copy 64-bytes per iteration
2208       const int prefetch_distance = 2 * 64; // prefetch distance of 2
2209       if (UseAVX > 2) {
2210         Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold;
2211 
2212         __ BIND(L_copy_bytes);
2213         __ cmpptr(qword_count, (AVX3Threshold / 8));
2214         __ jccb(Assembler::greater, L_above_threshold);
2215         __ jmpb(L_below_threshold);
2216 
2217         __ align(OptoLoopAlignment);
2218         __ BIND(L_loop_avx512);
2219         if (prefetchnt) {
2220           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
2221         }
2222         __ evmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit, nt);
2223         __ evmovdqul(Address(to, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
2224         __ bind(L_above_threshold);
2225         __ subptr(qword_count, 8);
2226         __ jcc(Assembler::greaterEqual, L_loop_avx512);
2227         __ jmpb(L_32_byte_head);
2228 
2229         __ bind(L_loop_avx2);
2230         if (prefetchnt) {
2231           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
2232         }
2233         __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 32), nt);
2234         __ vmovdqu(Address(to, qword_count, Address::times_8, 32), xmm0);
2235         __ vmovdqa(xmm1, Address(from, qword_count, Address::times_8, 0), nt);
2236         __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm1);
2237         __ bind(L_below_threshold);
2238         __ subptr(qword_count, 8);
2239         __ jcc(Assembler::greaterEqual, L_loop_avx2);
2240 
2241         __ bind(L_32_byte_head);
2242         __ addptr(qword_count, 4);  // add(8) and sub(4)
2243         __ jccb(Assembler::less, L_end);
2244       } else {
2245         __ jmp(L_copy_bytes);
2246         __ align(OptoLoopAlignment);
2247         __ BIND(L_loop);
2248         if (prefetchnt) {
2249           __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
2250         }
2251         if (UseAVX == 2) {
2252           __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 32), nt);
2253           __ vmovdqu(Address(to, qword_count, Address::times_8, 32), xmm0);
2254           __ vmovdqa(xmm1, Address(from, qword_count, Address::times_8,  0), nt);
2255           __ vmovdqu(Address(to, qword_count, Address::times_8,  0), xmm1);
2256         } else {
2257           __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 48), nt);
2258           __ movdqu(Address(to, qword_count, Address::times_8, 48), xmm0);
2259           __ movdqa(xmm1, Address(from, qword_count, Address::times_8, 32), nt);
2260           __ movdqu(Address(to, qword_count, Address::times_8, 32), xmm1);
2261           __ movdqa(xmm2, Address(from, qword_count, Address::times_8, 16), nt);
2262           __ movdqu(Address(to, qword_count, Address::times_8, 16), xmm2);
2263           __ movdqa(xmm3, Address(from, qword_count, Address::times_8,  0), nt);
2264           __ movdqu(Address(to, qword_count, Address::times_8,  0), xmm3);
2265         }
2266 
2267         __ BIND(L_copy_bytes);
2268         __ subptr(qword_count, 8);
2269         __ jcc(Assembler::greaterEqual, L_loop);
2270 
2271         __ addptr(qword_count, 4);  // add(8) and sub(4)
2272         __ jccb(Assembler::less, L_end);
2273       }
2274       // Copy trailing 32 bytes
2275       if (UseAVX >= 2) {
2276         __ vmovdqa(xmm0, Address(from, qword_count, Address::times_8, 0), nt);
2277         __ vmovdqu(Address(to, qword_count, Address::times_8, 0), xmm0);
2278       } else {
2279         __ movdqa(xmm0, Address(from, qword_count, Address::times_8, 16), nt);
2280         __ movdqu(Address(to, qword_count, Address::times_8, 16), xmm0);
2281         __ movdqa(xmm1, Address(from, qword_count, Address::times_8,  0), nt);
2282         __ movdqu(Address(to, qword_count, Address::times_8,  0), xmm1);
2283       }
2284       __ subptr(qword_count, 4);
2285     } else {
2286       // Copy 32-bytes per iteration
2287       const int prefetch_distance = 4 * 32; // prefetch distance of 4
2288       __ jmp(L_copy_bytes);
2289       __ align(OptoLoopAlignment);
2290       __ BIND(L_loop);
2291       if (prefetchnt) {
2292         __ prefetchnta(Address(from, qword_count, Address::times_8, -prefetch_distance));
2293       }
2294       __ movq(rax, Address(from, qword_count, Address::times_8, 24));
2295       __ movq(Address(to, qword_count, Address::times_8, 24), rax);
2296       __ movq(rax, Address(from, qword_count, Address::times_8, 16));
2297       __ movq(Address(to, qword_count, Address::times_8, 16), rax);
2298       __ movq(rax, Address(from, qword_count, Address::times_8,  8));
2299       __ movq(Address(to, qword_count, Address::times_8,  8), rax);
2300       __ movq(rax, Address(from, qword_count, Address::times_8,  0));
2301       __ movq(Address(to, qword_count, Address::times_8,  0), rax);
2302 
2303       __ BIND(L_copy_bytes);
2304       __ subptr(qword_count, 4);
2305       __ jcc(Assembler::greaterEqual, L_loop);
2306     }
2307     __ BIND(L_end);
2308     __ addptr(qword_count, 4);
2309     __ jccb(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
2310 
2311     __ BIND(L_exit);
2312     restore_arg_regs();
2313     __ xorptr(rax, rax); // return 0
2314     __ vzeroupper();
2315     __ leave(); // required for proper stackwalking of RuntimeStub frame
2316     __ ret(0);
2317 
2318     // Copy trailing qwords
2319     __ BIND(L_copy_8_bytes);
2320     if (nt) {
2321       __ prefetchnta(Address(from, qword_count, Address::times_8, -8));
2322     }
2323     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2324     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2325     __ decrement(qword_count);
2326     __ jcc(Assembler::notZero, L_copy_8_bytes);
2327     __ jmp(L_exit);
2328 
2329     return start;
2330   }
2331 
2332   // Arguments:
2333   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2334   //             ignored
2335   //   name    - stub name string
2336   //
2337   // Inputs:
2338   //   c_rarg0   - source array address
2339   //   c_rarg1   - destination array address
2340   //   c_rarg2   - element count, treated as ssize_t, can be zero
2341   //
2342   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
2343   // we let the hardware handle it.  The one to eight bytes within words,
2344   // dwords or qwords that span cache line boundaries will still be loaded
2345   // and stored atomically.
2346   //
2347   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
2348                                       address* entry, const char *name) {
2349 #if COMPILER2_OR_JVMCI
2350     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2351        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
2352                                                  nooverlap_target, aligned, false, false);
2353     }
2354 #endif
2355     __ align(CodeEntryAlignment);
2356     StubCodeMark mark(this, "StubRoutines", name);
2357     address start = __ pc();
2358 
2359     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
2360     const Register from        = rdi;  // source array address
2361     const Register to          = rsi;  // destination array address
2362     const Register count       = rdx;  // elements count
2363     const Register byte_count  = rcx;
2364     const Register qword_count = count;
2365 
2366     __ enter(); // required for proper stackwalking of RuntimeStub frame
2367     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2368 
2369     if (entry != NULL) {
2370       *entry = __ pc();
2371       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2372       BLOCK_COMMENT("Entry:");
2373     }
2374 
2375     array_overlap_test(nooverlap_target, Address::times_1);
2376     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2377                       // r9 and r10 may be used to save non-volatile registers
2378 
2379     {
2380       // UnsafeCopyMemory page error: continue after ucm
2381       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2382       // 'from', 'to' and 'count' are now valid
2383       __ movptr(byte_count, count);
2384       __ shrptr(count, 3);   // count => qword_count
2385 
2386       // Copy from high to low addresses.
2387 
2388       // Check for and copy trailing byte
2389       __ testl(byte_count, 1);
2390       __ jcc(Assembler::zero, L_copy_2_bytes);
2391       __ movb(rax, Address(from, byte_count, Address::times_1, -1));
2392       __ movb(Address(to, byte_count, Address::times_1, -1), rax);
2393       __ decrement(byte_count); // Adjust for possible trailing word
2394 
2395       // Check for and copy trailing word
2396     __ BIND(L_copy_2_bytes);
2397       __ testl(byte_count, 2);
2398       __ jcc(Assembler::zero, L_copy_4_bytes);
2399       __ movw(rax, Address(from, byte_count, Address::times_1, -2));
2400       __ movw(Address(to, byte_count, Address::times_1, -2), rax);
2401 
2402       // Check for and copy trailing dword
2403     __ BIND(L_copy_4_bytes);
2404       __ testl(byte_count, 4);
2405       __ jcc(Assembler::zero, L_copy_bytes);
2406       __ movl(rax, Address(from, qword_count, Address::times_8));
2407       __ movl(Address(to, qword_count, Address::times_8), rax);
2408       __ jmp(L_copy_bytes);
2409 
2410       // Copy trailing qwords
2411     __ BIND(L_copy_8_bytes);
2412       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2413       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2414       __ decrement(qword_count);
2415       __ jcc(Assembler::notZero, L_copy_8_bytes);
2416     }
2417     restore_arg_regs();
2418     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
2419     __ xorptr(rax, rax); // return 0
2420     __ vzeroupper();
2421     __ leave(); // required for proper stackwalking of RuntimeStub frame
2422     __ ret(0);
2423 
2424     {
2425       // UnsafeCopyMemory page error: continue after ucm
2426       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2427       // Copy in multi-bytes chunks
2428       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2429     }
2430     restore_arg_regs();
2431     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
2432     __ xorptr(rax, rax); // return 0
2433     __ vzeroupper();
2434     __ leave(); // required for proper stackwalking of RuntimeStub frame
2435     __ ret(0);
2436 
2437     return start;
2438   }
2439 
2440   // Arguments:
2441   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2442   //             ignored
2443   //   name    - stub name string
2444   //
2445   // Inputs:
2446   //   c_rarg0   - source array address
2447   //   c_rarg1   - destination array address
2448   //   c_rarg2   - element count, treated as ssize_t, can be zero
2449   //
2450   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2451   // let the hardware handle it.  The two or four words within dwords
2452   // or qwords that span cache line boundaries will still be loaded
2453   // and stored atomically.
2454   //
2455   // Side Effects:
2456   //   disjoint_short_copy_entry is set to the no-overlap entry point
2457   //   used by generate_conjoint_short_copy().
2458   //
2459   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
2460 #if COMPILER2_OR_JVMCI
2461     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2462        return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
2463                                                  aligned, false, false);
2464     }
2465 #endif
2466 
2467     __ align(CodeEntryAlignment);
2468     StubCodeMark mark(this, "StubRoutines", name);
2469     address start = __ pc();
2470 
2471     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
2472     const Register from        = rdi;  // source array address
2473     const Register to          = rsi;  // destination array address
2474     const Register count       = rdx;  // elements count
2475     const Register word_count  = rcx;
2476     const Register qword_count = count;
2477     const Register end_from    = from; // source array end address
2478     const Register end_to      = to;   // destination array end address
2479     // End pointers are inclusive, and if count is not zero they point
2480     // to the last unit copied:  end_to[0] := end_from[0]
2481 
2482     __ enter(); // required for proper stackwalking of RuntimeStub frame
2483     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2484 
2485     if (entry != NULL) {
2486       *entry = __ pc();
2487       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2488       BLOCK_COMMENT("Entry:");
2489     }
2490 
2491     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2492                       // r9 and r10 may be used to save non-volatile registers
2493 
2494     {
2495       // UnsafeCopyMemory page error: continue after ucm
2496       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2497       // 'from', 'to' and 'count' are now valid
2498       __ movptr(word_count, count);
2499       __ shrptr(count, 2); // count => qword_count
2500 
2501       // Copy from low to high addresses.  Use 'to' as scratch.
2502       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2503       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2504       __ negptr(qword_count);
2505       __ jmp(L_copy_bytes);
2506 
2507       // Copy trailing qwords
2508     __ BIND(L_copy_8_bytes);
2509       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2510       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2511       __ increment(qword_count);
2512       __ jcc(Assembler::notZero, L_copy_8_bytes);
2513 
2514       // Original 'dest' is trashed, so we can't use it as a
2515       // base register for a possible trailing word copy
2516 
2517       // Check for and copy trailing dword
2518     __ BIND(L_copy_4_bytes);
2519       __ testl(word_count, 2);
2520       __ jccb(Assembler::zero, L_copy_2_bytes);
2521       __ movl(rax, Address(end_from, 8));
2522       __ movl(Address(end_to, 8), rax);
2523 
2524       __ addptr(end_from, 4);
2525       __ addptr(end_to, 4);
2526 
2527       // Check for and copy trailing word
2528     __ BIND(L_copy_2_bytes);
2529       __ testl(word_count, 1);
2530       __ jccb(Assembler::zero, L_exit);
2531       __ movw(rax, Address(end_from, 8));
2532       __ movw(Address(end_to, 8), rax);
2533     }
2534   __ BIND(L_exit);
2535     address ucme_exit_pc = __ pc();
2536     restore_arg_regs();
2537     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2538     __ xorptr(rax, rax); // return 0
2539     __ vzeroupper();
2540     __ leave(); // required for proper stackwalking of RuntimeStub frame
2541     __ ret(0);
2542 
2543     {
2544       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
2545       // Copy in multi-bytes chunks
2546       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2547       __ jmp(L_copy_4_bytes);
2548     }
2549 
2550     return start;
2551   }
2552 
2553   address generate_fill(BasicType t, bool aligned, const char *name) {
2554     __ align(CodeEntryAlignment);
2555     StubCodeMark mark(this, "StubRoutines", name);
2556     address start = __ pc();
2557 
2558     BLOCK_COMMENT("Entry:");
2559 
2560     const Register to       = c_rarg0;  // source array address
2561     const Register value    = c_rarg1;  // value
2562     const Register count    = c_rarg2;  // elements count
2563 
2564     __ enter(); // required for proper stackwalking of RuntimeStub frame
2565 
2566     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
2567 
2568     __ vzeroupper();
2569     __ leave(); // required for proper stackwalking of RuntimeStub frame
2570     __ ret(0);
2571     return start;
2572   }
2573 
2574   // Arguments:
2575   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2576   //             ignored
2577   //   name    - stub name string
2578   //
2579   // Inputs:
2580   //   c_rarg0   - source array address
2581   //   c_rarg1   - destination array address
2582   //   c_rarg2   - element count, treated as ssize_t, can be zero
2583   //
2584   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2585   // let the hardware handle it.  The two or four words within dwords
2586   // or qwords that span cache line boundaries will still be loaded
2587   // and stored atomically.
2588   //
2589   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2590                                        address *entry, const char *name) {
2591 #if COMPILER2_OR_JVMCI
2592     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2593        return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
2594                                                  nooverlap_target, aligned, false, false);
2595     }
2596 #endif
2597     __ align(CodeEntryAlignment);
2598     StubCodeMark mark(this, "StubRoutines", name);
2599     address start = __ pc();
2600 
2601     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2602     const Register from        = rdi;  // source array address
2603     const Register to          = rsi;  // destination array address
2604     const Register count       = rdx;  // elements count
2605     const Register word_count  = rcx;
2606     const Register qword_count = count;
2607 
2608     __ enter(); // required for proper stackwalking of RuntimeStub frame
2609     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2610 
2611     if (entry != NULL) {
2612       *entry = __ pc();
2613       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2614       BLOCK_COMMENT("Entry:");
2615     }
2616 
2617     array_overlap_test(nooverlap_target, Address::times_2);
2618     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2619                       // r9 and r10 may be used to save non-volatile registers
2620 
2621     {
2622       // UnsafeCopyMemory page error: continue after ucm
2623       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2624       // 'from', 'to' and 'count' are now valid
2625       __ movptr(word_count, count);
2626       __ shrptr(count, 2); // count => qword_count
2627 
2628       // Copy from high to low addresses.  Use 'to' as scratch.
2629 
2630       // Check for and copy trailing word
2631       __ testl(word_count, 1);
2632       __ jccb(Assembler::zero, L_copy_4_bytes);
2633       __ movw(rax, Address(from, word_count, Address::times_2, -2));
2634       __ movw(Address(to, word_count, Address::times_2, -2), rax);
2635 
2636      // Check for and copy trailing dword
2637     __ BIND(L_copy_4_bytes);
2638       __ testl(word_count, 2);
2639       __ jcc(Assembler::zero, L_copy_bytes);
2640       __ movl(rax, Address(from, qword_count, Address::times_8));
2641       __ movl(Address(to, qword_count, Address::times_8), rax);
2642       __ jmp(L_copy_bytes);
2643 
2644       // Copy trailing qwords
2645     __ BIND(L_copy_8_bytes);
2646       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2647       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2648       __ decrement(qword_count);
2649       __ jcc(Assembler::notZero, L_copy_8_bytes);
2650     }
2651     restore_arg_regs();
2652     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2653     __ xorptr(rax, rax); // return 0
2654     __ vzeroupper();
2655     __ leave(); // required for proper stackwalking of RuntimeStub frame
2656     __ ret(0);
2657 
2658     {
2659       // UnsafeCopyMemory page error: continue after ucm
2660       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2661       // Copy in multi-bytes chunks
2662       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2663     }
2664     restore_arg_regs();
2665     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2666     __ xorptr(rax, rax); // return 0
2667     __ vzeroupper();
2668     __ leave(); // required for proper stackwalking of RuntimeStub frame
2669     __ ret(0);
2670 
2671     return start;
2672   }
2673 
2674   // Arguments:
2675   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2676   //             ignored
2677   //   is_oop  - true => oop array, so generate store check code
2678   //   name    - stub name string
2679   //
2680   // Inputs:
2681   //   c_rarg0   - source array address
2682   //   c_rarg1   - destination array address
2683   //   c_rarg2   - element count, treated as ssize_t, can be zero
2684   //
2685   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2686   // the hardware handle it.  The two dwords within qwords that span
2687   // cache line boundaries will still be loaded and stored atomicly.
2688   //
2689   // Side Effects:
2690   //   disjoint_int_copy_entry is set to the no-overlap entry point
2691   //   used by generate_conjoint_int_oop_copy().
2692   //
2693   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2694                                          const char *name, bool dest_uninitialized = false) {
2695 #if COMPILER2_OR_JVMCI
2696     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2697        return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
2698                                                  aligned, is_oop, dest_uninitialized);
2699     }
2700 #endif
2701 
2702     __ align(CodeEntryAlignment);
2703     StubCodeMark mark(this, "StubRoutines", name);
2704     address start = __ pc();
2705 
2706     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2707     const Register from        = rdi;  // source array address
2708     const Register to          = rsi;  // destination array address
2709     const Register count       = rdx;  // elements count
2710     const Register dword_count = rcx;
2711     const Register qword_count = count;
2712     const Register end_from    = from; // source array end address
2713     const Register end_to      = to;   // destination array end address
2714     // End pointers are inclusive, and if count is not zero they point
2715     // to the last unit copied:  end_to[0] := end_from[0]
2716 
2717     __ enter(); // required for proper stackwalking of RuntimeStub frame
2718     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2719 
2720     if (entry != NULL) {
2721       *entry = __ pc();
2722       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2723       BLOCK_COMMENT("Entry:");
2724     }
2725 
2726     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2727                                    // r9 is used to save r15_thread
2728 
2729     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2730     if (dest_uninitialized) {
2731       decorators |= IS_DEST_UNINITIALIZED;
2732     }
2733     if (aligned) {
2734       decorators |= ARRAYCOPY_ALIGNED;
2735     }
2736 
2737     BasicType type = is_oop ? T_OBJECT : T_INT;
2738     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2739     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2740 
2741     {
2742       // UnsafeCopyMemory page error: continue after ucm
2743       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2744       // 'from', 'to' and 'count' are now valid
2745       __ movptr(dword_count, count);
2746       __ shrptr(count, 1); // count => qword_count
2747 
2748       // Copy from low to high addresses.  Use 'to' as scratch.
2749       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2750       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2751       __ negptr(qword_count);
2752       __ jmp(L_copy_bytes);
2753 
2754       // Copy trailing qwords
2755     __ BIND(L_copy_8_bytes);
2756       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2757       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2758       __ increment(qword_count);
2759       __ jcc(Assembler::notZero, L_copy_8_bytes);
2760 
2761       // Check for and copy trailing dword
2762     __ BIND(L_copy_4_bytes);
2763       __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2764       __ jccb(Assembler::zero, L_exit);
2765       __ movl(rax, Address(end_from, 8));
2766       __ movl(Address(end_to, 8), rax);
2767     }
2768   __ BIND(L_exit);
2769     address ucme_exit_pc = __ pc();
2770     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2771     restore_arg_regs_using_thread();
2772     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2773     __ vzeroupper();
2774     __ xorptr(rax, rax); // return 0
2775     __ leave(); // required for proper stackwalking of RuntimeStub frame
2776     __ ret(0);
2777 
2778     {
2779       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2780       // Copy in multi-bytes chunks
2781       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2782       __ jmp(L_copy_4_bytes);
2783     }
2784 
2785     return start;
2786   }
2787 
2788   // Arguments:
2789   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2790   //             ignored
2791   //   is_oop  - true => oop array, so generate store check code
2792   //   name    - stub name string
2793   //
2794   // Inputs:
2795   //   c_rarg0   - source array address
2796   //   c_rarg1   - destination array address
2797   //   c_rarg2   - element count, treated as ssize_t, can be zero
2798   //
2799   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2800   // the hardware handle it.  The two dwords within qwords that span
2801   // cache line boundaries will still be loaded and stored atomicly.
2802   //
2803   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2804                                          address *entry, const char *name,
2805                                          bool dest_uninitialized = false) {
2806 #if COMPILER2_OR_JVMCI
2807     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2808        return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
2809                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2810     }
2811 #endif
2812     __ align(CodeEntryAlignment);
2813     StubCodeMark mark(this, "StubRoutines", name);
2814     address start = __ pc();
2815 
2816     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2817     const Register from        = rdi;  // source array address
2818     const Register to          = rsi;  // destination array address
2819     const Register count       = rdx;  // elements count
2820     const Register dword_count = rcx;
2821     const Register qword_count = count;
2822 
2823     __ enter(); // required for proper stackwalking of RuntimeStub frame
2824     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2825 
2826     if (entry != NULL) {
2827       *entry = __ pc();
2828        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2829       BLOCK_COMMENT("Entry:");
2830     }
2831 
2832     array_overlap_test(nooverlap_target, Address::times_4);
2833     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2834                                    // r9 is used to save r15_thread
2835 
2836     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2837     if (dest_uninitialized) {
2838       decorators |= IS_DEST_UNINITIALIZED;
2839     }
2840     if (aligned) {
2841       decorators |= ARRAYCOPY_ALIGNED;
2842     }
2843 
2844     BasicType type = is_oop ? T_OBJECT : T_INT;
2845     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2846     // no registers are destroyed by this call
2847     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2848 
2849     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2850     {
2851       // UnsafeCopyMemory page error: continue after ucm
2852       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2853       // 'from', 'to' and 'count' are now valid
2854       __ movptr(dword_count, count);
2855       __ shrptr(count, 1); // count => qword_count
2856 
2857       // Copy from high to low addresses.  Use 'to' as scratch.
2858 
2859       // Check for and copy trailing dword
2860       __ testl(dword_count, 1);
2861       __ jcc(Assembler::zero, L_copy_bytes);
2862       __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2863       __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2864       __ jmp(L_copy_bytes);
2865 
2866       // Copy trailing qwords
2867     __ BIND(L_copy_8_bytes);
2868       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2869       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2870       __ decrement(qword_count);
2871       __ jcc(Assembler::notZero, L_copy_8_bytes);
2872     }
2873     if (is_oop) {
2874       __ jmp(L_exit);
2875     }
2876     restore_arg_regs_using_thread();
2877     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2878     __ xorptr(rax, rax); // return 0
2879     __ vzeroupper();
2880     __ leave(); // required for proper stackwalking of RuntimeStub frame
2881     __ ret(0);
2882 
2883     {
2884       // UnsafeCopyMemory page error: continue after ucm
2885       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2886       // Copy in multi-bytes chunks
2887       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2888     }
2889 
2890   __ BIND(L_exit);
2891     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2892     restore_arg_regs_using_thread();
2893     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2894     __ xorptr(rax, rax); // return 0
2895     __ vzeroupper();
2896     __ leave(); // required for proper stackwalking of RuntimeStub frame
2897     __ ret(0);
2898 
2899     return start;
2900   }
2901 
2902   // Arguments:
2903   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2904   //             ignored
2905   //   is_oop  - true => oop array, so generate store check code
2906   //   name    - stub name string
2907   //
2908   // Inputs:
2909   //   c_rarg0   - source array address
2910   //   c_rarg1   - destination array address
2911   //   c_rarg2   - element count, treated as ssize_t, can be zero
2912   //
2913  // Side Effects:
2914   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2915   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2916   //
2917   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2918                                           const char *name, bool dest_uninitialized = false) {
2919 #if COMPILER2_OR_JVMCI
2920     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2921        return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
2922                                                  aligned, is_oop, dest_uninitialized);
2923     }
2924 #endif
2925     __ align(CodeEntryAlignment);
2926     StubCodeMark mark(this, "StubRoutines", name);
2927     address start = __ pc();
2928 
2929     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2930     const Register from        = rdi;  // source array address
2931     const Register to          = rsi;  // destination array address
2932     const Register qword_count = rdx;  // elements count
2933     const Register end_from    = from; // source array end address
2934     const Register end_to      = rcx;  // destination array end address
2935     const Register saved_count = r11;
2936     // End pointers are inclusive, and if count is not zero they point
2937     // to the last unit copied:  end_to[0] := end_from[0]
2938 
2939     __ enter(); // required for proper stackwalking of RuntimeStub frame
2940     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2941     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2942 
2943     if (entry != NULL) {
2944       *entry = __ pc();
2945       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2946       BLOCK_COMMENT("Entry:");
2947     }
2948 
2949     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2950                                      // r9 is used to save r15_thread
2951     // 'from', 'to' and 'qword_count' are now valid
2952 
2953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2954     if (dest_uninitialized) {
2955       decorators |= IS_DEST_UNINITIALIZED;
2956     }
2957     if (aligned) {
2958       decorators |= ARRAYCOPY_ALIGNED;
2959     }
2960 
2961     BasicType type = is_oop ? T_OBJECT : T_LONG;
2962     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2963     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2964     {
2965       // UnsafeCopyMemory page error: continue after ucm
2966       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2967 
2968       // Copy from low to high addresses.  Use 'to' as scratch.
2969       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2970       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2971       __ negptr(qword_count);
2972       __ jmp(L_copy_bytes);
2973 
2974       // Copy trailing qwords
2975     __ BIND(L_copy_8_bytes);
2976       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2977       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2978       __ increment(qword_count);
2979       __ jcc(Assembler::notZero, L_copy_8_bytes);
2980     }
2981     if (is_oop) {
2982       __ jmp(L_exit);
2983     } else {
2984       restore_arg_regs_using_thread();
2985       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2986       __ xorptr(rax, rax); // return 0
2987       __ vzeroupper();
2988       __ leave(); // required for proper stackwalking of RuntimeStub frame
2989       __ ret(0);
2990     }
2991 
2992     {
2993       // UnsafeCopyMemory page error: continue after ucm
2994       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2995       // Copy in multi-bytes chunks
2996       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2997     }
2998 
2999     __ BIND(L_exit);
3000     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
3001     restore_arg_regs_using_thread();
3002     if (is_oop) {
3003       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
3004     } else {
3005       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
3006     }
3007     __ vzeroupper();
3008     __ xorptr(rax, rax); // return 0
3009     __ leave(); // required for proper stackwalking of RuntimeStub frame
3010     __ ret(0);
3011 
3012     return start;
3013   }
3014 
3015   // Arguments:
3016   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
3017   //             ignored
3018   //   is_oop  - true => oop array, so generate store check code
3019   //   name    - stub name string
3020   //
3021   // Inputs:
3022   //   c_rarg0   - source array address
3023   //   c_rarg1   - destination array address
3024   //   c_rarg2   - element count, treated as ssize_t, can be zero
3025   //
3026   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
3027                                           address nooverlap_target, address *entry,
3028                                           const char *name, bool dest_uninitialized = false) {
3029 #if COMPILER2_OR_JVMCI
3030     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
3031        return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
3032                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
3033     }
3034 #endif
3035     __ align(CodeEntryAlignment);
3036     StubCodeMark mark(this, "StubRoutines", name);
3037     address start = __ pc();
3038 
3039     Label L_copy_bytes, L_copy_8_bytes, L_exit;
3040     const Register from        = rdi;  // source array address
3041     const Register to          = rsi;  // destination array address
3042     const Register qword_count = rdx;  // elements count
3043     const Register saved_count = rcx;
3044 
3045     __ enter(); // required for proper stackwalking of RuntimeStub frame
3046     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
3047 
3048     if (entry != NULL) {
3049       *entry = __ pc();
3050       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
3051       BLOCK_COMMENT("Entry:");
3052     }
3053 
3054     array_overlap_test(nooverlap_target, Address::times_8);
3055     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
3056                                    // r9 is used to save r15_thread
3057     // 'from', 'to' and 'qword_count' are now valid
3058 
3059     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
3060     if (dest_uninitialized) {
3061       decorators |= IS_DEST_UNINITIALIZED;
3062     }
3063     if (aligned) {
3064       decorators |= ARRAYCOPY_ALIGNED;
3065     }
3066 
3067     BasicType type = is_oop ? T_OBJECT : T_LONG;
3068     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3069     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
3070     {
3071       // UnsafeCopyMemory page error: continue after ucm
3072       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
3073 
3074       __ jmp(L_copy_bytes);
3075 
3076       // Copy trailing qwords
3077     __ BIND(L_copy_8_bytes);
3078       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
3079       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
3080       __ decrement(qword_count);
3081       __ jcc(Assembler::notZero, L_copy_8_bytes);
3082     }
3083     if (is_oop) {
3084       __ jmp(L_exit);
3085     } else {
3086       restore_arg_regs_using_thread();
3087       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
3088       __ xorptr(rax, rax); // return 0
3089       __ vzeroupper();
3090       __ leave(); // required for proper stackwalking of RuntimeStub frame
3091       __ ret(0);
3092     }
3093     {
3094       // UnsafeCopyMemory page error: continue after ucm
3095       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
3096 
3097       // Copy in multi-bytes chunks
3098       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
3099     }
3100     __ BIND(L_exit);
3101     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
3102     restore_arg_regs_using_thread();
3103     if (is_oop) {
3104       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
3105     } else {
3106       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
3107     }
3108     __ vzeroupper();
3109     __ xorptr(rax, rax); // return 0
3110     __ leave(); // required for proper stackwalking of RuntimeStub frame
3111     __ ret(0);
3112 
3113     return start;
3114   }
3115 
3116 
3117   // Helper for generating a dynamic type check.
3118   // Smashes no registers.
3119   void generate_type_check(Register sub_klass,
3120                            Register super_check_offset,
3121                            Register super_klass,
3122                            Label& L_success) {
3123     assert_different_registers(sub_klass, super_check_offset, super_klass);
3124 
3125     BLOCK_COMMENT("type_check:");
3126 
3127     Label L_miss;
3128 
3129     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
3130                                      super_check_offset);
3131     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
3132 
3133     // Fall through on failure!
3134     __ BIND(L_miss);
3135   }
3136 
3137   //
3138   //  Generate checkcasting array copy stub
3139   //
3140   //  Input:
3141   //    c_rarg0   - source array address
3142   //    c_rarg1   - destination array address
3143   //    c_rarg2   - element count, treated as ssize_t, can be zero
3144   //    c_rarg3   - size_t ckoff (super_check_offset)
3145   // not Win64
3146   //    c_rarg4   - oop ckval (super_klass)
3147   // Win64
3148   //    rsp+40    - oop ckval (super_klass)
3149   //
3150   //  Output:
3151   //    rax ==  0  -  success
3152   //    rax == -1^K - failure, where K is partial transfer count
3153   //
3154   address generate_checkcast_copy(const char *name, address *entry,
3155                                   bool dest_uninitialized = false) {
3156 
3157     Label L_load_element, L_store_element, L_do_card_marks, L_done;
3158 
3159     // Input registers (after setup_arg_regs)
3160     const Register from        = rdi;   // source array address
3161     const Register to          = rsi;   // destination array address
3162     const Register length      = rdx;   // elements count
3163     const Register ckoff       = rcx;   // super_check_offset
3164     const Register ckval       = r8;    // super_klass
3165 
3166     // Registers used as temps (r13, r14 are save-on-entry)
3167     const Register end_from    = from;  // source array end address
3168     const Register end_to      = r13;   // destination array end address
3169     const Register count       = rdx;   // -(count_remaining)
3170     const Register r14_length  = r14;   // saved copy of length
3171     // End pointers are inclusive, and if length is not zero they point
3172     // to the last unit copied:  end_to[0] := end_from[0]
3173 
3174     const Register rax_oop    = rax;    // actual oop copied
3175     const Register r11_klass  = r11;    // oop._klass
3176 
3177     //---------------------------------------------------------------
3178     // Assembler stub will be used for this call to arraycopy
3179     // if the two arrays are subtypes of Object[] but the
3180     // destination array type is not equal to or a supertype
3181     // of the source type.  Each element must be separately
3182     // checked.
3183 
3184     __ align(CodeEntryAlignment);
3185     StubCodeMark mark(this, "StubRoutines", name);
3186     address start = __ pc();
3187 
3188     __ enter(); // required for proper stackwalking of RuntimeStub frame
3189 
3190 #ifdef ASSERT
3191     // caller guarantees that the arrays really are different
3192     // otherwise, we would have to make conjoint checks
3193     { Label L;
3194       array_overlap_test(L, TIMES_OOP);
3195       __ stop("checkcast_copy within a single array");
3196       __ bind(L);
3197     }
3198 #endif //ASSERT
3199 
3200     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
3201                        // ckoff => rcx, ckval => r8
3202                        // r9 and r10 may be used to save non-volatile registers
3203 #ifdef _WIN64
3204     // last argument (#4) is on stack on Win64
3205     __ movptr(ckval, Address(rsp, 6 * wordSize));
3206 #endif
3207 
3208     // Caller of this entry point must set up the argument registers.
3209     if (entry != NULL) {
3210       *entry = __ pc();
3211       BLOCK_COMMENT("Entry:");
3212     }
3213 
3214     // allocate spill slots for r13, r14
3215     enum {
3216       saved_r13_offset,
3217       saved_r14_offset,
3218       saved_r10_offset,
3219       saved_rbp_offset
3220     };
3221     __ subptr(rsp, saved_rbp_offset * wordSize);
3222     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
3223     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
3224     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
3225 
3226 #ifdef ASSERT
3227       Label L2;
3228       __ get_thread(r14);
3229       __ cmpptr(r15_thread, r14);
3230       __ jcc(Assembler::equal, L2);
3231       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
3232       __ bind(L2);
3233 #endif // ASSERT
3234 
3235     // check that int operands are properly extended to size_t
3236     assert_clean_int(length, rax);
3237     assert_clean_int(ckoff, rax);
3238 
3239 #ifdef ASSERT
3240     BLOCK_COMMENT("assert consistent ckoff/ckval");
3241     // The ckoff and ckval must be mutually consistent,
3242     // even though caller generates both.
3243     { Label L;
3244       int sco_offset = in_bytes(Klass::super_check_offset_offset());
3245       __ cmpl(ckoff, Address(ckval, sco_offset));
3246       __ jcc(Assembler::equal, L);
3247       __ stop("super_check_offset inconsistent");
3248       __ bind(L);
3249     }
3250 #endif //ASSERT
3251 
3252     // Loop-invariant addresses.  They are exclusive end pointers.
3253     Address end_from_addr(from, length, TIMES_OOP, 0);
3254     Address   end_to_addr(to,   length, TIMES_OOP, 0);
3255     // Loop-variant addresses.  They assume post-incremented count < 0.
3256     Address from_element_addr(end_from, count, TIMES_OOP, 0);
3257     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
3258 
3259     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
3260     if (dest_uninitialized) {
3261       decorators |= IS_DEST_UNINITIALIZED;
3262     }
3263 
3264     BasicType type = T_OBJECT;
3265     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3266     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
3267 
3268     // Copy from low to high addresses, indexed from the end of each array.
3269     __ lea(end_from, end_from_addr);
3270     __ lea(end_to,   end_to_addr);
3271     __ movptr(r14_length, length);        // save a copy of the length
3272     assert(length == count, "");          // else fix next line:
3273     __ negptr(count);                     // negate and test the length
3274     __ jcc(Assembler::notZero, L_load_element);
3275 
3276     // Empty array:  Nothing to do.
3277     __ xorptr(rax, rax);                  // return 0 on (trivial) success
3278     __ jmp(L_done);
3279 
3280     // ======== begin loop ========
3281     // (Loop is rotated; its entry is L_load_element.)
3282     // Loop control:
3283     //   for (count = -count; count != 0; count++)
3284     // Base pointers src, dst are biased by 8*(count-1),to last element.
3285     __ align(OptoLoopAlignment);
3286 
3287     __ BIND(L_store_element);
3288     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW);  // store the oop
3289     __ increment(count);               // increment the count toward zero
3290     __ jcc(Assembler::zero, L_do_card_marks);
3291 
3292     // ======== loop entry is here ========
3293     __ BIND(L_load_element);
3294     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
3295     __ testptr(rax_oop, rax_oop);
3296     __ jcc(Assembler::zero, L_store_element);
3297 
3298     __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
3299     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
3300     // ======== end loop ========
3301 
3302     // It was a real error; we must depend on the caller to finish the job.
3303     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
3304     // Emit GC store barriers for the oops we have copied (r14 + rdx),
3305     // and report their number to the caller.
3306     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
3307     Label L_post_barrier;
3308     __ addptr(r14_length, count);     // K = (original - remaining) oops
3309     __ movptr(rax, r14_length);       // save the value
3310     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
3311     __ jccb(Assembler::notZero, L_post_barrier);
3312     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
3313 
3314     // Come here on success only.
3315     __ BIND(L_do_card_marks);
3316     __ xorptr(rax, rax);              // return 0 on success
3317 
3318     __ BIND(L_post_barrier);
3319     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
3320 
3321     // Common exit point (success or failure).
3322     __ BIND(L_done);
3323     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
3324     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
3325     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
3326     restore_arg_regs();
3327     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
3328     __ leave(); // required for proper stackwalking of RuntimeStub frame
3329     __ ret(0);
3330 
3331     return start;
3332   }
3333 
3334   //
3335   //  Generate 'unsafe' array copy stub
3336   //  Though just as safe as the other stubs, it takes an unscaled
3337   //  size_t argument instead of an element count.
3338   //
3339   //  Input:
3340   //    c_rarg0   - source array address
3341   //    c_rarg1   - destination array address
3342   //    c_rarg2   - byte count, treated as ssize_t, can be zero
3343   //
3344   // Examines the alignment of the operands and dispatches
3345   // to a long, int, short, or byte copy loop.
3346   //
3347   address generate_unsafe_copy(const char *name,
3348                                address byte_copy_entry, address short_copy_entry,
3349                                address int_copy_entry, address long_copy_entry) {
3350 
3351     Label L_long_aligned, L_int_aligned, L_short_aligned;
3352 
3353     // Input registers (before setup_arg_regs)
3354     const Register from        = c_rarg0;  // source array address
3355     const Register to          = c_rarg1;  // destination array address
3356     const Register size        = c_rarg2;  // byte count (size_t)
3357 
3358     // Register used as a temp
3359     const Register bits        = rax;      // test copy of low bits
3360 
3361     __ align(CodeEntryAlignment);
3362     StubCodeMark mark(this, "StubRoutines", name);
3363     address start = __ pc();
3364 
3365     __ enter(); // required for proper stackwalking of RuntimeStub frame
3366 
3367     // bump this on entry, not on exit:
3368     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
3369 
3370     __ mov(bits, from);
3371     __ orptr(bits, to);
3372     __ orptr(bits, size);
3373 
3374     __ testb(bits, BytesPerLong-1);
3375     __ jccb(Assembler::zero, L_long_aligned);
3376 
3377     __ testb(bits, BytesPerInt-1);
3378     __ jccb(Assembler::zero, L_int_aligned);
3379 
3380     __ testb(bits, BytesPerShort-1);
3381     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
3382 
3383     __ BIND(L_short_aligned);
3384     __ shrptr(size, LogBytesPerShort); // size => short_count
3385     __ jump(RuntimeAddress(short_copy_entry));
3386 
3387     __ BIND(L_int_aligned);
3388     __ shrptr(size, LogBytesPerInt); // size => int_count
3389     __ jump(RuntimeAddress(int_copy_entry));
3390 
3391     __ BIND(L_long_aligned);
3392     __ shrptr(size, LogBytesPerLong); // size => qword_count
3393     __ jump(RuntimeAddress(long_copy_entry));
3394 
3395     return start;
3396   }
3397 
3398   // Perform range checks on the proposed arraycopy.
3399   // Kills temp, but nothing else.
3400   // Also, clean the sign bits of src_pos and dst_pos.
3401   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
3402                               Register src_pos, // source position (c_rarg1)
3403                               Register dst,     // destination array oo (c_rarg2)
3404                               Register dst_pos, // destination position (c_rarg3)
3405                               Register length,
3406                               Register temp,
3407                               Label& L_failed) {
3408     BLOCK_COMMENT("arraycopy_range_checks:");
3409 
3410     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
3411     __ movl(temp, length);
3412     __ addl(temp, src_pos);             // src_pos + length
3413     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
3414     __ jcc(Assembler::above, L_failed);
3415 
3416     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
3417     __ movl(temp, length);
3418     __ addl(temp, dst_pos);             // dst_pos + length
3419     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
3420     __ jcc(Assembler::above, L_failed);
3421 
3422     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
3423     // Move with sign extension can be used since they are positive.
3424     __ movslq(src_pos, src_pos);
3425     __ movslq(dst_pos, dst_pos);
3426 
3427     BLOCK_COMMENT("arraycopy_range_checks done");
3428   }
3429 
3430   //
3431   //  Generate generic array copy stubs
3432   //
3433   //  Input:
3434   //    c_rarg0    -  src oop
3435   //    c_rarg1    -  src_pos (32-bits)
3436   //    c_rarg2    -  dst oop
3437   //    c_rarg3    -  dst_pos (32-bits)
3438   // not Win64
3439   //    c_rarg4    -  element count (32-bits)
3440   // Win64
3441   //    rsp+40     -  element count (32-bits)
3442   //
3443   //  Output:
3444   //    rax ==  0  -  success
3445   //    rax == -1^K - failure, where K is partial transfer count
3446   //
3447   address generate_generic_copy(const char *name,
3448                                 address byte_copy_entry, address short_copy_entry,
3449                                 address int_copy_entry, address oop_copy_entry,
3450                                 address long_copy_entry, address checkcast_copy_entry) {
3451 
3452     Label L_failed, L_failed_0, L_objArray;
3453     Label L_copy_shorts, L_copy_ints, L_copy_longs;
3454 
3455     // Input registers
3456     const Register src        = c_rarg0;  // source array oop
3457     const Register src_pos    = c_rarg1;  // source position
3458     const Register dst        = c_rarg2;  // destination array oop
3459     const Register dst_pos    = c_rarg3;  // destination position
3460 #ifndef _WIN64
3461     const Register length     = c_rarg4;
3462     const Register rklass_tmp = r9;  // load_klass
3463 #else
3464     const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3465     const Register rklass_tmp = rdi;  // load_klass
3466 #endif
3467 
3468     { int modulus = CodeEntryAlignment;
3469       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
3470       int advance = target - (__ offset() % modulus);
3471       if (advance < 0)  advance += modulus;
3472       if (advance > 0)  __ nop(advance);
3473     }
3474     StubCodeMark mark(this, "StubRoutines", name);
3475 
3476     // Short-hop target to L_failed.  Makes for denser prologue code.
3477     __ BIND(L_failed_0);
3478     __ jmp(L_failed);
3479     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
3480 
3481     __ align(CodeEntryAlignment);
3482     address start = __ pc();
3483 
3484     __ enter(); // required for proper stackwalking of RuntimeStub frame
3485 
3486 #ifdef _WIN64
3487     __ push(rklass_tmp); // rdi is callee-save on Windows
3488 #endif
3489 
3490     // bump this on entry, not on exit:
3491     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
3492 
3493     //-----------------------------------------------------------------------
3494     // Assembler stub will be used for this call to arraycopy
3495     // if the following conditions are met:
3496     //
3497     // (1) src and dst must not be null.
3498     // (2) src_pos must not be negative.
3499     // (3) dst_pos must not be negative.
3500     // (4) length  must not be negative.
3501     // (5) src klass and dst klass should be the same and not NULL.
3502     // (6) src and dst should be arrays.
3503     // (7) src_pos + length must not exceed length of src.
3504     // (8) dst_pos + length must not exceed length of dst.
3505     //
3506 
3507     //  if (src == NULL) return -1;
3508     __ testptr(src, src);         // src oop
3509     size_t j1off = __ offset();
3510     __ jccb(Assembler::zero, L_failed_0);
3511 
3512     //  if (src_pos < 0) return -1;
3513     __ testl(src_pos, src_pos); // src_pos (32-bits)
3514     __ jccb(Assembler::negative, L_failed_0);
3515 
3516     //  if (dst == NULL) return -1;
3517     __ testptr(dst, dst);         // dst oop
3518     __ jccb(Assembler::zero, L_failed_0);
3519 
3520     //  if (dst_pos < 0) return -1;
3521     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3522     size_t j4off = __ offset();
3523     __ jccb(Assembler::negative, L_failed_0);
3524 
3525     // The first four tests are very dense code,
3526     // but not quite dense enough to put four
3527     // jumps in a 16-byte instruction fetch buffer.
3528     // That's good, because some branch predicters
3529     // do not like jumps so close together.
3530     // Make sure of this.
3531     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3532 
3533     // registers used as temp
3534     const Register r11_length    = r11; // elements count to copy
3535     const Register r10_src_klass = r10; // array klass
3536 
3537     //  if (length < 0) return -1;
3538     __ movl(r11_length, length);        // length (elements count, 32-bits value)
3539     __ testl(r11_length, r11_length);
3540     __ jccb(Assembler::negative, L_failed_0);
3541 
3542     __ load_klass(r10_src_klass, src, rklass_tmp);
3543 #ifdef ASSERT
3544     //  assert(src->klass() != NULL);
3545     {
3546       BLOCK_COMMENT("assert klasses not null {");
3547       Label L1, L2;
3548       __ testptr(r10_src_klass, r10_src_klass);
3549       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
3550       __ bind(L1);
3551       __ stop("broken null klass");
3552       __ bind(L2);
3553       __ load_klass(rax, dst, rklass_tmp);
3554       __ cmpq(rax, 0);
3555       __ jcc(Assembler::equal, L1);     // this would be broken also
3556       BLOCK_COMMENT("} assert klasses not null done");
3557     }
3558 #endif
3559 
3560     // Load layout helper (32-bits)
3561     //
3562     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3563     // 32        30    24            16              8     2                 0
3564     //
3565     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3566     //
3567 
3568     const int lh_offset = in_bytes(Klass::layout_helper_offset());
3569 
3570     // Handle objArrays completely differently...
3571     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3572     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3573     __ jcc(Assembler::equal, L_objArray);
3574 
3575     //  if (src->klass() != dst->klass()) return -1;
3576     __ load_klass(rax, dst, rklass_tmp);
3577     __ cmpq(r10_src_klass, rax);
3578     __ jcc(Assembler::notEqual, L_failed);
3579 
3580     const Register rax_lh = rax;  // layout helper
3581     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3582 
3583     //  if (!src->is_Array()) return -1;
3584     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3585     __ jcc(Assembler::greaterEqual, L_failed);
3586 
3587     // At this point, it is known to be a typeArray (array_tag 0x3).
3588 #ifdef ASSERT
3589     {
3590       BLOCK_COMMENT("assert primitive array {");
3591       Label L;
3592       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3593       __ jcc(Assembler::greaterEqual, L);
3594       __ stop("must be a primitive array");
3595       __ bind(L);
3596       BLOCK_COMMENT("} assert primitive array done");
3597     }
3598 #endif
3599 
3600     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3601                            r10, L_failed);
3602 
3603     // TypeArrayKlass
3604     //
3605     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3606     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3607     //
3608 
3609     const Register r10_offset = r10;    // array offset
3610     const Register rax_elsize = rax_lh; // element size
3611 
3612     __ movl(r10_offset, rax_lh);
3613     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3614     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3615     __ addptr(src, r10_offset);           // src array offset
3616     __ addptr(dst, r10_offset);           // dst array offset
3617     BLOCK_COMMENT("choose copy loop based on element size");
3618     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3619 
3620 #ifdef _WIN64
3621     __ pop(rklass_tmp); // Restore callee-save rdi
3622 #endif
3623 
3624     // next registers should be set before the jump to corresponding stub
3625     const Register from     = c_rarg0;  // source array address
3626     const Register to       = c_rarg1;  // destination array address
3627     const Register count    = c_rarg2;  // elements count
3628 
3629     // 'from', 'to', 'count' registers should be set in such order
3630     // since they are the same as 'src', 'src_pos', 'dst'.
3631 
3632     __ cmpl(rax_elsize, 0);
3633     __ jccb(Assembler::notEqual, L_copy_shorts);
3634     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3635     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3636     __ movl2ptr(count, r11_length); // length
3637     __ jump(RuntimeAddress(byte_copy_entry));
3638 
3639   __ BIND(L_copy_shorts);
3640     __ cmpl(rax_elsize, LogBytesPerShort);
3641     __ jccb(Assembler::notEqual, L_copy_ints);
3642     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3643     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3644     __ movl2ptr(count, r11_length); // length
3645     __ jump(RuntimeAddress(short_copy_entry));
3646 
3647   __ BIND(L_copy_ints);
3648     __ cmpl(rax_elsize, LogBytesPerInt);
3649     __ jccb(Assembler::notEqual, L_copy_longs);
3650     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3651     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3652     __ movl2ptr(count, r11_length); // length
3653     __ jump(RuntimeAddress(int_copy_entry));
3654 
3655   __ BIND(L_copy_longs);
3656 #ifdef ASSERT
3657     {
3658       BLOCK_COMMENT("assert long copy {");
3659       Label L;
3660       __ cmpl(rax_elsize, LogBytesPerLong);
3661       __ jcc(Assembler::equal, L);
3662       __ stop("must be long copy, but elsize is wrong");
3663       __ bind(L);
3664       BLOCK_COMMENT("} assert long copy done");
3665     }
3666 #endif
3667     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3668     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3669     __ movl2ptr(count, r11_length); // length
3670     __ jump(RuntimeAddress(long_copy_entry));
3671 
3672     // ObjArrayKlass
3673   __ BIND(L_objArray);
3674     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3675 
3676     Label L_plain_copy, L_checkcast_copy;
3677     //  test array classes for subtyping
3678     __ load_klass(rax, dst, rklass_tmp);
3679     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3680     __ jcc(Assembler::notEqual, L_checkcast_copy);
3681 
3682     // Identically typed arrays can be copied without element-wise checks.
3683     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3684                            r10, L_failed);
3685 
3686     __ lea(from, Address(src, src_pos, TIMES_OOP,
3687                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3688     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3689                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3690     __ movl2ptr(count, r11_length); // length
3691   __ BIND(L_plain_copy);
3692 #ifdef _WIN64
3693     __ pop(rklass_tmp); // Restore callee-save rdi
3694 #endif
3695     __ jump(RuntimeAddress(oop_copy_entry));
3696 
3697   __ BIND(L_checkcast_copy);
3698     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3699     {
3700       // Before looking at dst.length, make sure dst is also an objArray.
3701       __ cmpl(Address(rax, lh_offset), objArray_lh);
3702       __ jcc(Assembler::notEqual, L_failed);
3703 
3704       // It is safe to examine both src.length and dst.length.
3705       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3706                              rax, L_failed);
3707 
3708       const Register r11_dst_klass = r11;
3709       __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3710 
3711       // Marshal the base address arguments now, freeing registers.
3712       __ lea(from, Address(src, src_pos, TIMES_OOP,
3713                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3714       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3715                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3716       __ movl(count, length);           // length (reloaded)
3717       Register sco_temp = c_rarg3;      // this register is free now
3718       assert_different_registers(from, to, count, sco_temp,
3719                                  r11_dst_klass, r10_src_klass);
3720       assert_clean_int(count, sco_temp);
3721 
3722       // Generate the type check.
3723       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3724       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3725       assert_clean_int(sco_temp, rax);
3726       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3727 
3728       // Fetch destination element klass from the ObjArrayKlass header.
3729       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3730       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3731       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3732       assert_clean_int(sco_temp, rax);
3733 
3734 #ifdef _WIN64
3735       __ pop(rklass_tmp); // Restore callee-save rdi
3736 #endif
3737 
3738       // the checkcast_copy loop needs two extra arguments:
3739       assert(c_rarg3 == sco_temp, "#3 already in place");
3740       // Set up arguments for checkcast_copy_entry.
3741       setup_arg_regs(4);
3742       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3743       __ jump(RuntimeAddress(checkcast_copy_entry));
3744     }
3745 
3746   __ BIND(L_failed);
3747 #ifdef _WIN64
3748     __ pop(rklass_tmp); // Restore callee-save rdi
3749 #endif
3750     __ xorptr(rax, rax);
3751     __ notptr(rax); // return -1
3752     __ leave();   // required for proper stackwalking of RuntimeStub frame
3753     __ ret(0);
3754 
3755     return start;
3756   }
3757 
3758   address generate_data_cache_writeback() {
3759     const Register src        = c_rarg0;  // source address
3760 
3761     __ align(CodeEntryAlignment);
3762 
3763     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3764 
3765     address start = __ pc();
3766     __ enter();
3767     __ cache_wb(Address(src, 0));
3768     __ leave();
3769     __ ret(0);
3770 
3771     return start;
3772   }
3773 
3774   address generate_data_cache_writeback_sync() {
3775     const Register is_pre    = c_rarg0;  // pre or post sync
3776 
3777     __ align(CodeEntryAlignment);
3778 
3779     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3780 
3781     // pre wbsync is a no-op
3782     // post wbsync translates to an sfence
3783 
3784     Label skip;
3785     address start = __ pc();
3786     __ enter();
3787     __ cmpl(is_pre, 0);
3788     __ jcc(Assembler::notEqual, skip);
3789     __ cache_wbsync(false);
3790     __ bind(skip);
3791     __ leave();
3792     __ ret(0);
3793 
3794     return start;
3795   }
3796 
3797   void generate_arraycopy_stubs() {
3798     address entry;
3799     address entry_jbyte_arraycopy;
3800     address entry_jshort_arraycopy;
3801     address entry_jint_arraycopy;
3802     address entry_oop_arraycopy;
3803     address entry_jlong_arraycopy;
3804     address entry_checkcast_arraycopy;
3805 
3806     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3807                                                                            "jbyte_disjoint_arraycopy");
3808     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3809                                                                            "jbyte_arraycopy");
3810 
3811     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3812                                                                             "jshort_disjoint_arraycopy");
3813     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3814                                                                             "jshort_arraycopy");
3815 
3816     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3817                                                                               "jint_disjoint_arraycopy");
3818     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3819                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3820 
3821     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3822                                                                                "jlong_disjoint_arraycopy");
3823     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3824                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3825 
3826 
3827     if (UseCompressedOops) {
3828       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3829                                                                               "oop_disjoint_arraycopy");
3830       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3831                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3832       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3833                                                                                      "oop_disjoint_arraycopy_uninit",
3834                                                                                      /*dest_uninitialized*/true);
3835       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3836                                                                                      NULL, "oop_arraycopy_uninit",
3837                                                                                      /*dest_uninitialized*/true);
3838     } else {
3839       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3840                                                                                "oop_disjoint_arraycopy");
3841       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3842                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3843       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3844                                                                                       "oop_disjoint_arraycopy_uninit",
3845                                                                                       /*dest_uninitialized*/true);
3846       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3847                                                                                       NULL, "oop_arraycopy_uninit",
3848                                                                                       /*dest_uninitialized*/true);
3849     }
3850 
3851     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3852     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3853                                                                         /*dest_uninitialized*/true);
3854 
3855     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3856                                                               entry_jbyte_arraycopy,
3857                                                               entry_jshort_arraycopy,
3858                                                               entry_jint_arraycopy,
3859                                                               entry_jlong_arraycopy);
3860     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3861                                                                entry_jbyte_arraycopy,
3862                                                                entry_jshort_arraycopy,
3863                                                                entry_jint_arraycopy,
3864                                                                entry_oop_arraycopy,
3865                                                                entry_jlong_arraycopy,
3866                                                                entry_checkcast_arraycopy);
3867 
3868     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3869     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3870     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3871     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3872     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3873     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3874 
3875     // We don't generate specialized code for HeapWord-aligned source
3876     // arrays, so just use the code we've already generated
3877     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3878     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3879 
3880     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3881     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3882 
3883     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3884     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3885 
3886     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3887     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3888 
3889     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3890     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3891 
3892     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3893     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3894 
3895     StubRoutines::_has_word_memcpy     = true;
3896     StubRoutines::_word_memcpy_up      = generate_disjoint_word_copy_up  (false, "word_memcpy_up");
3897     StubRoutines::_word_memcpy_up_nt   = generate_disjoint_word_copy_up  (true,  "word_memcpy_up_nt");
3898     StubRoutines::_word_memcpy_down    = generate_disjoint_word_copy_down(0,     "word_memcpy_down");
3899     StubRoutines::_word_memcpy_down_nt = generate_disjoint_word_copy_down(1,     "word_memcpy_down_nt");
3900   }
3901 
3902   // AES intrinsic stubs
3903   enum {AESBlockSize = 16};
3904 
3905   address generate_key_shuffle_mask() {
3906     __ align(16);
3907     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3908     address start = __ pc();
3909     __ emit_data64( 0x0405060700010203, relocInfo::none );
3910     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3911     return start;
3912   }
3913 
3914   address generate_counter_shuffle_mask() {
3915     __ align(16);
3916     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3917     address start = __ pc();
3918     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3919     __ emit_data64(0x0001020304050607, relocInfo::none);
3920     return start;
3921   }
3922 
3923   // Utility routine for loading a 128-bit key word in little endian format
3924   // can optionally specify that the shuffle mask is already in an xmmregister
3925   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3926     __ movdqu(xmmdst, Address(key, offset));
3927     if (xmm_shuf_mask != NULL) {
3928       __ pshufb(xmmdst, xmm_shuf_mask);
3929     } else {
3930       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3931     }
3932   }
3933 
3934   // Utility routine for increase 128bit counter (iv in CTR mode)
3935   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3936     __ pextrq(reg, xmmdst, 0x0);
3937     __ addq(reg, inc_delta);
3938     __ pinsrq(xmmdst, reg, 0x0);
3939     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3940     __ pextrq(reg, xmmdst, 0x01); // Carry
3941     __ addq(reg, 0x01);
3942     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3943     __ BIND(next_block);          // next instruction
3944   }
3945 
3946   // Arguments:
3947   //
3948   // Inputs:
3949   //   c_rarg0   - source byte array address
3950   //   c_rarg1   - destination byte array address
3951   //   c_rarg2   - K (key) in little endian int array
3952   //
3953   address generate_aescrypt_encryptBlock() {
3954     assert(UseAES, "need AES instructions and misaligned SSE support");
3955     __ align(CodeEntryAlignment);
3956     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3957     Label L_doLast;
3958     address start = __ pc();
3959 
3960     const Register from        = c_rarg0;  // source array address
3961     const Register to          = c_rarg1;  // destination array address
3962     const Register key         = c_rarg2;  // key array address
3963     const Register keylen      = rax;
3964 
3965     const XMMRegister xmm_result = xmm0;
3966     const XMMRegister xmm_key_shuf_mask = xmm1;
3967     // On win64 xmm6-xmm15 must be preserved so don't use them.
3968     const XMMRegister xmm_temp1  = xmm2;
3969     const XMMRegister xmm_temp2  = xmm3;
3970     const XMMRegister xmm_temp3  = xmm4;
3971     const XMMRegister xmm_temp4  = xmm5;
3972 
3973     __ enter(); // required for proper stackwalking of RuntimeStub frame
3974 
3975     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3976     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3977 
3978     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3979     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3980 
3981     // For encryption, the java expanded key ordering is just what we need
3982     // we don't know if the key is aligned, hence not using load-execute form
3983 
3984     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3985     __ pxor(xmm_result, xmm_temp1);
3986 
3987     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3988     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3989     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3990     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3991 
3992     __ aesenc(xmm_result, xmm_temp1);
3993     __ aesenc(xmm_result, xmm_temp2);
3994     __ aesenc(xmm_result, xmm_temp3);
3995     __ aesenc(xmm_result, xmm_temp4);
3996 
3997     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3998     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3999     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
4000     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
4001 
4002     __ aesenc(xmm_result, xmm_temp1);
4003     __ aesenc(xmm_result, xmm_temp2);
4004     __ aesenc(xmm_result, xmm_temp3);
4005     __ aesenc(xmm_result, xmm_temp4);
4006 
4007     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
4008     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
4009 
4010     __ cmpl(keylen, 44);
4011     __ jccb(Assembler::equal, L_doLast);
4012 
4013     __ aesenc(xmm_result, xmm_temp1);
4014     __ aesenc(xmm_result, xmm_temp2);
4015 
4016     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
4017     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
4018 
4019     __ cmpl(keylen, 52);
4020     __ jccb(Assembler::equal, L_doLast);
4021 
4022     __ aesenc(xmm_result, xmm_temp1);
4023     __ aesenc(xmm_result, xmm_temp2);
4024 
4025     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
4026     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
4027 
4028     __ BIND(L_doLast);
4029     __ aesenc(xmm_result, xmm_temp1);
4030     __ aesenclast(xmm_result, xmm_temp2);
4031     __ movdqu(Address(to, 0), xmm_result);        // store the result
4032     __ xorptr(rax, rax); // return 0
4033     __ leave(); // required for proper stackwalking of RuntimeStub frame
4034     __ ret(0);
4035 
4036     return start;
4037   }
4038 
4039 
4040   // Arguments:
4041   //
4042   // Inputs:
4043   //   c_rarg0   - source byte array address
4044   //   c_rarg1   - destination byte array address
4045   //   c_rarg2   - K (key) in little endian int array
4046   //
4047   address generate_aescrypt_decryptBlock() {
4048     assert(UseAES, "need AES instructions and misaligned SSE support");
4049     __ align(CodeEntryAlignment);
4050     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
4051     Label L_doLast;
4052     address start = __ pc();
4053 
4054     const Register from        = c_rarg0;  // source array address
4055     const Register to          = c_rarg1;  // destination array address
4056     const Register key         = c_rarg2;  // key array address
4057     const Register keylen      = rax;
4058 
4059     const XMMRegister xmm_result = xmm0;
4060     const XMMRegister xmm_key_shuf_mask = xmm1;
4061     // On win64 xmm6-xmm15 must be preserved so don't use them.
4062     const XMMRegister xmm_temp1  = xmm2;
4063     const XMMRegister xmm_temp2  = xmm3;
4064     const XMMRegister xmm_temp3  = xmm4;
4065     const XMMRegister xmm_temp4  = xmm5;
4066 
4067     __ enter(); // required for proper stackwalking of RuntimeStub frame
4068 
4069     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
4070     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4071 
4072     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4073     __ movdqu(xmm_result, Address(from, 0));
4074 
4075     // for decryption java expanded key ordering is rotated one position from what we want
4076     // so we start from 0x10 here and hit 0x00 last
4077     // we don't know if the key is aligned, hence not using load-execute form
4078     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
4079     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
4080     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
4081     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
4082 
4083     __ pxor  (xmm_result, xmm_temp1);
4084     __ aesdec(xmm_result, xmm_temp2);
4085     __ aesdec(xmm_result, xmm_temp3);
4086     __ aesdec(xmm_result, xmm_temp4);
4087 
4088     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
4089     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
4090     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
4091     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
4092 
4093     __ aesdec(xmm_result, xmm_temp1);
4094     __ aesdec(xmm_result, xmm_temp2);
4095     __ aesdec(xmm_result, xmm_temp3);
4096     __ aesdec(xmm_result, xmm_temp4);
4097 
4098     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
4099     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
4100     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
4101 
4102     __ cmpl(keylen, 44);
4103     __ jccb(Assembler::equal, L_doLast);
4104 
4105     __ aesdec(xmm_result, xmm_temp1);
4106     __ aesdec(xmm_result, xmm_temp2);
4107 
4108     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
4109     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
4110 
4111     __ cmpl(keylen, 52);
4112     __ jccb(Assembler::equal, L_doLast);
4113 
4114     __ aesdec(xmm_result, xmm_temp1);
4115     __ aesdec(xmm_result, xmm_temp2);
4116 
4117     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
4118     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
4119 
4120     __ BIND(L_doLast);
4121     __ aesdec(xmm_result, xmm_temp1);
4122     __ aesdec(xmm_result, xmm_temp2);
4123 
4124     // for decryption the aesdeclast operation is always on key+0x00
4125     __ aesdeclast(xmm_result, xmm_temp3);
4126     __ movdqu(Address(to, 0), xmm_result);  // store the result
4127     __ xorptr(rax, rax); // return 0
4128     __ leave(); // required for proper stackwalking of RuntimeStub frame
4129     __ ret(0);
4130 
4131     return start;
4132   }
4133 
4134 
4135   // Arguments:
4136   //
4137   // Inputs:
4138   //   c_rarg0   - source byte array address
4139   //   c_rarg1   - destination byte array address
4140   //   c_rarg2   - K (key) in little endian int array
4141   //   c_rarg3   - r vector byte array address
4142   //   c_rarg4   - input length
4143   //
4144   // Output:
4145   //   rax       - input length
4146   //
4147   address generate_cipherBlockChaining_encryptAESCrypt() {
4148     assert(UseAES, "need AES instructions and misaligned SSE support");
4149     __ align(CodeEntryAlignment);
4150     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
4151     address start = __ pc();
4152 
4153     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
4154     const Register from        = c_rarg0;  // source array address
4155     const Register to          = c_rarg1;  // destination array address
4156     const Register key         = c_rarg2;  // key array address
4157     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
4158                                            // and left with the results of the last encryption block
4159 #ifndef _WIN64
4160     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
4161 #else
4162     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4163     const Register len_reg     = r11;      // pick the volatile windows register
4164 #endif
4165     const Register pos         = rax;
4166 
4167     // xmm register assignments for the loops below
4168     const XMMRegister xmm_result = xmm0;
4169     const XMMRegister xmm_temp   = xmm1;
4170     // keys 0-10 preloaded into xmm2-xmm12
4171     const int XMM_REG_NUM_KEY_FIRST = 2;
4172     const int XMM_REG_NUM_KEY_LAST  = 15;
4173     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
4174     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
4175     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
4176     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
4177     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
4178 
4179     __ enter(); // required for proper stackwalking of RuntimeStub frame
4180 
4181 #ifdef _WIN64
4182     // on win64, fill len_reg from stack position
4183     __ movl(len_reg, len_mem);
4184 #else
4185     __ push(len_reg); // Save
4186 #endif
4187 
4188     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
4189     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4190     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
4191     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
4192       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
4193       offset += 0x10;
4194     }
4195     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
4196 
4197     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
4198     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4199     __ cmpl(rax, 44);
4200     __ jcc(Assembler::notEqual, L_key_192_256);
4201 
4202     // 128 bit code follows here
4203     __ movptr(pos, 0);
4204     __ align(OptoLoopAlignment);
4205 
4206     __ BIND(L_loopTop_128);
4207     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
4208     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
4209     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
4210     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
4211       __ aesenc(xmm_result, as_XMMRegister(rnum));
4212     }
4213     __ aesenclast(xmm_result, xmm_key10);
4214     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
4215     // no need to store r to memory until we exit
4216     __ addptr(pos, AESBlockSize);
4217     __ subptr(len_reg, AESBlockSize);
4218     __ jcc(Assembler::notEqual, L_loopTop_128);
4219 
4220     __ BIND(L_exit);
4221     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
4222 
4223 #ifdef _WIN64
4224     __ movl(rax, len_mem);
4225 #else
4226     __ pop(rax); // return length
4227 #endif
4228     __ leave(); // required for proper stackwalking of RuntimeStub frame
4229     __ ret(0);
4230 
4231     __ BIND(L_key_192_256);
4232     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
4233     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
4234     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
4235     __ cmpl(rax, 52);
4236     __ jcc(Assembler::notEqual, L_key_256);
4237 
4238     // 192-bit code follows here (could be changed to use more xmm registers)
4239     __ movptr(pos, 0);
4240     __ align(OptoLoopAlignment);
4241 
4242     __ BIND(L_loopTop_192);
4243     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
4244     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
4245     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
4246     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
4247       __ aesenc(xmm_result, as_XMMRegister(rnum));
4248     }
4249     __ aesenclast(xmm_result, xmm_key12);
4250     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
4251     // no need to store r to memory until we exit
4252     __ addptr(pos, AESBlockSize);
4253     __ subptr(len_reg, AESBlockSize);
4254     __ jcc(Assembler::notEqual, L_loopTop_192);
4255     __ jmp(L_exit);
4256 
4257     __ BIND(L_key_256);
4258     // 256-bit code follows here (could be changed to use more xmm registers)
4259     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
4260     __ movptr(pos, 0);
4261     __ align(OptoLoopAlignment);
4262 
4263     __ BIND(L_loopTop_256);
4264     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
4265     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
4266     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
4267     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
4268       __ aesenc(xmm_result, as_XMMRegister(rnum));
4269     }
4270     load_key(xmm_temp, key, 0xe0);
4271     __ aesenclast(xmm_result, xmm_temp);
4272     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
4273     // no need to store r to memory until we exit
4274     __ addptr(pos, AESBlockSize);
4275     __ subptr(len_reg, AESBlockSize);
4276     __ jcc(Assembler::notEqual, L_loopTop_256);
4277     __ jmp(L_exit);
4278 
4279     return start;
4280   }
4281 
4282   // Safefetch stubs.
4283   void generate_safefetch(const char* name, int size, address* entry,
4284                           address* fault_pc, address* continuation_pc) {
4285     // safefetch signatures:
4286     //   int      SafeFetch32(int*      adr, int      errValue);
4287     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
4288     //
4289     // arguments:
4290     //   c_rarg0 = adr
4291     //   c_rarg1 = errValue
4292     //
4293     // result:
4294     //   PPC_RET  = *adr or errValue
4295 
4296     StubCodeMark mark(this, "StubRoutines", name);
4297 
4298     // Entry point, pc or function descriptor.
4299     *entry = __ pc();
4300 
4301     // Load *adr into c_rarg1, may fault.
4302     *fault_pc = __ pc();
4303     switch (size) {
4304       case 4:
4305         // int32_t
4306         __ movl(c_rarg1, Address(c_rarg0, 0));
4307         break;
4308       case 8:
4309         // int64_t
4310         __ movq(c_rarg1, Address(c_rarg0, 0));
4311         break;
4312       default:
4313         ShouldNotReachHere();
4314     }
4315 
4316     // return errValue or *adr
4317     *continuation_pc = __ pc();
4318     __ movq(rax, c_rarg1);
4319     __ ret(0);
4320   }
4321 
4322   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
4323   // to hide instruction latency
4324   //
4325   // Arguments:
4326   //
4327   // Inputs:
4328   //   c_rarg0   - source byte array address
4329   //   c_rarg1   - destination byte array address
4330   //   c_rarg2   - K (key) in little endian int array
4331   //   c_rarg3   - r vector byte array address
4332   //   c_rarg4   - input length
4333   //
4334   // Output:
4335   //   rax       - input length
4336   //
4337   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
4338     assert(UseAES, "need AES instructions and misaligned SSE support");
4339     __ align(CodeEntryAlignment);
4340     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4341     address start = __ pc();
4342 
4343     const Register from        = c_rarg0;  // source array address
4344     const Register to          = c_rarg1;  // destination array address
4345     const Register key         = c_rarg2;  // key array address
4346     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
4347                                            // and left with the results of the last encryption block
4348 #ifndef _WIN64
4349     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
4350 #else
4351     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4352     const Register len_reg     = r11;      // pick the volatile windows register
4353 #endif
4354     const Register pos         = rax;
4355 
4356     const int PARALLEL_FACTOR = 4;
4357     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
4358 
4359     Label L_exit;
4360     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
4361     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
4362     Label L_singleBlock_loopTop[3]; // 128, 192, 256
4363     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
4364     Label L_multiBlock_loopTop[3]; // 128, 192, 256
4365 
4366     // keys 0-10 preloaded into xmm5-xmm15
4367     const int XMM_REG_NUM_KEY_FIRST = 5;
4368     const int XMM_REG_NUM_KEY_LAST  = 15;
4369     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
4370     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
4371 
4372     __ enter(); // required for proper stackwalking of RuntimeStub frame
4373 
4374 #ifdef _WIN64
4375     // on win64, fill len_reg from stack position
4376     __ movl(len_reg, len_mem);
4377 #else
4378     __ push(len_reg); // Save
4379 #endif
4380     __ push(rbx);
4381     // the java expanded key ordering is rotated one position from what we want
4382     // so we start from 0x10 here and hit 0x00 last
4383     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
4384     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4385     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
4386     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
4387       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
4388       offset += 0x10;
4389     }
4390     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
4391 
4392     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
4393 
4394     // registers holding the four results in the parallelized loop
4395     const XMMRegister xmm_result0 = xmm0;
4396     const XMMRegister xmm_result1 = xmm2;
4397     const XMMRegister xmm_result2 = xmm3;
4398     const XMMRegister xmm_result3 = xmm4;
4399 
4400     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
4401 
4402     __ xorptr(pos, pos);
4403 
4404     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
4405     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4406     __ cmpl(rbx, 52);
4407     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
4408     __ cmpl(rbx, 60);
4409     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
4410 
4411 #define DoFour(opc, src_reg)           \
4412   __ opc(xmm_result0, src_reg);         \
4413   __ opc(xmm_result1, src_reg);         \
4414   __ opc(xmm_result2, src_reg);         \
4415   __ opc(xmm_result3, src_reg);         \
4416 
4417     for (int k = 0; k < 3; ++k) {
4418       __ BIND(L_multiBlock_loopTopHead[k]);
4419       if (k != 0) {
4420         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
4421         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
4422       }
4423       if (k == 1) {
4424         __ subptr(rsp, 6 * wordSize);
4425         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
4426         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
4427         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
4428         load_key(xmm1, key, 0xc0);  // 0xc0;
4429         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
4430       } else if (k == 2) {
4431         __ subptr(rsp, 10 * wordSize);
4432         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
4433         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
4434         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
4435         load_key(xmm1, key, 0xe0);  // 0xe0;
4436         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
4437         load_key(xmm15, key, 0xb0); // 0xb0;
4438         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
4439         load_key(xmm1, key, 0xc0);  // 0xc0;
4440         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
4441       }
4442       __ align(OptoLoopAlignment);
4443       __ BIND(L_multiBlock_loopTop[k]);
4444       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
4445       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
4446 
4447       if  (k != 0) {
4448         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
4449         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
4450       }
4451 
4452       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
4453       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4454       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4455       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4456 
4457       DoFour(pxor, xmm_key_first);
4458       if (k == 0) {
4459         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
4460           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4461         }
4462         DoFour(aesdeclast, xmm_key_last);
4463       } else if (k == 1) {
4464         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
4465           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4466         }
4467         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4468         DoFour(aesdec, xmm1);  // key : 0xc0
4469         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4470         DoFour(aesdeclast, xmm_key_last);
4471       } else if (k == 2) {
4472         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
4473           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4474         }
4475         DoFour(aesdec, xmm1);  // key : 0xc0
4476         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
4477         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
4478         DoFour(aesdec, xmm15);  // key : 0xd0
4479         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4480         DoFour(aesdec, xmm1);  // key : 0xe0
4481         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4482         DoFour(aesdeclast, xmm_key_last);
4483       }
4484 
4485       // for each result, xor with the r vector of previous cipher block
4486       __ pxor(xmm_result0, xmm_prev_block_cipher);
4487       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4488       __ pxor(xmm_result1, xmm_prev_block_cipher);
4489       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4490       __ pxor(xmm_result2, xmm_prev_block_cipher);
4491       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4492       __ pxor(xmm_result3, xmm_prev_block_cipher);
4493       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
4494       if (k != 0) {
4495         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
4496       }
4497 
4498       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
4499       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4500       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4501       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4502 
4503       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
4504       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
4505       __ jmp(L_multiBlock_loopTop[k]);
4506 
4507       // registers used in the non-parallelized loops
4508       // xmm register assignments for the loops below
4509       const XMMRegister xmm_result = xmm0;
4510       const XMMRegister xmm_prev_block_cipher_save = xmm2;
4511       const XMMRegister xmm_key11 = xmm3;
4512       const XMMRegister xmm_key12 = xmm4;
4513       const XMMRegister key_tmp = xmm4;
4514 
4515       __ BIND(L_singleBlock_loopTopHead[k]);
4516       if (k == 1) {
4517         __ addptr(rsp, 6 * wordSize);
4518       } else if (k == 2) {
4519         __ addptr(rsp, 10 * wordSize);
4520       }
4521       __ cmpptr(len_reg, 0); // any blocks left??
4522       __ jcc(Assembler::equal, L_exit);
4523       __ BIND(L_singleBlock_loopTopHead2[k]);
4524       if (k == 1) {
4525         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
4526         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
4527       }
4528       if (k == 2) {
4529         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
4530       }
4531       __ align(OptoLoopAlignment);
4532       __ BIND(L_singleBlock_loopTop[k]);
4533       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
4534       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
4535       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
4536       for (int rnum = 1; rnum <= 9 ; rnum++) {
4537           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4538       }
4539       if (k == 1) {
4540         __ aesdec(xmm_result, xmm_key11);
4541         __ aesdec(xmm_result, xmm_key12);
4542       }
4543       if (k == 2) {
4544         __ aesdec(xmm_result, xmm_key11);
4545         load_key(key_tmp, key, 0xc0);
4546         __ aesdec(xmm_result, key_tmp);
4547         load_key(key_tmp, key, 0xd0);
4548         __ aesdec(xmm_result, key_tmp);
4549         load_key(key_tmp, key, 0xe0);
4550         __ aesdec(xmm_result, key_tmp);
4551       }
4552 
4553       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
4554       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
4555       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
4556       // no need to store r to memory until we exit
4557       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
4558       __ addptr(pos, AESBlockSize);
4559       __ subptr(len_reg, AESBlockSize);
4560       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
4561       if (k != 2) {
4562         __ jmp(L_exit);
4563       }
4564     } //for 128/192/256
4565 
4566     __ BIND(L_exit);
4567     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
4568     __ pop(rbx);
4569 #ifdef _WIN64
4570     __ movl(rax, len_mem);
4571 #else
4572     __ pop(rax); // return length
4573 #endif
4574     __ leave(); // required for proper stackwalking of RuntimeStub frame
4575     __ ret(0);
4576     return start;
4577 }
4578 
4579   address generate_electronicCodeBook_encryptAESCrypt() {
4580     __ align(CodeEntryAlignment);
4581     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
4582     address start = __ pc();
4583     const Register from = c_rarg0;  // source array address
4584     const Register to = c_rarg1;  // destination array address
4585     const Register key = c_rarg2;  // key array address
4586     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4587     __ enter(); // required for proper stackwalking of RuntimeStub frame
4588     __ aesecb_encrypt(from, to, key, len);
4589     __ leave(); // required for proper stackwalking of RuntimeStub frame
4590     __ ret(0);
4591     return start;
4592  }
4593 
4594   address generate_electronicCodeBook_decryptAESCrypt() {
4595     __ align(CodeEntryAlignment);
4596     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
4597     address start = __ pc();
4598     const Register from = c_rarg0;  // source array address
4599     const Register to = c_rarg1;  // destination array address
4600     const Register key = c_rarg2;  // key array address
4601     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4602     __ enter(); // required for proper stackwalking of RuntimeStub frame
4603     __ aesecb_decrypt(from, to, key, len);
4604     __ leave(); // required for proper stackwalking of RuntimeStub frame
4605     __ ret(0);
4606     return start;
4607   }
4608 
4609   // ofs and limit are use for multi-block byte array.
4610   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
4611   address generate_md5_implCompress(bool multi_block, const char *name) {
4612     __ align(CodeEntryAlignment);
4613     StubCodeMark mark(this, "StubRoutines", name);
4614     address start = __ pc();
4615 
4616     const Register buf_param = r15;
4617     const Address state_param(rsp, 0 * wordSize);
4618     const Address ofs_param  (rsp, 1 * wordSize    );
4619     const Address limit_param(rsp, 1 * wordSize + 4);
4620 
4621     __ enter();
4622     __ push(rbx);
4623     __ push(rdi);
4624     __ push(rsi);
4625     __ push(r15);
4626     __ subptr(rsp, 2 * wordSize);
4627 
4628     __ movptr(buf_param, c_rarg0);
4629     __ movptr(state_param, c_rarg1);
4630     if (multi_block) {
4631       __ movl(ofs_param, c_rarg2);
4632       __ movl(limit_param, c_rarg3);
4633     }
4634     __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
4635 
4636     __ addptr(rsp, 2 * wordSize);
4637     __ pop(r15);
4638     __ pop(rsi);
4639     __ pop(rdi);
4640     __ pop(rbx);
4641     __ leave();
4642     __ ret(0);
4643     return start;
4644   }
4645 
4646   address generate_upper_word_mask() {
4647     __ align64();
4648     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
4649     address start = __ pc();
4650     __ emit_data64(0x0000000000000000, relocInfo::none);
4651     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
4652     return start;
4653   }
4654 
4655   address generate_shuffle_byte_flip_mask() {
4656     __ align64();
4657     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
4658     address start = __ pc();
4659     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4660     __ emit_data64(0x0001020304050607, relocInfo::none);
4661     return start;
4662   }
4663 
4664   // ofs and limit are use for multi-block byte array.
4665   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4666   address generate_sha1_implCompress(bool multi_block, const char *name) {
4667     __ align(CodeEntryAlignment);
4668     StubCodeMark mark(this, "StubRoutines", name);
4669     address start = __ pc();
4670 
4671     Register buf = c_rarg0;
4672     Register state = c_rarg1;
4673     Register ofs = c_rarg2;
4674     Register limit = c_rarg3;
4675 
4676     const XMMRegister abcd = xmm0;
4677     const XMMRegister e0 = xmm1;
4678     const XMMRegister e1 = xmm2;
4679     const XMMRegister msg0 = xmm3;
4680 
4681     const XMMRegister msg1 = xmm4;
4682     const XMMRegister msg2 = xmm5;
4683     const XMMRegister msg3 = xmm6;
4684     const XMMRegister shuf_mask = xmm7;
4685 
4686     __ enter();
4687 
4688     __ subptr(rsp, 4 * wordSize);
4689 
4690     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4691       buf, state, ofs, limit, rsp, multi_block);
4692 
4693     __ addptr(rsp, 4 * wordSize);
4694 
4695     __ leave();
4696     __ ret(0);
4697     return start;
4698   }
4699 
4700   address generate_pshuffle_byte_flip_mask() {
4701     __ align64();
4702     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4703     address start = __ pc();
4704     __ emit_data64(0x0405060700010203, relocInfo::none);
4705     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4706 
4707     if (VM_Version::supports_avx2()) {
4708       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4709       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4710       // _SHUF_00BA
4711       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4712       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4713       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4714       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4715       // _SHUF_DC00
4716       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4717       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4718       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4719       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4720     }
4721 
4722     return start;
4723   }
4724 
4725   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4726   address generate_pshuffle_byte_flip_mask_sha512() {
4727     __ align32();
4728     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4729     address start = __ pc();
4730     if (VM_Version::supports_avx2()) {
4731       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4732       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4733       __ emit_data64(0x1011121314151617, relocInfo::none);
4734       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4735       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4736       __ emit_data64(0x0000000000000000, relocInfo::none);
4737       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4738       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4739     }
4740 
4741     return start;
4742   }
4743 
4744 // ofs and limit are use for multi-block byte array.
4745 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4746   address generate_sha256_implCompress(bool multi_block, const char *name) {
4747     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4748     __ align(CodeEntryAlignment);
4749     StubCodeMark mark(this, "StubRoutines", name);
4750     address start = __ pc();
4751 
4752     Register buf = c_rarg0;
4753     Register state = c_rarg1;
4754     Register ofs = c_rarg2;
4755     Register limit = c_rarg3;
4756 
4757     const XMMRegister msg = xmm0;
4758     const XMMRegister state0 = xmm1;
4759     const XMMRegister state1 = xmm2;
4760     const XMMRegister msgtmp0 = xmm3;
4761 
4762     const XMMRegister msgtmp1 = xmm4;
4763     const XMMRegister msgtmp2 = xmm5;
4764     const XMMRegister msgtmp3 = xmm6;
4765     const XMMRegister msgtmp4 = xmm7;
4766 
4767     const XMMRegister shuf_mask = xmm8;
4768 
4769     __ enter();
4770 
4771     __ subptr(rsp, 4 * wordSize);
4772 
4773     if (VM_Version::supports_sha()) {
4774       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4775         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4776     } else if (VM_Version::supports_avx2()) {
4777       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4778         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4779     }
4780     __ addptr(rsp, 4 * wordSize);
4781     __ vzeroupper();
4782     __ leave();
4783     __ ret(0);
4784     return start;
4785   }
4786 
4787   address generate_sha512_implCompress(bool multi_block, const char *name) {
4788     assert(VM_Version::supports_avx2(), "");
4789     assert(VM_Version::supports_bmi2(), "");
4790     __ align(CodeEntryAlignment);
4791     StubCodeMark mark(this, "StubRoutines", name);
4792     address start = __ pc();
4793 
4794     Register buf = c_rarg0;
4795     Register state = c_rarg1;
4796     Register ofs = c_rarg2;
4797     Register limit = c_rarg3;
4798 
4799     const XMMRegister msg = xmm0;
4800     const XMMRegister state0 = xmm1;
4801     const XMMRegister state1 = xmm2;
4802     const XMMRegister msgtmp0 = xmm3;
4803     const XMMRegister msgtmp1 = xmm4;
4804     const XMMRegister msgtmp2 = xmm5;
4805     const XMMRegister msgtmp3 = xmm6;
4806     const XMMRegister msgtmp4 = xmm7;
4807 
4808     const XMMRegister shuf_mask = xmm8;
4809 
4810     __ enter();
4811 
4812     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4813     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4814 
4815     __ vzeroupper();
4816     __ leave();
4817     __ ret(0);
4818     return start;
4819   }
4820 
4821   address ghash_polynomial512_addr() {
4822     __ align(CodeEntryAlignment);
4823     StubCodeMark mark(this, "StubRoutines", "_ghash_poly512_addr");
4824     address start = __ pc();
4825     __ emit_data64(0x00000001C2000000, relocInfo::none); // POLY for reduction
4826     __ emit_data64(0xC200000000000000, relocInfo::none);
4827     __ emit_data64(0x00000001C2000000, relocInfo::none);
4828     __ emit_data64(0xC200000000000000, relocInfo::none);
4829     __ emit_data64(0x00000001C2000000, relocInfo::none);
4830     __ emit_data64(0xC200000000000000, relocInfo::none);
4831     __ emit_data64(0x00000001C2000000, relocInfo::none);
4832     __ emit_data64(0xC200000000000000, relocInfo::none);
4833     __ emit_data64(0x0000000000000001, relocInfo::none); // POLY
4834     __ emit_data64(0xC200000000000000, relocInfo::none);
4835     __ emit_data64(0x0000000000000001, relocInfo::none); // TWOONE
4836     __ emit_data64(0x0000000100000000, relocInfo::none);
4837     return start;
4838 }
4839 
4840   // Vector AES Galois Counter Mode implementation. Parameters:
4841   // Windows regs            |  Linux regs
4842   // in = c_rarg0 (rcx)      |  c_rarg0 (rsi)
4843   // len = c_rarg1 (rdx)     |  c_rarg1 (rdi)
4844   // ct = c_rarg2 (r8)       |  c_rarg2 (rdx)
4845   // out = c_rarg3 (r9)      |  c_rarg3 (rcx)
4846   // key = r10               |  c_rarg4 (r8)
4847   // state = r13             |  c_rarg5 (r9)
4848   // subkeyHtbl = r14        |  r11
4849   // counter = rsi           |  r12
4850   // return - number of processed bytes
4851   address generate_galoisCounterMode_AESCrypt() {
4852     __ align(CodeEntryAlignment);
4853     StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
4854     address start = __ pc();
4855     const Register in = c_rarg0;
4856     const Register len = c_rarg1;
4857     const Register ct = c_rarg2;
4858     const Register out = c_rarg3;
4859     // and updated with the incremented counter in the end
4860 #ifndef _WIN64
4861     const Register key = c_rarg4;
4862     const Register state = c_rarg5;
4863     const Address subkeyH_mem(rbp, 2 * wordSize);
4864     const Register subkeyHtbl = r11;
4865     const Address avx512_subkeyH_mem(rbp, 3 * wordSize);
4866     const Register avx512_subkeyHtbl = r13;
4867     const Address counter_mem(rbp, 4 * wordSize);
4868     const Register counter = r12;
4869 #else
4870     const Address key_mem(rbp, 6 * wordSize);
4871     const Register key = r10;
4872     const Address state_mem(rbp, 7 * wordSize);
4873     const Register state = r13;
4874     const Address subkeyH_mem(rbp, 8 * wordSize);
4875     const Register subkeyHtbl = r14;
4876     const Address avx512_subkeyH_mem(rbp, 9 * wordSize);
4877     const Register avx512_subkeyHtbl = r12;
4878     const Address counter_mem(rbp, 10 * wordSize);
4879     const Register counter = rsi;
4880 #endif
4881     __ enter();
4882    // Save state before entering routine
4883     __ push(r12);
4884     __ push(r13);
4885     __ push(r14);
4886     __ push(r15);
4887     __ push(rbx);
4888 #ifdef _WIN64
4889     // on win64, fill len_reg from stack position
4890     __ push(rsi);
4891     __ movptr(key, key_mem);
4892     __ movptr(state, state_mem);
4893 #endif
4894     __ movptr(subkeyHtbl, subkeyH_mem);
4895     __ movptr(avx512_subkeyHtbl, avx512_subkeyH_mem);
4896     __ movptr(counter, counter_mem);
4897 
4898     __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
4899 
4900     // Restore state before leaving routine
4901 #ifdef _WIN64
4902     __ pop(rsi);
4903 #endif
4904     __ pop(rbx);
4905     __ pop(r15);
4906     __ pop(r14);
4907     __ pop(r13);
4908     __ pop(r12);
4909 
4910     __ leave(); // required for proper stackwalking of RuntimeStub frame
4911     __ ret(0);
4912      return start;
4913   }
4914 
4915   // This mask is used for incrementing counter value(linc0, linc4, etc.)
4916   address counter_mask_addr() {
4917     __ align64();
4918     StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
4919     address start = __ pc();
4920     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
4921     __ emit_data64(0x0001020304050607, relocInfo::none);
4922     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4923     __ emit_data64(0x0001020304050607, relocInfo::none);
4924     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4925     __ emit_data64(0x0001020304050607, relocInfo::none);
4926     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4927     __ emit_data64(0x0001020304050607, relocInfo::none);
4928     __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4929     __ emit_data64(0x0000000000000000, relocInfo::none);
4930     __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4931     __ emit_data64(0x0000000000000000, relocInfo::none);
4932     __ emit_data64(0x0000000000000002, relocInfo::none);
4933     __ emit_data64(0x0000000000000000, relocInfo::none);
4934     __ emit_data64(0x0000000000000003, relocInfo::none);
4935     __ emit_data64(0x0000000000000000, relocInfo::none);
4936     __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4937     __ emit_data64(0x0000000000000000, relocInfo::none);
4938     __ emit_data64(0x0000000000000004, relocInfo::none);
4939     __ emit_data64(0x0000000000000000, relocInfo::none);
4940     __ emit_data64(0x0000000000000004, relocInfo::none);
4941     __ emit_data64(0x0000000000000000, relocInfo::none);
4942     __ emit_data64(0x0000000000000004, relocInfo::none);
4943     __ emit_data64(0x0000000000000000, relocInfo::none);
4944     __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4945     __ emit_data64(0x0000000000000000, relocInfo::none);
4946     __ emit_data64(0x0000000000000008, relocInfo::none);
4947     __ emit_data64(0x0000000000000000, relocInfo::none);
4948     __ emit_data64(0x0000000000000008, relocInfo::none);
4949     __ emit_data64(0x0000000000000000, relocInfo::none);
4950     __ emit_data64(0x0000000000000008, relocInfo::none);
4951     __ emit_data64(0x0000000000000000, relocInfo::none);
4952     __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4953     __ emit_data64(0x0000000000000000, relocInfo::none);
4954     __ emit_data64(0x0000000000000020, relocInfo::none);
4955     __ emit_data64(0x0000000000000000, relocInfo::none);
4956     __ emit_data64(0x0000000000000020, relocInfo::none);
4957     __ emit_data64(0x0000000000000000, relocInfo::none);
4958     __ emit_data64(0x0000000000000020, relocInfo::none);
4959     __ emit_data64(0x0000000000000000, relocInfo::none);
4960     __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4961     __ emit_data64(0x0000000000000000, relocInfo::none);
4962     __ emit_data64(0x0000000000000010, relocInfo::none);
4963     __ emit_data64(0x0000000000000000, relocInfo::none);
4964     __ emit_data64(0x0000000000000010, relocInfo::none);
4965     __ emit_data64(0x0000000000000000, relocInfo::none);
4966     __ emit_data64(0x0000000000000010, relocInfo::none);
4967     __ emit_data64(0x0000000000000000, relocInfo::none);
4968     return start;
4969   }
4970 
4971  // Vector AES Counter implementation
4972   address generate_counterMode_VectorAESCrypt()  {
4973     __ align(CodeEntryAlignment);
4974     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4975     address start = __ pc();
4976     const Register from = c_rarg0; // source array address
4977     const Register to = c_rarg1; // destination array address
4978     const Register key = c_rarg2; // key array address r8
4979     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4980     // and updated with the incremented counter in the end
4981 #ifndef _WIN64
4982     const Register len_reg = c_rarg4;
4983     const Register saved_encCounter_start = c_rarg5;
4984     const Register used_addr = r10;
4985     const Address  used_mem(rbp, 2 * wordSize);
4986     const Register used = r11;
4987 #else
4988     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4989     const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4990     const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4991     const Register len_reg = r10; // pick the first volatile windows register
4992     const Register saved_encCounter_start = r11;
4993     const Register used_addr = r13;
4994     const Register used = r14;
4995 #endif
4996     __ enter();
4997    // Save state before entering routine
4998     __ push(r12);
4999     __ push(r13);
5000     __ push(r14);
5001     __ push(r15);
5002 #ifdef _WIN64
5003     // on win64, fill len_reg from stack position
5004     __ movl(len_reg, len_mem);
5005     __ movptr(saved_encCounter_start, saved_encCounter_mem);
5006     __ movptr(used_addr, used_mem);
5007     __ movl(used, Address(used_addr, 0));
5008 #else
5009     __ push(len_reg); // Save
5010     __ movptr(used_addr, used_mem);
5011     __ movl(used, Address(used_addr, 0));
5012 #endif
5013     __ push(rbx);
5014     __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
5015     // Restore state before leaving routine
5016     __ pop(rbx);
5017 #ifdef _WIN64
5018     __ movl(rax, len_mem); // return length
5019 #else
5020     __ pop(rax); // return length
5021 #endif
5022     __ pop(r15);
5023     __ pop(r14);
5024     __ pop(r13);
5025     __ pop(r12);
5026 
5027     __ leave(); // required for proper stackwalking of RuntimeStub frame
5028     __ ret(0);
5029     return start;
5030   }
5031 
5032   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
5033   // to hide instruction latency
5034   //
5035   // Arguments:
5036   //
5037   // Inputs:
5038   //   c_rarg0   - source byte array address
5039   //   c_rarg1   - destination byte array address
5040   //   c_rarg2   - K (key) in little endian int array
5041   //   c_rarg3   - counter vector byte array address
5042   //   Linux
5043   //     c_rarg4   -          input length
5044   //     c_rarg5   -          saved encryptedCounter start
5045   //     rbp + 6 * wordSize - saved used length
5046   //   Windows
5047   //     rbp + 6 * wordSize - input length
5048   //     rbp + 7 * wordSize - saved encryptedCounter start
5049   //     rbp + 8 * wordSize - saved used length
5050   //
5051   // Output:
5052   //   rax       - input length
5053   //
5054   address generate_counterMode_AESCrypt_Parallel() {
5055     assert(UseAES, "need AES instructions and misaligned SSE support");
5056     __ align(CodeEntryAlignment);
5057     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
5058     address start = __ pc();
5059     const Register from = c_rarg0; // source array address
5060     const Register to = c_rarg1; // destination array address
5061     const Register key = c_rarg2; // key array address
5062     const Register counter = c_rarg3; // counter byte array initialized from counter array address
5063                                       // and updated with the incremented counter in the end
5064 #ifndef _WIN64
5065     const Register len_reg = c_rarg4;
5066     const Register saved_encCounter_start = c_rarg5;
5067     const Register used_addr = r10;
5068     const Address  used_mem(rbp, 2 * wordSize);
5069     const Register used = r11;
5070 #else
5071     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
5072     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
5073     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
5074     const Register len_reg = r10; // pick the first volatile windows register
5075     const Register saved_encCounter_start = r11;
5076     const Register used_addr = r13;
5077     const Register used = r14;
5078 #endif
5079     const Register pos = rax;
5080 
5081     const int PARALLEL_FACTOR = 6;
5082     const XMMRegister xmm_counter_shuf_mask = xmm0;
5083     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
5084     const XMMRegister xmm_curr_counter = xmm2;
5085 
5086     const XMMRegister xmm_key_tmp0 = xmm3;
5087     const XMMRegister xmm_key_tmp1 = xmm4;
5088 
5089     // registers holding the four results in the parallelized loop
5090     const XMMRegister xmm_result0 = xmm5;
5091     const XMMRegister xmm_result1 = xmm6;
5092     const XMMRegister xmm_result2 = xmm7;
5093     const XMMRegister xmm_result3 = xmm8;
5094     const XMMRegister xmm_result4 = xmm9;
5095     const XMMRegister xmm_result5 = xmm10;
5096 
5097     const XMMRegister xmm_from0 = xmm11;
5098     const XMMRegister xmm_from1 = xmm12;
5099     const XMMRegister xmm_from2 = xmm13;
5100     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
5101     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
5102     const XMMRegister xmm_from5 = xmm4;
5103 
5104     //for key_128, key_192, key_256
5105     const int rounds[3] = {10, 12, 14};
5106     Label L_exit_preLoop, L_preLoop_start;
5107     Label L_multiBlock_loopTop[3];
5108     Label L_singleBlockLoopTop[3];
5109     Label L__incCounter[3][6]; //for 6 blocks
5110     Label L__incCounter_single[3]; //for single block, key128, key192, key256
5111     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
5112     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
5113 
5114     Label L_exit;
5115 
5116     __ enter(); // required for proper stackwalking of RuntimeStub frame
5117 
5118 #ifdef _WIN64
5119     // allocate spill slots for r13, r14
5120     enum {
5121         saved_r13_offset,
5122         saved_r14_offset
5123     };
5124     __ subptr(rsp, 2 * wordSize);
5125     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
5126     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
5127 
5128     // on win64, fill len_reg from stack position
5129     __ movl(len_reg, len_mem);
5130     __ movptr(saved_encCounter_start, saved_encCounter_mem);
5131     __ movptr(used_addr, used_mem);
5132     __ movl(used, Address(used_addr, 0));
5133 #else
5134     __ push(len_reg); // Save
5135     __ movptr(used_addr, used_mem);
5136     __ movl(used, Address(used_addr, 0));
5137 #endif
5138 
5139     __ push(rbx); // Save RBX
5140     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
5141     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
5142     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
5143     __ movptr(pos, 0);
5144 
5145     // Use the partially used encrpyted counter from last invocation
5146     __ BIND(L_preLoop_start);
5147     __ cmpptr(used, 16);
5148     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
5149       __ cmpptr(len_reg, 0);
5150       __ jcc(Assembler::lessEqual, L_exit_preLoop);
5151       __ movb(rbx, Address(saved_encCounter_start, used));
5152       __ xorb(rbx, Address(from, pos));
5153       __ movb(Address(to, pos), rbx);
5154       __ addptr(pos, 1);
5155       __ addptr(used, 1);
5156       __ subptr(len_reg, 1);
5157 
5158     __ jmp(L_preLoop_start);
5159 
5160     __ BIND(L_exit_preLoop);
5161     __ movl(Address(used_addr, 0), used);
5162 
5163     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
5164     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
5165     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
5166     __ cmpl(rbx, 52);
5167     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
5168     __ cmpl(rbx, 60);
5169     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
5170 
5171 #define CTR_DoSix(opc, src_reg)                \
5172     __ opc(xmm_result0, src_reg);              \
5173     __ opc(xmm_result1, src_reg);              \
5174     __ opc(xmm_result2, src_reg);              \
5175     __ opc(xmm_result3, src_reg);              \
5176     __ opc(xmm_result4, src_reg);              \
5177     __ opc(xmm_result5, src_reg);
5178 
5179     // k == 0 :  generate code for key_128
5180     // k == 1 :  generate code for key_192
5181     // k == 2 :  generate code for key_256
5182     for (int k = 0; k < 3; ++k) {
5183       //multi blocks starts here
5184       __ align(OptoLoopAlignment);
5185       __ BIND(L_multiBlock_loopTop[k]);
5186       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
5187       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
5188       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
5189 
5190       //load, then increase counters
5191       CTR_DoSix(movdqa, xmm_curr_counter);
5192       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
5193       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
5194       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
5195       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
5196       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
5197       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
5198       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
5199       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
5200 
5201       //load two ROUND_KEYs at a time
5202       for (int i = 1; i < rounds[k]; ) {
5203         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
5204         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
5205         CTR_DoSix(aesenc, xmm_key_tmp1);
5206         i++;
5207         if (i != rounds[k]) {
5208           CTR_DoSix(aesenc, xmm_key_tmp0);
5209         } else {
5210           CTR_DoSix(aesenclast, xmm_key_tmp0);
5211         }
5212         i++;
5213       }
5214 
5215       // get next PARALLEL_FACTOR blocks into xmm_result registers
5216       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
5217       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
5218       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
5219       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
5220       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
5221       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
5222 
5223       __ pxor(xmm_result0, xmm_from0);
5224       __ pxor(xmm_result1, xmm_from1);
5225       __ pxor(xmm_result2, xmm_from2);
5226       __ pxor(xmm_result3, xmm_from3);
5227       __ pxor(xmm_result4, xmm_from4);
5228       __ pxor(xmm_result5, xmm_from5);
5229 
5230       // store 6 results into the next 64 bytes of output
5231       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
5232       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
5233       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
5234       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
5235       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
5236       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
5237 
5238       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
5239       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
5240       __ jmp(L_multiBlock_loopTop[k]);
5241 
5242       // singleBlock starts here
5243       __ align(OptoLoopAlignment);
5244       __ BIND(L_singleBlockLoopTop[k]);
5245       __ cmpptr(len_reg, 0);
5246       __ jcc(Assembler::lessEqual, L_exit);
5247       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
5248       __ movdqa(xmm_result0, xmm_curr_counter);
5249       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
5250       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
5251       __ pxor(xmm_result0, xmm_key_tmp0);
5252       for (int i = 1; i < rounds[k]; i++) {
5253         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
5254         __ aesenc(xmm_result0, xmm_key_tmp0);
5255       }
5256       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
5257       __ aesenclast(xmm_result0, xmm_key_tmp0);
5258       __ cmpptr(len_reg, AESBlockSize);
5259       __ jcc(Assembler::less, L_processTail_insr[k]);
5260         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
5261         __ pxor(xmm_result0, xmm_from0);
5262         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
5263         __ addptr(pos, AESBlockSize);
5264         __ subptr(len_reg, AESBlockSize);
5265         __ jmp(L_singleBlockLoopTop[k]);
5266       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
5267         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
5268         __ testptr(len_reg, 8);
5269         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
5270           __ subptr(pos,8);
5271           __ pinsrq(xmm_from0, Address(from, pos), 0);
5272         __ BIND(L_processTail_4_insr[k]);
5273         __ testptr(len_reg, 4);
5274         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
5275           __ subptr(pos,4);
5276           __ pslldq(xmm_from0, 4);
5277           __ pinsrd(xmm_from0, Address(from, pos), 0);
5278         __ BIND(L_processTail_2_insr[k]);
5279         __ testptr(len_reg, 2);
5280         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
5281           __ subptr(pos, 2);
5282           __ pslldq(xmm_from0, 2);
5283           __ pinsrw(xmm_from0, Address(from, pos), 0);
5284         __ BIND(L_processTail_1_insr[k]);
5285         __ testptr(len_reg, 1);
5286         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
5287           __ subptr(pos, 1);
5288           __ pslldq(xmm_from0, 1);
5289           __ pinsrb(xmm_from0, Address(from, pos), 0);
5290         __ BIND(L_processTail_exit_insr[k]);
5291 
5292         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
5293         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
5294 
5295         __ testptr(len_reg, 8);
5296         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
5297           __ pextrq(Address(to, pos), xmm_result0, 0);
5298           __ psrldq(xmm_result0, 8);
5299           __ addptr(pos, 8);
5300         __ BIND(L_processTail_4_extr[k]);
5301         __ testptr(len_reg, 4);
5302         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
5303           __ pextrd(Address(to, pos), xmm_result0, 0);
5304           __ psrldq(xmm_result0, 4);
5305           __ addptr(pos, 4);
5306         __ BIND(L_processTail_2_extr[k]);
5307         __ testptr(len_reg, 2);
5308         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
5309           __ pextrw(Address(to, pos), xmm_result0, 0);
5310           __ psrldq(xmm_result0, 2);
5311           __ addptr(pos, 2);
5312         __ BIND(L_processTail_1_extr[k]);
5313         __ testptr(len_reg, 1);
5314         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
5315           __ pextrb(Address(to, pos), xmm_result0, 0);
5316 
5317         __ BIND(L_processTail_exit_extr[k]);
5318         __ movl(Address(used_addr, 0), len_reg);
5319         __ jmp(L_exit);
5320 
5321     }
5322 
5323     __ BIND(L_exit);
5324     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
5325     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
5326     __ pop(rbx); // pop the saved RBX.
5327 #ifdef _WIN64
5328     __ movl(rax, len_mem);
5329     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
5330     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
5331     __ addptr(rsp, 2 * wordSize);
5332 #else
5333     __ pop(rax); // return 'len'
5334 #endif
5335     __ leave(); // required for proper stackwalking of RuntimeStub frame
5336     __ ret(0);
5337     return start;
5338   }
5339 
5340 void roundDec(XMMRegister xmm_reg) {
5341   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
5342   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
5343   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
5344   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
5345   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
5346   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
5347   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
5348   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
5349 }
5350 
5351 void roundDeclast(XMMRegister xmm_reg) {
5352   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
5353   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
5354   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
5355   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
5356   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
5357   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
5358   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
5359   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
5360 }
5361 
5362   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
5363     __ movdqu(xmmdst, Address(key, offset));
5364     if (xmm_shuf_mask != NULL) {
5365       __ pshufb(xmmdst, xmm_shuf_mask);
5366     } else {
5367       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
5368     }
5369     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
5370 
5371   }
5372 
5373 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
5374     assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
5375     __ align(CodeEntryAlignment);
5376     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
5377     address start = __ pc();
5378 
5379     const Register from = c_rarg0;  // source array address
5380     const Register to = c_rarg1;  // destination array address
5381     const Register key = c_rarg2;  // key array address
5382     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
5383     // and left with the results of the last encryption block
5384 #ifndef _WIN64
5385     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
5386 #else
5387     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
5388     const Register len_reg = r11;      // pick the volatile windows register
5389 #endif
5390 
5391     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
5392           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
5393 
5394     __ enter();
5395 
5396 #ifdef _WIN64
5397   // on win64, fill len_reg from stack position
5398     __ movl(len_reg, len_mem);
5399 #else
5400     __ push(len_reg); // Save
5401 #endif
5402     __ push(rbx);
5403     __ vzeroupper();
5404 
5405     // Temporary variable declaration for swapping key bytes
5406     const XMMRegister xmm_key_shuf_mask = xmm1;
5407     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
5408 
5409     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
5410     const Register rounds = rbx;
5411     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
5412 
5413     const XMMRegister IV = xmm0;
5414     // Load IV and broadcast value to 512-bits
5415     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
5416 
5417     // Temporary variables for storing round keys
5418     const XMMRegister RK0 = xmm30;
5419     const XMMRegister RK1 = xmm9;
5420     const XMMRegister RK2 = xmm18;
5421     const XMMRegister RK3 = xmm19;
5422     const XMMRegister RK4 = xmm20;
5423     const XMMRegister RK5 = xmm21;
5424     const XMMRegister RK6 = xmm22;
5425     const XMMRegister RK7 = xmm23;
5426     const XMMRegister RK8 = xmm24;
5427     const XMMRegister RK9 = xmm25;
5428     const XMMRegister RK10 = xmm26;
5429 
5430      // Load and shuffle key
5431     // the java expanded key ordering is rotated one position from what we want
5432     // so we start from 1*16 here and hit 0*16 last
5433     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
5434     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
5435     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
5436     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
5437     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
5438     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
5439     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
5440     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
5441     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
5442     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
5443     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
5444 
5445     // Variables for storing source cipher text
5446     const XMMRegister S0 = xmm10;
5447     const XMMRegister S1 = xmm11;
5448     const XMMRegister S2 = xmm12;
5449     const XMMRegister S3 = xmm13;
5450     const XMMRegister S4 = xmm14;
5451     const XMMRegister S5 = xmm15;
5452     const XMMRegister S6 = xmm16;
5453     const XMMRegister S7 = xmm17;
5454 
5455     // Variables for storing decrypted text
5456     const XMMRegister B0 = xmm1;
5457     const XMMRegister B1 = xmm2;
5458     const XMMRegister B2 = xmm3;
5459     const XMMRegister B3 = xmm4;
5460     const XMMRegister B4 = xmm5;
5461     const XMMRegister B5 = xmm6;
5462     const XMMRegister B6 = xmm7;
5463     const XMMRegister B7 = xmm8;
5464 
5465     __ cmpl(rounds, 44);
5466     __ jcc(Assembler::greater, KEY_192);
5467     __ jmp(Loop);
5468 
5469     __ BIND(KEY_192);
5470     const XMMRegister RK11 = xmm27;
5471     const XMMRegister RK12 = xmm28;
5472     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
5473     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
5474 
5475     __ cmpl(rounds, 52);
5476     __ jcc(Assembler::greater, KEY_256);
5477     __ jmp(Loop);
5478 
5479     __ BIND(KEY_256);
5480     const XMMRegister RK13 = xmm29;
5481     const XMMRegister RK14 = xmm31;
5482     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
5483     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
5484 
5485     __ BIND(Loop);
5486     __ cmpl(len_reg, 512);
5487     __ jcc(Assembler::below, Lcbc_dec_rem);
5488     __ BIND(Loop1);
5489     __ subl(len_reg, 512);
5490     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
5491     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
5492     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
5493     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
5494     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
5495     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
5496     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
5497     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
5498     __ leaq(from, Address(from, 8 * 64));
5499 
5500     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5501     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
5502     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
5503     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
5504     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
5505     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
5506     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
5507     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
5508 
5509     __ evalignq(IV, S0, IV, 0x06);
5510     __ evalignq(S0, S1, S0, 0x06);
5511     __ evalignq(S1, S2, S1, 0x06);
5512     __ evalignq(S2, S3, S2, 0x06);
5513     __ evalignq(S3, S4, S3, 0x06);
5514     __ evalignq(S4, S5, S4, 0x06);
5515     __ evalignq(S5, S6, S5, 0x06);
5516     __ evalignq(S6, S7, S6, 0x06);
5517 
5518     roundDec(RK2);
5519     roundDec(RK3);
5520     roundDec(RK4);
5521     roundDec(RK5);
5522     roundDec(RK6);
5523     roundDec(RK7);
5524     roundDec(RK8);
5525     roundDec(RK9);
5526     roundDec(RK10);
5527 
5528     __ cmpl(rounds, 44);
5529     __ jcc(Assembler::belowEqual, L_128);
5530     roundDec(RK11);
5531     roundDec(RK12);
5532 
5533     __ cmpl(rounds, 52);
5534     __ jcc(Assembler::belowEqual, L_192);
5535     roundDec(RK13);
5536     roundDec(RK14);
5537 
5538     __ BIND(L_256);
5539     roundDeclast(RK0);
5540     __ jmp(Loop2);
5541 
5542     __ BIND(L_128);
5543     roundDeclast(RK0);
5544     __ jmp(Loop2);
5545 
5546     __ BIND(L_192);
5547     roundDeclast(RK0);
5548 
5549     __ BIND(Loop2);
5550     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5551     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
5552     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
5553     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
5554     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
5555     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
5556     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
5557     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
5558     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
5559 
5560     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
5561     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
5562     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
5563     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
5564     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
5565     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
5566     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
5567     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
5568     __ leaq(to, Address(to, 8 * 64));
5569     __ jmp(Loop);
5570 
5571     __ BIND(Lcbc_dec_rem);
5572     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
5573 
5574     __ BIND(Lcbc_dec_rem_loop);
5575     __ subl(len_reg, 16);
5576     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
5577 
5578     __ movdqu(S0, Address(from, 0));
5579     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5580     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
5581     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
5582     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
5583     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
5584     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
5585     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
5586     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
5587     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
5588     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
5589     __ cmpl(rounds, 44);
5590     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5591 
5592     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
5593     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
5594     __ cmpl(rounds, 52);
5595     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5596 
5597     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
5598     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
5599 
5600     __ BIND(Lcbc_dec_rem_last);
5601     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
5602 
5603     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5604     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
5605     __ movdqu(Address(to, 0), B0);
5606     __ leaq(from, Address(from, 16));
5607     __ leaq(to, Address(to, 16));
5608     __ jmp(Lcbc_dec_rem_loop);
5609 
5610     __ BIND(Lcbc_dec_ret);
5611     __ movdqu(Address(rvec, 0), IV);
5612 
5613     // Zero out the round keys
5614     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
5615     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
5616     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
5617     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
5618     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
5619     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
5620     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
5621     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
5622     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
5623     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
5624     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
5625     __ cmpl(rounds, 44);
5626     __ jcc(Assembler::belowEqual, Lcbc_exit);
5627     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
5628     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
5629     __ cmpl(rounds, 52);
5630     __ jcc(Assembler::belowEqual, Lcbc_exit);
5631     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
5632     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
5633 
5634     __ BIND(Lcbc_exit);
5635     __ pop(rbx);
5636 #ifdef _WIN64
5637     __ movl(rax, len_mem);
5638 #else
5639     __ pop(rax); // return length
5640 #endif
5641     __ leave(); // required for proper stackwalking of RuntimeStub frame
5642     __ ret(0);
5643     return start;
5644 }
5645 
5646 // Polynomial x^128+x^127+x^126+x^121+1
5647 address ghash_polynomial_addr() {
5648     __ align(CodeEntryAlignment);
5649     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
5650     address start = __ pc();
5651     __ emit_data64(0x0000000000000001, relocInfo::none);
5652     __ emit_data64(0xc200000000000000, relocInfo::none);
5653     return start;
5654 }
5655 
5656 address ghash_shufflemask_addr() {
5657     __ align(CodeEntryAlignment);
5658     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
5659     address start = __ pc();
5660     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5661     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5662     return start;
5663 }
5664 
5665 // Ghash single and multi block operations using AVX instructions
5666 address generate_avx_ghash_processBlocks() {
5667     __ align(CodeEntryAlignment);
5668 
5669     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5670     address start = __ pc();
5671 
5672     // arguments
5673     const Register state = c_rarg0;
5674     const Register htbl = c_rarg1;
5675     const Register data = c_rarg2;
5676     const Register blocks = c_rarg3;
5677     __ enter();
5678    // Save state before entering routine
5679     __ avx_ghash(state, htbl, data, blocks);
5680     __ leave(); // required for proper stackwalking of RuntimeStub frame
5681     __ ret(0);
5682     return start;
5683 }
5684 
5685   // byte swap x86 long
5686   address generate_ghash_long_swap_mask() {
5687     __ align(CodeEntryAlignment);
5688     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
5689     address start = __ pc();
5690     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
5691     __ emit_data64(0x0706050403020100, relocInfo::none );
5692   return start;
5693   }
5694 
5695   // byte swap x86 byte array
5696   address generate_ghash_byte_swap_mask() {
5697     __ align(CodeEntryAlignment);
5698     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
5699     address start = __ pc();
5700     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
5701     __ emit_data64(0x0001020304050607, relocInfo::none );
5702   return start;
5703   }
5704 
5705   /* Single and multi-block ghash operations */
5706   address generate_ghash_processBlocks() {
5707     __ align(CodeEntryAlignment);
5708     Label L_ghash_loop, L_exit;
5709     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5710     address start = __ pc();
5711 
5712     const Register state        = c_rarg0;
5713     const Register subkeyH      = c_rarg1;
5714     const Register data         = c_rarg2;
5715     const Register blocks       = c_rarg3;
5716 
5717     const XMMRegister xmm_temp0 = xmm0;
5718     const XMMRegister xmm_temp1 = xmm1;
5719     const XMMRegister xmm_temp2 = xmm2;
5720     const XMMRegister xmm_temp3 = xmm3;
5721     const XMMRegister xmm_temp4 = xmm4;
5722     const XMMRegister xmm_temp5 = xmm5;
5723     const XMMRegister xmm_temp6 = xmm6;
5724     const XMMRegister xmm_temp7 = xmm7;
5725     const XMMRegister xmm_temp8 = xmm8;
5726     const XMMRegister xmm_temp9 = xmm9;
5727     const XMMRegister xmm_temp10 = xmm10;
5728 
5729     __ enter();
5730 
5731     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
5732 
5733     __ movdqu(xmm_temp0, Address(state, 0));
5734     __ pshufb(xmm_temp0, xmm_temp10);
5735 
5736 
5737     __ BIND(L_ghash_loop);
5738     __ movdqu(xmm_temp2, Address(data, 0));
5739     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
5740 
5741     __ movdqu(xmm_temp1, Address(subkeyH, 0));
5742     __ pshufb(xmm_temp1, xmm_temp10);
5743 
5744     __ pxor(xmm_temp0, xmm_temp2);
5745 
5746     //
5747     // Multiply with the hash key
5748     //
5749     __ movdqu(xmm_temp3, xmm_temp0);
5750     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
5751     __ movdqu(xmm_temp4, xmm_temp0);
5752     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
5753 
5754     __ movdqu(xmm_temp5, xmm_temp0);
5755     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
5756     __ movdqu(xmm_temp6, xmm_temp0);
5757     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
5758 
5759     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
5760 
5761     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
5762     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
5763     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
5764     __ pxor(xmm_temp3, xmm_temp5);
5765     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
5766                                         // of the carry-less multiplication of
5767                                         // xmm0 by xmm1.
5768 
5769     // We shift the result of the multiplication by one bit position
5770     // to the left to cope for the fact that the bits are reversed.
5771     __ movdqu(xmm_temp7, xmm_temp3);
5772     __ movdqu(xmm_temp8, xmm_temp6);
5773     __ pslld(xmm_temp3, 1);
5774     __ pslld(xmm_temp6, 1);
5775     __ psrld(xmm_temp7, 31);
5776     __ psrld(xmm_temp8, 31);
5777     __ movdqu(xmm_temp9, xmm_temp7);
5778     __ pslldq(xmm_temp8, 4);
5779     __ pslldq(xmm_temp7, 4);
5780     __ psrldq(xmm_temp9, 12);
5781     __ por(xmm_temp3, xmm_temp7);
5782     __ por(xmm_temp6, xmm_temp8);
5783     __ por(xmm_temp6, xmm_temp9);
5784 
5785     //
5786     // First phase of the reduction
5787     //
5788     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
5789     // independently.
5790     __ movdqu(xmm_temp7, xmm_temp3);
5791     __ movdqu(xmm_temp8, xmm_temp3);
5792     __ movdqu(xmm_temp9, xmm_temp3);
5793     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
5794     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
5795     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
5796     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
5797     __ pxor(xmm_temp7, xmm_temp9);
5798     __ movdqu(xmm_temp8, xmm_temp7);
5799     __ pslldq(xmm_temp7, 12);
5800     __ psrldq(xmm_temp8, 4);
5801     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
5802 
5803     //
5804     // Second phase of the reduction
5805     //
5806     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
5807     // shift operations.
5808     __ movdqu(xmm_temp2, xmm_temp3);
5809     __ movdqu(xmm_temp4, xmm_temp3);
5810     __ movdqu(xmm_temp5, xmm_temp3);
5811     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
5812     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
5813     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
5814     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
5815     __ pxor(xmm_temp2, xmm_temp5);
5816     __ pxor(xmm_temp2, xmm_temp8);
5817     __ pxor(xmm_temp3, xmm_temp2);
5818     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
5819 
5820     __ decrement(blocks);
5821     __ jcc(Assembler::zero, L_exit);
5822     __ movdqu(xmm_temp0, xmm_temp6);
5823     __ addptr(data, 16);
5824     __ jmp(L_ghash_loop);
5825 
5826     __ BIND(L_exit);
5827     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
5828     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
5829     __ leave();
5830     __ ret(0);
5831     return start;
5832   }
5833 
5834   address base64_shuffle_addr()
5835   {
5836     __ align64();
5837     StubCodeMark mark(this, "StubRoutines", "shuffle_base64");
5838     address start = __ pc();
5839     assert(((unsigned long long)start & 0x3f) == 0,
5840            "Alignment problem (0x%08llx)", (unsigned long long)start);
5841     __ emit_data64(0x0405030401020001, relocInfo::none);
5842     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5843     __ emit_data64(0x10110f100d0e0c0d, relocInfo::none);
5844     __ emit_data64(0x1617151613141213, relocInfo::none);
5845     __ emit_data64(0x1c1d1b1c191a1819, relocInfo::none);
5846     __ emit_data64(0x222321221f201e1f, relocInfo::none);
5847     __ emit_data64(0x2829272825262425, relocInfo::none);
5848     __ emit_data64(0x2e2f2d2e2b2c2a2b, relocInfo::none);
5849     return start;
5850   }
5851 
5852   address base64_avx2_shuffle_addr()
5853   {
5854     __ align32();
5855     StubCodeMark mark(this, "StubRoutines", "avx2_shuffle_base64");
5856     address start = __ pc();
5857     __ emit_data64(0x0809070805060405, relocInfo::none);
5858     __ emit_data64(0x0e0f0d0e0b0c0a0b, relocInfo::none);
5859     __ emit_data64(0x0405030401020001, relocInfo::none);
5860     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5861     return start;
5862   }
5863 
5864   address base64_avx2_input_mask_addr()
5865   {
5866     __ align32();
5867     StubCodeMark mark(this, "StubRoutines", "avx2_input_mask_base64");
5868     address start = __ pc();
5869     __ emit_data64(0x8000000000000000, relocInfo::none);
5870     __ emit_data64(0x8000000080000000, relocInfo::none);
5871     __ emit_data64(0x8000000080000000, relocInfo::none);
5872     __ emit_data64(0x8000000080000000, relocInfo::none);
5873     return start;
5874   }
5875 
5876   address base64_avx2_lut_addr()
5877   {
5878     __ align32();
5879     StubCodeMark mark(this, "StubRoutines", "avx2_lut_base64");
5880     address start = __ pc();
5881     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5882     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5883     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5884     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5885 
5886     // URL LUT
5887     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5888     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5889     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5890     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5891     return start;
5892   }
5893 
5894   address base64_encoding_table_addr()
5895   {
5896     __ align64();
5897     StubCodeMark mark(this, "StubRoutines", "encoding_table_base64");
5898     address start = __ pc();
5899     assert(((unsigned long long)start & 0x3f) == 0, "Alignment problem (0x%08llx)", (unsigned long long)start);
5900     __ emit_data64(0x4847464544434241, relocInfo::none);
5901     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5902     __ emit_data64(0x5857565554535251, relocInfo::none);
5903     __ emit_data64(0x6665646362615a59, relocInfo::none);
5904     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5905     __ emit_data64(0x767574737271706f, relocInfo::none);
5906     __ emit_data64(0x333231307a797877, relocInfo::none);
5907     __ emit_data64(0x2f2b393837363534, relocInfo::none);
5908 
5909     // URL table
5910     __ emit_data64(0x4847464544434241, relocInfo::none);
5911     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5912     __ emit_data64(0x5857565554535251, relocInfo::none);
5913     __ emit_data64(0x6665646362615a59, relocInfo::none);
5914     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5915     __ emit_data64(0x767574737271706f, relocInfo::none);
5916     __ emit_data64(0x333231307a797877, relocInfo::none);
5917     __ emit_data64(0x5f2d393837363534, relocInfo::none);
5918     return start;
5919   }
5920 
5921   // Code for generating Base64 encoding.
5922   // Intrinsic function prototype in Base64.java:
5923   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp,
5924   // boolean isURL) {
5925   address generate_base64_encodeBlock()
5926   {
5927     __ align(CodeEntryAlignment);
5928     StubCodeMark mark(this, "StubRoutines", "implEncode");
5929     address start = __ pc();
5930     __ enter();
5931 
5932     // Save callee-saved registers before using them
5933     __ push(r12);
5934     __ push(r13);
5935     __ push(r14);
5936     __ push(r15);
5937 
5938     // arguments
5939     const Register source = c_rarg0;       // Source Array
5940     const Register start_offset = c_rarg1; // start offset
5941     const Register end_offset = c_rarg2;   // end offset
5942     const Register dest = c_rarg3;   // destination array
5943 
5944 #ifndef _WIN64
5945     const Register dp = c_rarg4;    // Position for writing to dest array
5946     const Register isURL = c_rarg5; // Base64 or URL character set
5947 #else
5948     const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5949     const Address isURL_mem(rbp, 7 * wordSize);
5950     const Register isURL = r10; // pick the volatile windows register
5951     const Register dp = r12;
5952     __ movl(dp, dp_mem);
5953     __ movl(isURL, isURL_mem);
5954 #endif
5955 
5956     const Register length = r14;
5957     const Register encode_table = r13;
5958     Label L_process3, L_exit, L_processdata, L_vbmiLoop, L_not512, L_32byteLoop;
5959 
5960     // calculate length from offsets
5961     __ movl(length, end_offset);
5962     __ subl(length, start_offset);
5963     __ cmpl(length, 0);
5964     __ jcc(Assembler::lessEqual, L_exit);
5965 
5966     // Code for 512-bit VBMI encoding.  Encodes 48 input bytes into 64
5967     // output bytes. We read 64 input bytes and ignore the last 16, so be
5968     // sure not to read past the end of the input buffer.
5969     if (VM_Version::supports_avx512_vbmi()) {
5970       __ cmpl(length, 64); // Do not overrun input buffer.
5971       __ jcc(Assembler::below, L_not512);
5972 
5973       __ shll(isURL, 6); // index into decode table based on isURL
5974       __ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5975       __ addptr(encode_table, isURL);
5976       __ shrl(isURL, 6); // restore isURL
5977 
5978       __ mov64(rax, 0x3036242a1016040aull); // Shifts
5979       __ evmovdquq(xmm3, ExternalAddress(StubRoutines::x86::base64_shuffle_addr()), Assembler::AVX_512bit, r15);
5980       __ evmovdquq(xmm2, Address(encode_table, 0), Assembler::AVX_512bit);
5981       __ evpbroadcastq(xmm1, rax, Assembler::AVX_512bit);
5982 
5983       __ align32();
5984       __ BIND(L_vbmiLoop);
5985 
5986       __ vpermb(xmm0, xmm3, Address(source, start_offset), Assembler::AVX_512bit);
5987       __ subl(length, 48);
5988 
5989       // Put the input bytes into the proper lanes for writing, then
5990       // encode them.
5991       __ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
5992       __ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
5993 
5994       // Write to destination
5995       __ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
5996 
5997       __ addptr(dest, 64);
5998       __ addptr(source, 48);
5999       __ cmpl(length, 64);
6000       __ jcc(Assembler::aboveEqual, L_vbmiLoop);
6001 
6002       __ vzeroupper();
6003     }
6004 
6005     __ BIND(L_not512);
6006     if (VM_Version::supports_avx2()
6007         && VM_Version::supports_avx512vlbw()) {
6008       /*
6009       ** This AVX2 encoder is based off the paper at:
6010       **      https://dl.acm.org/doi/10.1145/3132709
6011       **
6012       ** We use AVX2 SIMD instructions to encode 24 bytes into 32
6013       ** output bytes.
6014       **
6015       */
6016       // Lengths under 32 bytes are done with scalar routine
6017       __ cmpl(length, 31);
6018       __ jcc(Assembler::belowEqual, L_process3);
6019 
6020       // Set up supporting constant table data
6021       __ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
6022       // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
6023       __ movl(rax, 0x0fc0fc00);
6024       __ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
6025       __ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
6026 
6027       // Multiplication constant for "shifting" right by 6 and 10
6028       // bits
6029       __ movl(rax, 0x04000040);
6030 
6031       __ subl(length, 24);
6032       __ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
6033 
6034       // For the first load, we mask off reading of the first 4
6035       // bytes into the register. This is so we can get 4 3-byte
6036       // chunks into each lane of the register, avoiding having to
6037       // handle end conditions.  We then shuffle these bytes into a
6038       // specific order so that manipulation is easier.
6039       //
6040       // The initial read loads the XMM register like this:
6041       //
6042       // Lower 128-bit lane:
6043       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
6044       // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1
6045       // | C2 | D0 | D1 | D2 |
6046       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
6047       //
6048       // Upper 128-bit lane:
6049       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
6050       // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2
6051       // | XX | XX | XX | XX |
6052       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
6053       //
6054       // Where A0 is the first input byte, B0 is the fourth, etc.
6055       // The alphabetical significance denotes the 3 bytes to be
6056       // consumed and encoded into 4 bytes.
6057       //
6058       // We then shuffle the register so each 32-bit word contains
6059       // the sequence:
6060       //    A1 A0 A2 A1, B1, B0, B2, B1, etc.
6061       // Each of these byte sequences are then manipulated into 4
6062       // 6-bit values ready for encoding.
6063       //
6064       // If we focus on one set of 3-byte chunks, changing the
6065       // nomenclature such that A0 => a, A1 => b, and A2 => c, we
6066       // shuffle such that each 24-bit chunk contains:
6067       //
6068       // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6
6069       // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0
6070       // Explain this step.
6071       // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4
6072       // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2
6073       //
6074       // W first and off all but bits 4-9 and 16-21 (c5..c0 and
6075       // a5..a0) and shift them using a vector multiplication
6076       // operation (vpmulhuw) which effectively shifts c right by 6
6077       // bits and a right by 10 bits.  We similarly mask bits 10-15
6078       // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4
6079       // bits respecively.  This is done using vpmullw.  We end up
6080       // with 4 6-bit values, thus splitting the 3 input bytes,
6081       // ready for encoding:
6082       //    0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0
6083       //
6084       // For translation, we recognize that there are 5 distinct
6085       // ranges of legal Base64 characters as below:
6086       //
6087       //   +-------------+-------------+------------+
6088       //   | 6-bit value | ASCII range |   offset   |
6089       //   +-------------+-------------+------------+
6090       //   |    0..25    |    A..Z     |     65     |
6091       //   |   26..51    |    a..z     |     71     |
6092       //   |   52..61    |    0..9     |     -4     |
6093       //   |     62      |   + or -    | -19 or -17 |
6094       //   |     63      |   / or _    | -16 or 32  |
6095       //   +-------------+-------------+------------+
6096       //
6097       // We note that vpshufb does a parallel lookup in a
6098       // destination register using the lower 4 bits of bytes from a
6099       // source register.  If we use a saturated subtraction and
6100       // subtract 51 from each 6-bit value, bytes from [0,51]
6101       // saturate to 0, and [52,63] map to a range of [1,12].  We
6102       // distinguish the [0,25] and [26,51] ranges by assigning a
6103       // value of 13 for all 6-bit values less than 26.  We end up
6104       // with:
6105       //
6106       //   +-------------+-------------+------------+
6107       //   | 6-bit value |   Reduced   |   offset   |
6108       //   +-------------+-------------+------------+
6109       //   |    0..25    |     13      |     65     |
6110       //   |   26..51    |      0      |     71     |
6111       //   |   52..61    |    0..9     |     -4     |
6112       //   |     62      |     11      | -19 or -17 |
6113       //   |     63      |     12      | -16 or 32  |
6114       //   +-------------+-------------+------------+
6115       //
6116       // We then use a final vpshufb to add the appropriate offset,
6117       // translating the bytes.
6118       //
6119       // Load input bytes - only 28 bytes.  Mask the first load to
6120       // not load into the full register.
6121       __ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
6122 
6123       // Move 3-byte chunks of input (12 bytes) into 16 bytes,
6124       // ordering by:
6125       //   1, 0, 2, 1; 4, 3, 5, 4; etc.  This groups 6-bit chunks
6126       //   for easy masking
6127       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
6128 
6129       __ addl(start_offset, 24);
6130 
6131       // Load masking register for first and third (and multiples)
6132       // 6-bit values.
6133       __ movl(rax, 0x003f03f0);
6134       __ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
6135       // Multiplication constant for "shifting" left by 4 and 8 bits
6136       __ movl(rax, 0x01000010);
6137       __ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
6138 
6139       // Isolate 6-bit chunks of interest
6140       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
6141 
6142       // Load constants for encoding
6143       __ movl(rax, 0x19191919);
6144       __ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
6145       __ movl(rax, 0x33333333);
6146       __ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
6147 
6148       // Shift output bytes 0 and 2 into proper lanes
6149       __ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
6150 
6151       // Mask and shift output bytes 1 and 3 into proper lanes and
6152       // combine
6153       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
6154       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
6155       __ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
6156 
6157       // Find out which are 0..25.  This indicates which input
6158       // values fall in the range of 'A'-'Z', which require an
6159       // additional offset (see comments above)
6160       __ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
6161       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
6162       __ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
6163 
6164       // Load the proper lookup table
6165       __ lea(r11, ExternalAddress(StubRoutines::x86::base64_avx2_lut_addr()));
6166       __ movl(r15, isURL);
6167       __ shll(r15, 5);
6168       __ vmovdqu(xmm2, Address(r11, r15));
6169 
6170       // Shuffle the offsets based on the range calculation done
6171       // above. This allows us to add the correct offset to the
6172       // 6-bit value corresponding to the range documented above.
6173       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
6174       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
6175 
6176       // Store the encoded bytes
6177       __ vmovdqu(Address(dest, dp), xmm0);
6178       __ addl(dp, 32);
6179 
6180       __ cmpl(length, 31);
6181       __ jcc(Assembler::belowEqual, L_process3);
6182 
6183       __ align32();
6184       __ BIND(L_32byteLoop);
6185 
6186       // Get next 32 bytes
6187       __ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
6188 
6189       __ subl(length, 24);
6190       __ addl(start_offset, 24);
6191 
6192       // This logic is identical to the above, with only constant
6193       // register loads removed.  Shuffle the input, mask off 6-bit
6194       // chunks, shift them into place, then add the offset to
6195       // encode.
6196       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
6197 
6198       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
6199       __ vpmulhuw(xmm10, xmm0, xmm7, Assembler::AVX_256bit);
6200       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
6201       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
6202       __ vpor(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
6203       __ vpcmpgtb(xmm10, xmm0, xmm3, Assembler::AVX_256bit);
6204       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
6205       __ vpsubb(xmm1, xmm1, xmm10, Assembler::AVX_256bit);
6206       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
6207       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
6208 
6209       // Store the encoded bytes
6210       __ vmovdqu(Address(dest, dp), xmm0);
6211       __ addl(dp, 32);
6212 
6213       __ cmpl(length, 31);
6214       __ jcc(Assembler::above, L_32byteLoop);
6215 
6216       __ BIND(L_process3);
6217       __ vzeroupper();
6218     } else {
6219       __ BIND(L_process3);
6220     }
6221 
6222     __ cmpl(length, 3);
6223     __ jcc(Assembler::below, L_exit);
6224 
6225     // Load the encoding table based on isURL
6226     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
6227     __ movl(r15, isURL);
6228     __ shll(r15, 6);
6229     __ addptr(r11, r15);
6230 
6231     __ BIND(L_processdata);
6232 
6233     // Load 3 bytes
6234     __ load_unsigned_byte(r15, Address(source, start_offset));
6235     __ load_unsigned_byte(r10, Address(source, start_offset, Address::times_1, 1));
6236     __ load_unsigned_byte(r13, Address(source, start_offset, Address::times_1, 2));
6237 
6238     // Build a 32-bit word with bytes 1, 2, 0, 1
6239     __ movl(rax, r10);
6240     __ shll(r10, 24);
6241     __ orl(rax, r10);
6242 
6243     __ subl(length, 3);
6244 
6245     __ shll(r15, 8);
6246     __ shll(r13, 16);
6247     __ orl(rax, r15);
6248 
6249     __ addl(start_offset, 3);
6250 
6251     __ orl(rax, r13);
6252     // At this point, rax contains | byte1 | byte2 | byte0 | byte1
6253     // r13 has byte2 << 16 - need low-order 6 bits to translate.
6254     // This translated byte is the fourth output byte.
6255     __ shrl(r13, 16);
6256     __ andl(r13, 0x3f);
6257 
6258     // The high-order 6 bits of r15 (byte0) is translated.
6259     // The translated byte is the first output byte.
6260     __ shrl(r15, 10);
6261 
6262     __ load_unsigned_byte(r13, Address(r11, r13));
6263     __ load_unsigned_byte(r15, Address(r11, r15));
6264 
6265     __ movb(Address(dest, dp, Address::times_1, 3), r13);
6266 
6267     // Extract high-order 4 bits of byte1 and low-order 2 bits of byte0.
6268     // This translated byte is the second output byte.
6269     __ shrl(rax, 4);
6270     __ movl(r10, rax);
6271     __ andl(rax, 0x3f);
6272 
6273     __ movb(Address(dest, dp, Address::times_1, 0), r15);
6274 
6275     __ load_unsigned_byte(rax, Address(r11, rax));
6276 
6277     // Extract low-order 2 bits of byte1 and high-order 4 bits of byte2.
6278     // This translated byte is the third output byte.
6279     __ shrl(r10, 18);
6280     __ andl(r10, 0x3f);
6281 
6282     __ load_unsigned_byte(r10, Address(r11, r10));
6283 
6284     __ movb(Address(dest, dp, Address::times_1, 1), rax);
6285     __ movb(Address(dest, dp, Address::times_1, 2), r10);
6286 
6287     __ addl(dp, 4);
6288     __ cmpl(length, 3);
6289     __ jcc(Assembler::aboveEqual, L_processdata);
6290 
6291     __ BIND(L_exit);
6292     __ pop(r15);
6293     __ pop(r14);
6294     __ pop(r13);
6295     __ pop(r12);
6296     __ leave();
6297     __ ret(0);
6298     return start;
6299   }
6300 
6301   // base64 AVX512vbmi tables
6302   address base64_vbmi_lookup_lo_addr() {
6303     __ align64();
6304     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64");
6305     address start = __ pc();
6306     assert(((unsigned long long)start & 0x3f) == 0,
6307            "Alignment problem (0x%08llx)", (unsigned long long)start);
6308     __ emit_data64(0x8080808080808080, relocInfo::none);
6309     __ emit_data64(0x8080808080808080, relocInfo::none);
6310     __ emit_data64(0x8080808080808080, relocInfo::none);
6311     __ emit_data64(0x8080808080808080, relocInfo::none);
6312     __ emit_data64(0x8080808080808080, relocInfo::none);
6313     __ emit_data64(0x3f8080803e808080, relocInfo::none);
6314     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6315     __ emit_data64(0x8080808080803d3c, relocInfo::none);
6316     return start;
6317   }
6318 
6319   address base64_vbmi_lookup_hi_addr() {
6320     __ align64();
6321     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64");
6322     address start = __ pc();
6323     assert(((unsigned long long)start & 0x3f) == 0,
6324            "Alignment problem (0x%08llx)", (unsigned long long)start);
6325     __ emit_data64(0x0605040302010080, relocInfo::none);
6326     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6327     __ emit_data64(0x161514131211100f, relocInfo::none);
6328     __ emit_data64(0x8080808080191817, relocInfo::none);
6329     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
6330     __ emit_data64(0x2827262524232221, relocInfo::none);
6331     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6332     __ emit_data64(0x8080808080333231, relocInfo::none);
6333     return start;
6334   }
6335   address base64_vbmi_lookup_lo_url_addr() {
6336     __ align64();
6337     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64url");
6338     address start = __ pc();
6339     assert(((unsigned long long)start & 0x3f) == 0,
6340            "Alignment problem (0x%08llx)", (unsigned long long)start);
6341     __ emit_data64(0x8080808080808080, relocInfo::none);
6342     __ emit_data64(0x8080808080808080, relocInfo::none);
6343     __ emit_data64(0x8080808080808080, relocInfo::none);
6344     __ emit_data64(0x8080808080808080, relocInfo::none);
6345     __ emit_data64(0x8080808080808080, relocInfo::none);
6346     __ emit_data64(0x80803e8080808080, relocInfo::none);
6347     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6348     __ emit_data64(0x8080808080803d3c, relocInfo::none);
6349     return start;
6350   }
6351 
6352   address base64_vbmi_lookup_hi_url_addr() {
6353     __ align64();
6354     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64url");
6355     address start = __ pc();
6356     assert(((unsigned long long)start & 0x3f) == 0,
6357            "Alignment problem (0x%08llx)", (unsigned long long)start);
6358     __ emit_data64(0x0605040302010080, relocInfo::none);
6359     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6360     __ emit_data64(0x161514131211100f, relocInfo::none);
6361     __ emit_data64(0x3f80808080191817, relocInfo::none);
6362     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
6363     __ emit_data64(0x2827262524232221, relocInfo::none);
6364     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6365     __ emit_data64(0x8080808080333231, relocInfo::none);
6366     return start;
6367   }
6368 
6369   address base64_vbmi_pack_vec_addr() {
6370     __ align64();
6371     StubCodeMark mark(this, "StubRoutines", "pack_vec_base64");
6372     address start = __ pc();
6373     assert(((unsigned long long)start & 0x3f) == 0,
6374            "Alignment problem (0x%08llx)", (unsigned long long)start);
6375     __ emit_data64(0x090a040506000102, relocInfo::none);
6376     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
6377     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
6378     __ emit_data64(0x292a242526202122, relocInfo::none);
6379     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6380     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6381     __ emit_data64(0x0000000000000000, relocInfo::none);
6382     __ emit_data64(0x0000000000000000, relocInfo::none);
6383     return start;
6384   }
6385 
6386   address base64_vbmi_join_0_1_addr() {
6387     __ align64();
6388     StubCodeMark mark(this, "StubRoutines", "join_0_1_base64");
6389     address start = __ pc();
6390     assert(((unsigned long long)start & 0x3f) == 0,
6391            "Alignment problem (0x%08llx)", (unsigned long long)start);
6392     __ emit_data64(0x090a040506000102, relocInfo::none);
6393     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
6394     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
6395     __ emit_data64(0x292a242526202122, relocInfo::none);
6396     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6397     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6398     __ emit_data64(0x494a444546404142, relocInfo::none);
6399     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6400     return start;
6401   }
6402 
6403   address base64_vbmi_join_1_2_addr() {
6404     __ align64();
6405     StubCodeMark mark(this, "StubRoutines", "join_1_2_base64");
6406     address start = __ pc();
6407     assert(((unsigned long long)start & 0x3f) == 0,
6408            "Alignment problem (0x%08llx)", (unsigned long long)start);
6409     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
6410     __ emit_data64(0x292a242526202122, relocInfo::none);
6411     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6412     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6413     __ emit_data64(0x494a444546404142, relocInfo::none);
6414     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6415     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
6416     __ emit_data64(0x696a646566606162, relocInfo::none);
6417     return start;
6418   }
6419 
6420   address base64_vbmi_join_2_3_addr() {
6421     __ align64();
6422     StubCodeMark mark(this, "StubRoutines", "join_2_3_base64");
6423     address start = __ pc();
6424     assert(((unsigned long long)start & 0x3f) == 0,
6425            "Alignment problem (0x%08llx)", (unsigned long long)start);
6426     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6427     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6428     __ emit_data64(0x494a444546404142, relocInfo::none);
6429     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6430     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
6431     __ emit_data64(0x696a646566606162, relocInfo::none);
6432     __ emit_data64(0x767071726c6d6e68, relocInfo::none);
6433     __ emit_data64(0x7c7d7e78797a7475, relocInfo::none);
6434     return start;
6435   }
6436 
6437   address base64_decoding_table_addr() {
6438     StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
6439     address start = __ pc();
6440     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6441     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6442     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6443     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6444     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6445     __ emit_data64(0x3fffffff3effffff, relocInfo::none);
6446     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6447     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6448     __ emit_data64(0x06050403020100ff, relocInfo::none);
6449     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6450     __ emit_data64(0x161514131211100f, relocInfo::none);
6451     __ emit_data64(0xffffffffff191817, relocInfo::none);
6452     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6453     __ emit_data64(0x2827262524232221, relocInfo::none);
6454     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6455     __ emit_data64(0xffffffffff333231, relocInfo::none);
6456     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6457     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6458     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6459     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6460     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6461     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6462     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6463     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6464     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6465     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6466     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6467     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6468     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6469     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6470     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6471     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6472 
6473     // URL table
6474     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6475     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6476     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6477     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6478     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6479     __ emit_data64(0xffff3effffffffff, relocInfo::none);
6480     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6481     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6482     __ emit_data64(0x06050403020100ff, relocInfo::none);
6483     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6484     __ emit_data64(0x161514131211100f, relocInfo::none);
6485     __ emit_data64(0x3fffffffff191817, relocInfo::none);
6486     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6487     __ emit_data64(0x2827262524232221, relocInfo::none);
6488     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6489     __ emit_data64(0xffffffffff333231, relocInfo::none);
6490     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6491     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6492     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6493     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6494     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6495     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6496     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6497     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6498     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6499     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6500     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6501     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6502     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6503     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6504     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6505     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6506     return start;
6507   }
6508 
6509 
6510 // Code for generating Base64 decoding.
6511 //
6512 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
6513 //
6514 // Intrinsic function prototype in Base64.java:
6515 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME) {
6516   address generate_base64_decodeBlock() {
6517     __ align(CodeEntryAlignment);
6518     StubCodeMark mark(this, "StubRoutines", "implDecode");
6519     address start = __ pc();
6520     __ enter();
6521 
6522     // Save callee-saved registers before using them
6523     __ push(r12);
6524     __ push(r13);
6525     __ push(r14);
6526     __ push(r15);
6527     __ push(rbx);
6528 
6529     // arguments
6530     const Register source = c_rarg0; // Source Array
6531     const Register start_offset = c_rarg1; // start offset
6532     const Register end_offset = c_rarg2; // end offset
6533     const Register dest = c_rarg3; // destination array
6534     const Register isMIME = rbx;
6535 
6536 #ifndef _WIN64
6537     const Register dp = c_rarg4;  // Position for writing to dest array
6538     const Register isURL = c_rarg5;// Base64 or URL character set
6539     __ movl(isMIME, Address(rbp, 2 * wordSize));
6540 #else
6541     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
6542     const Address isURL_mem(rbp, 7 * wordSize);
6543     const Register isURL = r10;      // pick the volatile windows register
6544     const Register dp = r12;
6545     __ movl(dp, dp_mem);
6546     __ movl(isURL, isURL_mem);
6547     __ movl(isMIME, Address(rbp, 8 * wordSize));
6548 #endif
6549 
6550     const XMMRegister lookup_lo = xmm5;
6551     const XMMRegister lookup_hi = xmm6;
6552     const XMMRegister errorvec = xmm7;
6553     const XMMRegister pack16_op = xmm9;
6554     const XMMRegister pack32_op = xmm8;
6555     const XMMRegister input0 = xmm3;
6556     const XMMRegister input1 = xmm20;
6557     const XMMRegister input2 = xmm21;
6558     const XMMRegister input3 = xmm19;
6559     const XMMRegister join01 = xmm12;
6560     const XMMRegister join12 = xmm11;
6561     const XMMRegister join23 = xmm10;
6562     const XMMRegister translated0 = xmm2;
6563     const XMMRegister translated1 = xmm1;
6564     const XMMRegister translated2 = xmm0;
6565     const XMMRegister translated3 = xmm4;
6566 
6567     const XMMRegister merged0 = xmm2;
6568     const XMMRegister merged1 = xmm1;
6569     const XMMRegister merged2 = xmm0;
6570     const XMMRegister merged3 = xmm4;
6571     const XMMRegister merge_ab_bc0 = xmm2;
6572     const XMMRegister merge_ab_bc1 = xmm1;
6573     const XMMRegister merge_ab_bc2 = xmm0;
6574     const XMMRegister merge_ab_bc3 = xmm4;
6575 
6576     const XMMRegister pack24bits = xmm4;
6577 
6578     const Register length = r14;
6579     const Register output_size = r13;
6580     const Register output_mask = r15;
6581     const KRegister input_mask = k1;
6582 
6583     const XMMRegister input_initial_valid_b64 = xmm0;
6584     const XMMRegister tmp = xmm10;
6585     const XMMRegister mask = xmm0;
6586     const XMMRegister invalid_b64 = xmm1;
6587 
6588     Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
6589     Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
6590     Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
6591 
6592     // calculate length from offsets
6593     __ movl(length, end_offset);
6594     __ subl(length, start_offset);
6595     __ push(dest);          // Save for return value calc
6596 
6597     // If AVX512 VBMI not supported, just compile non-AVX code
6598     if(VM_Version::supports_avx512_vbmi() &&
6599        VM_Version::supports_avx512bw()) {
6600       __ cmpl(length, 128);     // 128-bytes is break-even for AVX-512
6601       __ jcc(Assembler::lessEqual, L_bruteForce);
6602 
6603       __ cmpl(isMIME, 0);
6604       __ jcc(Assembler::notEqual, L_bruteForce);
6605 
6606       // Load lookup tables based on isURL
6607       __ cmpl(isURL, 0);
6608       __ jcc(Assembler::notZero, L_loadURL);
6609 
6610       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_addr()), Assembler::AVX_512bit, r13);
6611       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_addr()), Assembler::AVX_512bit, r13);
6612 
6613       __ BIND(L_continue);
6614 
6615       __ movl(r15, 0x01400140);
6616       __ evpbroadcastd(pack16_op, r15, Assembler::AVX_512bit);
6617 
6618       __ movl(r15, 0x00011000);
6619       __ evpbroadcastd(pack32_op, r15, Assembler::AVX_512bit);
6620 
6621       __ cmpl(length, 0xff);
6622       __ jcc(Assembler::lessEqual, L_process64);
6623 
6624       // load masks required for decoding data
6625       __ BIND(L_processdata);
6626       __ evmovdquq(join01, ExternalAddress(StubRoutines::x86::base64_vbmi_join_0_1_addr()), Assembler::AVX_512bit,r13);
6627       __ evmovdquq(join12, ExternalAddress(StubRoutines::x86::base64_vbmi_join_1_2_addr()), Assembler::AVX_512bit, r13);
6628       __ evmovdquq(join23, ExternalAddress(StubRoutines::x86::base64_vbmi_join_2_3_addr()), Assembler::AVX_512bit, r13);
6629 
6630       __ align32();
6631       __ BIND(L_process256);
6632       // Grab input data
6633       __ evmovdquq(input0, Address(source, start_offset, Address::times_1, 0x00), Assembler::AVX_512bit);
6634       __ evmovdquq(input1, Address(source, start_offset, Address::times_1, 0x40), Assembler::AVX_512bit);
6635       __ evmovdquq(input2, Address(source, start_offset, Address::times_1, 0x80), Assembler::AVX_512bit);
6636       __ evmovdquq(input3, Address(source, start_offset, Address::times_1, 0xc0), Assembler::AVX_512bit);
6637 
6638       // Copy the low part of the lookup table into the destination of the permutation
6639       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6640       __ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
6641       __ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
6642       __ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
6643 
6644       // Translate the base64 input into "decoded" bytes
6645       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6646       __ evpermt2b(translated1, input1, lookup_hi, Assembler::AVX_512bit);
6647       __ evpermt2b(translated2, input2, lookup_hi, Assembler::AVX_512bit);
6648       __ evpermt2b(translated3, input3, lookup_hi, Assembler::AVX_512bit);
6649 
6650       // OR all of the translations together to check for errors (high-order bit of byte set)
6651       __ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
6652 
6653       __ vpternlogd(input3, 0xfe, translated0, translated1, Assembler::AVX_512bit);
6654       __ vpternlogd(input0, 0xfe, translated2, translated3, Assembler::AVX_512bit);
6655       __ vpor(errorvec, input3, input0, Assembler::AVX_512bit);
6656 
6657       // Check if there was an error - if so, try 64-byte chunks
6658       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6659       __ kortestql(k3, k3);
6660       __ jcc(Assembler::notZero, L_process64);
6661 
6662       // The merging and shuffling happens here
6663       // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa]
6664       // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd]
6665       // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
6666       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6667       __ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
6668       __ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
6669       __ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
6670 
6671       // Now do the same with packed 16-bit values.
6672       // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb]
6673       // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12
6674       // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
6675       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6676       __ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
6677       __ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
6678       __ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
6679 
6680       // The join vectors specify which byte from which vector goes into the outputs
6681       // One of every 4 bytes in the extended vector is zero, so we pack them into their
6682       // final positions in the register for storing (256 bytes in, 192 bytes out)
6683       __ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
6684       __ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
6685       __ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
6686 
6687       // Store result
6688       __ evmovdquq(Address(dest, dp, Address::times_1, 0x00), merged0, Assembler::AVX_512bit);
6689       __ evmovdquq(Address(dest, dp, Address::times_1, 0x40), merged1, Assembler::AVX_512bit);
6690       __ evmovdquq(Address(dest, dp, Address::times_1, 0x80), merged2, Assembler::AVX_512bit);
6691 
6692       __ addptr(source, 0x100);
6693       __ addptr(dest, 0xc0);
6694       __ subl(length, 0x100);
6695       __ cmpl(length, 64 * 4);
6696       __ jcc(Assembler::greaterEqual, L_process256);
6697 
6698       // At this point, we've decoded 64 * 4 * n bytes.
6699       // The remaining length will be <= 64 * 4 - 1.
6700       // UNLESS there was an error decoding the first 256-byte chunk.  In this
6701       // case, the length will be arbitrarily long.
6702       //
6703       // Note that this will be the path for MIME-encoded strings.
6704 
6705       __ BIND(L_process64);
6706 
6707       __ evmovdquq(pack24bits, ExternalAddress(StubRoutines::x86::base64_vbmi_pack_vec_addr()), Assembler::AVX_512bit, r13);
6708 
6709       __ cmpl(length, 63);
6710       __ jcc(Assembler::lessEqual, L_finalBit);
6711 
6712       __ align32();
6713       __ BIND(L_process64Loop);
6714 
6715       // Handle first 64-byte block
6716 
6717       __ evmovdquq(input0, Address(source, start_offset), Assembler::AVX_512bit);
6718       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6719       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6720 
6721       __ vpor(errorvec, translated0, input0, Assembler::AVX_512bit);
6722 
6723       // Check for error and bomb out before updating dest
6724       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6725       __ kortestql(k3, k3);
6726       __ jcc(Assembler::notZero, L_exit);
6727 
6728       // Pack output register, selecting correct byte ordering
6729       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6730       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6731       __ vpermb(merged0, pack24bits, merged0, Assembler::AVX_512bit);
6732 
6733       __ evmovdquq(Address(dest, dp), merged0, Assembler::AVX_512bit);
6734 
6735       __ subl(length, 64);
6736       __ addptr(source, 64);
6737       __ addptr(dest, 48);
6738 
6739       __ cmpl(length, 64);
6740       __ jcc(Assembler::greaterEqual, L_process64Loop);
6741 
6742       __ cmpl(length, 0);
6743       __ jcc(Assembler::lessEqual, L_exit);
6744 
6745       __ BIND(L_finalBit);
6746       // Now have 1 to 63 bytes left to decode
6747 
6748       // I was going to let Java take care of the final fragment
6749       // however it will repeatedly call this routine for every 4 bytes
6750       // of input data, so handle the rest here.
6751       __ movq(rax, -1);
6752       __ bzhiq(rax, rax, length);    // Input mask in rax
6753 
6754       __ movl(output_size, length);
6755       __ shrl(output_size, 2);   // Find (len / 4) * 3 (output length)
6756       __ lea(output_size, Address(output_size, output_size, Address::times_2, 0));
6757       // output_size in r13
6758 
6759       // Strip pad characters, if any, and adjust length and mask
6760       __ cmpb(Address(source, length, Address::times_1, -1), '=');
6761       __ jcc(Assembler::equal, L_padding);
6762 
6763       __ BIND(L_donePadding);
6764 
6765       // Output size is (64 - output_size), output mask is (all 1s >> output_size).
6766       __ kmovql(input_mask, rax);
6767       __ movq(output_mask, -1);
6768       __ bzhiq(output_mask, output_mask, output_size);
6769 
6770       // Load initial input with all valid base64 characters.  Will be used
6771       // in merging source bytes to avoid masking when determining if an error occurred.
6772       __ movl(rax, 0x61616161);
6773       __ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
6774 
6775       // A register containing all invalid base64 decoded values
6776       __ movl(rax, 0x80808080);
6777       __ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
6778 
6779       // input_mask is in k1
6780       // output_size is in r13
6781       // output_mask is in r15
6782       // zmm0 - free
6783       // zmm1 - 0x00011000
6784       // zmm2 - 0x01400140
6785       // zmm3 - errorvec
6786       // zmm4 - pack vector
6787       // zmm5 - lookup_lo
6788       // zmm6 - lookup_hi
6789       // zmm7 - errorvec
6790       // zmm8 - 0x61616161
6791       // zmm9 - 0x80808080
6792 
6793       // Load only the bytes from source, merging into our "fully-valid" register
6794       __ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
6795 
6796       // Decode all bytes within our merged input
6797       __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
6798       __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
6799       __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
6800 
6801       // Check for error.  Compare (decoded | initial) to all invalid.
6802       // If any bytes have their high-order bit set, then we have an error.
6803       __ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
6804       __ kortestql(k2, k2);
6805 
6806       // If we have an error, use the brute force loop to decode what we can (4-byte chunks).
6807       __ jcc(Assembler::notZero, L_bruteForce);
6808 
6809       // Shuffle output bytes
6810       __ vpmaddubsw(tmp, tmp, pack16_op, Assembler::AVX_512bit);
6811       __ vpmaddwd(tmp, tmp, pack32_op, Assembler::AVX_512bit);
6812 
6813       __ vpermb(tmp, pack24bits, tmp, Assembler::AVX_512bit);
6814       __ kmovql(k1, output_mask);
6815       __ evmovdqub(Address(dest, dp), k1, tmp, true, Assembler::AVX_512bit);
6816 
6817       __ addptr(dest, output_size);
6818 
6819       __ BIND(L_exit);
6820       __ vzeroupper();
6821       __ pop(rax);             // Get original dest value
6822       __ subptr(dest, rax);      // Number of bytes converted
6823       __ movptr(rax, dest);
6824       __ pop(rbx);
6825       __ pop(r15);
6826       __ pop(r14);
6827       __ pop(r13);
6828       __ pop(r12);
6829       __ leave();
6830       __ ret(0);
6831 
6832       __ BIND(L_loadURL);
6833       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_url_addr()), Assembler::AVX_512bit, r13);
6834       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_url_addr()), Assembler::AVX_512bit, r13);
6835       __ jmp(L_continue);
6836 
6837       __ BIND(L_padding);
6838       __ decrementq(output_size, 1);
6839       __ shrq(rax, 1);
6840 
6841       __ cmpb(Address(source, length, Address::times_1, -2), '=');
6842       __ jcc(Assembler::notEqual, L_donePadding);
6843 
6844       __ decrementq(output_size, 1);
6845       __ shrq(rax, 1);
6846       __ jmp(L_donePadding);
6847 
6848       __ align32();
6849       __ BIND(L_bruteForce);
6850     }   // End of if(avx512_vbmi)
6851 
6852     // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
6853 
6854     // Register state (Linux):
6855     // r12-15 - saved on stack
6856     // rdi - src
6857     // rsi - sp
6858     // rdx - sl
6859     // rcx - dst
6860     // r8 - dp
6861     // r9 - isURL
6862 
6863     // Register state (Windows):
6864     // r12-15 - saved on stack
6865     // rcx - src
6866     // rdx - sp
6867     // r8 - sl
6868     // r9 - dst
6869     // r12 - dp
6870     // r10 - isURL
6871 
6872     // Registers (common):
6873     // length (r14) - bytes in src
6874 
6875     const Register decode_table = r11;
6876     const Register out_byte_count = rbx;
6877     const Register byte1 = r13;
6878     const Register byte2 = r15;
6879     const Register byte3 = WINDOWS_ONLY(r8) NOT_WINDOWS(rdx);
6880     const Register byte4 = WINDOWS_ONLY(r10) NOT_WINDOWS(r9);
6881 
6882     __ shrl(length, 2);    // Multiple of 4 bytes only - length is # 4-byte chunks
6883     __ cmpl(length, 0);
6884     __ jcc(Assembler::lessEqual, L_exit_no_vzero);
6885 
6886     __ shll(isURL, 8);    // index into decode table based on isURL
6887     __ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
6888     __ addptr(decode_table, isURL);
6889 
6890     __ jmp(L_bottomLoop);
6891 
6892     __ align32();
6893     __ BIND(L_forceLoop);
6894     __ shll(byte1, 18);
6895     __ shll(byte2, 12);
6896     __ shll(byte3, 6);
6897     __ orl(byte1, byte2);
6898     __ orl(byte1, byte3);
6899     __ orl(byte1, byte4);
6900 
6901     __ addptr(source, 4);
6902 
6903     __ movb(Address(dest, dp, Address::times_1, 2), byte1);
6904     __ shrl(byte1, 8);
6905     __ movb(Address(dest, dp, Address::times_1, 1), byte1);
6906     __ shrl(byte1, 8);
6907     __ movb(Address(dest, dp, Address::times_1, 0), byte1);
6908 
6909     __ addptr(dest, 3);
6910     __ decrementl(length, 1);
6911     __ jcc(Assembler::zero, L_exit_no_vzero);
6912 
6913     __ BIND(L_bottomLoop);
6914     __ load_unsigned_byte(byte1, Address(source, start_offset, Address::times_1, 0x00));
6915     __ load_unsigned_byte(byte2, Address(source, start_offset, Address::times_1, 0x01));
6916     __ load_signed_byte(byte1, Address(decode_table, byte1));
6917     __ load_signed_byte(byte2, Address(decode_table, byte2));
6918     __ load_unsigned_byte(byte3, Address(source, start_offset, Address::times_1, 0x02));
6919     __ load_unsigned_byte(byte4, Address(source, start_offset, Address::times_1, 0x03));
6920     __ load_signed_byte(byte3, Address(decode_table, byte3));
6921     __ load_signed_byte(byte4, Address(decode_table, byte4));
6922 
6923     __ mov(rax, byte1);
6924     __ orl(rax, byte2);
6925     __ orl(rax, byte3);
6926     __ orl(rax, byte4);
6927     __ jcc(Assembler::positive, L_forceLoop);
6928 
6929     __ BIND(L_exit_no_vzero);
6930     __ pop(rax);             // Get original dest value
6931     __ subptr(dest, rax);      // Number of bytes converted
6932     __ movptr(rax, dest);
6933     __ pop(rbx);
6934     __ pop(r15);
6935     __ pop(r14);
6936     __ pop(r13);
6937     __ pop(r12);
6938     __ leave();
6939     __ ret(0);
6940 
6941     return start;
6942   }
6943 
6944 
6945   /**
6946    *  Arguments:
6947    *
6948    * Inputs:
6949    *   c_rarg0   - int crc
6950    *   c_rarg1   - byte* buf
6951    *   c_rarg2   - int length
6952    *
6953    * Ouput:
6954    *       rax   - int crc result
6955    */
6956   address generate_updateBytesCRC32() {
6957     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
6958 
6959     __ align(CodeEntryAlignment);
6960     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6961 
6962     address start = __ pc();
6963     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6964     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6965     // rscratch1: r10
6966     const Register crc   = c_rarg0;  // crc
6967     const Register buf   = c_rarg1;  // source java byte array address
6968     const Register len   = c_rarg2;  // length
6969     const Register table = c_rarg3;  // crc_table address (reuse register)
6970     const Register tmp1   = r11;
6971     const Register tmp2   = r10;
6972     assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
6973 
6974     BLOCK_COMMENT("Entry:");
6975     __ enter(); // required for proper stackwalking of RuntimeStub frame
6976 
6977     if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6978         VM_Version::supports_avx512bw() &&
6979         VM_Version::supports_avx512vl()) {
6980       __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6981     } else {
6982       __ kernel_crc32(crc, buf, len, table, tmp1);
6983     }
6984 
6985     __ movl(rax, crc);
6986     __ vzeroupper();
6987     __ leave(); // required for proper stackwalking of RuntimeStub frame
6988     __ ret(0);
6989 
6990     return start;
6991   }
6992 
6993   /**
6994   *  Arguments:
6995   *
6996   * Inputs:
6997   *   c_rarg0   - int crc
6998   *   c_rarg1   - byte* buf
6999   *   c_rarg2   - long length
7000   *   c_rarg3   - table_start - optional (present only when doing a library_call,
7001   *              not used by x86 algorithm)
7002   *
7003   * Ouput:
7004   *       rax   - int crc result
7005   */
7006   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
7007       assert(UseCRC32CIntrinsics, "need SSE4_2");
7008       __ align(CodeEntryAlignment);
7009       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
7010       address start = __ pc();
7011       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
7012       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
7013       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
7014       const Register crc = c_rarg0;  // crc
7015       const Register buf = c_rarg1;  // source java byte array address
7016       const Register len = c_rarg2;  // length
7017       const Register a = rax;
7018       const Register j = r9;
7019       const Register k = r10;
7020       const Register l = r11;
7021 #ifdef _WIN64
7022       const Register y = rdi;
7023       const Register z = rsi;
7024 #else
7025       const Register y = rcx;
7026       const Register z = r8;
7027 #endif
7028       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
7029 
7030       BLOCK_COMMENT("Entry:");
7031       __ enter(); // required for proper stackwalking of RuntimeStub frame
7032 #ifdef _WIN64
7033       __ push(y);
7034       __ push(z);
7035 #endif
7036       __ crc32c_ipl_alg2_alt2(crc, buf, len,
7037                               a, j, k,
7038                               l, y, z,
7039                               c_farg0, c_farg1, c_farg2,
7040                               is_pclmulqdq_supported);
7041       __ movl(rax, crc);
7042 #ifdef _WIN64
7043       __ pop(z);
7044       __ pop(y);
7045 #endif
7046       __ vzeroupper();
7047       __ leave(); // required for proper stackwalking of RuntimeStub frame
7048       __ ret(0);
7049 
7050       return start;
7051   }
7052 
7053 
7054   /***
7055    *  Arguments:
7056    *
7057    *  Inputs:
7058    *   c_rarg0   - int   adler
7059    *   c_rarg1   - byte* buff
7060    *   c_rarg2   - int   len
7061    *
7062    * Output:
7063    *   rax   - int adler result
7064    */
7065 
7066   address generate_updateBytesAdler32() {
7067       assert(UseAdler32Intrinsics, "need AVX2");
7068 
7069       __ align(CodeEntryAlignment);
7070       StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
7071 
7072       address start = __ pc();
7073 
7074       const Register data = r9;
7075       const Register size = r10;
7076 
7077       const XMMRegister yshuf0 = xmm6;
7078       const XMMRegister yshuf1 = xmm7;
7079       assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
7080 
7081       BLOCK_COMMENT("Entry:");
7082       __ enter(); // required for proper stackwalking of RuntimeStub frame
7083 
7084       __ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
7085       __ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
7086       __ movptr(data, c_rarg1); //data
7087       __ movl(size, c_rarg2); //length
7088       __ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
7089       __ leave();
7090       __ ret(0);
7091       return start;
7092   }
7093 
7094   /**
7095    *  Arguments:
7096    *
7097    *  Input:
7098    *    c_rarg0   - x address
7099    *    c_rarg1   - x length
7100    *    c_rarg2   - y address
7101    *    c_rarg3   - y length
7102    * not Win64
7103    *    c_rarg4   - z address
7104    *    c_rarg5   - z length
7105    * Win64
7106    *    rsp+40    - z address
7107    *    rsp+48    - z length
7108    */
7109   address generate_multiplyToLen() {
7110     __ align(CodeEntryAlignment);
7111     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
7112 
7113     address start = __ pc();
7114     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
7115     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
7116     const Register x     = rdi;
7117     const Register xlen  = rax;
7118     const Register y     = rsi;
7119     const Register ylen  = rcx;
7120     const Register z     = r8;
7121     const Register zlen  = r11;
7122 
7123     // Next registers will be saved on stack in multiply_to_len().
7124     const Register tmp1  = r12;
7125     const Register tmp2  = r13;
7126     const Register tmp3  = r14;
7127     const Register tmp4  = r15;
7128     const Register tmp5  = rbx;
7129 
7130     BLOCK_COMMENT("Entry:");
7131     __ enter(); // required for proper stackwalking of RuntimeStub frame
7132 
7133 #ifndef _WIN64
7134     __ movptr(zlen, r9); // Save r9 in r11 - zlen
7135 #endif
7136     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
7137                        // ylen => rcx, z => r8, zlen => r11
7138                        // r9 and r10 may be used to save non-volatile registers
7139 #ifdef _WIN64
7140     // last 2 arguments (#4, #5) are on stack on Win64
7141     __ movptr(z, Address(rsp, 6 * wordSize));
7142     __ movptr(zlen, Address(rsp, 7 * wordSize));
7143 #endif
7144 
7145     __ movptr(xlen, rsi);
7146     __ movptr(y,    rdx);
7147     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
7148 
7149     restore_arg_regs();
7150 
7151     __ leave(); // required for proper stackwalking of RuntimeStub frame
7152     __ ret(0);
7153 
7154     return start;
7155   }
7156 
7157   /**
7158   *  Arguments:
7159   *
7160   *  Input:
7161   *    c_rarg0   - obja     address
7162   *    c_rarg1   - objb     address
7163   *    c_rarg3   - length   length
7164   *    c_rarg4   - scale    log2_array_indxscale
7165   *
7166   *  Output:
7167   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
7168   */
7169   address generate_vectorizedMismatch() {
7170     __ align(CodeEntryAlignment);
7171     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
7172     address start = __ pc();
7173 
7174     BLOCK_COMMENT("Entry:");
7175     __ enter();
7176 
7177 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
7178     const Register scale = c_rarg0;  //rcx, will exchange with r9
7179     const Register objb = c_rarg1;   //rdx
7180     const Register length = c_rarg2; //r8
7181     const Register obja = c_rarg3;   //r9
7182     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
7183 
7184     const Register tmp1 = r10;
7185     const Register tmp2 = r11;
7186 #endif
7187 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
7188     const Register obja = c_rarg0;   //U:rdi
7189     const Register objb = c_rarg1;   //U:rsi
7190     const Register length = c_rarg2; //U:rdx
7191     const Register scale = c_rarg3;  //U:rcx
7192     const Register tmp1 = r8;
7193     const Register tmp2 = r9;
7194 #endif
7195     const Register result = rax; //return value
7196     const XMMRegister vec0 = xmm0;
7197     const XMMRegister vec1 = xmm1;
7198     const XMMRegister vec2 = xmm2;
7199 
7200     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
7201 
7202     __ vzeroupper();
7203     __ leave();
7204     __ ret(0);
7205 
7206     return start;
7207   }
7208 
7209 /**
7210    *  Arguments:
7211    *
7212   //  Input:
7213   //    c_rarg0   - x address
7214   //    c_rarg1   - x length
7215   //    c_rarg2   - z address
7216   //    c_rarg3   - z lenth
7217    *
7218    */
7219   address generate_squareToLen() {
7220 
7221     __ align(CodeEntryAlignment);
7222     StubCodeMark mark(this, "StubRoutines", "squareToLen");
7223 
7224     address start = __ pc();
7225     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
7226     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
7227     const Register x      = rdi;
7228     const Register len    = rsi;
7229     const Register z      = r8;
7230     const Register zlen   = rcx;
7231 
7232    const Register tmp1      = r12;
7233    const Register tmp2      = r13;
7234    const Register tmp3      = r14;
7235    const Register tmp4      = r15;
7236    const Register tmp5      = rbx;
7237 
7238     BLOCK_COMMENT("Entry:");
7239     __ enter(); // required for proper stackwalking of RuntimeStub frame
7240 
7241     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
7242                        // zlen => rcx
7243                        // r9 and r10 may be used to save non-volatile registers
7244     __ movptr(r8, rdx);
7245     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
7246 
7247     restore_arg_regs();
7248 
7249     __ leave(); // required for proper stackwalking of RuntimeStub frame
7250     __ ret(0);
7251 
7252     return start;
7253   }
7254 
7255   address generate_method_entry_barrier() {
7256     __ align(CodeEntryAlignment);
7257     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
7258 
7259     Label deoptimize_label;
7260 
7261     address start = __ pc();
7262 
7263     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
7264 
7265     BLOCK_COMMENT("Entry:");
7266     __ enter(); // save rbp
7267 
7268     // save c_rarg0, because we want to use that value.
7269     // We could do without it but then we depend on the number of slots used by pusha
7270     __ push(c_rarg0);
7271 
7272     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
7273 
7274     __ pusha();
7275 
7276     // The method may have floats as arguments, and we must spill them before calling
7277     // the VM runtime.
7278     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
7279     const int xmm_size = wordSize * 2;
7280     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
7281     __ subptr(rsp, xmm_spill_size);
7282     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
7283     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
7284     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
7285     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
7286     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
7287     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
7288     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
7289     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
7290 
7291     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
7292 
7293     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
7294     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
7295     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
7296     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
7297     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
7298     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
7299     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
7300     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
7301     __ addptr(rsp, xmm_spill_size);
7302 
7303     __ cmpl(rax, 1); // 1 means deoptimize
7304     __ jcc(Assembler::equal, deoptimize_label);
7305 
7306     __ popa();
7307     __ pop(c_rarg0);
7308 
7309     __ leave();
7310 
7311     __ addptr(rsp, 1 * wordSize); // cookie
7312     __ ret(0);
7313 
7314 
7315     __ BIND(deoptimize_label);
7316 
7317     __ popa();
7318     __ pop(c_rarg0);
7319 
7320     __ leave();
7321 
7322     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
7323     // here while still having a correct stack is valuable
7324     __ testptr(rsp, Address(rsp, 0));
7325 
7326     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
7327     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
7328 
7329     return start;
7330   }
7331 
7332    /**
7333    *  Arguments:
7334    *
7335    *  Input:
7336    *    c_rarg0   - out address
7337    *    c_rarg1   - in address
7338    *    c_rarg2   - offset
7339    *    c_rarg3   - len
7340    * not Win64
7341    *    c_rarg4   - k
7342    * Win64
7343    *    rsp+40    - k
7344    */
7345   address generate_mulAdd() {
7346     __ align(CodeEntryAlignment);
7347     StubCodeMark mark(this, "StubRoutines", "mulAdd");
7348 
7349     address start = __ pc();
7350     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
7351     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
7352     const Register out     = rdi;
7353     const Register in      = rsi;
7354     const Register offset  = r11;
7355     const Register len     = rcx;
7356     const Register k       = r8;
7357 
7358     // Next registers will be saved on stack in mul_add().
7359     const Register tmp1  = r12;
7360     const Register tmp2  = r13;
7361     const Register tmp3  = r14;
7362     const Register tmp4  = r15;
7363     const Register tmp5  = rbx;
7364 
7365     BLOCK_COMMENT("Entry:");
7366     __ enter(); // required for proper stackwalking of RuntimeStub frame
7367 
7368     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
7369                        // len => rcx, k => r8
7370                        // r9 and r10 may be used to save non-volatile registers
7371 #ifdef _WIN64
7372     // last argument is on stack on Win64
7373     __ movl(k, Address(rsp, 6 * wordSize));
7374 #endif
7375     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
7376     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
7377 
7378     restore_arg_regs();
7379 
7380     __ leave(); // required for proper stackwalking of RuntimeStub frame
7381     __ ret(0);
7382 
7383     return start;
7384   }
7385 
7386   address generate_bigIntegerRightShift() {
7387     __ align(CodeEntryAlignment);
7388     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
7389 
7390     address start = __ pc();
7391     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7392     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7393     const Register newArr = rdi;
7394     const Register oldArr = rsi;
7395     const Register newIdx = rdx;
7396     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7397     const Register totalNumIter = r8;
7398 
7399     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7400     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7401     const Register tmp1 = r11;                    // Caller save.
7402     const Register tmp2 = rax;                    // Caller save.
7403     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7404     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7405     const Register tmp5 = r14;                    // Callee save.
7406     const Register tmp6 = r15;
7407 
7408     const XMMRegister x0 = xmm0;
7409     const XMMRegister x1 = xmm1;
7410     const XMMRegister x2 = xmm2;
7411 
7412     BLOCK_COMMENT("Entry:");
7413     __ enter(); // required for proper stackwalking of RuntimeStub frame
7414 
7415 #ifdef _WINDOWS
7416     setup_arg_regs(4);
7417     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7418     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7419     // Save callee save registers.
7420     __ push(tmp3);
7421     __ push(tmp4);
7422 #endif
7423     __ push(tmp5);
7424 
7425     // Rename temps used throughout the code.
7426     const Register idx = tmp1;
7427     const Register nIdx = tmp2;
7428 
7429     __ xorl(idx, idx);
7430 
7431     // Start right shift from end of the array.
7432     // For example, if #iteration = 4 and newIdx = 1
7433     // then dest[4] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
7434     // if #iteration = 4 and newIdx = 0
7435     // then dest[3] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
7436     __ movl(idx, totalNumIter);
7437     __ movl(nIdx, idx);
7438     __ addl(nIdx, newIdx);
7439 
7440     // If vectorization is enabled, check if the number of iterations is at least 64
7441     // If not, then go to ShifTwo processing 2 iterations
7442     if (VM_Version::supports_avx512_vbmi2()) {
7443       __ cmpptr(totalNumIter, (AVX3Threshold/64));
7444       __ jcc(Assembler::less, ShiftTwo);
7445 
7446       if (AVX3Threshold < 16 * 64) {
7447         __ cmpl(totalNumIter, 16);
7448         __ jcc(Assembler::less, ShiftTwo);
7449       }
7450       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7451       __ subl(idx, 16);
7452       __ subl(nIdx, 16);
7453       __ BIND(Shift512Loop);
7454       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
7455       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7456       __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
7457       __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
7458       __ subl(nIdx, 16);
7459       __ subl(idx, 16);
7460       __ jcc(Assembler::greaterEqual, Shift512Loop);
7461       __ addl(idx, 16);
7462       __ addl(nIdx, 16);
7463     }
7464     __ BIND(ShiftTwo);
7465     __ cmpl(idx, 2);
7466     __ jcc(Assembler::less, ShiftOne);
7467     __ subl(idx, 2);
7468     __ subl(nIdx, 2);
7469     __ BIND(ShiftTwoLoop);
7470     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
7471     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7472     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7473     __ shrdl(tmp5, tmp4);
7474     __ shrdl(tmp4, tmp3);
7475     __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
7476     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7477     __ subl(nIdx, 2);
7478     __ subl(idx, 2);
7479     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7480     __ addl(idx, 2);
7481     __ addl(nIdx, 2);
7482 
7483     // Do the last iteration
7484     __ BIND(ShiftOne);
7485     __ cmpl(idx, 1);
7486     __ jcc(Assembler::less, Exit);
7487     __ subl(idx, 1);
7488     __ subl(nIdx, 1);
7489     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7490     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7491     __ shrdl(tmp4, tmp3);
7492     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7493     __ BIND(Exit);
7494     // Restore callee save registers.
7495     __ pop(tmp5);
7496 #ifdef _WINDOWS
7497     __ pop(tmp4);
7498     __ pop(tmp3);
7499     restore_arg_regs();
7500 #endif
7501     __ leave(); // required for proper stackwalking of RuntimeStub frame
7502     __ ret(0);
7503     return start;
7504   }
7505 
7506    /**
7507    *  Arguments:
7508    *
7509    *  Input:
7510    *    c_rarg0   - newArr address
7511    *    c_rarg1   - oldArr address
7512    *    c_rarg2   - newIdx
7513    *    c_rarg3   - shiftCount
7514    * not Win64
7515    *    c_rarg4   - numIter
7516    * Win64
7517    *    rsp40    - numIter
7518    */
7519   address generate_bigIntegerLeftShift() {
7520     __ align(CodeEntryAlignment);
7521     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
7522     address start = __ pc();
7523     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7524     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7525     const Register newArr = rdi;
7526     const Register oldArr = rsi;
7527     const Register newIdx = rdx;
7528     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7529     const Register totalNumIter = r8;
7530     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7531     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7532     const Register tmp1 = r11;                    // Caller save.
7533     const Register tmp2 = rax;                    // Caller save.
7534     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7535     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7536     const Register tmp5 = r14;                    // Callee save.
7537 
7538     const XMMRegister x0 = xmm0;
7539     const XMMRegister x1 = xmm1;
7540     const XMMRegister x2 = xmm2;
7541     BLOCK_COMMENT("Entry:");
7542     __ enter(); // required for proper stackwalking of RuntimeStub frame
7543 
7544 #ifdef _WINDOWS
7545     setup_arg_regs(4);
7546     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7547     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7548     // Save callee save registers.
7549     __ push(tmp3);
7550     __ push(tmp4);
7551 #endif
7552     __ push(tmp5);
7553 
7554     // Rename temps used throughout the code
7555     const Register idx = tmp1;
7556     const Register numIterTmp = tmp2;
7557 
7558     // Start idx from zero.
7559     __ xorl(idx, idx);
7560     // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
7561     __ lea(newArr, Address(newArr, newIdx, Address::times_4));
7562     __ movl(numIterTmp, totalNumIter);
7563 
7564     // If vectorization is enabled, check if the number of iterations is at least 64
7565     // If not, then go to ShiftTwo shifting two numbers at a time
7566     if (VM_Version::supports_avx512_vbmi2()) {
7567       __ cmpl(totalNumIter, (AVX3Threshold/64));
7568       __ jcc(Assembler::less, ShiftTwo);
7569 
7570       if (AVX3Threshold < 16 * 64) {
7571         __ cmpl(totalNumIter, 16);
7572         __ jcc(Assembler::less, ShiftTwo);
7573       }
7574       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7575       __ subl(numIterTmp, 16);
7576       __ BIND(Shift512Loop);
7577       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7578       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
7579       __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
7580       __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
7581       __ addl(idx, 16);
7582       __ subl(numIterTmp, 16);
7583       __ jcc(Assembler::greaterEqual, Shift512Loop);
7584       __ addl(numIterTmp, 16);
7585     }
7586     __ BIND(ShiftTwo);
7587     __ cmpl(totalNumIter, 1);
7588     __ jcc(Assembler::less, Exit);
7589     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7590     __ subl(numIterTmp, 2);
7591     __ jcc(Assembler::less, ShiftOne);
7592 
7593     __ BIND(ShiftTwoLoop);
7594     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7595     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
7596     __ shldl(tmp3, tmp4);
7597     __ shldl(tmp4, tmp5);
7598     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7599     __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
7600     __ movl(tmp3, tmp5);
7601     __ addl(idx, 2);
7602     __ subl(numIterTmp, 2);
7603     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7604 
7605     // Do the last iteration
7606     __ BIND(ShiftOne);
7607     __ addl(numIterTmp, 2);
7608     __ cmpl(numIterTmp, 1);
7609     __ jcc(Assembler::less, Exit);
7610     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7611     __ shldl(tmp3, tmp4);
7612     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7613 
7614     __ BIND(Exit);
7615     // Restore callee save registers.
7616     __ pop(tmp5);
7617 #ifdef _WINDOWS
7618     __ pop(tmp4);
7619     __ pop(tmp3);
7620     restore_arg_regs();
7621 #endif
7622     __ leave(); // required for proper stackwalking of RuntimeStub frame
7623     __ ret(0);
7624     return start;
7625   }
7626 
7627   address generate_libmExp() {
7628     StubCodeMark mark(this, "StubRoutines", "libmExp");
7629 
7630     address start = __ pc();
7631 
7632     const XMMRegister x0  = xmm0;
7633     const XMMRegister x1  = xmm1;
7634     const XMMRegister x2  = xmm2;
7635     const XMMRegister x3  = xmm3;
7636 
7637     const XMMRegister x4  = xmm4;
7638     const XMMRegister x5  = xmm5;
7639     const XMMRegister x6  = xmm6;
7640     const XMMRegister x7  = xmm7;
7641 
7642     const Register tmp   = r11;
7643 
7644     BLOCK_COMMENT("Entry:");
7645     __ enter(); // required for proper stackwalking of RuntimeStub frame
7646 
7647     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7648 
7649     __ leave(); // required for proper stackwalking of RuntimeStub frame
7650     __ ret(0);
7651 
7652     return start;
7653 
7654   }
7655 
7656   address generate_libmLog() {
7657     StubCodeMark mark(this, "StubRoutines", "libmLog");
7658 
7659     address start = __ pc();
7660 
7661     const XMMRegister x0 = xmm0;
7662     const XMMRegister x1 = xmm1;
7663     const XMMRegister x2 = xmm2;
7664     const XMMRegister x3 = xmm3;
7665 
7666     const XMMRegister x4 = xmm4;
7667     const XMMRegister x5 = xmm5;
7668     const XMMRegister x6 = xmm6;
7669     const XMMRegister x7 = xmm7;
7670 
7671     const Register tmp1 = r11;
7672     const Register tmp2 = r8;
7673 
7674     BLOCK_COMMENT("Entry:");
7675     __ enter(); // required for proper stackwalking of RuntimeStub frame
7676 
7677     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
7678 
7679     __ leave(); // required for proper stackwalking of RuntimeStub frame
7680     __ ret(0);
7681 
7682     return start;
7683 
7684   }
7685 
7686   address generate_libmLog10() {
7687     StubCodeMark mark(this, "StubRoutines", "libmLog10");
7688 
7689     address start = __ pc();
7690 
7691     const XMMRegister x0 = xmm0;
7692     const XMMRegister x1 = xmm1;
7693     const XMMRegister x2 = xmm2;
7694     const XMMRegister x3 = xmm3;
7695 
7696     const XMMRegister x4 = xmm4;
7697     const XMMRegister x5 = xmm5;
7698     const XMMRegister x6 = xmm6;
7699     const XMMRegister x7 = xmm7;
7700 
7701     const Register tmp = r11;
7702 
7703     BLOCK_COMMENT("Entry:");
7704     __ enter(); // required for proper stackwalking of RuntimeStub frame
7705 
7706     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7707 
7708     __ leave(); // required for proper stackwalking of RuntimeStub frame
7709     __ ret(0);
7710 
7711     return start;
7712 
7713   }
7714 
7715   address generate_libmPow() {
7716     StubCodeMark mark(this, "StubRoutines", "libmPow");
7717 
7718     address start = __ pc();
7719 
7720     const XMMRegister x0 = xmm0;
7721     const XMMRegister x1 = xmm1;
7722     const XMMRegister x2 = xmm2;
7723     const XMMRegister x3 = xmm3;
7724 
7725     const XMMRegister x4 = xmm4;
7726     const XMMRegister x5 = xmm5;
7727     const XMMRegister x6 = xmm6;
7728     const XMMRegister x7 = xmm7;
7729 
7730     const Register tmp1 = r8;
7731     const Register tmp2 = r9;
7732     const Register tmp3 = r10;
7733     const Register tmp4 = r11;
7734 
7735     BLOCK_COMMENT("Entry:");
7736     __ enter(); // required for proper stackwalking of RuntimeStub frame
7737 
7738     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7739 
7740     __ leave(); // required for proper stackwalking of RuntimeStub frame
7741     __ ret(0);
7742 
7743     return start;
7744 
7745   }
7746 
7747   address generate_libmSin() {
7748     StubCodeMark mark(this, "StubRoutines", "libmSin");
7749 
7750     address start = __ pc();
7751 
7752     const XMMRegister x0 = xmm0;
7753     const XMMRegister x1 = xmm1;
7754     const XMMRegister x2 = xmm2;
7755     const XMMRegister x3 = xmm3;
7756 
7757     const XMMRegister x4 = xmm4;
7758     const XMMRegister x5 = xmm5;
7759     const XMMRegister x6 = xmm6;
7760     const XMMRegister x7 = xmm7;
7761 
7762     const Register tmp1 = r8;
7763     const Register tmp2 = r9;
7764     const Register tmp3 = r10;
7765     const Register tmp4 = r11;
7766 
7767     BLOCK_COMMENT("Entry:");
7768     __ enter(); // required for proper stackwalking of RuntimeStub frame
7769 
7770 #ifdef _WIN64
7771     __ push(rsi);
7772     __ push(rdi);
7773 #endif
7774     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7775 
7776 #ifdef _WIN64
7777     __ pop(rdi);
7778     __ pop(rsi);
7779 #endif
7780 
7781     __ leave(); // required for proper stackwalking of RuntimeStub frame
7782     __ ret(0);
7783 
7784     return start;
7785 
7786   }
7787 
7788   address generate_libmCos() {
7789     StubCodeMark mark(this, "StubRoutines", "libmCos");
7790 
7791     address start = __ pc();
7792 
7793     const XMMRegister x0 = xmm0;
7794     const XMMRegister x1 = xmm1;
7795     const XMMRegister x2 = xmm2;
7796     const XMMRegister x3 = xmm3;
7797 
7798     const XMMRegister x4 = xmm4;
7799     const XMMRegister x5 = xmm5;
7800     const XMMRegister x6 = xmm6;
7801     const XMMRegister x7 = xmm7;
7802 
7803     const Register tmp1 = r8;
7804     const Register tmp2 = r9;
7805     const Register tmp3 = r10;
7806     const Register tmp4 = r11;
7807 
7808     BLOCK_COMMENT("Entry:");
7809     __ enter(); // required for proper stackwalking of RuntimeStub frame
7810 
7811 #ifdef _WIN64
7812     __ push(rsi);
7813     __ push(rdi);
7814 #endif
7815     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7816 
7817 #ifdef _WIN64
7818     __ pop(rdi);
7819     __ pop(rsi);
7820 #endif
7821 
7822     __ leave(); // required for proper stackwalking of RuntimeStub frame
7823     __ ret(0);
7824 
7825     return start;
7826 
7827   }
7828 
7829   address generate_libmTan() {
7830     StubCodeMark mark(this, "StubRoutines", "libmTan");
7831 
7832     address start = __ pc();
7833 
7834     const XMMRegister x0 = xmm0;
7835     const XMMRegister x1 = xmm1;
7836     const XMMRegister x2 = xmm2;
7837     const XMMRegister x3 = xmm3;
7838 
7839     const XMMRegister x4 = xmm4;
7840     const XMMRegister x5 = xmm5;
7841     const XMMRegister x6 = xmm6;
7842     const XMMRegister x7 = xmm7;
7843 
7844     const Register tmp1 = r8;
7845     const Register tmp2 = r9;
7846     const Register tmp3 = r10;
7847     const Register tmp4 = r11;
7848 
7849     BLOCK_COMMENT("Entry:");
7850     __ enter(); // required for proper stackwalking of RuntimeStub frame
7851 
7852 #ifdef _WIN64
7853     __ push(rsi);
7854     __ push(rdi);
7855 #endif
7856     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7857 
7858 #ifdef _WIN64
7859     __ pop(rdi);
7860     __ pop(rsi);
7861 #endif
7862 
7863     __ leave(); // required for proper stackwalking of RuntimeStub frame
7864     __ ret(0);
7865 
7866     return start;
7867 
7868   }
7869 
7870 RuntimeStub* generate_cont_doYield() {
7871     const char *name = "cont_doYield";
7872 
7873     enum layout {
7874       rbp_off,
7875       rbpH_off,
7876       return_off,
7877       return_off2,
7878       framesize // inclusive of return address
7879     };
7880     // assert(is_even(framesize/2), "sp not 16-byte aligned");
7881     
7882     int insts_size = 512;
7883     int locs_size  = 64;
7884     CodeBuffer code(name, insts_size, locs_size);
7885     OopMapSet* oop_maps  = new OopMapSet();
7886     MacroAssembler* masm = new MacroAssembler(&code);
7887     MacroAssembler* _masm = masm;
7888 
7889     address start = __ pc();
7890 
7891     __ enter();
7892 
7893     __ movptr(c_rarg1, rsp);
7894 
7895     int frame_complete = __ pc() - start;
7896     address the_pc = __ pc();
7897 
7898     __ post_call_nop(); // this must be exactly after the pc value that is pushed into the frame info, we use this nop for fast CodeBlob lookup
7899 
7900     if (ContPerfTest > 5) {
7901       __ movptr(c_rarg0, r15_thread);
7902       __ set_last_Java_frame(rsp, rbp, the_pc);
7903 
7904       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::freeze), 2);
7905       
7906       __ reset_last_Java_frame(true);
7907     }
7908 
7909     Label pinned;
7910 
7911     if (ContPerfTest <= 5) { __ xorq(rax, rax); }
7912     __ testq(rax, rax);
7913     __ jcc(Assembler::notZero, pinned);
7914 
7915     __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
7916     continuation_enter_cleanup(masm);
7917     __ pop(rbp);
7918     __ ret(0);
7919 
7920     __ bind(pinned); // pinned -- return to caller
7921 
7922     __ leave();
7923     __ ret(0);
7924 
7925     OopMap* map = new OopMap(framesize, 1);
7926     // map->set_callee_saved(VMRegImpl::stack2reg(rbp_off), rbp->as_VMReg());
7927     oop_maps->add_gc_map(the_pc - start, map);
7928 
7929     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7930     RuntimeStub::new_runtime_stub(name,
7931                                   &code,
7932                                   frame_complete,
7933                                   (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7934                                   oop_maps, false);
7935     return stub;
7936   }
7937 
7938   address generate_cont_jump_from_safepoint() {
7939     StubCodeMark mark(this, "StubRoutines","Continuation jump from safepoint");
7940 
7941     address start = __ pc();
7942 
7943     __ get_thread(r15_thread);
7944     __ reset_last_Java_frame(true); // false would be fine, too, I guess
7945     __ reinit_heapbase();
7946     
7947     __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
7948     continuation_enter_cleanup(_masm);
7949     __ pop(rbp);
7950     __ ret(0);
7951 
7952     return start;
7953   }
7954 
7955   address generate_cont_thaw(bool return_barrier, bool exception) {
7956     assert (return_barrier || !exception, "must be");
7957 
7958     address start = __ pc();
7959 
7960     // TODO: Handle Valhalla return types. May require generating different return barriers.
7961 
7962     if (!return_barrier) {
7963       __ pop(c_rarg3); // pop return address. if we don't do this, we get a drift, where the bottom-most frozen frame continuously grows
7964     } else {
7965       __ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
7966     }
7967     assert_asm(_masm, cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
7968 
7969     if (return_barrier) {
7970       __ push(rax); __ push_d(xmm0); // preserve possible return value from a method returning to the return barrier
7971     }
7972 
7973     __ movl(c_rarg1, (return_barrier ? 1 : 0) + (exception ? 1 : 0));
7974     if (ContPerfTest > 105) {
7975       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), r15_thread, c_rarg1);
7976       __ movptr(rbx, rax); // rax contains the size of the frames to thaw, 0 if overflow or no more frames
7977     } else {
7978       __ xorq(rbx, rbx);
7979     }
7980     if (return_barrier) {
7981       __ pop_d(xmm0); __ pop(rax); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7982     }
7983     assert_asm(_masm, cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
7984   // #ifdef ASSERT
7985   //   __ lea(rcx, Address(rsp, wordSize));
7986   //   assert_asm(_masm, cmpptr(rcx, Address(r15_thread, JavaThread::cont_entry_offset())), Assembler::equal, "incorrect rsp");
7987   // #endif
7988 
7989     Label thaw_success;
7990     __ testq(rbx, rbx);           // rbx contains the size of the frames to thaw, 0 if overflow or no more frames
7991     __ jcc(Assembler::notZero, thaw_success);
7992     __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
7993     __ bind(thaw_success);
7994 
7995     __ subq(rsp, rbx);             // make room for the thawed frames
7996     __ andptr(rsp, -16);           // align
7997     
7998     if (return_barrier) {
7999       __ push(rax); __ push_d(xmm0); // save original return value -- again
8000     }
8001 
8002     __ movl(c_rarg1, (return_barrier ? 1 : 0) + (exception ? 1 : 0));
8003     if (ContPerfTest > 112) {
8004       __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::thaw), r15_thread, c_rarg1);
8005     }
8006     __ movptr(rbx, rax); // rax is the sp of the yielding frame
8007 
8008     if (return_barrier) {
8009       __ pop_d(xmm0); __ pop(rax); // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
8010     } else {
8011       __ movl(rax, 0); // return 0 (success) from doYield
8012     }
8013 
8014     __ movptr(rsp, rbx); // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
8015     __ subptr(rsp, 2*wordSize); // now pointing to rbp spill
8016 
8017     if (exception) {
8018       __ movptr(c_rarg1, Address(rsp, wordSize)); // return address
8019       __ push(rax); // save return value contaning the exception oop
8020       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), r15_thread, c_rarg1);
8021       __ movptr(rbx, rax); // the exception handler
8022       __ pop(rax); // restore return value contaning the exception oop
8023       __ pop(rbp);
8024       __ pop(rdx); // rdx must contain the original pc in the case of exception; see OptoRuntime::generate_exception_blob
8025       __ jmp(rbx); // the exception handler
8026     }
8027 
8028     // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
8029     __ pop(rbp);
8030     __ ret(0);
8031 
8032     return start;
8033   }
8034 
8035   address generate_cont_thaw() {
8036     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
8037     address start = __ pc();
8038     generate_cont_thaw(false, false);
8039     return start;
8040   }
8041 
8042   address generate_cont_returnBarrier() {
8043     // TODO: will probably need multiple return barriers depending on return type
8044     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
8045     address start = __ pc();
8046 
8047     generate_cont_thaw(true, false);
8048 
8049     return start;
8050   }
8051 
8052   address generate_cont_returnBarrier_exception() {
8053     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
8054     address start = __ pc();
8055 
8056     generate_cont_thaw(true, true);
8057 
8058     return start;
8059   }
8060 
8061   address generate_cont_interpreter_forced_preempt_return() {
8062       StubCodeMark mark(this, "StubRoutines", "cont interpreter forced preempt return");
8063       address start = __ pc();
8064 
8065       // This is necessary for forced yields, as the return addres (in rbx) is captured in a call_VM, and skips the restoration of rbcp and locals
8066       // see InterpreterMacroAssembler::restore_bcp/restore_locals
8067       // TODO: use InterpreterMacroAssembler
8068       static const Register _locals_register = r14;
8069       static const Register _bcp_register    = r13;
8070 
8071       __ pop(rbp);
8072 
8073       __ movptr(_bcp_register,    Address(rbp, frame::interpreter_frame_bcp_offset    * wordSize));
8074       __ movptr(_locals_register, Address(rbp, frame::interpreter_frame_locals_offset * wordSize));
8075       // __ reinit_heapbase();
8076 
8077       __ ret(0);
8078 
8079       return start;
8080     }
8081 
8082 #if INCLUDE_JFR
8083 
8084   static void jfr_set_last_java_frame(MacroAssembler* _masm) {
8085     Register last_java_pc = c_rarg0;
8086     Register last_java_sp = c_rarg2;
8087     __ movptr(last_java_pc, Address(rsp, 0));
8088     __ lea(last_java_sp, Address(rsp, wordSize));
8089     __ vzeroupper();
8090     Address anchor_java_pc(r15_thread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
8091     __ movptr(anchor_java_pc, last_java_pc);
8092     __ movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
8093   }
8094 
8095   static void jfr_prologue(MacroAssembler* _masm) {
8096     jfr_set_last_java_frame(_masm);
8097     __ movptr(c_rarg0, r15_thread);
8098   }
8099 
8100   // Handle is dereference here using correct load constructs.
8101   static void jfr_epilogue(MacroAssembler* _masm) {
8102     __ reset_last_Java_frame(false);
8103     Label null_jobject;
8104     __ testq(rax, rax);
8105     __ jcc(Assembler::zero, null_jobject);
8106     DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
8107     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
8108     bs->load_at(_masm, decorators, T_OBJECT, rax, Address(rax, 0), c_rarg1, r15_thread);
8109     __ bind(null_jobject);
8110   }
8111 
8112   // For c2: c_rarg0 is junk, c_rarg1 is the thread id. Call to runtime to write a checkpoint.
8113   // Runtime will return a jobject handle to the event writer. The handle is dereferenced and the return value
8114   // is the event writer oop.
8115   address generate_jfr_write_checkpoint() {
8116     StubCodeMark mark(this, "jfr_write_checkpoint", "JFR C2 support for Virtual Threads");
8117 
8118     address start = __ pc();
8119     jfr_prologue(_masm);
8120     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JFR_WRITE_CHECKPOINT_FUNCTION), 2);
8121     jfr_epilogue(_masm);
8122     __ ret(0);
8123 
8124     return start;
8125   }
8126 
8127   // For c1: call the corresponding runtime routine, it returns a jobject handle to the event writer.
8128   // The handle is dereferenced and the return value is the event writer oop.
8129   address generate_jfr_get_event_writer() {
8130     StubCodeMark mark(this, "jfr_get_event_writer", "JFR C1 support for Virtual Threads");
8131     address start = __ pc();
8132 
8133     jfr_prologue(_masm);
8134     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JFR_GET_EVENT_WRITER_FUNCTION), 1);
8135     jfr_epilogue(_masm);
8136     __ ret(0);
8137 
8138     return start;
8139   }
8140 
8141 #endif // INCLUDE_JFR
8142 
8143 #undef __
8144 #define __ masm->
8145 
8146   // Continuation point for throwing of implicit exceptions that are
8147   // not handled in the current activation. Fabricates an exception
8148   // oop and initiates normal exception dispatching in this
8149   // frame. Since we need to preserve callee-saved values (currently
8150   // only for C2, but done for C1 as well) we need a callee-saved oop
8151   // map and therefore have to make these stubs into RuntimeStubs
8152   // rather than BufferBlobs.  If the compiler needs all registers to
8153   // be preserved between the fault point and the exception handler
8154   // then it must assume responsibility for that in
8155   // AbstractCompiler::continuation_for_implicit_null_exception or
8156   // continuation_for_implicit_division_by_zero_exception. All other
8157   // implicit exceptions (e.g., NullPointerException or
8158   // AbstractMethodError on entry) are either at call sites or
8159   // otherwise assume that stack unwinding will be initiated, so
8160   // caller saved registers were assumed volatile in the compiler.
8161   address generate_throw_exception(const char* name,
8162                                    address runtime_entry,
8163                                    Register arg1 = noreg,
8164                                    Register arg2 = noreg) {
8165     // Information about frame layout at time of blocking runtime call.
8166     // Note that we only have to preserve callee-saved registers since
8167     // the compilers are responsible for supplying a continuation point
8168     // if they expect all registers to be preserved.
8169     enum layout {
8170       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
8171       rbp_off2,
8172       return_off,
8173       return_off2,
8174       framesize // inclusive of return address
8175     };
8176 
8177     int insts_size = 512;
8178     int locs_size  = 64;
8179 
8180     CodeBuffer code(name, insts_size, locs_size);
8181     OopMapSet* oop_maps  = new OopMapSet();
8182     MacroAssembler* masm = new MacroAssembler(&code);
8183 
8184     address start = __ pc();
8185 
8186     // This is an inlined and slightly modified version of call_VM
8187     // which has the ability to fetch the return PC out of
8188     // thread-local storage and also sets up last_Java_sp slightly
8189     // differently than the real call_VM
8190 
8191     __ enter(); // required for proper stackwalking of RuntimeStub frame
8192 
8193     assert(is_even(framesize/2), "sp not 16-byte aligned");
8194 
8195     // return address and rbp are already in place
8196     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
8197 
8198     int frame_complete = __ pc() - start;
8199 
8200     // Set up last_Java_sp and last_Java_fp
8201     address the_pc = __ pc();
8202     __ set_last_Java_frame(rsp, rbp, the_pc);
8203     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
8204 
8205     // Call runtime
8206     if (arg1 != noreg) {
8207       assert(arg2 != c_rarg1, "clobbered");
8208       __ movptr(c_rarg1, arg1);
8209     }
8210     if (arg2 != noreg) {
8211       __ movptr(c_rarg2, arg2);
8212     }
8213     __ movptr(c_rarg0, r15_thread);
8214     BLOCK_COMMENT("call runtime_entry");
8215     __ call(RuntimeAddress(runtime_entry));
8216 
8217     // Generate oop map
8218     OopMap* map = new OopMap(framesize, 0);
8219 
8220     oop_maps->add_gc_map(the_pc - start, map);
8221 
8222     __ reset_last_Java_frame(true);
8223 
8224     __ leave(); // required for proper stackwalking of RuntimeStub frame
8225 
8226     // check for pending exceptions
8227 #ifdef ASSERT
8228     Label L;
8229     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
8230             (int32_t) NULL_WORD);
8231     __ jcc(Assembler::notEqual, L);
8232     __ should_not_reach_here();
8233     __ bind(L);
8234 #endif // ASSERT
8235     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
8236 
8237 
8238     // codeBlob framesize is in words (not VMRegImpl::slot_size)
8239     RuntimeStub* stub =
8240       RuntimeStub::new_runtime_stub(name,
8241                                     &code,
8242                                     frame_complete,
8243                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
8244                                     oop_maps, false);
8245     return stub->entry_point();
8246   }
8247 
8248   void create_control_words() {
8249     // Round to nearest, 64-bit mode, exceptions masked
8250     StubRoutines::x86::_mxcsr_std = 0x1F80;
8251   }
8252 
8253   // Initialization
8254   void generate_initial() {
8255     // Generates all stubs and initializes the entry points
8256 
8257     // This platform-specific settings are needed by generate_call_stub()
8258     create_control_words();
8259 
8260     // entry points that exist in all platforms Note: This is code
8261     // that could be shared among different platforms - however the
8262     // benefit seems to be smaller than the disadvantage of having a
8263     // much more complicated generator structure. See also comment in
8264     // stubRoutines.hpp.
8265 
8266     StubRoutines::_forward_exception_entry = generate_forward_exception();
8267 
8268     StubRoutines::_call_stub_entry =
8269       generate_call_stub(StubRoutines::_call_stub_return_address);
8270 
8271     // is referenced by megamorphic call
8272     StubRoutines::_catch_exception_entry = generate_catch_exception();
8273 
8274     // atomic calls
8275     StubRoutines::_fence_entry                = generate_orderaccess_fence();
8276 
8277     // platform dependent
8278     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
8279 
8280     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
8281 
8282     StubRoutines::x86::_f2i_fixup             = generate_f2i_fixup();
8283     StubRoutines::x86::_f2l_fixup             = generate_f2l_fixup();
8284     StubRoutines::x86::_d2i_fixup             = generate_d2i_fixup();
8285     StubRoutines::x86::_d2l_fixup             = generate_d2l_fixup();
8286 
8287     StubRoutines::x86::_float_sign_mask       = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
8288     StubRoutines::x86::_float_sign_flip       = generate_fp_mask("float_sign_flip",  0x8000000080000000);
8289     StubRoutines::x86::_double_sign_mask      = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
8290     StubRoutines::x86::_double_sign_flip      = generate_fp_mask("double_sign_flip", 0x8000000000000000);
8291 
8292     // Build this early so it's available for the interpreter.
8293     StubRoutines::_throw_StackOverflowError_entry =
8294       generate_throw_exception("StackOverflowError throw_exception",
8295                                CAST_FROM_FN_PTR(address,
8296                                                 SharedRuntime::
8297                                                 throw_StackOverflowError));
8298     StubRoutines::_throw_delayed_StackOverflowError_entry =
8299       generate_throw_exception("delayed StackOverflowError throw_exception",
8300                                CAST_FROM_FN_PTR(address,
8301                                                 SharedRuntime::
8302                                                 throw_delayed_StackOverflowError));
8303     if (UseCRC32Intrinsics) {
8304       // set table address before stub generation which use it
8305       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
8306       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8307     }
8308 
8309     if (UseCRC32CIntrinsics) {
8310       bool supports_clmul = VM_Version::supports_clmul();
8311       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
8312       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
8313       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
8314     }
8315 
8316     if (UseAdler32Intrinsics) {
8317        StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8318     }
8319 
8320     if (UseLibmIntrinsic && InlineIntrinsics) {
8321       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
8322           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
8323           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
8324         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
8325         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
8326         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
8327         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
8328         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
8329         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
8330         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
8331         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
8332         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
8333         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
8334         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
8335         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
8336         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
8337         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
8338       }
8339       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
8340         StubRoutines::_dexp = generate_libmExp();
8341       }
8342       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
8343         StubRoutines::_dlog = generate_libmLog();
8344       }
8345       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
8346         StubRoutines::_dlog10 = generate_libmLog10();
8347       }
8348       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
8349         StubRoutines::_dpow = generate_libmPow();
8350       }
8351       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8352         StubRoutines::_dsin = generate_libmSin();
8353       }
8354       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8355         StubRoutines::_dcos = generate_libmCos();
8356       }
8357       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
8358         StubRoutines::_dtan = generate_libmTan();
8359       }
8360     }
8361 
8362     // Safefetch stubs.
8363     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
8364                                                        &StubRoutines::_safefetch32_fault_pc,
8365                                                        &StubRoutines::_safefetch32_continuation_pc);
8366     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
8367                                                        &StubRoutines::_safefetchN_fault_pc,
8368                                                        &StubRoutines::_safefetchN_continuation_pc);
8369   }
8370 
8371   void generate_phase1() {
8372     // Continuation stubs:
8373     StubRoutines::_cont_thaw          = generate_cont_thaw();
8374     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8375     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8376     StubRoutines::_cont_doYield_stub = generate_cont_doYield();
8377     StubRoutines::_cont_doYield    = StubRoutines::_cont_doYield_stub->entry_point();
8378     StubRoutines::_cont_jump_from_sp = generate_cont_jump_from_safepoint();
8379     StubRoutines::_cont_interpreter_forced_preempt_return = generate_cont_interpreter_forced_preempt_return();
8380 
8381     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = generate_jfr_write_checkpoint();)
8382     JFR_ONLY(StubRoutines::_jfr_get_event_writer = generate_jfr_get_event_writer();)
8383   }
8384 
8385   void generate_all() {
8386     // Generates all stubs and initializes the entry points
8387 
8388     // These entry points require SharedInfo::stack0 to be set up in
8389     // non-core builds and need to be relocatable, so they each
8390     // fabricate a RuntimeStub internally.
8391     StubRoutines::_throw_AbstractMethodError_entry =
8392       generate_throw_exception("AbstractMethodError throw_exception",
8393                                CAST_FROM_FN_PTR(address,
8394                                                 SharedRuntime::
8395                                                 throw_AbstractMethodError));
8396 
8397     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8398       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8399                                CAST_FROM_FN_PTR(address,
8400                                                 SharedRuntime::
8401                                                 throw_IncompatibleClassChangeError));
8402 
8403     StubRoutines::_throw_NullPointerException_at_call_entry =
8404       generate_throw_exception("NullPointerException at call throw_exception",
8405                                CAST_FROM_FN_PTR(address,
8406                                                 SharedRuntime::
8407                                                 throw_NullPointerException_at_call));
8408 
8409     // entry points that are platform specific
8410     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
8411     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
8412     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
8413     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
8414     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
8415     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
8416     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
8417     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
8418     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
8419     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
8420                                                                         0xFFFFFFFF, 0, 0, 0);
8421     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
8422                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
8423     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
8424     StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
8425     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
8426     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
8427     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
8428     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
8429 
8430     // support for verify_oop (must happen after universe_init)
8431     if (VerifyOops) {
8432       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
8433     }
8434 
8435     // data cache line writeback
8436     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8437     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8438 
8439     // arraycopy stubs used by compilers
8440     generate_arraycopy_stubs();
8441 
8442     // don't bother generating these AES intrinsic stubs unless global flag is set
8443     if (UseAESIntrinsics) {
8444       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
8445       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8446       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8447       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8448       if (VM_Version::supports_avx512_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
8449         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
8450         StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
8451         StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
8452         StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
8453         StubRoutines::x86::_ghash_poly512_addr = ghash_polynomial512_addr();
8454         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
8455         StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8456       } else {
8457         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
8458       }
8459     }
8460 
8461     if (UseAESCTRIntrinsics) {
8462       if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
8463         if (StubRoutines::x86::_counter_mask_addr == NULL) {
8464           StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
8465         }
8466         StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
8467       } else {
8468         StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
8469         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
8470       }
8471     }
8472 
8473     if (UseMD5Intrinsics) {
8474       StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
8475       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
8476     }
8477     if (UseSHA1Intrinsics) {
8478       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
8479       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
8480       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
8481       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
8482     }
8483     if (UseSHA256Intrinsics) {
8484       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
8485       char* dst = (char*)StubRoutines::x86::_k256_W;
8486       char* src = (char*)StubRoutines::x86::_k256;
8487       for (int ii = 0; ii < 16; ++ii) {
8488         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
8489         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
8490       }
8491       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
8492       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
8493       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
8494       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
8495     }
8496     if (UseSHA512Intrinsics) {
8497       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
8498       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
8499       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
8500       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
8501     }
8502 
8503     // Generate GHASH intrinsics code
8504     if (UseGHASHIntrinsics) {
8505       if (StubRoutines::x86::_ghash_long_swap_mask_addr == NULL) {
8506         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
8507       }
8508     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
8509       if (VM_Version::supports_avx()) {
8510         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
8511         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
8512         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
8513       } else {
8514         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8515       }
8516     }
8517 
8518 
8519     if (UseBASE64Intrinsics) {
8520       if(VM_Version::supports_avx2() &&
8521          VM_Version::supports_avx512bw() &&
8522          VM_Version::supports_avx512vl()) {
8523         StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
8524         StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
8525         StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
8526       }
8527       StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
8528       if (VM_Version::supports_avx512_vbmi()) {
8529         StubRoutines::x86::_shuffle_base64 = base64_shuffle_addr();
8530         StubRoutines::x86::_lookup_lo_base64 = base64_vbmi_lookup_lo_addr();
8531         StubRoutines::x86::_lookup_hi_base64 = base64_vbmi_lookup_hi_addr();
8532         StubRoutines::x86::_lookup_lo_base64url = base64_vbmi_lookup_lo_url_addr();
8533         StubRoutines::x86::_lookup_hi_base64url = base64_vbmi_lookup_hi_url_addr();
8534         StubRoutines::x86::_pack_vec_base64 = base64_vbmi_pack_vec_addr();
8535         StubRoutines::x86::_join_0_1_base64 = base64_vbmi_join_0_1_addr();
8536         StubRoutines::x86::_join_1_2_base64 = base64_vbmi_join_1_2_addr();
8537         StubRoutines::x86::_join_2_3_base64 = base64_vbmi_join_2_3_addr();
8538       }
8539       StubRoutines::x86::_decoding_table_base64 = base64_decoding_table_addr();
8540       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8541       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8542     }
8543 
8544     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8545     if (bs_nm != NULL) {
8546       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
8547     }
8548 #ifdef COMPILER2
8549     if (UseMultiplyToLenIntrinsic) {
8550       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8551     }
8552     if (UseSquareToLenIntrinsic) {
8553       StubRoutines::_squareToLen = generate_squareToLen();
8554     }
8555     if (UseMulAddIntrinsic) {
8556       StubRoutines::_mulAdd = generate_mulAdd();
8557     }
8558     if (VM_Version::supports_avx512_vbmi2()) {
8559       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8560       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
8561     }
8562     if (UseMontgomeryMultiplyIntrinsic) {
8563       StubRoutines::_montgomeryMultiply
8564         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
8565     }
8566     if (UseMontgomerySquareIntrinsic) {
8567       StubRoutines::_montgomerySquare
8568         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
8569     }
8570 
8571     // Get svml stub routine addresses
8572     void *libsvml = NULL;
8573     char ebuf[1024];
8574     char dll_name[JVM_MAXPATHLEN];
8575     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "svml")) {
8576       libsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
8577     }
8578     if (libsvml != NULL) {
8579       // SVML method naming convention
8580       //   All the methods are named as __svml_op<T><N>_ha_<VV>
8581       //   Where:
8582       //      ha stands for high accuracy
8583       //      <T> is optional to indicate float/double
8584       //              Set to f for vector float operation
8585       //              Omitted for vector double operation
8586       //      <N> is the number of elements in the vector
8587       //              1, 2, 4, 8, 16
8588       //              e.g. 128 bit float vector has 4 float elements
8589       //      <VV> indicates the avx/sse level:
8590       //              z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2
8591       //      e.g. __svml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns
8592       //           __svml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
8593 
8594       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "svml" JNI_LIB_SUFFIX, p2i(libsvml));
8595       if (UseAVX > 2) {
8596         for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
8597           int vop = VectorSupport::VECTOR_OP_SVML_START + op;
8598           if ((!VM_Version::supports_avx512dq()) &&
8599               (vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) {
8600             continue;
8601           }
8602           snprintf(ebuf, sizeof(ebuf), "__svml_%sf16_ha_z0", VectorSupport::svmlname[op]);
8603           StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libsvml, ebuf);
8604 
8605           snprintf(ebuf, sizeof(ebuf), "__svml_%s8_ha_z0", VectorSupport::svmlname[op]);
8606           StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libsvml, ebuf);
8607         }
8608       }
8609       const char* avx_sse_str = (UseAVX >= 2) ? "l9" : ((UseAVX == 1) ? "e9" : "ex");
8610       for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
8611         int vop = VectorSupport::VECTOR_OP_SVML_START + op;
8612         if (vop == VectorSupport::VECTOR_OP_POW) {
8613           continue;
8614         }
8615         snprintf(ebuf, sizeof(ebuf), "__svml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8616         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsvml, ebuf);
8617 
8618         snprintf(ebuf, sizeof(ebuf), "__svml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8619         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsvml, ebuf);
8620 
8621         snprintf(ebuf, sizeof(ebuf), "__svml_%sf8_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8622         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
8623 
8624         snprintf(ebuf, sizeof(ebuf), "__svml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8625         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsvml, ebuf);
8626 
8627         snprintf(ebuf, sizeof(ebuf), "__svml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8628         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsvml, ebuf);
8629 
8630         snprintf(ebuf, sizeof(ebuf), "__svml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8631         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
8632       }
8633     }
8634 #endif // COMPILER2
8635 
8636     if (UseVectorizedMismatchIntrinsic) {
8637       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
8638     }
8639   }
8640 
8641  public:
8642   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
8643     if (phase == 0) {
8644       generate_initial();
8645     } else if (phase == 1) {
8646       generate_phase1(); // stubs that must be available for the interpreter
8647     } else {
8648       generate_all();
8649     }
8650   }
8651 }; // end class declaration
8652 
8653 #define UCM_TABLE_MAX_ENTRIES 16
8654 void StubGenerator_generate(CodeBuffer* code, int phase) {
8655   if (UnsafeCopyMemory::_table == NULL) {
8656     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
8657   }
8658   StubGenerator g(code, phase);
8659 }
8660 
8661 #undef __
8662 #define __ masm->
8663 
8664 // on exit, rsp points to the ContinuationEntry
8665 // kills rax
8666 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
8667   assert (ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
8668   assert (in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
8669   assert (in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
8670 
8671   stack_slots += (int)ContinuationEntry::size()/wordSize;
8672   __ subptr(rsp, (int32_t)ContinuationEntry::size()); // place Continuation metadata
8673 
8674   OopMap* map = new OopMap(((int)ContinuationEntry::size() + wordSize)/ VMRegImpl::stack_slot_size, 0 /* arg_slots*/);
8675   ContinuationEntry::setup_oopmap(map);
8676 
8677   __ movptr(rax, Address(r15_thread, JavaThread::cont_entry_offset()));
8678   __ movptr(Address(rsp, ContinuationEntry::parent_offset()), rax);
8679   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rsp);
8680 
8681   return map;
8682 }
8683 
8684 // on entry c_rarg1 points to the continuation 
8685 //          rsp points to ContinuationEntry
8686 // kills rax
8687 void fill_continuation_entry(MacroAssembler* masm) {
8688   DEBUG_ONLY(__ movl(Address(rsp, ContinuationEntry::cookie_offset()), 0x1234);)
8689 
8690   __ movptr(Address(rsp, ContinuationEntry::cont_offset()), c_rarg1);
8691   __ movptr(Address(rsp, ContinuationEntry::chunk_offset()), (int32_t)0);
8692   __ movl(Address(rsp, ContinuationEntry::argsize_offset()), (int32_t)0);
8693 
8694   __ movptr(rax, Address(r15_thread, JavaThread::cont_fastpath_offset()));
8695   __ movptr(Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()), rax);
8696   __ movl(rax, Address(r15_thread, JavaThread::held_monitor_count_offset()));
8697   __ movl(Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()), rax);
8698   
8699   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), 0);
8700   __ reset_held_monitor_count(r15_thread);
8701 }
8702 
8703 // on entry, rsp points to the ContinuationEntry
8704 // on exit, rsp points to the spilled rbp in the entry frame
8705 // kills rbx, rcx
8706 void continuation_enter_cleanup(MacroAssembler* masm) {
8707 #ifndef PRODUCT
8708   Label OK;
8709   __ cmpptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
8710   __ jcc(Assembler::equal, OK);
8711   __ stop("incorrect rsp1");
8712   __ bind(OK);
8713 #endif
8714   
8715   __ movptr(rbx, Address(rsp, ContinuationEntry::parent_cont_fastpath_offset()));
8716   __ movptr(Address(r15_thread, JavaThread::cont_fastpath_offset()), rbx);
8717   __ movl(rbx, Address(rsp, ContinuationEntry::parent_held_monitor_count_offset()));
8718   __ movl(Address(r15_thread, JavaThread::held_monitor_count_offset()), rbx);
8719 
8720   __ movptr(rcx, Address(rsp, ContinuationEntry::parent_offset()));
8721   __ movptr(Address(r15_thread, JavaThread::cont_entry_offset()), rcx);
8722   __ addptr(rsp, (int32_t)ContinuationEntry::size());
8723 }
8724 
8725 #undef __