1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "ci/ciUtilities.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/barrierSetNMethod.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "memory/universe.hpp"
  37 #include "nativeInst_x86.hpp"
  38 #include "oops/instanceOop.hpp"
  39 #include "oops/method.hpp"
  40 #include "oops/objArrayKlass.hpp"
  41 #include "oops/oop.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "register_x86.hpp"
  44 #include "runtime/arguments.hpp"
  45 #include "runtime/frame.inline.hpp"
  46 #include "runtime/handles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubCodeGenerator.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.inline.hpp"
  51 #include "utilities/macros.hpp"
  52 #include "vmreg_x86.inline.hpp"
  53 #ifdef COMPILER2
  54 #include "opto/runtime.hpp"
  55 #endif
  56 #if INCLUDE_JVMCI
  57 #include "jvmci/jvmci_globals.hpp"
  58 #endif
  59 #if INCLUDE_ZGC
  60 #include "gc/z/zThreadLocalData.hpp"
  61 #endif
  62 
  63 
  64 // Declaration and definition of StubGenerator (no .hpp file).
  65 // For a more detailed description of the stub routine structure
  66 // see the comment in stubRoutines.hpp
  67 
  68 #define __ _masm->
  69 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  70 #define a__ ((Assembler*)_masm)->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  80 
  81 // Stub Code definitions
  82 
  83 class StubGenerator: public StubCodeGenerator {
  84  private:
  85 
  86 #ifdef PRODUCT
  87 #define inc_counter_np(counter) ((void)0)
  88 #else
  89   void inc_counter_np_(int& counter) {
  90     // This can destroy rscratch1 if counter is far from the code cache
  91     __ incrementl(ExternalAddress((address)&counter));
  92   }
  93 #define inc_counter_np(counter) \
  94   BLOCK_COMMENT("inc_counter " #counter); \
  95   inc_counter_np_(counter);
  96 #endif
  97 
  98   // Call stubs are used to call Java from C
  99   //
 100   // Linux Arguments:
 101   //    c_rarg0:   call wrapper address                   address
 102   //    c_rarg1:   result                                 address
 103   //    c_rarg2:   result type                            BasicType
 104   //    c_rarg3:   method                                 Method*
 105   //    c_rarg4:   (interpreter) entry point              address
 106   //    c_rarg5:   parameters                             intptr_t*
 107   //    16(rbp): parameter size (in words)              int
 108   //    24(rbp): thread                                 Thread*
 109   //
 110   //     [ return_from_Java     ] <--- rsp
 111   //     [ argument word n      ]
 112   //      ...
 113   // -12 [ argument word 1      ]
 114   // -11 [ saved r15            ] <--- rsp_after_call
 115   // -10 [ saved r14            ]
 116   //  -9 [ saved r13            ]
 117   //  -8 [ saved r12            ]
 118   //  -7 [ saved rbx            ]
 119   //  -6 [ call wrapper         ]
 120   //  -5 [ result               ]
 121   //  -4 [ result type          ]
 122   //  -3 [ method               ]
 123   //  -2 [ entry point          ]
 124   //  -1 [ parameters           ]
 125   //   0 [ saved rbp            ] <--- rbp
 126   //   1 [ return address       ]
 127   //   2 [ parameter size       ]
 128   //   3 [ thread               ]
 129   //
 130   // Windows Arguments:
 131   //    c_rarg0:   call wrapper address                   address
 132   //    c_rarg1:   result                                 address
 133   //    c_rarg2:   result type                            BasicType
 134   //    c_rarg3:   method                                 Method*
 135   //    48(rbp): (interpreter) entry point              address
 136   //    56(rbp): parameters                             intptr_t*
 137   //    64(rbp): parameter size (in words)              int
 138   //    72(rbp): thread                                 Thread*
 139   //
 140   //     [ return_from_Java     ] <--- rsp
 141   //     [ argument word n      ]
 142   //      ...
 143   // -60 [ argument word 1      ]
 144   // -59 [ saved xmm31          ] <--- rsp after_call
 145   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 146   // -27 [ saved xmm15          ]
 147   //     [ saved xmm7-xmm14     ]
 148   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 149   //  -7 [ saved r15            ]
 150   //  -6 [ saved r14            ]
 151   //  -5 [ saved r13            ]
 152   //  -4 [ saved r12            ]
 153   //  -3 [ saved rdi            ]
 154   //  -2 [ saved rsi            ]
 155   //  -1 [ saved rbx            ]
 156   //   0 [ saved rbp            ] <--- rbp
 157   //   1 [ return address       ]
 158   //   2 [ call wrapper         ]
 159   //   3 [ result               ]
 160   //   4 [ result type          ]
 161   //   5 [ method               ]
 162   //   6 [ entry point          ]
 163   //   7 [ parameters           ]
 164   //   8 [ parameter size       ]
 165   //   9 [ thread               ]
 166   //
 167   //    Windows reserves the callers stack space for arguments 1-4.
 168   //    We spill c_rarg0-c_rarg3 to this space.
 169 
 170   // Call stub stack layout word offsets from rbp
 171   enum call_stub_layout {
 172 #ifdef _WIN64
 173     xmm_save_first     = 6,  // save from xmm6
 174     xmm_save_last      = 31, // to xmm31
 175     xmm_save_base      = -9,
 176     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 177     r15_off            = -7,
 178     r14_off            = -6,
 179     r13_off            = -5,
 180     r12_off            = -4,
 181     rdi_off            = -3,
 182     rsi_off            = -2,
 183     rbx_off            = -1,
 184     rbp_off            =  0,
 185     retaddr_off        =  1,
 186     call_wrapper_off   =  2,
 187     result_off         =  3,
 188     result_type_off    =  4,
 189     method_off         =  5,
 190     entry_point_off    =  6,
 191     parameters_off     =  7,
 192     parameter_size_off =  8,
 193     thread_off         =  9
 194 #else
 195     rsp_after_call_off = -12,
 196     mxcsr_off          = rsp_after_call_off,
 197     r15_off            = -11,
 198     r14_off            = -10,
 199     r13_off            = -9,
 200     r12_off            = -8,
 201     rbx_off            = -7,
 202     call_wrapper_off   = -6,
 203     result_off         = -5,
 204     result_type_off    = -4,
 205     method_off         = -3,
 206     entry_point_off    = -2,
 207     parameters_off     = -1,
 208     rbp_off            =  0,
 209     retaddr_off        =  1,
 210     parameter_size_off =  2,
 211     thread_off         =  3
 212 #endif
 213   };
 214 
 215 #ifdef _WIN64
 216   Address xmm_save(int reg) {
 217     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 218     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 219   }
 220 #endif
 221 
 222   address generate_call_stub(address& return_address) {
 223     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 224            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 225            "adjust this code");
 226     StubCodeMark mark(this, "StubRoutines", "call_stub");
 227     address start = __ pc();
 228 
 229     // same as in generate_catch_exception()!
 230     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 231 
 232     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 233     const Address result        (rbp, result_off         * wordSize);
 234     const Address result_type   (rbp, result_type_off    * wordSize);
 235     const Address method        (rbp, method_off         * wordSize);
 236     const Address entry_point   (rbp, entry_point_off    * wordSize);
 237     const Address parameters    (rbp, parameters_off     * wordSize);
 238     const Address parameter_size(rbp, parameter_size_off * wordSize);
 239 
 240     // same as in generate_catch_exception()!
 241     const Address thread        (rbp, thread_off         * wordSize);
 242 
 243     const Address r15_save(rbp, r15_off * wordSize);
 244     const Address r14_save(rbp, r14_off * wordSize);
 245     const Address r13_save(rbp, r13_off * wordSize);
 246     const Address r12_save(rbp, r12_off * wordSize);
 247     const Address rbx_save(rbp, rbx_off * wordSize);
 248 
 249     // stub code
 250     __ enter();
 251     __ subptr(rsp, -rsp_after_call_off * wordSize);
 252 
 253     // save register parameters
 254 #ifndef _WIN64
 255     __ movptr(parameters,   c_rarg5); // parameters
 256     __ movptr(entry_point,  c_rarg4); // entry_point
 257 #endif
 258 
 259     __ movptr(method,       c_rarg3); // method
 260     __ movl(result_type,  c_rarg2);   // result type
 261     __ movptr(result,       c_rarg1); // result
 262     __ movptr(call_wrapper, c_rarg0); // call wrapper
 263 
 264     // save regs belonging to calling function
 265     __ movptr(rbx_save, rbx);
 266     __ movptr(r12_save, r12);
 267     __ movptr(r13_save, r13);
 268     __ movptr(r14_save, r14);
 269     __ movptr(r15_save, r15);
 270 
 271 #ifdef _WIN64
 272     int last_reg = 15;
 273     if (UseAVX > 2) {
 274       last_reg = 31;
 275     }
 276     if (VM_Version::supports_evex()) {
 277       for (int i = xmm_save_first; i <= last_reg; i++) {
 278         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 279       }
 280     } else {
 281       for (int i = xmm_save_first; i <= last_reg; i++) {
 282         __ movdqu(xmm_save(i), as_XMMRegister(i));
 283       }
 284     }
 285 
 286     const Address rdi_save(rbp, rdi_off * wordSize);
 287     const Address rsi_save(rbp, rsi_off * wordSize);
 288 
 289     __ movptr(rsi_save, rsi);
 290     __ movptr(rdi_save, rdi);
 291 #else
 292     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 293     {
 294       Label skip_ldmx;
 295       __ stmxcsr(mxcsr_save);
 296       __ movl(rax, mxcsr_save);
 297       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 298       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 299       __ cmp32(rax, mxcsr_std);
 300       __ jcc(Assembler::equal, skip_ldmx);
 301       __ ldmxcsr(mxcsr_std);
 302       __ bind(skip_ldmx);
 303     }
 304 #endif
 305 
 306     // Load up thread register
 307     __ movptr(r15_thread, thread);
 308     __ reinit_heapbase();
 309 
 310 #ifdef ASSERT
 311     // make sure we have no pending exceptions
 312     {
 313       Label L;
 314       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 315       __ jcc(Assembler::equal, L);
 316       __ stop("StubRoutines::call_stub: entered with pending exception");
 317       __ bind(L);
 318     }
 319 #endif
 320 
 321     // pass parameters if any
 322     BLOCK_COMMENT("pass parameters if any");
 323     Label parameters_done;
 324     __ movl(c_rarg3, parameter_size);
 325     __ testl(c_rarg3, c_rarg3);
 326     __ jcc(Assembler::zero, parameters_done);
 327 
 328     Label loop;
 329     __ movptr(c_rarg2, parameters);       // parameter pointer
 330     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 331     __ BIND(loop);
 332     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 333     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 334     __ decrementl(c_rarg1);             // decrement counter
 335     __ push(rax);                       // pass parameter
 336     __ jcc(Assembler::notZero, loop);
 337 
 338     // call Java function
 339     __ BIND(parameters_done);
 340     __ movptr(rbx, method);             // get Method*
 341     __ movptr(c_rarg1, entry_point);    // get entry_point
 342     __ mov(r13, rsp);                   // set sender sp
 343     BLOCK_COMMENT("call Java function");
 344     __ call(c_rarg1);
 345 
 346     BLOCK_COMMENT("call_stub_return_address:");
 347     return_address = __ pc();
 348 
 349     // store result depending on type (everything that is not
 350     // T_OBJECT, T_PRIMITIVE_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 351     __ movptr(r13, result);
 352     Label is_long, is_float, is_double, check_prim, exit;
 353     __ movl(rbx, result_type);
 354     __ cmpl(rbx, T_OBJECT);
 355     __ jcc(Assembler::equal, check_prim);
 356     __ cmpl(rbx, T_PRIMITIVE_OBJECT);
 357     __ jcc(Assembler::equal, check_prim);
 358     __ cmpl(rbx, T_LONG);
 359     __ jcc(Assembler::equal, is_long);
 360     __ cmpl(rbx, T_FLOAT);
 361     __ jcc(Assembler::equal, is_float);
 362     __ cmpl(rbx, T_DOUBLE);
 363     __ jcc(Assembler::equal, is_double);
 364 
 365     // handle T_INT case
 366     __ movl(Address(r13, 0), rax);
 367 
 368     __ BIND(exit);
 369 
 370     // pop parameters
 371     __ lea(rsp, rsp_after_call);
 372 
 373 #ifdef ASSERT
 374     // verify that threads correspond
 375     {
 376      Label L1, L2, L3;
 377       __ cmpptr(r15_thread, thread);
 378       __ jcc(Assembler::equal, L1);
 379       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 380       __ bind(L1);
 381       __ get_thread(rbx);
 382       __ cmpptr(r15_thread, thread);
 383       __ jcc(Assembler::equal, L2);
 384       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 385       __ bind(L2);
 386       __ cmpptr(r15_thread, rbx);
 387       __ jcc(Assembler::equal, L3);
 388       __ stop("StubRoutines::call_stub: threads must correspond");
 389       __ bind(L3);
 390     }
 391 #endif
 392 
 393     // restore regs belonging to calling function
 394 #ifdef _WIN64
 395     // emit the restores for xmm regs
 396     if (VM_Version::supports_evex()) {
 397       for (int i = xmm_save_first; i <= last_reg; i++) {
 398         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 399       }
 400     } else {
 401       for (int i = xmm_save_first; i <= last_reg; i++) {
 402         __ movdqu(as_XMMRegister(i), xmm_save(i));
 403       }
 404     }
 405 #endif
 406     __ movptr(r15, r15_save);
 407     __ movptr(r14, r14_save);
 408     __ movptr(r13, r13_save);
 409     __ movptr(r12, r12_save);
 410     __ movptr(rbx, rbx_save);
 411 
 412 #ifdef _WIN64
 413     __ movptr(rdi, rdi_save);
 414     __ movptr(rsi, rsi_save);
 415 #else
 416     __ ldmxcsr(mxcsr_save);
 417 #endif
 418 
 419     // restore rsp
 420     __ addptr(rsp, -rsp_after_call_off * wordSize);
 421 
 422     // return
 423     __ vzeroupper();
 424     __ pop(rbp);
 425     __ ret(0);
 426 
 427     // handle return types different from T_INT
 428     __ BIND(check_prim);
 429     if (InlineTypeReturnedAsFields) {
 430       // Check for scalarized return value
 431       __ testptr(rax, 1);
 432       __ jcc(Assembler::zero, is_long);
 433       // Load pack handler address
 434       __ andptr(rax, -2);
 435       __ movptr(rax, Address(rax, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 436       __ movptr(rbx, Address(rax, InlineKlass::pack_handler_jobject_offset()));
 437       // Call pack handler to initialize the buffer
 438       __ call(rbx);
 439       __ jmp(exit);
 440     }
 441     __ BIND(is_long);
 442     __ movq(Address(r13, 0), rax);
 443     __ jmp(exit);
 444 
 445     __ BIND(is_float);
 446     __ movflt(Address(r13, 0), xmm0);
 447     __ jmp(exit);
 448 
 449     __ BIND(is_double);
 450     __ movdbl(Address(r13, 0), xmm0);
 451     __ jmp(exit);
 452 
 453     return start;
 454   }
 455 
 456   // Return point for a Java call if there's an exception thrown in
 457   // Java code.  The exception is caught and transformed into a
 458   // pending exception stored in JavaThread that can be tested from
 459   // within the VM.
 460   //
 461   // Note: Usually the parameters are removed by the callee. In case
 462   // of an exception crossing an activation frame boundary, that is
 463   // not the case if the callee is compiled code => need to setup the
 464   // rsp.
 465   //
 466   // rax: exception oop
 467 
 468   address generate_catch_exception() {
 469     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 470     address start = __ pc();
 471 
 472     // same as in generate_call_stub():
 473     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 474     const Address thread        (rbp, thread_off         * wordSize);
 475 
 476 #ifdef ASSERT
 477     // verify that threads correspond
 478     {
 479       Label L1, L2, L3;
 480       __ cmpptr(r15_thread, thread);
 481       __ jcc(Assembler::equal, L1);
 482       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 483       __ bind(L1);
 484       __ get_thread(rbx);
 485       __ cmpptr(r15_thread, thread);
 486       __ jcc(Assembler::equal, L2);
 487       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 488       __ bind(L2);
 489       __ cmpptr(r15_thread, rbx);
 490       __ jcc(Assembler::equal, L3);
 491       __ stop("StubRoutines::catch_exception: threads must correspond");
 492       __ bind(L3);
 493     }
 494 #endif
 495 
 496     // set pending exception
 497     __ verify_oop(rax);
 498 
 499     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 500     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 501     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 502     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 503 
 504     // complete return to VM
 505     assert(StubRoutines::_call_stub_return_address != NULL,
 506            "_call_stub_return_address must have been generated before");
 507     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 508 
 509     return start;
 510   }
 511 
 512   // Continuation point for runtime calls returning with a pending
 513   // exception.  The pending exception check happened in the runtime
 514   // or native call stub.  The pending exception in Thread is
 515   // converted into a Java-level exception.
 516   //
 517   // Contract with Java-level exception handlers:
 518   // rax: exception
 519   // rdx: throwing pc
 520   //
 521   // NOTE: At entry of this stub, exception-pc must be on stack !!
 522 
 523   address generate_forward_exception() {
 524     StubCodeMark mark(this, "StubRoutines", "forward exception");
 525     address start = __ pc();
 526 
 527     // Upon entry, the sp points to the return address returning into
 528     // Java (interpreted or compiled) code; i.e., the return address
 529     // becomes the throwing pc.
 530     //
 531     // Arguments pushed before the runtime call are still on the stack
 532     // but the exception handler will reset the stack pointer ->
 533     // ignore them.  A potential result in registers can be ignored as
 534     // well.
 535 
 536 #ifdef ASSERT
 537     // make sure this code is only executed if there is a pending exception
 538     {
 539       Label L;
 540       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 541       __ jcc(Assembler::notEqual, L);
 542       __ stop("StubRoutines::forward exception: no pending exception (1)");
 543       __ bind(L);
 544     }
 545 #endif
 546 
 547     // compute exception handler into rbx
 548     __ movptr(c_rarg0, Address(rsp, 0));
 549     BLOCK_COMMENT("call exception_handler_for_return_address");
 550     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 551                          SharedRuntime::exception_handler_for_return_address),
 552                     r15_thread, c_rarg0);
 553     __ mov(rbx, rax);
 554 
 555     // setup rax & rdx, remove return address & clear pending exception
 556     __ pop(rdx);
 557     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 558     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 559 
 560 #ifdef ASSERT
 561     // make sure exception is set
 562     {
 563       Label L;
 564       __ testptr(rax, rax);
 565       __ jcc(Assembler::notEqual, L);
 566       __ stop("StubRoutines::forward exception: no pending exception (2)");
 567       __ bind(L);
 568     }
 569 #endif
 570 
 571     // continue at exception handler (return address removed)
 572     // rax: exception
 573     // rbx: exception handler
 574     // rdx: throwing pc
 575     __ verify_oop(rax);
 576     __ jmp(rbx);
 577 
 578     return start;
 579   }
 580 
 581   // Support for intptr_t OrderAccess::fence()
 582   //
 583   // Arguments :
 584   //
 585   // Result:
 586   address generate_orderaccess_fence() {
 587     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 588     address start = __ pc();
 589     __ membar(Assembler::StoreLoad);
 590     __ ret(0);
 591 
 592     return start;
 593   }
 594 
 595 
 596   // Support for intptr_t get_previous_sp()
 597   //
 598   // This routine is used to find the previous stack pointer for the
 599   // caller.
 600   address generate_get_previous_sp() {
 601     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 602     address start = __ pc();
 603 
 604     __ movptr(rax, rsp);
 605     __ addptr(rax, 8); // return address is at the top of the stack.
 606     __ ret(0);
 607 
 608     return start;
 609   }
 610 
 611   //----------------------------------------------------------------------------------------------------
 612   // Support for void verify_mxcsr()
 613   //
 614   // This routine is used with -Xcheck:jni to verify that native
 615   // JNI code does not return to Java code without restoring the
 616   // MXCSR register to our expected state.
 617 
 618   address generate_verify_mxcsr() {
 619     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 620     address start = __ pc();
 621 
 622     const Address mxcsr_save(rsp, 0);
 623 
 624     if (CheckJNICalls) {
 625       Label ok_ret;
 626       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 627       __ push(rax);
 628       __ subptr(rsp, wordSize);      // allocate a temp location
 629       __ stmxcsr(mxcsr_save);
 630       __ movl(rax, mxcsr_save);
 631       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 632       __ cmp32(rax, mxcsr_std);
 633       __ jcc(Assembler::equal, ok_ret);
 634 
 635       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 636 
 637       __ ldmxcsr(mxcsr_std);
 638 
 639       __ bind(ok_ret);
 640       __ addptr(rsp, wordSize);
 641       __ pop(rax);
 642     }
 643 
 644     __ ret(0);
 645 
 646     return start;
 647   }
 648 
 649   address generate_f2i_fixup() {
 650     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 651     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 652 
 653     address start = __ pc();
 654 
 655     Label L;
 656 
 657     __ push(rax);
 658     __ push(c_rarg3);
 659     __ push(c_rarg2);
 660     __ push(c_rarg1);
 661 
 662     __ movl(rax, 0x7f800000);
 663     __ xorl(c_rarg3, c_rarg3);
 664     __ movl(c_rarg2, inout);
 665     __ movl(c_rarg1, c_rarg2);
 666     __ andl(c_rarg1, 0x7fffffff);
 667     __ cmpl(rax, c_rarg1); // NaN? -> 0
 668     __ jcc(Assembler::negative, L);
 669     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 670     __ movl(c_rarg3, 0x80000000);
 671     __ movl(rax, 0x7fffffff);
 672     __ cmovl(Assembler::positive, c_rarg3, rax);
 673 
 674     __ bind(L);
 675     __ movptr(inout, c_rarg3);
 676 
 677     __ pop(c_rarg1);
 678     __ pop(c_rarg2);
 679     __ pop(c_rarg3);
 680     __ pop(rax);
 681 
 682     __ ret(0);
 683 
 684     return start;
 685   }
 686 
 687   address generate_f2l_fixup() {
 688     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 689     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 690     address start = __ pc();
 691 
 692     Label L;
 693 
 694     __ push(rax);
 695     __ push(c_rarg3);
 696     __ push(c_rarg2);
 697     __ push(c_rarg1);
 698 
 699     __ movl(rax, 0x7f800000);
 700     __ xorl(c_rarg3, c_rarg3);
 701     __ movl(c_rarg2, inout);
 702     __ movl(c_rarg1, c_rarg2);
 703     __ andl(c_rarg1, 0x7fffffff);
 704     __ cmpl(rax, c_rarg1); // NaN? -> 0
 705     __ jcc(Assembler::negative, L);
 706     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 707     __ mov64(c_rarg3, 0x8000000000000000);
 708     __ mov64(rax, 0x7fffffffffffffff);
 709     __ cmov(Assembler::positive, c_rarg3, rax);
 710 
 711     __ bind(L);
 712     __ movptr(inout, c_rarg3);
 713 
 714     __ pop(c_rarg1);
 715     __ pop(c_rarg2);
 716     __ pop(c_rarg3);
 717     __ pop(rax);
 718 
 719     __ ret(0);
 720 
 721     return start;
 722   }
 723 
 724   address generate_d2i_fixup() {
 725     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 726     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 727 
 728     address start = __ pc();
 729 
 730     Label L;
 731 
 732     __ push(rax);
 733     __ push(c_rarg3);
 734     __ push(c_rarg2);
 735     __ push(c_rarg1);
 736     __ push(c_rarg0);
 737 
 738     __ movl(rax, 0x7ff00000);
 739     __ movq(c_rarg2, inout);
 740     __ movl(c_rarg3, c_rarg2);
 741     __ mov(c_rarg1, c_rarg2);
 742     __ mov(c_rarg0, c_rarg2);
 743     __ negl(c_rarg3);
 744     __ shrptr(c_rarg1, 0x20);
 745     __ orl(c_rarg3, c_rarg2);
 746     __ andl(c_rarg1, 0x7fffffff);
 747     __ xorl(c_rarg2, c_rarg2);
 748     __ shrl(c_rarg3, 0x1f);
 749     __ orl(c_rarg1, c_rarg3);
 750     __ cmpl(rax, c_rarg1);
 751     __ jcc(Assembler::negative, L); // NaN -> 0
 752     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 753     __ movl(c_rarg2, 0x80000000);
 754     __ movl(rax, 0x7fffffff);
 755     __ cmov(Assembler::positive, c_rarg2, rax);
 756 
 757     __ bind(L);
 758     __ movptr(inout, c_rarg2);
 759 
 760     __ pop(c_rarg0);
 761     __ pop(c_rarg1);
 762     __ pop(c_rarg2);
 763     __ pop(c_rarg3);
 764     __ pop(rax);
 765 
 766     __ ret(0);
 767 
 768     return start;
 769   }
 770 
 771   address generate_d2l_fixup() {
 772     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 773     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 774 
 775     address start = __ pc();
 776 
 777     Label L;
 778 
 779     __ push(rax);
 780     __ push(c_rarg3);
 781     __ push(c_rarg2);
 782     __ push(c_rarg1);
 783     __ push(c_rarg0);
 784 
 785     __ movl(rax, 0x7ff00000);
 786     __ movq(c_rarg2, inout);
 787     __ movl(c_rarg3, c_rarg2);
 788     __ mov(c_rarg1, c_rarg2);
 789     __ mov(c_rarg0, c_rarg2);
 790     __ negl(c_rarg3);
 791     __ shrptr(c_rarg1, 0x20);
 792     __ orl(c_rarg3, c_rarg2);
 793     __ andl(c_rarg1, 0x7fffffff);
 794     __ xorl(c_rarg2, c_rarg2);
 795     __ shrl(c_rarg3, 0x1f);
 796     __ orl(c_rarg1, c_rarg3);
 797     __ cmpl(rax, c_rarg1);
 798     __ jcc(Assembler::negative, L); // NaN -> 0
 799     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 800     __ mov64(c_rarg2, 0x8000000000000000);
 801     __ mov64(rax, 0x7fffffffffffffff);
 802     __ cmovq(Assembler::positive, c_rarg2, rax);
 803 
 804     __ bind(L);
 805     __ movq(inout, c_rarg2);
 806 
 807     __ pop(c_rarg0);
 808     __ pop(c_rarg1);
 809     __ pop(c_rarg2);
 810     __ pop(c_rarg3);
 811     __ pop(rax);
 812 
 813     __ ret(0);
 814 
 815     return start;
 816   }
 817 
 818   address generate_popcount_avx_lut(const char *stub_name) {
 819     __ align64();
 820     StubCodeMark mark(this, "StubRoutines", stub_name);
 821     address start = __ pc();
 822     __ emit_data64(0x0302020102010100, relocInfo::none);
 823     __ emit_data64(0x0403030203020201, relocInfo::none);
 824     __ emit_data64(0x0302020102010100, relocInfo::none);
 825     __ emit_data64(0x0403030203020201, relocInfo::none);
 826     __ emit_data64(0x0302020102010100, relocInfo::none);
 827     __ emit_data64(0x0403030203020201, relocInfo::none);
 828     __ emit_data64(0x0302020102010100, relocInfo::none);
 829     __ emit_data64(0x0403030203020201, relocInfo::none);
 830     return start;
 831   }
 832 
 833   address generate_iota_indices(const char *stub_name) {
 834     __ align(CodeEntryAlignment);
 835     StubCodeMark mark(this, "StubRoutines", stub_name);
 836     address start = __ pc();
 837     __ emit_data64(0x0706050403020100, relocInfo::none);
 838     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 839     __ emit_data64(0x1716151413121110, relocInfo::none);
 840     __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
 841     __ emit_data64(0x2726252423222120, relocInfo::none);
 842     __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
 843     __ emit_data64(0x3736353433323130, relocInfo::none);
 844     __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
 845     return start;
 846   }
 847 
 848   address generate_vector_byte_shuffle_mask(const char *stub_name) {
 849     __ align(CodeEntryAlignment);
 850     StubCodeMark mark(this, "StubRoutines", stub_name);
 851     address start = __ pc();
 852     __ emit_data64(0x7070707070707070, relocInfo::none);
 853     __ emit_data64(0x7070707070707070, relocInfo::none);
 854     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 855     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 856     return start;
 857   }
 858 
 859   address generate_fp_mask(const char *stub_name, int64_t mask) {
 860     __ align(CodeEntryAlignment);
 861     StubCodeMark mark(this, "StubRoutines", stub_name);
 862     address start = __ pc();
 863 
 864     __ emit_data64( mask, relocInfo::none );
 865     __ emit_data64( mask, relocInfo::none );
 866 
 867     return start;
 868   }
 869 
 870   address generate_vector_mask(const char *stub_name, int64_t mask) {
 871     __ align(CodeEntryAlignment);
 872     StubCodeMark mark(this, "StubRoutines", stub_name);
 873     address start = __ pc();
 874 
 875     __ emit_data64(mask, relocInfo::none);
 876     __ emit_data64(mask, relocInfo::none);
 877     __ emit_data64(mask, relocInfo::none);
 878     __ emit_data64(mask, relocInfo::none);
 879     __ emit_data64(mask, relocInfo::none);
 880     __ emit_data64(mask, relocInfo::none);
 881     __ emit_data64(mask, relocInfo::none);
 882     __ emit_data64(mask, relocInfo::none);
 883 
 884     return start;
 885   }
 886 
 887   address generate_vector_byte_perm_mask(const char *stub_name) {
 888     __ align(CodeEntryAlignment);
 889     StubCodeMark mark(this, "StubRoutines", stub_name);
 890     address start = __ pc();
 891 
 892     __ emit_data64(0x0000000000000001, relocInfo::none);
 893     __ emit_data64(0x0000000000000003, relocInfo::none);
 894     __ emit_data64(0x0000000000000005, relocInfo::none);
 895     __ emit_data64(0x0000000000000007, relocInfo::none);
 896     __ emit_data64(0x0000000000000000, relocInfo::none);
 897     __ emit_data64(0x0000000000000002, relocInfo::none);
 898     __ emit_data64(0x0000000000000004, relocInfo::none);
 899     __ emit_data64(0x0000000000000006, relocInfo::none);
 900 
 901     return start;
 902   }
 903 
 904   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
 905     __ align(CodeEntryAlignment);
 906     StubCodeMark mark(this, "StubRoutines", stub_name);
 907     address start = __ pc();
 908 
 909     __ emit_data64(mask, relocInfo::none);
 910     __ emit_data64(mask, relocInfo::none);
 911     __ emit_data64(mask, relocInfo::none);
 912     __ emit_data64(mask, relocInfo::none);
 913     __ emit_data64(mask, relocInfo::none);
 914     __ emit_data64(mask, relocInfo::none);
 915     __ emit_data64(mask, relocInfo::none);
 916     __ emit_data64(mask, relocInfo::none);
 917 
 918     return start;
 919   }
 920 
 921   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 922                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 923                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 924                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 925                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 926     __ align(CodeEntryAlignment);
 927     StubCodeMark mark(this, "StubRoutines", stub_name);
 928     address start = __ pc();
 929 
 930     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 931     __ emit_data(val0, relocInfo::none, 0);
 932     __ emit_data(val1, relocInfo::none, 0);
 933     __ emit_data(val2, relocInfo::none, 0);
 934     __ emit_data(val3, relocInfo::none, 0);
 935     if (len >= Assembler::AVX_256bit) {
 936       __ emit_data(val4, relocInfo::none, 0);
 937       __ emit_data(val5, relocInfo::none, 0);
 938       __ emit_data(val6, relocInfo::none, 0);
 939       __ emit_data(val7, relocInfo::none, 0);
 940       if (len >= Assembler::AVX_512bit) {
 941         __ emit_data(val8, relocInfo::none, 0);
 942         __ emit_data(val9, relocInfo::none, 0);
 943         __ emit_data(val10, relocInfo::none, 0);
 944         __ emit_data(val11, relocInfo::none, 0);
 945         __ emit_data(val12, relocInfo::none, 0);
 946         __ emit_data(val13, relocInfo::none, 0);
 947         __ emit_data(val14, relocInfo::none, 0);
 948         __ emit_data(val15, relocInfo::none, 0);
 949       }
 950     }
 951 
 952     return start;
 953   }
 954 
 955   // Non-destructive plausibility checks for oops
 956   //
 957   // Arguments:
 958   //    all args on stack!
 959   //
 960   // Stack after saving c_rarg3:
 961   //    [tos + 0]: saved c_rarg3
 962   //    [tos + 1]: saved c_rarg2
 963   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 964   //    [tos + 3]: saved flags
 965   //    [tos + 4]: return address
 966   //  * [tos + 5]: error message (char*)
 967   //  * [tos + 6]: object to verify (oop)
 968   //  * [tos + 7]: saved rax - saved by caller and bashed
 969   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 970   //  * = popped on exit
 971   address generate_verify_oop() {
 972     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 973     address start = __ pc();
 974 
 975     Label exit, error;
 976 
 977     __ pushf();
 978     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 979 
 980     __ push(r12);
 981 
 982     // save c_rarg2 and c_rarg3
 983     __ push(c_rarg2);
 984     __ push(c_rarg3);
 985 
 986     enum {
 987            // After previous pushes.
 988            oop_to_verify = 6 * wordSize,
 989            saved_rax     = 7 * wordSize,
 990            saved_r10     = 8 * wordSize,
 991 
 992            // Before the call to MacroAssembler::debug(), see below.
 993            return_addr   = 16 * wordSize,
 994            error_msg     = 17 * wordSize
 995     };
 996 
 997     // get object
 998     __ movptr(rax, Address(rsp, oop_to_verify));
 999 
1000     // make sure object is 'reasonable'
1001     __ testptr(rax, rax);
1002     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1003 
1004 #if INCLUDE_ZGC
1005     if (UseZGC) {
1006       // Check if metadata bits indicate a bad oop
1007       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1008       __ jcc(Assembler::notZero, error);
1009     }
1010 #endif
1011 
1012     // Check if the oop is in the right area of memory
1013     __ movptr(c_rarg2, rax);
1014     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1015     __ andptr(c_rarg2, c_rarg3);
1016     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1017     __ cmpptr(c_rarg2, c_rarg3);
1018     __ jcc(Assembler::notZero, error);
1019 
1020     // make sure klass is 'reasonable', which is not zero.
1021     __ load_klass(rax, rax, rscratch1);  // get klass
1022     __ testptr(rax, rax);
1023     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1024 
1025     // return if everything seems ok
1026     __ bind(exit);
1027     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1028     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1029     __ pop(c_rarg3);                             // restore c_rarg3
1030     __ pop(c_rarg2);                             // restore c_rarg2
1031     __ pop(r12);                                 // restore r12
1032     __ popf();                                   // restore flags
1033     __ ret(4 * wordSize);                        // pop caller saved stuff
1034 
1035     // handle errors
1036     __ bind(error);
1037     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1038     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1039     __ pop(c_rarg3);                             // get saved c_rarg3 back
1040     __ pop(c_rarg2);                             // get saved c_rarg2 back
1041     __ pop(r12);                                 // get saved r12 back
1042     __ popf();                                   // get saved flags off stack --
1043                                                  // will be ignored
1044 
1045     __ pusha();                                  // push registers
1046                                                  // (rip is already
1047                                                  // already pushed)
1048     // debug(char* msg, int64_t pc, int64_t regs[])
1049     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1050     // pushed all the registers, so now the stack looks like:
1051     //     [tos +  0] 16 saved registers
1052     //     [tos + 16] return address
1053     //   * [tos + 17] error message (char*)
1054     //   * [tos + 18] object to verify (oop)
1055     //   * [tos + 19] saved rax - saved by caller and bashed
1056     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1057     //   * = popped on exit
1058 
1059     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1060     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1061     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1062     __ mov(r12, rsp);                               // remember rsp
1063     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1064     __ andptr(rsp, -16);                            // align stack as required by ABI
1065     BLOCK_COMMENT("call MacroAssembler::debug");
1066     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1067     __ hlt();
1068     return start;
1069   }
1070 
1071   //
1072   // Verify that a register contains clean 32-bits positive value
1073   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1074   //
1075   //  Input:
1076   //    Rint  -  32-bits value
1077   //    Rtmp  -  scratch
1078   //
1079   void assert_clean_int(Register Rint, Register Rtmp) {
1080 #ifdef ASSERT
1081     Label L;
1082     assert_different_registers(Rtmp, Rint);
1083     __ movslq(Rtmp, Rint);
1084     __ cmpq(Rtmp, Rint);
1085     __ jcc(Assembler::equal, L);
1086     __ stop("high 32-bits of int value are not 0");
1087     __ bind(L);
1088 #endif
1089   }
1090 
1091   //  Generate overlap test for array copy stubs
1092   //
1093   //  Input:
1094   //     c_rarg0 - from
1095   //     c_rarg1 - to
1096   //     c_rarg2 - element count
1097   //
1098   //  Output:
1099   //     rax   - &from[element count - 1]
1100   //
1101   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1102     assert(no_overlap_target != NULL, "must be generated");
1103     array_overlap_test(no_overlap_target, NULL, sf);
1104   }
1105   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1106     array_overlap_test(NULL, &L_no_overlap, sf);
1107   }
1108   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1109     const Register from     = c_rarg0;
1110     const Register to       = c_rarg1;
1111     const Register count    = c_rarg2;
1112     const Register end_from = rax;
1113 
1114     __ cmpptr(to, from);
1115     __ lea(end_from, Address(from, count, sf, 0));
1116     if (NOLp == NULL) {
1117       ExternalAddress no_overlap(no_overlap_target);
1118       __ jump_cc(Assembler::belowEqual, no_overlap);
1119       __ cmpptr(to, end_from);
1120       __ jump_cc(Assembler::aboveEqual, no_overlap);
1121     } else {
1122       __ jcc(Assembler::belowEqual, (*NOLp));
1123       __ cmpptr(to, end_from);
1124       __ jcc(Assembler::aboveEqual, (*NOLp));
1125     }
1126   }
1127 
1128   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1129   //
1130   // Outputs:
1131   //    rdi - rcx
1132   //    rsi - rdx
1133   //    rdx - r8
1134   //    rcx - r9
1135   //
1136   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1137   // are non-volatile.  r9 and r10 should not be used by the caller.
1138   //
1139   DEBUG_ONLY(bool regs_in_thread;)
1140 
1141   void setup_arg_regs(int nargs = 3) {
1142     const Register saved_rdi = r9;
1143     const Register saved_rsi = r10;
1144     assert(nargs == 3 || nargs == 4, "else fix");
1145 #ifdef _WIN64
1146     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1147            "unexpected argument registers");
1148     if (nargs >= 4)
1149       __ mov(rax, r9);  // r9 is also saved_rdi
1150     __ movptr(saved_rdi, rdi);
1151     __ movptr(saved_rsi, rsi);
1152     __ mov(rdi, rcx); // c_rarg0
1153     __ mov(rsi, rdx); // c_rarg1
1154     __ mov(rdx, r8);  // c_rarg2
1155     if (nargs >= 4)
1156       __ mov(rcx, rax); // c_rarg3 (via rax)
1157 #else
1158     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1159            "unexpected argument registers");
1160 #endif
1161     DEBUG_ONLY(regs_in_thread = false;)
1162   }
1163 
1164   void restore_arg_regs() {
1165     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1166     const Register saved_rdi = r9;
1167     const Register saved_rsi = r10;
1168 #ifdef _WIN64
1169     __ movptr(rdi, saved_rdi);
1170     __ movptr(rsi, saved_rsi);
1171 #endif
1172   }
1173 
1174   // This is used in places where r10 is a scratch register, and can
1175   // be adapted if r9 is needed also.
1176   void setup_arg_regs_using_thread() {
1177     const Register saved_r15 = r9;
1178 #ifdef _WIN64
1179     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1180     __ get_thread(r15_thread);
1181     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1182            "unexpected argument registers");
1183     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1184     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1185 
1186     __ mov(rdi, rcx); // c_rarg0
1187     __ mov(rsi, rdx); // c_rarg1
1188     __ mov(rdx, r8);  // c_rarg2
1189 #else
1190     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1191            "unexpected argument registers");
1192 #endif
1193     DEBUG_ONLY(regs_in_thread = true;)
1194   }
1195 
1196   void restore_arg_regs_using_thread() {
1197     assert(regs_in_thread, "wrong call to restore_arg_regs");
1198     const Register saved_r15 = r9;
1199 #ifdef _WIN64
1200     __ get_thread(r15_thread);
1201     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1202     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1203     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1204 #endif
1205   }
1206 
1207   // Copy big chunks forward
1208   //
1209   // Inputs:
1210   //   end_from     - source arrays end address
1211   //   end_to       - destination array end address
1212   //   qword_count  - 64-bits element count, negative
1213   //   to           - scratch
1214   //   L_copy_bytes - entry label
1215   //   L_copy_8_bytes  - exit  label
1216   //
1217   void copy_bytes_forward(Register end_from, Register end_to,
1218                              Register qword_count, Register to,
1219                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1220     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1221     Label L_loop;
1222     __ align(OptoLoopAlignment);
1223     if (UseUnalignedLoadStores) {
1224       Label L_end;
1225       __ BIND(L_loop);
1226       if (UseAVX >= 2) {
1227         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1228         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1229         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1230         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1231       } else {
1232         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1233         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1234         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1235         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1236         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1237         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1238         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1239         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1240       }
1241 
1242       __ BIND(L_copy_bytes);
1243       __ addptr(qword_count, 8);
1244       __ jcc(Assembler::lessEqual, L_loop);
1245       __ subptr(qword_count, 4);  // sub(8) and add(4)
1246       __ jccb(Assembler::greater, L_end);
1247       // Copy trailing 32 bytes
1248       if (UseAVX >= 2) {
1249         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1250         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1251       } else {
1252         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1253         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1254         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1255         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1256       }
1257       __ addptr(qword_count, 4);
1258       __ BIND(L_end);
1259     } else {
1260       // Copy 32-bytes per iteration
1261       __ BIND(L_loop);
1262       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1263       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1264       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1265       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1266       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1267       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1268       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1269       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1270 
1271       __ BIND(L_copy_bytes);
1272       __ addptr(qword_count, 4);
1273       __ jcc(Assembler::lessEqual, L_loop);
1274     }
1275     __ subptr(qword_count, 4);
1276     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1277   }
1278 
1279   // Copy big chunks backward
1280   //
1281   // Inputs:
1282   //   from         - source arrays address
1283   //   dest         - destination array address
1284   //   qword_count  - 64-bits element count
1285   //   to           - scratch
1286   //   L_copy_bytes - entry label
1287   //   L_copy_8_bytes  - exit  label
1288   //
1289   void copy_bytes_backward(Register from, Register dest,
1290                               Register qword_count, Register to,
1291                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1292     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1293     Label L_loop;
1294     __ align(OptoLoopAlignment);
1295     if (UseUnalignedLoadStores) {
1296       Label L_end;
1297       __ BIND(L_loop);
1298       if (UseAVX >= 2) {
1299         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1300         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1301         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1302         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1303       } else {
1304         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1305         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1306         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1307         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1308         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1309         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1310         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1311         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1312       }
1313 
1314       __ BIND(L_copy_bytes);
1315       __ subptr(qword_count, 8);
1316       __ jcc(Assembler::greaterEqual, L_loop);
1317 
1318       __ addptr(qword_count, 4);  // add(8) and sub(4)
1319       __ jccb(Assembler::less, L_end);
1320       // Copy trailing 32 bytes
1321       if (UseAVX >= 2) {
1322         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1323         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1324       } else {
1325         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1326         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1327         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1328         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1329       }
1330       __ subptr(qword_count, 4);
1331       __ BIND(L_end);
1332     } else {
1333       // Copy 32-bytes per iteration
1334       __ BIND(L_loop);
1335       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1336       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1337       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1338       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1339       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1340       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1341       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1342       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1343 
1344       __ BIND(L_copy_bytes);
1345       __ subptr(qword_count, 4);
1346       __ jcc(Assembler::greaterEqual, L_loop);
1347     }
1348     __ addptr(qword_count, 4);
1349     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1350   }
1351 
1352 #ifndef PRODUCT
1353     int& get_profile_ctr(int shift) {
1354       if ( 0 == shift)
1355         return SharedRuntime::_jbyte_array_copy_ctr;
1356       else if(1 == shift)
1357         return SharedRuntime::_jshort_array_copy_ctr;
1358       else if(2 == shift)
1359         return SharedRuntime::_jint_array_copy_ctr;
1360       else
1361         return SharedRuntime::_jlong_array_copy_ctr;
1362     }
1363 #endif
1364 
1365   void setup_argument_regs(BasicType type) {
1366     if (type == T_BYTE || type == T_SHORT) {
1367       setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1368                         // r9 and r10 may be used to save non-volatile registers
1369     } else {
1370       setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1371                                      // r9 is used to save r15_thread
1372     }
1373   }
1374 
1375   void restore_argument_regs(BasicType type) {
1376     if (type == T_BYTE || type == T_SHORT) {
1377       restore_arg_regs();
1378     } else {
1379       restore_arg_regs_using_thread();
1380     }
1381   }
1382 
1383 #if COMPILER2_OR_JVMCI
1384   // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
1385   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
1386   //   for both special cases (various small block sizes) and aligned copy loop. This is the
1387   //   default configuration.
1388   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
1389   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
1390   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
1391   //   better performance for disjoint copies. For conjoint/backward copy vector based
1392   //   copy performs better.
1393   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
1394   //   64 byte vector registers (ZMMs).
1395 
1396   // Inputs:
1397   //   c_rarg0   - source array address
1398   //   c_rarg1   - destination array address
1399   //   c_rarg2   - element count, treated as ssize_t, can be zero
1400   //
1401   //
1402   // Side Effects:
1403   //   disjoint_copy_avx3_masked is set to the no-overlap entry point
1404   //   used by generate_conjoint_[byte/int/short/long]_copy().
1405   //
1406 
1407   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1408                                              bool aligned, bool is_oop, bool dest_uninitialized) {
1409     __ align(CodeEntryAlignment);
1410     StubCodeMark mark(this, "StubRoutines", name);
1411     address start = __ pc();
1412     int avx3threshold = VM_Version::avx3_threshold();
1413     bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
1414     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1415     Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1416     const Register from        = rdi;  // source array address
1417     const Register to          = rsi;  // destination array address
1418     const Register count       = rdx;  // elements count
1419     const Register temp1       = r8;
1420     const Register temp2       = r11;
1421     const Register temp3       = rax;
1422     const Register temp4       = rcx;
1423     // End pointers are inclusive, and if count is not zero they point
1424     // to the last unit copied:  end_to[0] := end_from[0]
1425 
1426     __ enter(); // required for proper stackwalking of RuntimeStub frame
1427     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1428 
1429     if (entry != NULL) {
1430       *entry = __ pc();
1431        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1432       BLOCK_COMMENT("Entry:");
1433     }
1434 
1435     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1436     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1437 
1438     setup_argument_regs(type);
1439 
1440     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1441     if (dest_uninitialized) {
1442       decorators |= IS_DEST_UNINITIALIZED;
1443     }
1444     if (aligned) {
1445       decorators |= ARRAYCOPY_ALIGNED;
1446     }
1447     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1448     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1449 
1450     {
1451       // Type(shift)           byte(0), short(1), int(2),   long(3)
1452       int loop_size[]        = { 192,     96,       48,      24};
1453       int threshold[]        = { 4096,    2048,     1024,    512};
1454 
1455       // UnsafeCopyMemory page error: continue after ucm
1456       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1457       // 'from', 'to' and 'count' are now valid
1458 
1459       // temp1 holds remaining count and temp4 holds running count used to compute
1460       // next address offset for start of to/from addresses (temp4 * scale).
1461       __ mov64(temp4, 0);
1462       __ movq(temp1, count);
1463 
1464       // Zero length check.
1465       __ BIND(L_tail);
1466       __ cmpq(temp1, 0);
1467       __ jcc(Assembler::lessEqual, L_exit);
1468 
1469       // Special cases using 32 byte [masked] vector copy operations.
1470       __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1471                                       temp4, temp3, use64byteVector, L_entry, L_exit);
1472 
1473       // PRE-MAIN-POST loop for aligned copy.
1474       __ BIND(L_entry);
1475 
1476       if (avx3threshold != 0) {
1477         __ cmpq(count, threshold[shift]);
1478         if (MaxVectorSize == 64) {
1479           // Copy using 64 byte vectors.
1480           __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1481         } else {
1482           assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
1483           // REP MOVS offer a faster copy path.
1484           __ jcc(Assembler::greaterEqual, L_repmovs);
1485         }
1486       }
1487 
1488       if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
1489         // Partial copy to make dst address 32 byte aligned.
1490         __ movq(temp2, to);
1491         __ andq(temp2, 31);
1492         __ jcc(Assembler::equal, L_main_pre_loop);
1493 
1494         __ negptr(temp2);
1495         __ addq(temp2, 32);
1496         if (shift) {
1497           __ shrq(temp2, shift);
1498         }
1499         __ movq(temp3, temp2);
1500         __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
1501         __ movq(temp4, temp2);
1502         __ movq(temp1, count);
1503         __ subq(temp1, temp2);
1504 
1505         __ cmpq(temp1, loop_size[shift]);
1506         __ jcc(Assembler::less, L_tail);
1507 
1508         __ BIND(L_main_pre_loop);
1509         __ subq(temp1, loop_size[shift]);
1510 
1511         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1512         __ align32();
1513         __ BIND(L_main_loop);
1514            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
1515            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
1516            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
1517            __ addptr(temp4, loop_size[shift]);
1518            __ subq(temp1, loop_size[shift]);
1519            __ jcc(Assembler::greater, L_main_loop);
1520 
1521         __ addq(temp1, loop_size[shift]);
1522 
1523         // Tail loop.
1524         __ jmp(L_tail);
1525 
1526         __ BIND(L_repmovs);
1527           __ movq(temp2, temp1);
1528           // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
1529           __ movq(temp3, to);
1530           __ movq(to,  from);
1531           __ movq(from, temp3);
1532           // Save to/from for restoration post rep_mov.
1533           __ movq(temp1, to);
1534           __ movq(temp3, from);
1535           if(shift < 3) {
1536             __ shrq(temp2, 3-shift);     // quad word count
1537           }
1538           __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
1539           __ rep_mov();
1540           __ shlq(temp2, 3);             // convert quad words into byte count.
1541           if(shift) {
1542             __ shrq(temp2, shift);       // type specific count.
1543           }
1544           // Restore original addresses in to/from.
1545           __ movq(to, temp3);
1546           __ movq(from, temp1);
1547           __ movq(temp4, temp2);
1548           __ movq(temp1, count);
1549           __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
1550           __ jmp(L_tail);
1551       }
1552 
1553       if (MaxVectorSize > 32) {
1554         __ BIND(L_pre_main_post_64);
1555         // Partial copy to make dst address 64 byte aligned.
1556         __ movq(temp2, to);
1557         __ andq(temp2, 63);
1558         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1559 
1560         __ negptr(temp2);
1561         __ addq(temp2, 64);
1562         if (shift) {
1563           __ shrq(temp2, shift);
1564         }
1565         __ movq(temp3, temp2);
1566         __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
1567         __ movq(temp4, temp2);
1568         __ movq(temp1, count);
1569         __ subq(temp1, temp2);
1570 
1571         __ cmpq(temp1, loop_size[shift]);
1572         __ jcc(Assembler::less, L_tail64);
1573 
1574         __ BIND(L_main_pre_loop_64bytes);
1575         __ subq(temp1, loop_size[shift]);
1576 
1577         // Main loop with aligned copy block size of 192 bytes at
1578         // 64 byte copy granularity.
1579         __ align32();
1580         __ BIND(L_main_loop_64bytes);
1581            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
1582            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
1583            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
1584            __ addptr(temp4, loop_size[shift]);
1585            __ subq(temp1, loop_size[shift]);
1586            __ jcc(Assembler::greater, L_main_loop_64bytes);
1587 
1588         __ addq(temp1, loop_size[shift]);
1589         // Zero length check.
1590         __ jcc(Assembler::lessEqual, L_exit);
1591 
1592         __ BIND(L_tail64);
1593 
1594         // Tail handling using 64 byte [masked] vector copy operations.
1595         use64byteVector = true;
1596         __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1597                                         temp4, temp3, use64byteVector, L_entry, L_exit);
1598       }
1599       __ BIND(L_exit);
1600     }
1601 
1602     address ucme_exit_pc = __ pc();
1603     // When called from generic_arraycopy r11 contains specific values
1604     // used during arraycopy epilogue, re-initializing r11.
1605     if (is_oop) {
1606       __ movq(r11, shift == 3 ? count : to);
1607     }
1608     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1609     restore_argument_regs(type);
1610     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1611     __ xorptr(rax, rax); // return 0
1612     __ vzeroupper();
1613     __ leave(); // required for proper stackwalking of RuntimeStub frame
1614     __ ret(0);
1615     return start;
1616   }
1617 
1618   // Inputs:
1619   //   c_rarg0   - source array address
1620   //   c_rarg1   - destination array address
1621   //   c_rarg2   - element count, treated as ssize_t, can be zero
1622   //
1623   //
1624   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1625                                              address nooverlap_target, bool aligned, bool is_oop,
1626                                              bool dest_uninitialized) {
1627     __ align(CodeEntryAlignment);
1628     StubCodeMark mark(this, "StubRoutines", name);
1629     address start = __ pc();
1630 
1631     int avx3threshold = VM_Version::avx3_threshold();
1632     bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
1633 
1634     Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1635     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1636     const Register from        = rdi;  // source array address
1637     const Register to          = rsi;  // destination array address
1638     const Register count       = rdx;  // elements count
1639     const Register temp1       = r8;
1640     const Register temp2       = rcx;
1641     const Register temp3       = r11;
1642     const Register temp4       = rax;
1643     // End pointers are inclusive, and if count is not zero they point
1644     // to the last unit copied:  end_to[0] := end_from[0]
1645 
1646     __ enter(); // required for proper stackwalking of RuntimeStub frame
1647     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1648 
1649     if (entry != NULL) {
1650       *entry = __ pc();
1651        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1652       BLOCK_COMMENT("Entry:");
1653     }
1654 
1655     array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1656 
1657     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1658     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1659 
1660     setup_argument_regs(type);
1661 
1662     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1663     if (dest_uninitialized) {
1664       decorators |= IS_DEST_UNINITIALIZED;
1665     }
1666     if (aligned) {
1667       decorators |= ARRAYCOPY_ALIGNED;
1668     }
1669     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1670     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1671     {
1672       // Type(shift)       byte(0), short(1), int(2),   long(3)
1673       int loop_size[]   = { 192,     96,       48,      24};
1674       int threshold[]   = { 4096,    2048,     1024,    512};
1675 
1676       // UnsafeCopyMemory page error: continue after ucm
1677       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1678       // 'from', 'to' and 'count' are now valid
1679 
1680       // temp1 holds remaining count.
1681       __ movq(temp1, count);
1682 
1683       // Zero length check.
1684       __ BIND(L_tail);
1685       __ cmpq(temp1, 0);
1686       __ jcc(Assembler::lessEqual, L_exit);
1687 
1688       __ mov64(temp2, 0);
1689       __ movq(temp3, temp1);
1690       // Special cases using 32 byte [masked] vector copy operations.
1691       __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1692                                                temp4, use64byteVector, L_entry, L_exit);
1693 
1694       // PRE-MAIN-POST loop for aligned copy.
1695       __ BIND(L_entry);
1696 
1697       if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
1698         __ cmpq(temp1, threshold[shift]);
1699         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1700       }
1701 
1702       if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
1703         // Partial copy to make dst address 32 byte aligned.
1704         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1705         __ andq(temp2, 31);
1706         __ jcc(Assembler::equal, L_main_pre_loop);
1707 
1708         if (shift) {
1709           __ shrq(temp2, shift);
1710         }
1711         __ subq(temp1, temp2);
1712         __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1713 
1714         __ cmpq(temp1, loop_size[shift]);
1715         __ jcc(Assembler::less, L_tail);
1716 
1717         __ BIND(L_main_pre_loop);
1718 
1719         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1720         __ align32();
1721         __ BIND(L_main_loop);
1722            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1723            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1724            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1725            __ subptr(temp1, loop_size[shift]);
1726            __ cmpq(temp1, loop_size[shift]);
1727            __ jcc(Assembler::greater, L_main_loop);
1728 
1729         // Tail loop.
1730         __ jmp(L_tail);
1731       }
1732 
1733       if (MaxVectorSize > 32) {
1734         __ BIND(L_pre_main_post_64);
1735         // Partial copy to make dst address 64 byte aligned.
1736         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1737         __ andq(temp2, 63);
1738         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1739 
1740         if (shift) {
1741           __ shrq(temp2, shift);
1742         }
1743         __ subq(temp1, temp2);
1744         __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1745 
1746         __ cmpq(temp1, loop_size[shift]);
1747         __ jcc(Assembler::less, L_tail64);
1748 
1749         __ BIND(L_main_pre_loop_64bytes);
1750 
1751         // Main loop with aligned copy block size of 192 bytes at
1752         // 64 byte copy granularity.
1753         __ align32();
1754         __ BIND(L_main_loop_64bytes);
1755            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1756            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1757            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1758            __ subq(temp1, loop_size[shift]);
1759            __ cmpq(temp1, loop_size[shift]);
1760            __ jcc(Assembler::greater, L_main_loop_64bytes);
1761 
1762         // Zero length check.
1763         __ cmpq(temp1, 0);
1764         __ jcc(Assembler::lessEqual, L_exit);
1765 
1766         __ BIND(L_tail64);
1767 
1768         // Tail handling using 64 byte [masked] vector copy operations.
1769         use64byteVector = true;
1770         __ mov64(temp2, 0);
1771         __ movq(temp3, temp1);
1772         __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1773                                                  temp4, use64byteVector, L_entry, L_exit);
1774       }
1775       __ BIND(L_exit);
1776     }
1777     address ucme_exit_pc = __ pc();
1778     // When called from generic_arraycopy r11 contains specific values
1779     // used during arraycopy epilogue, re-initializing r11.
1780     if(is_oop) {
1781       __ movq(r11, count);
1782     }
1783     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1784     restore_argument_regs(type);
1785     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1786     __ xorptr(rax, rax); // return 0
1787     __ vzeroupper();
1788     __ leave(); // required for proper stackwalking of RuntimeStub frame
1789     __ ret(0);
1790     return start;
1791   }
1792 #endif // COMPILER2_OR_JVMCI
1793 
1794 
1795   // Arguments:
1796   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1797   //             ignored
1798   //   name    - stub name string
1799   //
1800   // Inputs:
1801   //   c_rarg0   - source array address
1802   //   c_rarg1   - destination array address
1803   //   c_rarg2   - element count, treated as ssize_t, can be zero
1804   //
1805   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1806   // we let the hardware handle it.  The one to eight bytes within words,
1807   // dwords or qwords that span cache line boundaries will still be loaded
1808   // and stored atomically.
1809   //
1810   // Side Effects:
1811   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1812   //   used by generate_conjoint_byte_copy().
1813   //
1814   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1815 #if COMPILER2_OR_JVMCI
1816     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1817        return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1818                                                  aligned, false, false);
1819     }
1820 #endif
1821     __ align(CodeEntryAlignment);
1822     StubCodeMark mark(this, "StubRoutines", name);
1823     address start = __ pc();
1824 
1825     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1826     Label L_copy_byte, L_exit;
1827     const Register from        = rdi;  // source array address
1828     const Register to          = rsi;  // destination array address
1829     const Register count       = rdx;  // elements count
1830     const Register byte_count  = rcx;
1831     const Register qword_count = count;
1832     const Register end_from    = from; // source array end address
1833     const Register end_to      = to;   // destination array end address
1834     // End pointers are inclusive, and if count is not zero they point
1835     // to the last unit copied:  end_to[0] := end_from[0]
1836 
1837     __ enter(); // required for proper stackwalking of RuntimeStub frame
1838     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1839 
1840     if (entry != NULL) {
1841       *entry = __ pc();
1842        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1843       BLOCK_COMMENT("Entry:");
1844     }
1845 
1846     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1847                       // r9 and r10 may be used to save non-volatile registers
1848 
1849     {
1850       // UnsafeCopyMemory page error: continue after ucm
1851       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1852       // 'from', 'to' and 'count' are now valid
1853       __ movptr(byte_count, count);
1854       __ shrptr(count, 3); // count => qword_count
1855 
1856       // Copy from low to high addresses.  Use 'to' as scratch.
1857       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1858       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1859       __ negptr(qword_count); // make the count negative
1860       __ jmp(L_copy_bytes);
1861 
1862       // Copy trailing qwords
1863     __ BIND(L_copy_8_bytes);
1864       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1865       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1866       __ increment(qword_count);
1867       __ jcc(Assembler::notZero, L_copy_8_bytes);
1868 
1869       // Check for and copy trailing dword
1870     __ BIND(L_copy_4_bytes);
1871       __ testl(byte_count, 4);
1872       __ jccb(Assembler::zero, L_copy_2_bytes);
1873       __ movl(rax, Address(end_from, 8));
1874       __ movl(Address(end_to, 8), rax);
1875 
1876       __ addptr(end_from, 4);
1877       __ addptr(end_to, 4);
1878 
1879       // Check for and copy trailing word
1880     __ BIND(L_copy_2_bytes);
1881       __ testl(byte_count, 2);
1882       __ jccb(Assembler::zero, L_copy_byte);
1883       __ movw(rax, Address(end_from, 8));
1884       __ movw(Address(end_to, 8), rax);
1885 
1886       __ addptr(end_from, 2);
1887       __ addptr(end_to, 2);
1888 
1889       // Check for and copy trailing byte
1890     __ BIND(L_copy_byte);
1891       __ testl(byte_count, 1);
1892       __ jccb(Assembler::zero, L_exit);
1893       __ movb(rax, Address(end_from, 8));
1894       __ movb(Address(end_to, 8), rax);
1895     }
1896   __ BIND(L_exit);
1897     address ucme_exit_pc = __ pc();
1898     restore_arg_regs();
1899     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1900     __ xorptr(rax, rax); // return 0
1901     __ vzeroupper();
1902     __ leave(); // required for proper stackwalking of RuntimeStub frame
1903     __ ret(0);
1904 
1905     {
1906       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1907       // Copy in multi-bytes chunks
1908       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1909       __ jmp(L_copy_4_bytes);
1910     }
1911     return start;
1912   }
1913 
1914   // Arguments:
1915   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1916   //             ignored
1917   //   name    - stub name string
1918   //
1919   // Inputs:
1920   //   c_rarg0   - source array address
1921   //   c_rarg1   - destination array address
1922   //   c_rarg2   - element count, treated as ssize_t, can be zero
1923   //
1924   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1925   // we let the hardware handle it.  The one to eight bytes within words,
1926   // dwords or qwords that span cache line boundaries will still be loaded
1927   // and stored atomically.
1928   //
1929   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1930                                       address* entry, const char *name) {
1931 #if COMPILER2_OR_JVMCI
1932     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1933        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1934                                                  nooverlap_target, aligned, false, false);
1935     }
1936 #endif
1937     __ align(CodeEntryAlignment);
1938     StubCodeMark mark(this, "StubRoutines", name);
1939     address start = __ pc();
1940 
1941     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1942     const Register from        = rdi;  // source array address
1943     const Register to          = rsi;  // destination array address
1944     const Register count       = rdx;  // elements count
1945     const Register byte_count  = rcx;
1946     const Register qword_count = count;
1947 
1948     __ enter(); // required for proper stackwalking of RuntimeStub frame
1949     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1950 
1951     if (entry != NULL) {
1952       *entry = __ pc();
1953       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1954       BLOCK_COMMENT("Entry:");
1955     }
1956 
1957     array_overlap_test(nooverlap_target, Address::times_1);
1958     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1959                       // r9 and r10 may be used to save non-volatile registers
1960 
1961     {
1962       // UnsafeCopyMemory page error: continue after ucm
1963       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1964       // 'from', 'to' and 'count' are now valid
1965       __ movptr(byte_count, count);
1966       __ shrptr(count, 3);   // count => qword_count
1967 
1968       // Copy from high to low addresses.
1969 
1970       // Check for and copy trailing byte
1971       __ testl(byte_count, 1);
1972       __ jcc(Assembler::zero, L_copy_2_bytes);
1973       __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1974       __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1975       __ decrement(byte_count); // Adjust for possible trailing word
1976 
1977       // Check for and copy trailing word
1978     __ BIND(L_copy_2_bytes);
1979       __ testl(byte_count, 2);
1980       __ jcc(Assembler::zero, L_copy_4_bytes);
1981       __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1982       __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1983 
1984       // Check for and copy trailing dword
1985     __ BIND(L_copy_4_bytes);
1986       __ testl(byte_count, 4);
1987       __ jcc(Assembler::zero, L_copy_bytes);
1988       __ movl(rax, Address(from, qword_count, Address::times_8));
1989       __ movl(Address(to, qword_count, Address::times_8), rax);
1990       __ jmp(L_copy_bytes);
1991 
1992       // Copy trailing qwords
1993     __ BIND(L_copy_8_bytes);
1994       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1995       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1996       __ decrement(qword_count);
1997       __ jcc(Assembler::notZero, L_copy_8_bytes);
1998     }
1999     restore_arg_regs();
2000     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
2001     __ xorptr(rax, rax); // return 0
2002     __ vzeroupper();
2003     __ leave(); // required for proper stackwalking of RuntimeStub frame
2004     __ ret(0);
2005 
2006     {
2007       // UnsafeCopyMemory page error: continue after ucm
2008       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2009       // Copy in multi-bytes chunks
2010       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2011     }
2012     restore_arg_regs();
2013     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
2014     __ xorptr(rax, rax); // return 0
2015     __ vzeroupper();
2016     __ leave(); // required for proper stackwalking of RuntimeStub frame
2017     __ ret(0);
2018 
2019     return start;
2020   }
2021 
2022   // Arguments:
2023   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2024   //             ignored
2025   //   name    - stub name string
2026   //
2027   // Inputs:
2028   //   c_rarg0   - source array address
2029   //   c_rarg1   - destination array address
2030   //   c_rarg2   - element count, treated as ssize_t, can be zero
2031   //
2032   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2033   // let the hardware handle it.  The two or four words within dwords
2034   // or qwords that span cache line boundaries will still be loaded
2035   // and stored atomically.
2036   //
2037   // Side Effects:
2038   //   disjoint_short_copy_entry is set to the no-overlap entry point
2039   //   used by generate_conjoint_short_copy().
2040   //
2041   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
2042 #if COMPILER2_OR_JVMCI
2043     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2044        return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
2045                                                  aligned, false, false);
2046     }
2047 #endif
2048 
2049     __ align(CodeEntryAlignment);
2050     StubCodeMark mark(this, "StubRoutines", name);
2051     address start = __ pc();
2052 
2053     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
2054     const Register from        = rdi;  // source array address
2055     const Register to          = rsi;  // destination array address
2056     const Register count       = rdx;  // elements count
2057     const Register word_count  = rcx;
2058     const Register qword_count = count;
2059     const Register end_from    = from; // source array end address
2060     const Register end_to      = to;   // destination array end address
2061     // End pointers are inclusive, and if count is not zero they point
2062     // to the last unit copied:  end_to[0] := end_from[0]
2063 
2064     __ enter(); // required for proper stackwalking of RuntimeStub frame
2065     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2066 
2067     if (entry != NULL) {
2068       *entry = __ pc();
2069       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2070       BLOCK_COMMENT("Entry:");
2071     }
2072 
2073     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2074                       // r9 and r10 may be used to save non-volatile registers
2075 
2076     {
2077       // UnsafeCopyMemory page error: continue after ucm
2078       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2079       // 'from', 'to' and 'count' are now valid
2080       __ movptr(word_count, count);
2081       __ shrptr(count, 2); // count => qword_count
2082 
2083       // Copy from low to high addresses.  Use 'to' as scratch.
2084       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2085       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2086       __ negptr(qword_count);
2087       __ jmp(L_copy_bytes);
2088 
2089       // Copy trailing qwords
2090     __ BIND(L_copy_8_bytes);
2091       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2092       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2093       __ increment(qword_count);
2094       __ jcc(Assembler::notZero, L_copy_8_bytes);
2095 
2096       // Original 'dest' is trashed, so we can't use it as a
2097       // base register for a possible trailing word copy
2098 
2099       // Check for and copy trailing dword
2100     __ BIND(L_copy_4_bytes);
2101       __ testl(word_count, 2);
2102       __ jccb(Assembler::zero, L_copy_2_bytes);
2103       __ movl(rax, Address(end_from, 8));
2104       __ movl(Address(end_to, 8), rax);
2105 
2106       __ addptr(end_from, 4);
2107       __ addptr(end_to, 4);
2108 
2109       // Check for and copy trailing word
2110     __ BIND(L_copy_2_bytes);
2111       __ testl(word_count, 1);
2112       __ jccb(Assembler::zero, L_exit);
2113       __ movw(rax, Address(end_from, 8));
2114       __ movw(Address(end_to, 8), rax);
2115     }
2116   __ BIND(L_exit);
2117     address ucme_exit_pc = __ pc();
2118     restore_arg_regs();
2119     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2120     __ xorptr(rax, rax); // return 0
2121     __ vzeroupper();
2122     __ leave(); // required for proper stackwalking of RuntimeStub frame
2123     __ ret(0);
2124 
2125     {
2126       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
2127       // Copy in multi-bytes chunks
2128       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2129       __ jmp(L_copy_4_bytes);
2130     }
2131 
2132     return start;
2133   }
2134 
2135   address generate_fill(BasicType t, bool aligned, const char *name) {
2136     __ align(CodeEntryAlignment);
2137     StubCodeMark mark(this, "StubRoutines", name);
2138     address start = __ pc();
2139 
2140     BLOCK_COMMENT("Entry:");
2141 
2142     const Register to       = c_rarg0;  // destination array address
2143     const Register value    = c_rarg1;  // value
2144     const Register count    = c_rarg2;  // elements count
2145     __ mov(r11, count);
2146 
2147     __ enter(); // required for proper stackwalking of RuntimeStub frame
2148 
2149     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
2150 
2151     __ vzeroupper();
2152     __ leave(); // required for proper stackwalking of RuntimeStub frame
2153     __ ret(0);
2154     return start;
2155   }
2156 
2157   // Arguments:
2158   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2159   //             ignored
2160   //   name    - stub name string
2161   //
2162   // Inputs:
2163   //   c_rarg0   - source array address
2164   //   c_rarg1   - destination array address
2165   //   c_rarg2   - element count, treated as ssize_t, can be zero
2166   //
2167   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2168   // let the hardware handle it.  The two or four words within dwords
2169   // or qwords that span cache line boundaries will still be loaded
2170   // and stored atomically.
2171   //
2172   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2173                                        address *entry, const char *name) {
2174 #if COMPILER2_OR_JVMCI
2175     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2176        return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
2177                                                  nooverlap_target, aligned, false, false);
2178     }
2179 #endif
2180     __ align(CodeEntryAlignment);
2181     StubCodeMark mark(this, "StubRoutines", name);
2182     address start = __ pc();
2183 
2184     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2185     const Register from        = rdi;  // source array address
2186     const Register to          = rsi;  // destination array address
2187     const Register count       = rdx;  // elements count
2188     const Register word_count  = rcx;
2189     const Register qword_count = count;
2190 
2191     __ enter(); // required for proper stackwalking of RuntimeStub frame
2192     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2193 
2194     if (entry != NULL) {
2195       *entry = __ pc();
2196       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2197       BLOCK_COMMENT("Entry:");
2198     }
2199 
2200     array_overlap_test(nooverlap_target, Address::times_2);
2201     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2202                       // r9 and r10 may be used to save non-volatile registers
2203 
2204     {
2205       // UnsafeCopyMemory page error: continue after ucm
2206       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2207       // 'from', 'to' and 'count' are now valid
2208       __ movptr(word_count, count);
2209       __ shrptr(count, 2); // count => qword_count
2210 
2211       // Copy from high to low addresses.  Use 'to' as scratch.
2212 
2213       // Check for and copy trailing word
2214       __ testl(word_count, 1);
2215       __ jccb(Assembler::zero, L_copy_4_bytes);
2216       __ movw(rax, Address(from, word_count, Address::times_2, -2));
2217       __ movw(Address(to, word_count, Address::times_2, -2), rax);
2218 
2219      // Check for and copy trailing dword
2220     __ BIND(L_copy_4_bytes);
2221       __ testl(word_count, 2);
2222       __ jcc(Assembler::zero, L_copy_bytes);
2223       __ movl(rax, Address(from, qword_count, Address::times_8));
2224       __ movl(Address(to, qword_count, Address::times_8), rax);
2225       __ jmp(L_copy_bytes);
2226 
2227       // Copy trailing qwords
2228     __ BIND(L_copy_8_bytes);
2229       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2230       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2231       __ decrement(qword_count);
2232       __ jcc(Assembler::notZero, L_copy_8_bytes);
2233     }
2234     restore_arg_regs();
2235     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2236     __ xorptr(rax, rax); // return 0
2237     __ vzeroupper();
2238     __ leave(); // required for proper stackwalking of RuntimeStub frame
2239     __ ret(0);
2240 
2241     {
2242       // UnsafeCopyMemory page error: continue after ucm
2243       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2244       // Copy in multi-bytes chunks
2245       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2246     }
2247     restore_arg_regs();
2248     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2249     __ xorptr(rax, rax); // return 0
2250     __ vzeroupper();
2251     __ leave(); // required for proper stackwalking of RuntimeStub frame
2252     __ ret(0);
2253 
2254     return start;
2255   }
2256 
2257   // Arguments:
2258   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2259   //             ignored
2260   //   is_oop  - true => oop array, so generate store check code
2261   //   name    - stub name string
2262   //
2263   // Inputs:
2264   //   c_rarg0   - source array address
2265   //   c_rarg1   - destination array address
2266   //   c_rarg2   - element count, treated as ssize_t, can be zero
2267   //
2268   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2269   // the hardware handle it.  The two dwords within qwords that span
2270   // cache line boundaries will still be loaded and stored atomicly.
2271   //
2272   // Side Effects:
2273   //   disjoint_int_copy_entry is set to the no-overlap entry point
2274   //   used by generate_conjoint_int_oop_copy().
2275   //
2276   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2277                                          const char *name, bool dest_uninitialized = false) {
2278 #if COMPILER2_OR_JVMCI
2279     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2280        return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
2281                                                  aligned, is_oop, dest_uninitialized);
2282     }
2283 #endif
2284 
2285     __ align(CodeEntryAlignment);
2286     StubCodeMark mark(this, "StubRoutines", name);
2287     address start = __ pc();
2288 
2289     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2290     const Register from        = rdi;  // source array address
2291     const Register to          = rsi;  // destination array address
2292     const Register count       = rdx;  // elements count
2293     const Register dword_count = rcx;
2294     const Register qword_count = count;
2295     const Register end_from    = from; // source array end address
2296     const Register end_to      = to;   // destination array end address
2297     // End pointers are inclusive, and if count is not zero they point
2298     // to the last unit copied:  end_to[0] := end_from[0]
2299 
2300     __ enter(); // required for proper stackwalking of RuntimeStub frame
2301     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2302 
2303     if (entry != NULL) {
2304       *entry = __ pc();
2305       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2306       BLOCK_COMMENT("Entry:");
2307     }
2308 
2309     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2310                                    // r9 is used to save r15_thread
2311 
2312     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2313     if (dest_uninitialized) {
2314       decorators |= IS_DEST_UNINITIALIZED;
2315     }
2316     if (aligned) {
2317       decorators |= ARRAYCOPY_ALIGNED;
2318     }
2319 
2320     BasicType type = is_oop ? T_OBJECT : T_INT;
2321     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2322     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2323 
2324     {
2325       // UnsafeCopyMemory page error: continue after ucm
2326       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2327       // 'from', 'to' and 'count' are now valid
2328       __ movptr(dword_count, count);
2329       __ shrptr(count, 1); // count => qword_count
2330 
2331       // Copy from low to high addresses.  Use 'to' as scratch.
2332       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2333       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2334       __ negptr(qword_count);
2335       __ jmp(L_copy_bytes);
2336 
2337       // Copy trailing qwords
2338     __ BIND(L_copy_8_bytes);
2339       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2340       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2341       __ increment(qword_count);
2342       __ jcc(Assembler::notZero, L_copy_8_bytes);
2343 
2344       // Check for and copy trailing dword
2345     __ BIND(L_copy_4_bytes);
2346       __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2347       __ jccb(Assembler::zero, L_exit);
2348       __ movl(rax, Address(end_from, 8));
2349       __ movl(Address(end_to, 8), rax);
2350     }
2351   __ BIND(L_exit);
2352     address ucme_exit_pc = __ pc();
2353     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2354     restore_arg_regs_using_thread();
2355     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2356     __ vzeroupper();
2357     __ xorptr(rax, rax); // return 0
2358     __ leave(); // required for proper stackwalking of RuntimeStub frame
2359     __ ret(0);
2360 
2361     {
2362       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2363       // Copy in multi-bytes chunks
2364       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2365       __ jmp(L_copy_4_bytes);
2366     }
2367 
2368     return start;
2369   }
2370 
2371   // Arguments:
2372   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2373   //             ignored
2374   //   is_oop  - true => oop array, so generate store check code
2375   //   name    - stub name string
2376   //
2377   // Inputs:
2378   //   c_rarg0   - source array address
2379   //   c_rarg1   - destination array address
2380   //   c_rarg2   - element count, treated as ssize_t, can be zero
2381   //
2382   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2383   // the hardware handle it.  The two dwords within qwords that span
2384   // cache line boundaries will still be loaded and stored atomicly.
2385   //
2386   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2387                                          address *entry, const char *name,
2388                                          bool dest_uninitialized = false) {
2389 #if COMPILER2_OR_JVMCI
2390     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2391        return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
2392                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2393     }
2394 #endif
2395     __ align(CodeEntryAlignment);
2396     StubCodeMark mark(this, "StubRoutines", name);
2397     address start = __ pc();
2398 
2399     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2400     const Register from        = rdi;  // source array address
2401     const Register to          = rsi;  // destination array address
2402     const Register count       = rdx;  // elements count
2403     const Register dword_count = rcx;
2404     const Register qword_count = count;
2405 
2406     __ enter(); // required for proper stackwalking of RuntimeStub frame
2407     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2408 
2409     if (entry != NULL) {
2410       *entry = __ pc();
2411        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2412       BLOCK_COMMENT("Entry:");
2413     }
2414 
2415     array_overlap_test(nooverlap_target, Address::times_4);
2416     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2417                                    // r9 is used to save r15_thread
2418 
2419     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2420     if (dest_uninitialized) {
2421       decorators |= IS_DEST_UNINITIALIZED;
2422     }
2423     if (aligned) {
2424       decorators |= ARRAYCOPY_ALIGNED;
2425     }
2426 
2427     BasicType type = is_oop ? T_OBJECT : T_INT;
2428     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2429     // no registers are destroyed by this call
2430     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2431 
2432     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2433     {
2434       // UnsafeCopyMemory page error: continue after ucm
2435       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2436       // 'from', 'to' and 'count' are now valid
2437       __ movptr(dword_count, count);
2438       __ shrptr(count, 1); // count => qword_count
2439 
2440       // Copy from high to low addresses.  Use 'to' as scratch.
2441 
2442       // Check for and copy trailing dword
2443       __ testl(dword_count, 1);
2444       __ jcc(Assembler::zero, L_copy_bytes);
2445       __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2446       __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2447       __ jmp(L_copy_bytes);
2448 
2449       // Copy trailing qwords
2450     __ BIND(L_copy_8_bytes);
2451       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2452       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2453       __ decrement(qword_count);
2454       __ jcc(Assembler::notZero, L_copy_8_bytes);
2455     }
2456     if (is_oop) {
2457       __ jmp(L_exit);
2458     }
2459     restore_arg_regs_using_thread();
2460     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2461     __ xorptr(rax, rax); // return 0
2462     __ vzeroupper();
2463     __ leave(); // required for proper stackwalking of RuntimeStub frame
2464     __ ret(0);
2465 
2466     {
2467       // UnsafeCopyMemory page error: continue after ucm
2468       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2469       // Copy in multi-bytes chunks
2470       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2471     }
2472 
2473   __ BIND(L_exit);
2474     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2475     restore_arg_regs_using_thread();
2476     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2477     __ xorptr(rax, rax); // return 0
2478     __ vzeroupper();
2479     __ leave(); // required for proper stackwalking of RuntimeStub frame
2480     __ ret(0);
2481 
2482     return start;
2483   }
2484 
2485   // Arguments:
2486   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2487   //             ignored
2488   //   is_oop  - true => oop array, so generate store check code
2489   //   name    - stub name string
2490   //
2491   // Inputs:
2492   //   c_rarg0   - source array address
2493   //   c_rarg1   - destination array address
2494   //   c_rarg2   - element count, treated as ssize_t, can be zero
2495   //
2496  // Side Effects:
2497   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2498   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2499   //
2500   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2501                                           const char *name, bool dest_uninitialized = false) {
2502 #if COMPILER2_OR_JVMCI
2503     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2504        return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
2505                                                  aligned, is_oop, dest_uninitialized);
2506     }
2507 #endif
2508     __ align(CodeEntryAlignment);
2509     StubCodeMark mark(this, "StubRoutines", name);
2510     address start = __ pc();
2511 
2512     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2513     const Register from        = rdi;  // source array address
2514     const Register to          = rsi;  // destination array address
2515     const Register qword_count = rdx;  // elements count
2516     const Register end_from    = from; // source array end address
2517     const Register end_to      = rcx;  // destination array end address
2518     const Register saved_count = r11;
2519     // End pointers are inclusive, and if count is not zero they point
2520     // to the last unit copied:  end_to[0] := end_from[0]
2521 
2522     __ enter(); // required for proper stackwalking of RuntimeStub frame
2523     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2524     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2525 
2526     if (entry != NULL) {
2527       *entry = __ pc();
2528       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2529       BLOCK_COMMENT("Entry:");
2530     }
2531 
2532     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2533                                      // r9 is used to save r15_thread
2534     // 'from', 'to' and 'qword_count' are now valid
2535 
2536     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2537     if (dest_uninitialized) {
2538       decorators |= IS_DEST_UNINITIALIZED;
2539     }
2540     if (aligned) {
2541       decorators |= ARRAYCOPY_ALIGNED;
2542     }
2543 
2544     BasicType type = is_oop ? T_OBJECT : T_LONG;
2545     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2546     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2547     {
2548       // UnsafeCopyMemory page error: continue after ucm
2549       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2550 
2551       // Copy from low to high addresses.  Use 'to' as scratch.
2552       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2553       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2554       __ negptr(qword_count);
2555       __ jmp(L_copy_bytes);
2556 
2557       // Copy trailing qwords
2558     __ BIND(L_copy_8_bytes);
2559       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2560       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2561       __ increment(qword_count);
2562       __ jcc(Assembler::notZero, L_copy_8_bytes);
2563     }
2564     if (is_oop) {
2565       __ jmp(L_exit);
2566     } else {
2567       restore_arg_regs_using_thread();
2568       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2569       __ xorptr(rax, rax); // return 0
2570       __ vzeroupper();
2571       __ leave(); // required for proper stackwalking of RuntimeStub frame
2572       __ ret(0);
2573     }
2574 
2575     {
2576       // UnsafeCopyMemory page error: continue after ucm
2577       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2578       // Copy in multi-bytes chunks
2579       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2580     }
2581 
2582     __ BIND(L_exit);
2583     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2584     restore_arg_regs_using_thread();
2585     if (is_oop) {
2586       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2587     } else {
2588       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2589     }
2590     __ vzeroupper();
2591     __ xorptr(rax, rax); // return 0
2592     __ leave(); // required for proper stackwalking of RuntimeStub frame
2593     __ ret(0);
2594 
2595     return start;
2596   }
2597 
2598   // Arguments:
2599   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2600   //             ignored
2601   //   is_oop  - true => oop array, so generate store check code
2602   //   name    - stub name string
2603   //
2604   // Inputs:
2605   //   c_rarg0   - source array address
2606   //   c_rarg1   - destination array address
2607   //   c_rarg2   - element count, treated as ssize_t, can be zero
2608   //
2609   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2610                                           address nooverlap_target, address *entry,
2611                                           const char *name, bool dest_uninitialized = false) {
2612 #if COMPILER2_OR_JVMCI
2613     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2614        return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2615                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2616     }
2617 #endif
2618     __ align(CodeEntryAlignment);
2619     StubCodeMark mark(this, "StubRoutines", name);
2620     address start = __ pc();
2621 
2622     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2623     const Register from        = rdi;  // source array address
2624     const Register to          = rsi;  // destination array address
2625     const Register qword_count = rdx;  // elements count
2626     const Register saved_count = rcx;
2627 
2628     __ enter(); // required for proper stackwalking of RuntimeStub frame
2629     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2630 
2631     if (entry != NULL) {
2632       *entry = __ pc();
2633       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2634       BLOCK_COMMENT("Entry:");
2635     }
2636 
2637     array_overlap_test(nooverlap_target, Address::times_8);
2638     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2639                                    // r9 is used to save r15_thread
2640     // 'from', 'to' and 'qword_count' are now valid
2641 
2642     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2643     if (dest_uninitialized) {
2644       decorators |= IS_DEST_UNINITIALIZED;
2645     }
2646     if (aligned) {
2647       decorators |= ARRAYCOPY_ALIGNED;
2648     }
2649 
2650     BasicType type = is_oop ? T_OBJECT : T_LONG;
2651     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2652     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2653     {
2654       // UnsafeCopyMemory page error: continue after ucm
2655       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2656 
2657       __ jmp(L_copy_bytes);
2658 
2659       // Copy trailing qwords
2660     __ BIND(L_copy_8_bytes);
2661       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2662       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2663       __ decrement(qword_count);
2664       __ jcc(Assembler::notZero, L_copy_8_bytes);
2665     }
2666     if (is_oop) {
2667       __ jmp(L_exit);
2668     } else {
2669       restore_arg_regs_using_thread();
2670       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2671       __ xorptr(rax, rax); // return 0
2672       __ vzeroupper();
2673       __ leave(); // required for proper stackwalking of RuntimeStub frame
2674       __ ret(0);
2675     }
2676     {
2677       // UnsafeCopyMemory page error: continue after ucm
2678       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2679 
2680       // Copy in multi-bytes chunks
2681       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2682     }
2683     __ BIND(L_exit);
2684     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2685     restore_arg_regs_using_thread();
2686     if (is_oop) {
2687       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2688     } else {
2689       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2690     }
2691     __ vzeroupper();
2692     __ xorptr(rax, rax); // return 0
2693     __ leave(); // required for proper stackwalking of RuntimeStub frame
2694     __ ret(0);
2695 
2696     return start;
2697   }
2698 
2699 
2700   // Helper for generating a dynamic type check.
2701   // Smashes no registers.
2702   void generate_type_check(Register sub_klass,
2703                            Register super_check_offset,
2704                            Register super_klass,
2705                            Label& L_success) {
2706     assert_different_registers(sub_klass, super_check_offset, super_klass);
2707 
2708     BLOCK_COMMENT("type_check:");
2709 
2710     Label L_miss;
2711 
2712     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2713                                      super_check_offset);
2714     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2715 
2716     // Fall through on failure!
2717     __ BIND(L_miss);
2718   }
2719 
2720   //
2721   //  Generate checkcasting array copy stub
2722   //
2723   //  Input:
2724   //    c_rarg0   - source array address
2725   //    c_rarg1   - destination array address
2726   //    c_rarg2   - element count, treated as ssize_t, can be zero
2727   //    c_rarg3   - size_t ckoff (super_check_offset)
2728   // not Win64
2729   //    c_rarg4   - oop ckval (super_klass)
2730   // Win64
2731   //    rsp+40    - oop ckval (super_klass)
2732   //
2733   //  Output:
2734   //    rax ==  0  -  success
2735   //    rax == -1^K - failure, where K is partial transfer count
2736   //
2737   address generate_checkcast_copy(const char *name, address *entry,
2738                                   bool dest_uninitialized = false) {
2739 
2740     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2741 
2742     // Input registers (after setup_arg_regs)
2743     const Register from        = rdi;   // source array address
2744     const Register to          = rsi;   // destination array address
2745     const Register length      = rdx;   // elements count
2746     const Register ckoff       = rcx;   // super_check_offset
2747     const Register ckval       = r8;    // super_klass
2748 
2749     // Registers used as temps (r13, r14 are save-on-entry)
2750     const Register end_from    = from;  // source array end address
2751     const Register end_to      = r13;   // destination array end address
2752     const Register count       = rdx;   // -(count_remaining)
2753     const Register r14_length  = r14;   // saved copy of length
2754     // End pointers are inclusive, and if length is not zero they point
2755     // to the last unit copied:  end_to[0] := end_from[0]
2756 
2757     const Register rax_oop    = rax;    // actual oop copied
2758     const Register r11_klass  = r11;    // oop._klass
2759 
2760     //---------------------------------------------------------------
2761     // Assembler stub will be used for this call to arraycopy
2762     // if the two arrays are subtypes of Object[] but the
2763     // destination array type is not equal to or a supertype
2764     // of the source type.  Each element must be separately
2765     // checked.
2766 
2767     __ align(CodeEntryAlignment);
2768     StubCodeMark mark(this, "StubRoutines", name);
2769     address start = __ pc();
2770 
2771     __ enter(); // required for proper stackwalking of RuntimeStub frame
2772 
2773 #ifdef ASSERT
2774     // caller guarantees that the arrays really are different
2775     // otherwise, we would have to make conjoint checks
2776     { Label L;
2777       array_overlap_test(L, TIMES_OOP);
2778       __ stop("checkcast_copy within a single array");
2779       __ bind(L);
2780     }
2781 #endif //ASSERT
2782 
2783     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2784                        // ckoff => rcx, ckval => r8
2785                        // r9 and r10 may be used to save non-volatile registers
2786 #ifdef _WIN64
2787     // last argument (#4) is on stack on Win64
2788     __ movptr(ckval, Address(rsp, 6 * wordSize));
2789 #endif
2790 
2791     // Caller of this entry point must set up the argument registers.
2792     if (entry != NULL) {
2793       *entry = __ pc();
2794       BLOCK_COMMENT("Entry:");
2795     }
2796 
2797     // allocate spill slots for r13, r14
2798     enum {
2799       saved_r13_offset,
2800       saved_r14_offset,
2801       saved_r10_offset,
2802       saved_rbp_offset
2803     };
2804     __ subptr(rsp, saved_rbp_offset * wordSize);
2805     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2806     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2807     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2808 
2809 #ifdef ASSERT
2810       Label L2;
2811       __ get_thread(r14);
2812       __ cmpptr(r15_thread, r14);
2813       __ jcc(Assembler::equal, L2);
2814       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2815       __ bind(L2);
2816 #endif // ASSERT
2817 
2818     // check that int operands are properly extended to size_t
2819     assert_clean_int(length, rax);
2820     assert_clean_int(ckoff, rax);
2821 
2822 #ifdef ASSERT
2823     BLOCK_COMMENT("assert consistent ckoff/ckval");
2824     // The ckoff and ckval must be mutually consistent,
2825     // even though caller generates both.
2826     { Label L;
2827       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2828       __ cmpl(ckoff, Address(ckval, sco_offset));
2829       __ jcc(Assembler::equal, L);
2830       __ stop("super_check_offset inconsistent");
2831       __ bind(L);
2832     }
2833 #endif //ASSERT
2834 
2835     // Loop-invariant addresses.  They are exclusive end pointers.
2836     Address end_from_addr(from, length, TIMES_OOP, 0);
2837     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2838     // Loop-variant addresses.  They assume post-incremented count < 0.
2839     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2840     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2841 
2842     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2843     if (dest_uninitialized) {
2844       decorators |= IS_DEST_UNINITIALIZED;
2845     }
2846 
2847     BasicType type = T_OBJECT;
2848     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2849     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2850 
2851     // Copy from low to high addresses, indexed from the end of each array.
2852     __ lea(end_from, end_from_addr);
2853     __ lea(end_to,   end_to_addr);
2854     __ movptr(r14_length, length);        // save a copy of the length
2855     assert(length == count, "");          // else fix next line:
2856     __ negptr(count);                     // negate and test the length
2857     __ jcc(Assembler::notZero, L_load_element);
2858 
2859     // Empty array:  Nothing to do.
2860     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2861     __ jmp(L_done);
2862 
2863     // ======== begin loop ========
2864     // (Loop is rotated; its entry is L_load_element.)
2865     // Loop control:
2866     //   for (count = -count; count != 0; count++)
2867     // Base pointers src, dst are biased by 8*(count-1),to last element.
2868     __ align(OptoLoopAlignment);
2869 
2870     __ BIND(L_store_element);
2871     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
2872     __ increment(count);               // increment the count toward zero
2873     __ jcc(Assembler::zero, L_do_card_marks);
2874 
2875     // ======== loop entry is here ========
2876     __ BIND(L_load_element);
2877     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2878     __ testptr(rax_oop, rax_oop);
2879     __ jcc(Assembler::zero, L_store_element);
2880 
2881     __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2882     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2883     // ======== end loop ========
2884 
2885     // It was a real error; we must depend on the caller to finish the job.
2886     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2887     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2888     // and report their number to the caller.
2889     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2890     Label L_post_barrier;
2891     __ addptr(r14_length, count);     // K = (original - remaining) oops
2892     __ movptr(rax, r14_length);       // save the value
2893     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2894     __ jccb(Assembler::notZero, L_post_barrier);
2895     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2896 
2897     // Come here on success only.
2898     __ BIND(L_do_card_marks);
2899     __ xorptr(rax, rax);              // return 0 on success
2900 
2901     __ BIND(L_post_barrier);
2902     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2903 
2904     // Common exit point (success or failure).
2905     __ BIND(L_done);
2906     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2907     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2908     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2909     restore_arg_regs();
2910     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2911     __ leave(); // required for proper stackwalking of RuntimeStub frame
2912     __ ret(0);
2913 
2914     return start;
2915   }
2916 
2917   //
2918   //  Generate 'unsafe' array copy stub
2919   //  Though just as safe as the other stubs, it takes an unscaled
2920   //  size_t argument instead of an element count.
2921   //
2922   //  Input:
2923   //    c_rarg0   - source array address
2924   //    c_rarg1   - destination array address
2925   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2926   //
2927   // Examines the alignment of the operands and dispatches
2928   // to a long, int, short, or byte copy loop.
2929   //
2930   address generate_unsafe_copy(const char *name,
2931                                address byte_copy_entry, address short_copy_entry,
2932                                address int_copy_entry, address long_copy_entry) {
2933 
2934     Label L_long_aligned, L_int_aligned, L_short_aligned;
2935 
2936     // Input registers (before setup_arg_regs)
2937     const Register from        = c_rarg0;  // source array address
2938     const Register to          = c_rarg1;  // destination array address
2939     const Register size        = c_rarg2;  // byte count (size_t)
2940 
2941     // Register used as a temp
2942     const Register bits        = rax;      // test copy of low bits
2943 
2944     __ align(CodeEntryAlignment);
2945     StubCodeMark mark(this, "StubRoutines", name);
2946     address start = __ pc();
2947 
2948     __ enter(); // required for proper stackwalking of RuntimeStub frame
2949 
2950     // bump this on entry, not on exit:
2951     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2952 
2953     __ mov(bits, from);
2954     __ orptr(bits, to);
2955     __ orptr(bits, size);
2956 
2957     __ testb(bits, BytesPerLong-1);
2958     __ jccb(Assembler::zero, L_long_aligned);
2959 
2960     __ testb(bits, BytesPerInt-1);
2961     __ jccb(Assembler::zero, L_int_aligned);
2962 
2963     __ testb(bits, BytesPerShort-1);
2964     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2965 
2966     __ BIND(L_short_aligned);
2967     __ shrptr(size, LogBytesPerShort); // size => short_count
2968     __ jump(RuntimeAddress(short_copy_entry));
2969 
2970     __ BIND(L_int_aligned);
2971     __ shrptr(size, LogBytesPerInt); // size => int_count
2972     __ jump(RuntimeAddress(int_copy_entry));
2973 
2974     __ BIND(L_long_aligned);
2975     __ shrptr(size, LogBytesPerLong); // size => qword_count
2976     __ jump(RuntimeAddress(long_copy_entry));
2977 
2978     return start;
2979   }
2980 
2981   // Perform range checks on the proposed arraycopy.
2982   // Kills temp, but nothing else.
2983   // Also, clean the sign bits of src_pos and dst_pos.
2984   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2985                               Register src_pos, // source position (c_rarg1)
2986                               Register dst,     // destination array oo (c_rarg2)
2987                               Register dst_pos, // destination position (c_rarg3)
2988                               Register length,
2989                               Register temp,
2990                               Label& L_failed) {
2991     BLOCK_COMMENT("arraycopy_range_checks:");
2992 
2993     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2994     __ movl(temp, length);
2995     __ addl(temp, src_pos);             // src_pos + length
2996     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2997     __ jcc(Assembler::above, L_failed);
2998 
2999     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
3000     __ movl(temp, length);
3001     __ addl(temp, dst_pos);             // dst_pos + length
3002     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
3003     __ jcc(Assembler::above, L_failed);
3004 
3005     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
3006     // Move with sign extension can be used since they are positive.
3007     __ movslq(src_pos, src_pos);
3008     __ movslq(dst_pos, dst_pos);
3009 
3010     BLOCK_COMMENT("arraycopy_range_checks done");
3011   }
3012 
3013   //
3014   //  Generate generic array copy stubs
3015   //
3016   //  Input:
3017   //    c_rarg0    -  src oop
3018   //    c_rarg1    -  src_pos (32-bits)
3019   //    c_rarg2    -  dst oop
3020   //    c_rarg3    -  dst_pos (32-bits)
3021   // not Win64
3022   //    c_rarg4    -  element count (32-bits)
3023   // Win64
3024   //    rsp+40     -  element count (32-bits)
3025   //
3026   //  Output:
3027   //    rax ==  0  -  success
3028   //    rax == -1^K - failure, where K is partial transfer count
3029   //
3030   address generate_generic_copy(const char *name,
3031                                 address byte_copy_entry, address short_copy_entry,
3032                                 address int_copy_entry, address oop_copy_entry,
3033                                 address long_copy_entry, address checkcast_copy_entry) {
3034 
3035     Label L_failed, L_failed_0, L_objArray;
3036     Label L_copy_shorts, L_copy_ints, L_copy_longs;
3037 
3038     // Input registers
3039     const Register src        = c_rarg0;  // source array oop
3040     const Register src_pos    = c_rarg1;  // source position
3041     const Register dst        = c_rarg2;  // destination array oop
3042     const Register dst_pos    = c_rarg3;  // destination position
3043 #ifndef _WIN64
3044     const Register length     = c_rarg4;
3045     const Register rklass_tmp = r9;  // load_klass
3046 #else
3047     const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3048     const Register rklass_tmp = rdi;  // load_klass
3049 #endif
3050 
3051     { int modulus = CodeEntryAlignment;
3052       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
3053       int advance = target - (__ offset() % modulus);
3054       if (advance < 0)  advance += modulus;
3055       if (advance > 0)  __ nop(advance);
3056     }
3057     StubCodeMark mark(this, "StubRoutines", name);
3058 
3059     // Short-hop target to L_failed.  Makes for denser prologue code.
3060     __ BIND(L_failed_0);
3061     __ jmp(L_failed);
3062     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
3063 
3064     __ align(CodeEntryAlignment);
3065     address start = __ pc();
3066 
3067     __ enter(); // required for proper stackwalking of RuntimeStub frame
3068 
3069 #ifdef _WIN64
3070     __ push(rklass_tmp); // rdi is callee-save on Windows
3071 #endif
3072 
3073     // bump this on entry, not on exit:
3074     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
3075 
3076     //-----------------------------------------------------------------------
3077     // Assembler stub will be used for this call to arraycopy
3078     // if the following conditions are met:
3079     //
3080     // (1) src and dst must not be null.
3081     // (2) src_pos must not be negative.
3082     // (3) dst_pos must not be negative.
3083     // (4) length  must not be negative.
3084     // (5) src klass and dst klass should be the same and not NULL.
3085     // (6) src and dst should be arrays.
3086     // (7) src_pos + length must not exceed length of src.
3087     // (8) dst_pos + length must not exceed length of dst.
3088     //
3089 
3090     //  if (src == NULL) return -1;
3091     __ testptr(src, src);         // src oop
3092     size_t j1off = __ offset();
3093     __ jccb(Assembler::zero, L_failed_0);
3094 
3095     //  if (src_pos < 0) return -1;
3096     __ testl(src_pos, src_pos); // src_pos (32-bits)
3097     __ jccb(Assembler::negative, L_failed_0);
3098 
3099     //  if (dst == NULL) return -1;
3100     __ testptr(dst, dst);         // dst oop
3101     __ jccb(Assembler::zero, L_failed_0);
3102 
3103     //  if (dst_pos < 0) return -1;
3104     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3105     size_t j4off = __ offset();
3106     __ jccb(Assembler::negative, L_failed_0);
3107 
3108     // The first four tests are very dense code,
3109     // but not quite dense enough to put four
3110     // jumps in a 16-byte instruction fetch buffer.
3111     // That's good, because some branch predicters
3112     // do not like jumps so close together.
3113     // Make sure of this.
3114     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3115 
3116     // registers used as temp
3117     const Register r11_length    = r11; // elements count to copy
3118     const Register r10_src_klass = r10; // array klass
3119 
3120     //  if (length < 0) return -1;
3121     __ movl(r11_length, length);        // length (elements count, 32-bits value)
3122     __ testl(r11_length, r11_length);
3123     __ jccb(Assembler::negative, L_failed_0);
3124 
3125     __ load_klass(r10_src_klass, src, rklass_tmp);
3126 #ifdef ASSERT
3127     //  assert(src->klass() != NULL);
3128     {
3129       BLOCK_COMMENT("assert klasses not null {");
3130       Label L1, L2;
3131       __ testptr(r10_src_klass, r10_src_klass);
3132       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
3133       __ bind(L1);
3134       __ stop("broken null klass");
3135       __ bind(L2);
3136       __ load_klass(rax, dst, rklass_tmp);
3137       __ cmpq(rax, 0);
3138       __ jcc(Assembler::equal, L1);     // this would be broken also
3139       BLOCK_COMMENT("} assert klasses not null done");
3140     }
3141 #endif
3142 
3143     // Load layout helper (32-bits)
3144     //
3145     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3146     // 32        30    24            16              8     2                 0
3147     //
3148     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3149     //
3150 
3151     const int lh_offset = in_bytes(Klass::layout_helper_offset());
3152 
3153     // Handle objArrays completely differently...
3154     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3155     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3156     __ jcc(Assembler::equal, L_objArray);
3157 
3158     //  if (src->klass() != dst->klass()) return -1;
3159     __ load_klass(rax, dst, rklass_tmp);
3160     __ cmpq(r10_src_klass, rax);
3161     __ jcc(Assembler::notEqual, L_failed);
3162 
3163     const Register rax_lh = rax;  // layout helper
3164     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3165 
3166     // Check for flat inline type array -> return -1
3167     __ testl(rax_lh, Klass::_lh_array_tag_flat_value_bit_inplace);
3168     __ jcc(Assembler::notZero, L_failed);
3169 
3170     // Check for null-free (non-flat) inline type array -> handle as object array
3171     __ testl(rax_lh, Klass::_lh_null_free_array_bit_inplace);
3172     __ jcc(Assembler::notZero, L_objArray);
3173 
3174     //  if (!src->is_Array()) return -1;
3175     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3176     __ jcc(Assembler::greaterEqual, L_failed);
3177 
3178     // At this point, it is known to be a typeArray (array_tag 0x3).
3179 #ifdef ASSERT
3180     {
3181       BLOCK_COMMENT("assert primitive array {");
3182       Label L;
3183       __ movl(rklass_tmp, rax_lh);
3184       __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3185       __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3186       __ jcc(Assembler::equal, L);
3187       __ stop("must be a primitive array");
3188       __ bind(L);
3189       BLOCK_COMMENT("} assert primitive array done");
3190     }
3191 #endif
3192 
3193     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3194                            r10, L_failed);
3195 
3196     // TypeArrayKlass
3197     //
3198     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3199     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3200     //
3201 
3202     const Register r10_offset = r10;    // array offset
3203     const Register rax_elsize = rax_lh; // element size
3204 
3205     __ movl(r10_offset, rax_lh);
3206     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3207     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3208     __ addptr(src, r10_offset);           // src array offset
3209     __ addptr(dst, r10_offset);           // dst array offset
3210     BLOCK_COMMENT("choose copy loop based on element size");
3211     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3212 
3213 #ifdef _WIN64
3214     __ pop(rklass_tmp); // Restore callee-save rdi
3215 #endif
3216 
3217     // next registers should be set before the jump to corresponding stub
3218     const Register from     = c_rarg0;  // source array address
3219     const Register to       = c_rarg1;  // destination array address
3220     const Register count    = c_rarg2;  // elements count
3221 
3222     // 'from', 'to', 'count' registers should be set in such order
3223     // since they are the same as 'src', 'src_pos', 'dst'.
3224 
3225     __ cmpl(rax_elsize, 0);
3226     __ jccb(Assembler::notEqual, L_copy_shorts);
3227     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3228     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3229     __ movl2ptr(count, r11_length); // length
3230     __ jump(RuntimeAddress(byte_copy_entry));
3231 
3232   __ BIND(L_copy_shorts);
3233     __ cmpl(rax_elsize, LogBytesPerShort);
3234     __ jccb(Assembler::notEqual, L_copy_ints);
3235     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3236     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3237     __ movl2ptr(count, r11_length); // length
3238     __ jump(RuntimeAddress(short_copy_entry));
3239 
3240   __ BIND(L_copy_ints);
3241     __ cmpl(rax_elsize, LogBytesPerInt);
3242     __ jccb(Assembler::notEqual, L_copy_longs);
3243     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3244     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3245     __ movl2ptr(count, r11_length); // length
3246     __ jump(RuntimeAddress(int_copy_entry));
3247 
3248   __ BIND(L_copy_longs);
3249 #ifdef ASSERT
3250     {
3251       BLOCK_COMMENT("assert long copy {");
3252       Label L;
3253       __ cmpl(rax_elsize, LogBytesPerLong);
3254       __ jcc(Assembler::equal, L);
3255       __ stop("must be long copy, but elsize is wrong");
3256       __ bind(L);
3257       BLOCK_COMMENT("} assert long copy done");
3258     }
3259 #endif
3260     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3261     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3262     __ movl2ptr(count, r11_length); // length
3263     __ jump(RuntimeAddress(long_copy_entry));
3264 
3265     // ObjArrayKlass
3266   __ BIND(L_objArray);
3267     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3268 
3269     Label L_plain_copy, L_checkcast_copy;
3270     //  test array classes for subtyping
3271     __ load_klass(rax, dst, rklass_tmp);
3272     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3273     __ jcc(Assembler::notEqual, L_checkcast_copy);
3274 
3275     // Identically typed arrays can be copied without element-wise checks.
3276     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3277                            r10, L_failed);
3278 
3279     __ lea(from, Address(src, src_pos, TIMES_OOP,
3280                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3281     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3282                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3283     __ movl2ptr(count, r11_length); // length
3284   __ BIND(L_plain_copy);
3285 #ifdef _WIN64
3286     __ pop(rklass_tmp); // Restore callee-save rdi
3287 #endif
3288     __ jump(RuntimeAddress(oop_copy_entry));
3289 
3290   __ BIND(L_checkcast_copy);
3291     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3292     {
3293       // Before looking at dst.length, make sure dst is also an objArray.
3294       // This check also fails for flat/null-free arrays which are not supported.
3295       __ cmpl(Address(rax, lh_offset), objArray_lh);
3296       __ jcc(Assembler::notEqual, L_failed);
3297 
3298 #ifdef ASSERT
3299       {
3300         BLOCK_COMMENT("assert not null-free array {");
3301         Label L;
3302         __ movl(rklass_tmp, Address(rax, lh_offset));
3303         __ testl(rklass_tmp, Klass::_lh_null_free_array_bit_inplace);
3304         __ jcc(Assembler::zero, L);
3305         __ stop("unexpected null-free array");
3306         __ bind(L);
3307         BLOCK_COMMENT("} assert not null-free array");
3308       }
3309 #endif
3310 
3311       // It is safe to examine both src.length and dst.length.
3312       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3313                              rax, L_failed);
3314 
3315       const Register r11_dst_klass = r11;
3316       __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3317 
3318       // Marshal the base address arguments now, freeing registers.
3319       __ lea(from, Address(src, src_pos, TIMES_OOP,
3320                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3321       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3322                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3323       __ movl(count, length);           // length (reloaded)
3324       Register sco_temp = c_rarg3;      // this register is free now
3325       assert_different_registers(from, to, count, sco_temp,
3326                                  r11_dst_klass, r10_src_klass);
3327       assert_clean_int(count, sco_temp);
3328 
3329       // Generate the type check.
3330       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3331       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3332       assert_clean_int(sco_temp, rax);
3333       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3334 
3335       // Fetch destination element klass from the ObjArrayKlass header.
3336       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3337       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3338       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3339       assert_clean_int(sco_temp, rax);
3340 
3341 #ifdef _WIN64
3342       __ pop(rklass_tmp); // Restore callee-save rdi
3343 #endif
3344 
3345       // the checkcast_copy loop needs two extra arguments:
3346       assert(c_rarg3 == sco_temp, "#3 already in place");
3347       // Set up arguments for checkcast_copy_entry.
3348       setup_arg_regs(4);
3349       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3350       __ jump(RuntimeAddress(checkcast_copy_entry));
3351     }
3352 
3353   __ BIND(L_failed);
3354 #ifdef _WIN64
3355     __ pop(rklass_tmp); // Restore callee-save rdi
3356 #endif
3357     __ xorptr(rax, rax);
3358     __ notptr(rax); // return -1
3359     __ leave();   // required for proper stackwalking of RuntimeStub frame
3360     __ ret(0);
3361 
3362     return start;
3363   }
3364 
3365   address generate_data_cache_writeback() {
3366     const Register src        = c_rarg0;  // source address
3367 
3368     __ align(CodeEntryAlignment);
3369 
3370     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3371 
3372     address start = __ pc();
3373     __ enter();
3374     __ cache_wb(Address(src, 0));
3375     __ leave();
3376     __ ret(0);
3377 
3378     return start;
3379   }
3380 
3381   address generate_data_cache_writeback_sync() {
3382     const Register is_pre    = c_rarg0;  // pre or post sync
3383 
3384     __ align(CodeEntryAlignment);
3385 
3386     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3387 
3388     // pre wbsync is a no-op
3389     // post wbsync translates to an sfence
3390 
3391     Label skip;
3392     address start = __ pc();
3393     __ enter();
3394     __ cmpl(is_pre, 0);
3395     __ jcc(Assembler::notEqual, skip);
3396     __ cache_wbsync(false);
3397     __ bind(skip);
3398     __ leave();
3399     __ ret(0);
3400 
3401     return start;
3402   }
3403 
3404   void generate_arraycopy_stubs() {
3405     address entry;
3406     address entry_jbyte_arraycopy;
3407     address entry_jshort_arraycopy;
3408     address entry_jint_arraycopy;
3409     address entry_oop_arraycopy;
3410     address entry_jlong_arraycopy;
3411     address entry_checkcast_arraycopy;
3412 
3413     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3414                                                                            "jbyte_disjoint_arraycopy");
3415     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3416                                                                            "jbyte_arraycopy");
3417 
3418     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3419                                                                             "jshort_disjoint_arraycopy");
3420     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3421                                                                             "jshort_arraycopy");
3422 
3423     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3424                                                                               "jint_disjoint_arraycopy");
3425     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3426                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3427 
3428     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3429                                                                                "jlong_disjoint_arraycopy");
3430     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3431                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3432 
3433 
3434     if (UseCompressedOops) {
3435       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3436                                                                               "oop_disjoint_arraycopy");
3437       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3438                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3439       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3440                                                                                      "oop_disjoint_arraycopy_uninit",
3441                                                                                      /*dest_uninitialized*/true);
3442       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3443                                                                                      NULL, "oop_arraycopy_uninit",
3444                                                                                      /*dest_uninitialized*/true);
3445     } else {
3446       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3447                                                                                "oop_disjoint_arraycopy");
3448       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3449                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3450       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3451                                                                                       "oop_disjoint_arraycopy_uninit",
3452                                                                                       /*dest_uninitialized*/true);
3453       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3454                                                                                       NULL, "oop_arraycopy_uninit",
3455                                                                                       /*dest_uninitialized*/true);
3456     }
3457 
3458     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3459     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3460                                                                         /*dest_uninitialized*/true);
3461 
3462     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3463                                                               entry_jbyte_arraycopy,
3464                                                               entry_jshort_arraycopy,
3465                                                               entry_jint_arraycopy,
3466                                                               entry_jlong_arraycopy);
3467     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3468                                                                entry_jbyte_arraycopy,
3469                                                                entry_jshort_arraycopy,
3470                                                                entry_jint_arraycopy,
3471                                                                entry_oop_arraycopy,
3472                                                                entry_jlong_arraycopy,
3473                                                                entry_checkcast_arraycopy);
3474 
3475     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3476     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3477     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3478     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3479     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3480     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3481 
3482     // We don't generate specialized code for HeapWord-aligned source
3483     // arrays, so just use the code we've already generated
3484     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3485     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3486 
3487     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3488     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3489 
3490     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3491     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3492 
3493     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3494     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3495 
3496     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3497     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3498 
3499     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3500     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3501   }
3502 
3503   // AES intrinsic stubs
3504   enum {AESBlockSize = 16};
3505 
3506   address generate_key_shuffle_mask() {
3507     __ align(16);
3508     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3509     address start = __ pc();
3510     __ emit_data64( 0x0405060700010203, relocInfo::none );
3511     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3512     return start;
3513   }
3514 
3515   address generate_counter_shuffle_mask() {
3516     __ align(16);
3517     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3518     address start = __ pc();
3519     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3520     __ emit_data64(0x0001020304050607, relocInfo::none);
3521     return start;
3522   }
3523 
3524   // Utility routine for loading a 128-bit key word in little endian format
3525   // can optionally specify that the shuffle mask is already in an xmmregister
3526   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3527     __ movdqu(xmmdst, Address(key, offset));
3528     if (xmm_shuf_mask != NULL) {
3529       __ pshufb(xmmdst, xmm_shuf_mask);
3530     } else {
3531       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3532     }
3533   }
3534 
3535   // Utility routine for increase 128bit counter (iv in CTR mode)
3536   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3537     __ pextrq(reg, xmmdst, 0x0);
3538     __ addq(reg, inc_delta);
3539     __ pinsrq(xmmdst, reg, 0x0);
3540     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3541     __ pextrq(reg, xmmdst, 0x01); // Carry
3542     __ addq(reg, 0x01);
3543     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3544     __ BIND(next_block);          // next instruction
3545   }
3546 
3547   // Arguments:
3548   //
3549   // Inputs:
3550   //   c_rarg0   - source byte array address
3551   //   c_rarg1   - destination byte array address
3552   //   c_rarg2   - K (key) in little endian int array
3553   //
3554   address generate_aescrypt_encryptBlock() {
3555     assert(UseAES, "need AES instructions and misaligned SSE support");
3556     __ align(CodeEntryAlignment);
3557     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3558     Label L_doLast;
3559     address start = __ pc();
3560 
3561     const Register from        = c_rarg0;  // source array address
3562     const Register to          = c_rarg1;  // destination array address
3563     const Register key         = c_rarg2;  // key array address
3564     const Register keylen      = rax;
3565 
3566     const XMMRegister xmm_result = xmm0;
3567     const XMMRegister xmm_key_shuf_mask = xmm1;
3568     // On win64 xmm6-xmm15 must be preserved so don't use them.
3569     const XMMRegister xmm_temp1  = xmm2;
3570     const XMMRegister xmm_temp2  = xmm3;
3571     const XMMRegister xmm_temp3  = xmm4;
3572     const XMMRegister xmm_temp4  = xmm5;
3573 
3574     __ enter(); // required for proper stackwalking of RuntimeStub frame
3575 
3576     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3577     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3578 
3579     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3580     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3581 
3582     // For encryption, the java expanded key ordering is just what we need
3583     // we don't know if the key is aligned, hence not using load-execute form
3584 
3585     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3586     __ pxor(xmm_result, xmm_temp1);
3587 
3588     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3589     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3590     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3591     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3592 
3593     __ aesenc(xmm_result, xmm_temp1);
3594     __ aesenc(xmm_result, xmm_temp2);
3595     __ aesenc(xmm_result, xmm_temp3);
3596     __ aesenc(xmm_result, xmm_temp4);
3597 
3598     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3599     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3600     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3601     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3602 
3603     __ aesenc(xmm_result, xmm_temp1);
3604     __ aesenc(xmm_result, xmm_temp2);
3605     __ aesenc(xmm_result, xmm_temp3);
3606     __ aesenc(xmm_result, xmm_temp4);
3607 
3608     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3609     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3610 
3611     __ cmpl(keylen, 44);
3612     __ jccb(Assembler::equal, L_doLast);
3613 
3614     __ aesenc(xmm_result, xmm_temp1);
3615     __ aesenc(xmm_result, xmm_temp2);
3616 
3617     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3618     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3619 
3620     __ cmpl(keylen, 52);
3621     __ jccb(Assembler::equal, L_doLast);
3622 
3623     __ aesenc(xmm_result, xmm_temp1);
3624     __ aesenc(xmm_result, xmm_temp2);
3625 
3626     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3627     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3628 
3629     __ BIND(L_doLast);
3630     __ aesenc(xmm_result, xmm_temp1);
3631     __ aesenclast(xmm_result, xmm_temp2);
3632     __ movdqu(Address(to, 0), xmm_result);        // store the result
3633     __ xorptr(rax, rax); // return 0
3634     __ leave(); // required for proper stackwalking of RuntimeStub frame
3635     __ ret(0);
3636 
3637     return start;
3638   }
3639 
3640 
3641   // Arguments:
3642   //
3643   // Inputs:
3644   //   c_rarg0   - source byte array address
3645   //   c_rarg1   - destination byte array address
3646   //   c_rarg2   - K (key) in little endian int array
3647   //
3648   address generate_aescrypt_decryptBlock() {
3649     assert(UseAES, "need AES instructions and misaligned SSE support");
3650     __ align(CodeEntryAlignment);
3651     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3652     Label L_doLast;
3653     address start = __ pc();
3654 
3655     const Register from        = c_rarg0;  // source array address
3656     const Register to          = c_rarg1;  // destination array address
3657     const Register key         = c_rarg2;  // key array address
3658     const Register keylen      = rax;
3659 
3660     const XMMRegister xmm_result = xmm0;
3661     const XMMRegister xmm_key_shuf_mask = xmm1;
3662     // On win64 xmm6-xmm15 must be preserved so don't use them.
3663     const XMMRegister xmm_temp1  = xmm2;
3664     const XMMRegister xmm_temp2  = xmm3;
3665     const XMMRegister xmm_temp3  = xmm4;
3666     const XMMRegister xmm_temp4  = xmm5;
3667 
3668     __ enter(); // required for proper stackwalking of RuntimeStub frame
3669 
3670     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3671     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3672 
3673     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3674     __ movdqu(xmm_result, Address(from, 0));
3675 
3676     // for decryption java expanded key ordering is rotated one position from what we want
3677     // so we start from 0x10 here and hit 0x00 last
3678     // we don't know if the key is aligned, hence not using load-execute form
3679     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3680     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3681     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3682     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3683 
3684     __ pxor  (xmm_result, xmm_temp1);
3685     __ aesdec(xmm_result, xmm_temp2);
3686     __ aesdec(xmm_result, xmm_temp3);
3687     __ aesdec(xmm_result, xmm_temp4);
3688 
3689     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3690     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3691     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3692     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3693 
3694     __ aesdec(xmm_result, xmm_temp1);
3695     __ aesdec(xmm_result, xmm_temp2);
3696     __ aesdec(xmm_result, xmm_temp3);
3697     __ aesdec(xmm_result, xmm_temp4);
3698 
3699     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3700     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3701     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3702 
3703     __ cmpl(keylen, 44);
3704     __ jccb(Assembler::equal, L_doLast);
3705 
3706     __ aesdec(xmm_result, xmm_temp1);
3707     __ aesdec(xmm_result, xmm_temp2);
3708 
3709     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3710     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3711 
3712     __ cmpl(keylen, 52);
3713     __ jccb(Assembler::equal, L_doLast);
3714 
3715     __ aesdec(xmm_result, xmm_temp1);
3716     __ aesdec(xmm_result, xmm_temp2);
3717 
3718     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3719     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3720 
3721     __ BIND(L_doLast);
3722     __ aesdec(xmm_result, xmm_temp1);
3723     __ aesdec(xmm_result, xmm_temp2);
3724 
3725     // for decryption the aesdeclast operation is always on key+0x00
3726     __ aesdeclast(xmm_result, xmm_temp3);
3727     __ movdqu(Address(to, 0), xmm_result);  // store the result
3728     __ xorptr(rax, rax); // return 0
3729     __ leave(); // required for proper stackwalking of RuntimeStub frame
3730     __ ret(0);
3731 
3732     return start;
3733   }
3734 
3735 
3736   // Arguments:
3737   //
3738   // Inputs:
3739   //   c_rarg0   - source byte array address
3740   //   c_rarg1   - destination byte array address
3741   //   c_rarg2   - K (key) in little endian int array
3742   //   c_rarg3   - r vector byte array address
3743   //   c_rarg4   - input length
3744   //
3745   // Output:
3746   //   rax       - input length
3747   //
3748   address generate_cipherBlockChaining_encryptAESCrypt() {
3749     assert(UseAES, "need AES instructions and misaligned SSE support");
3750     __ align(CodeEntryAlignment);
3751     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3752     address start = __ pc();
3753 
3754     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3755     const Register from        = c_rarg0;  // source array address
3756     const Register to          = c_rarg1;  // destination array address
3757     const Register key         = c_rarg2;  // key array address
3758     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3759                                            // and left with the results of the last encryption block
3760 #ifndef _WIN64
3761     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3762 #else
3763     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3764     const Register len_reg     = r11;      // pick the volatile windows register
3765 #endif
3766     const Register pos         = rax;
3767 
3768     // xmm register assignments for the loops below
3769     const XMMRegister xmm_result = xmm0;
3770     const XMMRegister xmm_temp   = xmm1;
3771     // keys 0-10 preloaded into xmm2-xmm12
3772     const int XMM_REG_NUM_KEY_FIRST = 2;
3773     const int XMM_REG_NUM_KEY_LAST  = 15;
3774     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3775     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3776     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3777     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3778     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3779 
3780     __ enter(); // required for proper stackwalking of RuntimeStub frame
3781 
3782 #ifdef _WIN64
3783     // on win64, fill len_reg from stack position
3784     __ movl(len_reg, len_mem);
3785 #else
3786     __ push(len_reg); // Save
3787 #endif
3788 
3789     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3790     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3791     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3792     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3793       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3794       offset += 0x10;
3795     }
3796     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3797 
3798     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3799     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3800     __ cmpl(rax, 44);
3801     __ jcc(Assembler::notEqual, L_key_192_256);
3802 
3803     // 128 bit code follows here
3804     __ movptr(pos, 0);
3805     __ align(OptoLoopAlignment);
3806 
3807     __ BIND(L_loopTop_128);
3808     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3809     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3810     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3811     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3812       __ aesenc(xmm_result, as_XMMRegister(rnum));
3813     }
3814     __ aesenclast(xmm_result, xmm_key10);
3815     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3816     // no need to store r to memory until we exit
3817     __ addptr(pos, AESBlockSize);
3818     __ subptr(len_reg, AESBlockSize);
3819     __ jcc(Assembler::notEqual, L_loopTop_128);
3820 
3821     __ BIND(L_exit);
3822     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3823 
3824 #ifdef _WIN64
3825     __ movl(rax, len_mem);
3826 #else
3827     __ pop(rax); // return length
3828 #endif
3829     __ leave(); // required for proper stackwalking of RuntimeStub frame
3830     __ ret(0);
3831 
3832     __ BIND(L_key_192_256);
3833     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3834     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3835     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3836     __ cmpl(rax, 52);
3837     __ jcc(Assembler::notEqual, L_key_256);
3838 
3839     // 192-bit code follows here (could be changed to use more xmm registers)
3840     __ movptr(pos, 0);
3841     __ align(OptoLoopAlignment);
3842 
3843     __ BIND(L_loopTop_192);
3844     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3845     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3846     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3847     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3848       __ aesenc(xmm_result, as_XMMRegister(rnum));
3849     }
3850     __ aesenclast(xmm_result, xmm_key12);
3851     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3852     // no need to store r to memory until we exit
3853     __ addptr(pos, AESBlockSize);
3854     __ subptr(len_reg, AESBlockSize);
3855     __ jcc(Assembler::notEqual, L_loopTop_192);
3856     __ jmp(L_exit);
3857 
3858     __ BIND(L_key_256);
3859     // 256-bit code follows here (could be changed to use more xmm registers)
3860     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3861     __ movptr(pos, 0);
3862     __ align(OptoLoopAlignment);
3863 
3864     __ BIND(L_loopTop_256);
3865     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3866     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3867     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3868     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3869       __ aesenc(xmm_result, as_XMMRegister(rnum));
3870     }
3871     load_key(xmm_temp, key, 0xe0);
3872     __ aesenclast(xmm_result, xmm_temp);
3873     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3874     // no need to store r to memory until we exit
3875     __ addptr(pos, AESBlockSize);
3876     __ subptr(len_reg, AESBlockSize);
3877     __ jcc(Assembler::notEqual, L_loopTop_256);
3878     __ jmp(L_exit);
3879 
3880     return start;
3881   }
3882 
3883   // Safefetch stubs.
3884   void generate_safefetch(const char* name, int size, address* entry,
3885                           address* fault_pc, address* continuation_pc) {
3886     // safefetch signatures:
3887     //   int      SafeFetch32(int*      adr, int      errValue);
3888     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3889     //
3890     // arguments:
3891     //   c_rarg0 = adr
3892     //   c_rarg1 = errValue
3893     //
3894     // result:
3895     //   PPC_RET  = *adr or errValue
3896 
3897     StubCodeMark mark(this, "StubRoutines", name);
3898 
3899     // Entry point, pc or function descriptor.
3900     *entry = __ pc();
3901 
3902     // Load *adr into c_rarg1, may fault.
3903     *fault_pc = __ pc();
3904     switch (size) {
3905       case 4:
3906         // int32_t
3907         __ movl(c_rarg1, Address(c_rarg0, 0));
3908         break;
3909       case 8:
3910         // int64_t
3911         __ movq(c_rarg1, Address(c_rarg0, 0));
3912         break;
3913       default:
3914         ShouldNotReachHere();
3915     }
3916 
3917     // return errValue or *adr
3918     *continuation_pc = __ pc();
3919     __ movq(rax, c_rarg1);
3920     __ ret(0);
3921   }
3922 
3923   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3924   // to hide instruction latency
3925   //
3926   // Arguments:
3927   //
3928   // Inputs:
3929   //   c_rarg0   - source byte array address
3930   //   c_rarg1   - destination byte array address
3931   //   c_rarg2   - K (key) in little endian int array
3932   //   c_rarg3   - r vector byte array address
3933   //   c_rarg4   - input length
3934   //
3935   // Output:
3936   //   rax       - input length
3937   //
3938   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3939     assert(UseAES, "need AES instructions and misaligned SSE support");
3940     __ align(CodeEntryAlignment);
3941     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3942     address start = __ pc();
3943 
3944     const Register from        = c_rarg0;  // source array address
3945     const Register to          = c_rarg1;  // destination array address
3946     const Register key         = c_rarg2;  // key array address
3947     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3948                                            // and left with the results of the last encryption block
3949 #ifndef _WIN64
3950     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3951 #else
3952     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3953     const Register len_reg     = r11;      // pick the volatile windows register
3954 #endif
3955     const Register pos         = rax;
3956 
3957     const int PARALLEL_FACTOR = 4;
3958     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3959 
3960     Label L_exit;
3961     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3962     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3963     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3964     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3965     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3966 
3967     // keys 0-10 preloaded into xmm5-xmm15
3968     const int XMM_REG_NUM_KEY_FIRST = 5;
3969     const int XMM_REG_NUM_KEY_LAST  = 15;
3970     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3971     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3972 
3973     __ enter(); // required for proper stackwalking of RuntimeStub frame
3974 
3975 #ifdef _WIN64
3976     // on win64, fill len_reg from stack position
3977     __ movl(len_reg, len_mem);
3978 #else
3979     __ push(len_reg); // Save
3980 #endif
3981     __ push(rbx);
3982     // the java expanded key ordering is rotated one position from what we want
3983     // so we start from 0x10 here and hit 0x00 last
3984     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3985     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3986     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3987     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3988       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3989       offset += 0x10;
3990     }
3991     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3992 
3993     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3994 
3995     // registers holding the four results in the parallelized loop
3996     const XMMRegister xmm_result0 = xmm0;
3997     const XMMRegister xmm_result1 = xmm2;
3998     const XMMRegister xmm_result2 = xmm3;
3999     const XMMRegister xmm_result3 = xmm4;
4000 
4001     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
4002 
4003     __ xorptr(pos, pos);
4004 
4005     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
4006     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4007     __ cmpl(rbx, 52);
4008     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
4009     __ cmpl(rbx, 60);
4010     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
4011 
4012 #define DoFour(opc, src_reg)           \
4013   __ opc(xmm_result0, src_reg);         \
4014   __ opc(xmm_result1, src_reg);         \
4015   __ opc(xmm_result2, src_reg);         \
4016   __ opc(xmm_result3, src_reg);         \
4017 
4018     for (int k = 0; k < 3; ++k) {
4019       __ BIND(L_multiBlock_loopTopHead[k]);
4020       if (k != 0) {
4021         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
4022         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
4023       }
4024       if (k == 1) {
4025         __ subptr(rsp, 6 * wordSize);
4026         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
4027         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
4028         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
4029         load_key(xmm1, key, 0xc0);  // 0xc0;
4030         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
4031       } else if (k == 2) {
4032         __ subptr(rsp, 10 * wordSize);
4033         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
4034         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
4035         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
4036         load_key(xmm1, key, 0xe0);  // 0xe0;
4037         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
4038         load_key(xmm15, key, 0xb0); // 0xb0;
4039         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
4040         load_key(xmm1, key, 0xc0);  // 0xc0;
4041         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
4042       }
4043       __ align(OptoLoopAlignment);
4044       __ BIND(L_multiBlock_loopTop[k]);
4045       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
4046       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
4047 
4048       if  (k != 0) {
4049         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
4050         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
4051       }
4052 
4053       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
4054       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4055       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4056       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4057 
4058       DoFour(pxor, xmm_key_first);
4059       if (k == 0) {
4060         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
4061           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4062         }
4063         DoFour(aesdeclast, xmm_key_last);
4064       } else if (k == 1) {
4065         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
4066           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4067         }
4068         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4069         DoFour(aesdec, xmm1);  // key : 0xc0
4070         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4071         DoFour(aesdeclast, xmm_key_last);
4072       } else if (k == 2) {
4073         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
4074           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4075         }
4076         DoFour(aesdec, xmm1);  // key : 0xc0
4077         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
4078         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
4079         DoFour(aesdec, xmm15);  // key : 0xd0
4080         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4081         DoFour(aesdec, xmm1);  // key : 0xe0
4082         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4083         DoFour(aesdeclast, xmm_key_last);
4084       }
4085 
4086       // for each result, xor with the r vector of previous cipher block
4087       __ pxor(xmm_result0, xmm_prev_block_cipher);
4088       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4089       __ pxor(xmm_result1, xmm_prev_block_cipher);
4090       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4091       __ pxor(xmm_result2, xmm_prev_block_cipher);
4092       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4093       __ pxor(xmm_result3, xmm_prev_block_cipher);
4094       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
4095       if (k != 0) {
4096         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
4097       }
4098 
4099       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
4100       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4101       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4102       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4103 
4104       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
4105       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
4106       __ jmp(L_multiBlock_loopTop[k]);
4107 
4108       // registers used in the non-parallelized loops
4109       // xmm register assignments for the loops below
4110       const XMMRegister xmm_result = xmm0;
4111       const XMMRegister xmm_prev_block_cipher_save = xmm2;
4112       const XMMRegister xmm_key11 = xmm3;
4113       const XMMRegister xmm_key12 = xmm4;
4114       const XMMRegister key_tmp = xmm4;
4115 
4116       __ BIND(L_singleBlock_loopTopHead[k]);
4117       if (k == 1) {
4118         __ addptr(rsp, 6 * wordSize);
4119       } else if (k == 2) {
4120         __ addptr(rsp, 10 * wordSize);
4121       }
4122       __ cmpptr(len_reg, 0); // any blocks left??
4123       __ jcc(Assembler::equal, L_exit);
4124       __ BIND(L_singleBlock_loopTopHead2[k]);
4125       if (k == 1) {
4126         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
4127         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
4128       }
4129       if (k == 2) {
4130         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
4131       }
4132       __ align(OptoLoopAlignment);
4133       __ BIND(L_singleBlock_loopTop[k]);
4134       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
4135       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
4136       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
4137       for (int rnum = 1; rnum <= 9 ; rnum++) {
4138           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4139       }
4140       if (k == 1) {
4141         __ aesdec(xmm_result, xmm_key11);
4142         __ aesdec(xmm_result, xmm_key12);
4143       }
4144       if (k == 2) {
4145         __ aesdec(xmm_result, xmm_key11);
4146         load_key(key_tmp, key, 0xc0);
4147         __ aesdec(xmm_result, key_tmp);
4148         load_key(key_tmp, key, 0xd0);
4149         __ aesdec(xmm_result, key_tmp);
4150         load_key(key_tmp, key, 0xe0);
4151         __ aesdec(xmm_result, key_tmp);
4152       }
4153 
4154       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
4155       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
4156       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
4157       // no need to store r to memory until we exit
4158       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
4159       __ addptr(pos, AESBlockSize);
4160       __ subptr(len_reg, AESBlockSize);
4161       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
4162       if (k != 2) {
4163         __ jmp(L_exit);
4164       }
4165     } //for 128/192/256
4166 
4167     __ BIND(L_exit);
4168     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
4169     __ pop(rbx);
4170 #ifdef _WIN64
4171     __ movl(rax, len_mem);
4172 #else
4173     __ pop(rax); // return length
4174 #endif
4175     __ leave(); // required for proper stackwalking of RuntimeStub frame
4176     __ ret(0);
4177     return start;
4178 }
4179 
4180   address generate_electronicCodeBook_encryptAESCrypt() {
4181     __ align(CodeEntryAlignment);
4182     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
4183     address start = __ pc();
4184     const Register from = c_rarg0;  // source array address
4185     const Register to = c_rarg1;  // destination array address
4186     const Register key = c_rarg2;  // key array address
4187     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4188     __ enter(); // required for proper stackwalking of RuntimeStub frame
4189     __ aesecb_encrypt(from, to, key, len);
4190     __ vzeroupper();
4191     __ leave(); // required for proper stackwalking of RuntimeStub frame
4192     __ ret(0);
4193     return start;
4194  }
4195 
4196   address generate_electronicCodeBook_decryptAESCrypt() {
4197     __ align(CodeEntryAlignment);
4198     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
4199     address start = __ pc();
4200     const Register from = c_rarg0;  // source array address
4201     const Register to = c_rarg1;  // destination array address
4202     const Register key = c_rarg2;  // key array address
4203     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4204     __ enter(); // required for proper stackwalking of RuntimeStub frame
4205     __ aesecb_decrypt(from, to, key, len);
4206     __ vzeroupper();
4207     __ leave(); // required for proper stackwalking of RuntimeStub frame
4208     __ ret(0);
4209     return start;
4210   }
4211 
4212   // ofs and limit are use for multi-block byte array.
4213   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
4214   address generate_md5_implCompress(bool multi_block, const char *name) {
4215     __ align(CodeEntryAlignment);
4216     StubCodeMark mark(this, "StubRoutines", name);
4217     address start = __ pc();
4218 
4219     const Register buf_param = r15;
4220     const Address state_param(rsp, 0 * wordSize);
4221     const Address ofs_param  (rsp, 1 * wordSize    );
4222     const Address limit_param(rsp, 1 * wordSize + 4);
4223 
4224     __ enter();
4225     __ push(rbx);
4226     __ push(rdi);
4227     __ push(rsi);
4228     __ push(r15);
4229     __ subptr(rsp, 2 * wordSize);
4230 
4231     __ movptr(buf_param, c_rarg0);
4232     __ movptr(state_param, c_rarg1);
4233     if (multi_block) {
4234       __ movl(ofs_param, c_rarg2);
4235       __ movl(limit_param, c_rarg3);
4236     }
4237     __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
4238 
4239     __ addptr(rsp, 2 * wordSize);
4240     __ pop(r15);
4241     __ pop(rsi);
4242     __ pop(rdi);
4243     __ pop(rbx);
4244     __ leave();
4245     __ ret(0);
4246     return start;
4247   }
4248 
4249   address generate_upper_word_mask() {
4250     __ align64();
4251     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
4252     address start = __ pc();
4253     __ emit_data64(0x0000000000000000, relocInfo::none);
4254     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
4255     return start;
4256   }
4257 
4258   address generate_shuffle_byte_flip_mask() {
4259     __ align64();
4260     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
4261     address start = __ pc();
4262     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4263     __ emit_data64(0x0001020304050607, relocInfo::none);
4264     return start;
4265   }
4266 
4267   // ofs and limit are use for multi-block byte array.
4268   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4269   address generate_sha1_implCompress(bool multi_block, const char *name) {
4270     __ align(CodeEntryAlignment);
4271     StubCodeMark mark(this, "StubRoutines", name);
4272     address start = __ pc();
4273 
4274     Register buf = c_rarg0;
4275     Register state = c_rarg1;
4276     Register ofs = c_rarg2;
4277     Register limit = c_rarg3;
4278 
4279     const XMMRegister abcd = xmm0;
4280     const XMMRegister e0 = xmm1;
4281     const XMMRegister e1 = xmm2;
4282     const XMMRegister msg0 = xmm3;
4283 
4284     const XMMRegister msg1 = xmm4;
4285     const XMMRegister msg2 = xmm5;
4286     const XMMRegister msg3 = xmm6;
4287     const XMMRegister shuf_mask = xmm7;
4288 
4289     __ enter();
4290 
4291     __ subptr(rsp, 4 * wordSize);
4292 
4293     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4294       buf, state, ofs, limit, rsp, multi_block);
4295 
4296     __ addptr(rsp, 4 * wordSize);
4297 
4298     __ leave();
4299     __ ret(0);
4300     return start;
4301   }
4302 
4303   address generate_pshuffle_byte_flip_mask() {
4304     __ align64();
4305     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4306     address start = __ pc();
4307     __ emit_data64(0x0405060700010203, relocInfo::none);
4308     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4309 
4310     if (VM_Version::supports_avx2()) {
4311       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4312       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4313       // _SHUF_00BA
4314       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4315       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4316       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4317       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4318       // _SHUF_DC00
4319       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4320       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4321       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4322       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4323     }
4324 
4325     return start;
4326   }
4327 
4328   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4329   address generate_pshuffle_byte_flip_mask_sha512() {
4330     __ align32();
4331     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4332     address start = __ pc();
4333     if (VM_Version::supports_avx2()) {
4334       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4335       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4336       __ emit_data64(0x1011121314151617, relocInfo::none);
4337       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4338       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4339       __ emit_data64(0x0000000000000000, relocInfo::none);
4340       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4341       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4342     }
4343 
4344     return start;
4345   }
4346 
4347 // ofs and limit are use for multi-block byte array.
4348 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4349   address generate_sha256_implCompress(bool multi_block, const char *name) {
4350     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4351     __ align(CodeEntryAlignment);
4352     StubCodeMark mark(this, "StubRoutines", name);
4353     address start = __ pc();
4354 
4355     Register buf = c_rarg0;
4356     Register state = c_rarg1;
4357     Register ofs = c_rarg2;
4358     Register limit = c_rarg3;
4359 
4360     const XMMRegister msg = xmm0;
4361     const XMMRegister state0 = xmm1;
4362     const XMMRegister state1 = xmm2;
4363     const XMMRegister msgtmp0 = xmm3;
4364 
4365     const XMMRegister msgtmp1 = xmm4;
4366     const XMMRegister msgtmp2 = xmm5;
4367     const XMMRegister msgtmp3 = xmm6;
4368     const XMMRegister msgtmp4 = xmm7;
4369 
4370     const XMMRegister shuf_mask = xmm8;
4371 
4372     __ enter();
4373 
4374     __ subptr(rsp, 4 * wordSize);
4375 
4376     if (VM_Version::supports_sha()) {
4377       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4378         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4379     } else if (VM_Version::supports_avx2()) {
4380       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4381         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4382     }
4383     __ addptr(rsp, 4 * wordSize);
4384     __ vzeroupper();
4385     __ leave();
4386     __ ret(0);
4387     return start;
4388   }
4389 
4390   address generate_sha512_implCompress(bool multi_block, const char *name) {
4391     assert(VM_Version::supports_avx2(), "");
4392     assert(VM_Version::supports_bmi2(), "");
4393     __ align(CodeEntryAlignment);
4394     StubCodeMark mark(this, "StubRoutines", name);
4395     address start = __ pc();
4396 
4397     Register buf = c_rarg0;
4398     Register state = c_rarg1;
4399     Register ofs = c_rarg2;
4400     Register limit = c_rarg3;
4401 
4402     const XMMRegister msg = xmm0;
4403     const XMMRegister state0 = xmm1;
4404     const XMMRegister state1 = xmm2;
4405     const XMMRegister msgtmp0 = xmm3;
4406     const XMMRegister msgtmp1 = xmm4;
4407     const XMMRegister msgtmp2 = xmm5;
4408     const XMMRegister msgtmp3 = xmm6;
4409     const XMMRegister msgtmp4 = xmm7;
4410 
4411     const XMMRegister shuf_mask = xmm8;
4412 
4413     __ enter();
4414 
4415     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4416     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4417 
4418     __ vzeroupper();
4419     __ leave();
4420     __ ret(0);
4421     return start;
4422   }
4423 
4424   address ghash_polynomial512_addr() {
4425     __ align(CodeEntryAlignment);
4426     StubCodeMark mark(this, "StubRoutines", "_ghash_poly512_addr");
4427     address start = __ pc();
4428     __ emit_data64(0x00000001C2000000, relocInfo::none); // POLY for reduction
4429     __ emit_data64(0xC200000000000000, relocInfo::none);
4430     __ emit_data64(0x00000001C2000000, relocInfo::none);
4431     __ emit_data64(0xC200000000000000, relocInfo::none);
4432     __ emit_data64(0x00000001C2000000, relocInfo::none);
4433     __ emit_data64(0xC200000000000000, relocInfo::none);
4434     __ emit_data64(0x00000001C2000000, relocInfo::none);
4435     __ emit_data64(0xC200000000000000, relocInfo::none);
4436     __ emit_data64(0x0000000000000001, relocInfo::none); // POLY
4437     __ emit_data64(0xC200000000000000, relocInfo::none);
4438     __ emit_data64(0x0000000000000001, relocInfo::none); // TWOONE
4439     __ emit_data64(0x0000000100000000, relocInfo::none);
4440     return start;
4441 }
4442 
4443   // Vector AES Galois Counter Mode implementation. Parameters:
4444   // Windows regs            |  Linux regs
4445   // in = c_rarg0 (rcx)      |  c_rarg0 (rsi)
4446   // len = c_rarg1 (rdx)     |  c_rarg1 (rdi)
4447   // ct = c_rarg2 (r8)       |  c_rarg2 (rdx)
4448   // out = c_rarg3 (r9)      |  c_rarg3 (rcx)
4449   // key = r10               |  c_rarg4 (r8)
4450   // state = r13             |  c_rarg5 (r9)
4451   // subkeyHtbl = r14        |  r11
4452   // counter = rsi           |  r12
4453   // return - number of processed bytes
4454   address generate_galoisCounterMode_AESCrypt() {
4455     __ align(CodeEntryAlignment);
4456     StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
4457     address start = __ pc();
4458     const Register in = c_rarg0;
4459     const Register len = c_rarg1;
4460     const Register ct = c_rarg2;
4461     const Register out = c_rarg3;
4462     // and updated with the incremented counter in the end
4463 #ifndef _WIN64
4464     const Register key = c_rarg4;
4465     const Register state = c_rarg5;
4466     const Address subkeyH_mem(rbp, 2 * wordSize);
4467     const Register subkeyHtbl = r11;
4468     const Register avx512_subkeyHtbl = r13;
4469     const Address counter_mem(rbp, 3 * wordSize);
4470     const Register counter = r12;
4471 #else
4472     const Address key_mem(rbp, 6 * wordSize);
4473     const Register key = r10;
4474     const Address state_mem(rbp, 7 * wordSize);
4475     const Register state = r13;
4476     const Address subkeyH_mem(rbp, 8 * wordSize);
4477     const Register subkeyHtbl = r14;
4478     const Register avx512_subkeyHtbl = r12;
4479     const Address counter_mem(rbp, 9 * wordSize);
4480     const Register counter = rsi;
4481 #endif
4482     __ enter();
4483    // Save state before entering routine
4484     __ push(r12);
4485     __ push(r13);
4486     __ push(r14);
4487     __ push(r15);
4488     __ push(rbx);
4489 #ifdef _WIN64
4490     // on win64, fill len_reg from stack position
4491     __ push(rsi);
4492     __ movptr(key, key_mem);
4493     __ movptr(state, state_mem);
4494 #endif
4495     __ movptr(subkeyHtbl, subkeyH_mem);
4496     __ movptr(counter, counter_mem);
4497 // Save rbp and rsp
4498     __ push(rbp);
4499     __ movq(rbp, rsp);
4500 // Align stack
4501     __ andq(rsp, -64);
4502     __ subptr(rsp, 96 * longSize); // Create space on the stack for htbl entries
4503     __ movptr(avx512_subkeyHtbl, rsp);
4504 
4505     __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
4506     __ vzeroupper();
4507 
4508     __ movq(rsp, rbp);
4509     __ pop(rbp);
4510 
4511     // Restore state before leaving routine
4512 #ifdef _WIN64
4513     __ pop(rsi);
4514 #endif
4515     __ pop(rbx);
4516     __ pop(r15);
4517     __ pop(r14);
4518     __ pop(r13);
4519     __ pop(r12);
4520 
4521     __ leave(); // required for proper stackwalking of RuntimeStub frame
4522     __ ret(0);
4523      return start;
4524   }
4525 
4526   // This mask is used for incrementing counter value(linc0, linc4, etc.)
4527   address counter_mask_addr() {
4528     __ align64();
4529     StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
4530     address start = __ pc();
4531     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
4532     __ emit_data64(0x0001020304050607, relocInfo::none);
4533     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4534     __ emit_data64(0x0001020304050607, relocInfo::none);
4535     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4536     __ emit_data64(0x0001020304050607, relocInfo::none);
4537     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4538     __ emit_data64(0x0001020304050607, relocInfo::none);
4539     __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4540     __ emit_data64(0x0000000000000000, relocInfo::none);
4541     __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4542     __ emit_data64(0x0000000000000000, relocInfo::none);
4543     __ emit_data64(0x0000000000000002, relocInfo::none);
4544     __ emit_data64(0x0000000000000000, relocInfo::none);
4545     __ emit_data64(0x0000000000000003, relocInfo::none);
4546     __ emit_data64(0x0000000000000000, relocInfo::none);
4547     __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4548     __ emit_data64(0x0000000000000000, relocInfo::none);
4549     __ emit_data64(0x0000000000000004, relocInfo::none);
4550     __ emit_data64(0x0000000000000000, relocInfo::none);
4551     __ emit_data64(0x0000000000000004, relocInfo::none);
4552     __ emit_data64(0x0000000000000000, relocInfo::none);
4553     __ emit_data64(0x0000000000000004, relocInfo::none);
4554     __ emit_data64(0x0000000000000000, relocInfo::none);
4555     __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4556     __ emit_data64(0x0000000000000000, relocInfo::none);
4557     __ emit_data64(0x0000000000000008, relocInfo::none);
4558     __ emit_data64(0x0000000000000000, relocInfo::none);
4559     __ emit_data64(0x0000000000000008, relocInfo::none);
4560     __ emit_data64(0x0000000000000000, relocInfo::none);
4561     __ emit_data64(0x0000000000000008, relocInfo::none);
4562     __ emit_data64(0x0000000000000000, relocInfo::none);
4563     __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4564     __ emit_data64(0x0000000000000000, relocInfo::none);
4565     __ emit_data64(0x0000000000000020, relocInfo::none);
4566     __ emit_data64(0x0000000000000000, relocInfo::none);
4567     __ emit_data64(0x0000000000000020, relocInfo::none);
4568     __ emit_data64(0x0000000000000000, relocInfo::none);
4569     __ emit_data64(0x0000000000000020, relocInfo::none);
4570     __ emit_data64(0x0000000000000000, relocInfo::none);
4571     __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4572     __ emit_data64(0x0000000000000000, relocInfo::none);
4573     __ emit_data64(0x0000000000000010, relocInfo::none);
4574     __ emit_data64(0x0000000000000000, relocInfo::none);
4575     __ emit_data64(0x0000000000000010, relocInfo::none);
4576     __ emit_data64(0x0000000000000000, relocInfo::none);
4577     __ emit_data64(0x0000000000000010, relocInfo::none);
4578     __ emit_data64(0x0000000000000000, relocInfo::none);
4579     return start;
4580   }
4581 
4582  // Vector AES Counter implementation
4583   address generate_counterMode_VectorAESCrypt()  {
4584     __ align(CodeEntryAlignment);
4585     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4586     address start = __ pc();
4587     const Register from = c_rarg0; // source array address
4588     const Register to = c_rarg1; // destination array address
4589     const Register key = c_rarg2; // key array address r8
4590     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4591     // and updated with the incremented counter in the end
4592 #ifndef _WIN64
4593     const Register len_reg = c_rarg4;
4594     const Register saved_encCounter_start = c_rarg5;
4595     const Register used_addr = r10;
4596     const Address  used_mem(rbp, 2 * wordSize);
4597     const Register used = r11;
4598 #else
4599     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4600     const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4601     const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4602     const Register len_reg = r10; // pick the first volatile windows register
4603     const Register saved_encCounter_start = r11;
4604     const Register used_addr = r13;
4605     const Register used = r14;
4606 #endif
4607     __ enter();
4608    // Save state before entering routine
4609     __ push(r12);
4610     __ push(r13);
4611     __ push(r14);
4612     __ push(r15);
4613 #ifdef _WIN64
4614     // on win64, fill len_reg from stack position
4615     __ movl(len_reg, len_mem);
4616     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4617     __ movptr(used_addr, used_mem);
4618     __ movl(used, Address(used_addr, 0));
4619 #else
4620     __ push(len_reg); // Save
4621     __ movptr(used_addr, used_mem);
4622     __ movl(used, Address(used_addr, 0));
4623 #endif
4624     __ push(rbx);
4625     __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
4626     __ vzeroupper();
4627     // Restore state before leaving routine
4628     __ pop(rbx);
4629 #ifdef _WIN64
4630     __ movl(rax, len_mem); // return length
4631 #else
4632     __ pop(rax); // return length
4633 #endif
4634     __ pop(r15);
4635     __ pop(r14);
4636     __ pop(r13);
4637     __ pop(r12);
4638 
4639     __ leave(); // required for proper stackwalking of RuntimeStub frame
4640     __ ret(0);
4641     return start;
4642   }
4643 
4644   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4645   // to hide instruction latency
4646   //
4647   // Arguments:
4648   //
4649   // Inputs:
4650   //   c_rarg0   - source byte array address
4651   //   c_rarg1   - destination byte array address
4652   //   c_rarg2   - K (key) in little endian int array
4653   //   c_rarg3   - counter vector byte array address
4654   //   Linux
4655   //     c_rarg4   -          input length
4656   //     c_rarg5   -          saved encryptedCounter start
4657   //     rbp + 6 * wordSize - saved used length
4658   //   Windows
4659   //     rbp + 6 * wordSize - input length
4660   //     rbp + 7 * wordSize - saved encryptedCounter start
4661   //     rbp + 8 * wordSize - saved used length
4662   //
4663   // Output:
4664   //   rax       - input length
4665   //
4666   address generate_counterMode_AESCrypt_Parallel() {
4667     assert(UseAES, "need AES instructions and misaligned SSE support");
4668     __ align(CodeEntryAlignment);
4669     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4670     address start = __ pc();
4671     const Register from = c_rarg0; // source array address
4672     const Register to = c_rarg1; // destination array address
4673     const Register key = c_rarg2; // key array address
4674     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4675                                       // and updated with the incremented counter in the end
4676 #ifndef _WIN64
4677     const Register len_reg = c_rarg4;
4678     const Register saved_encCounter_start = c_rarg5;
4679     const Register used_addr = r10;
4680     const Address  used_mem(rbp, 2 * wordSize);
4681     const Register used = r11;
4682 #else
4683     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4684     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4685     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4686     const Register len_reg = r10; // pick the first volatile windows register
4687     const Register saved_encCounter_start = r11;
4688     const Register used_addr = r13;
4689     const Register used = r14;
4690 #endif
4691     const Register pos = rax;
4692 
4693     const int PARALLEL_FACTOR = 6;
4694     const XMMRegister xmm_counter_shuf_mask = xmm0;
4695     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4696     const XMMRegister xmm_curr_counter = xmm2;
4697 
4698     const XMMRegister xmm_key_tmp0 = xmm3;
4699     const XMMRegister xmm_key_tmp1 = xmm4;
4700 
4701     // registers holding the four results in the parallelized loop
4702     const XMMRegister xmm_result0 = xmm5;
4703     const XMMRegister xmm_result1 = xmm6;
4704     const XMMRegister xmm_result2 = xmm7;
4705     const XMMRegister xmm_result3 = xmm8;
4706     const XMMRegister xmm_result4 = xmm9;
4707     const XMMRegister xmm_result5 = xmm10;
4708 
4709     const XMMRegister xmm_from0 = xmm11;
4710     const XMMRegister xmm_from1 = xmm12;
4711     const XMMRegister xmm_from2 = xmm13;
4712     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4713     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4714     const XMMRegister xmm_from5 = xmm4;
4715 
4716     //for key_128, key_192, key_256
4717     const int rounds[3] = {10, 12, 14};
4718     Label L_exit_preLoop, L_preLoop_start;
4719     Label L_multiBlock_loopTop[3];
4720     Label L_singleBlockLoopTop[3];
4721     Label L__incCounter[3][6]; //for 6 blocks
4722     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4723     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4724     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4725 
4726     Label L_exit;
4727 
4728     __ enter(); // required for proper stackwalking of RuntimeStub frame
4729 
4730 #ifdef _WIN64
4731     // allocate spill slots for r13, r14
4732     enum {
4733         saved_r13_offset,
4734         saved_r14_offset
4735     };
4736     __ subptr(rsp, 2 * wordSize);
4737     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4738     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4739 
4740     // on win64, fill len_reg from stack position
4741     __ movl(len_reg, len_mem);
4742     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4743     __ movptr(used_addr, used_mem);
4744     __ movl(used, Address(used_addr, 0));
4745 #else
4746     __ push(len_reg); // Save
4747     __ movptr(used_addr, used_mem);
4748     __ movl(used, Address(used_addr, 0));
4749 #endif
4750 
4751     __ push(rbx); // Save RBX
4752     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4753     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4754     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4755     __ movptr(pos, 0);
4756 
4757     // Use the partially used encrpyted counter from last invocation
4758     __ BIND(L_preLoop_start);
4759     __ cmpptr(used, 16);
4760     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4761       __ cmpptr(len_reg, 0);
4762       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4763       __ movb(rbx, Address(saved_encCounter_start, used));
4764       __ xorb(rbx, Address(from, pos));
4765       __ movb(Address(to, pos), rbx);
4766       __ addptr(pos, 1);
4767       __ addptr(used, 1);
4768       __ subptr(len_reg, 1);
4769 
4770     __ jmp(L_preLoop_start);
4771 
4772     __ BIND(L_exit_preLoop);
4773     __ movl(Address(used_addr, 0), used);
4774 
4775     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4776     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4777     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4778     __ cmpl(rbx, 52);
4779     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4780     __ cmpl(rbx, 60);
4781     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4782 
4783 #define CTR_DoSix(opc, src_reg)                \
4784     __ opc(xmm_result0, src_reg);              \
4785     __ opc(xmm_result1, src_reg);              \
4786     __ opc(xmm_result2, src_reg);              \
4787     __ opc(xmm_result3, src_reg);              \
4788     __ opc(xmm_result4, src_reg);              \
4789     __ opc(xmm_result5, src_reg);
4790 
4791     // k == 0 :  generate code for key_128
4792     // k == 1 :  generate code for key_192
4793     // k == 2 :  generate code for key_256
4794     for (int k = 0; k < 3; ++k) {
4795       //multi blocks starts here
4796       __ align(OptoLoopAlignment);
4797       __ BIND(L_multiBlock_loopTop[k]);
4798       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4799       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4800       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4801 
4802       //load, then increase counters
4803       CTR_DoSix(movdqa, xmm_curr_counter);
4804       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4805       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4806       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4807       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4808       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4809       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4810       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4811       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4812 
4813       //load two ROUND_KEYs at a time
4814       for (int i = 1; i < rounds[k]; ) {
4815         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4816         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4817         CTR_DoSix(aesenc, xmm_key_tmp1);
4818         i++;
4819         if (i != rounds[k]) {
4820           CTR_DoSix(aesenc, xmm_key_tmp0);
4821         } else {
4822           CTR_DoSix(aesenclast, xmm_key_tmp0);
4823         }
4824         i++;
4825       }
4826 
4827       // get next PARALLEL_FACTOR blocks into xmm_result registers
4828       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4829       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4830       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4831       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4832       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4833       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4834 
4835       __ pxor(xmm_result0, xmm_from0);
4836       __ pxor(xmm_result1, xmm_from1);
4837       __ pxor(xmm_result2, xmm_from2);
4838       __ pxor(xmm_result3, xmm_from3);
4839       __ pxor(xmm_result4, xmm_from4);
4840       __ pxor(xmm_result5, xmm_from5);
4841 
4842       // store 6 results into the next 64 bytes of output
4843       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4844       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4845       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4846       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4847       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4848       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4849 
4850       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4851       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4852       __ jmp(L_multiBlock_loopTop[k]);
4853 
4854       // singleBlock starts here
4855       __ align(OptoLoopAlignment);
4856       __ BIND(L_singleBlockLoopTop[k]);
4857       __ cmpptr(len_reg, 0);
4858       __ jcc(Assembler::lessEqual, L_exit);
4859       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4860       __ movdqa(xmm_result0, xmm_curr_counter);
4861       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4862       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4863       __ pxor(xmm_result0, xmm_key_tmp0);
4864       for (int i = 1; i < rounds[k]; i++) {
4865         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4866         __ aesenc(xmm_result0, xmm_key_tmp0);
4867       }
4868       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4869       __ aesenclast(xmm_result0, xmm_key_tmp0);
4870       __ cmpptr(len_reg, AESBlockSize);
4871       __ jcc(Assembler::less, L_processTail_insr[k]);
4872         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4873         __ pxor(xmm_result0, xmm_from0);
4874         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4875         __ addptr(pos, AESBlockSize);
4876         __ subptr(len_reg, AESBlockSize);
4877         __ jmp(L_singleBlockLoopTop[k]);
4878       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4879         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4880         __ testptr(len_reg, 8);
4881         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4882           __ subptr(pos,8);
4883           __ pinsrq(xmm_from0, Address(from, pos), 0);
4884         __ BIND(L_processTail_4_insr[k]);
4885         __ testptr(len_reg, 4);
4886         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4887           __ subptr(pos,4);
4888           __ pslldq(xmm_from0, 4);
4889           __ pinsrd(xmm_from0, Address(from, pos), 0);
4890         __ BIND(L_processTail_2_insr[k]);
4891         __ testptr(len_reg, 2);
4892         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4893           __ subptr(pos, 2);
4894           __ pslldq(xmm_from0, 2);
4895           __ pinsrw(xmm_from0, Address(from, pos), 0);
4896         __ BIND(L_processTail_1_insr[k]);
4897         __ testptr(len_reg, 1);
4898         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4899           __ subptr(pos, 1);
4900           __ pslldq(xmm_from0, 1);
4901           __ pinsrb(xmm_from0, Address(from, pos), 0);
4902         __ BIND(L_processTail_exit_insr[k]);
4903 
4904         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4905         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4906 
4907         __ testptr(len_reg, 8);
4908         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4909           __ pextrq(Address(to, pos), xmm_result0, 0);
4910           __ psrldq(xmm_result0, 8);
4911           __ addptr(pos, 8);
4912         __ BIND(L_processTail_4_extr[k]);
4913         __ testptr(len_reg, 4);
4914         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4915           __ pextrd(Address(to, pos), xmm_result0, 0);
4916           __ psrldq(xmm_result0, 4);
4917           __ addptr(pos, 4);
4918         __ BIND(L_processTail_2_extr[k]);
4919         __ testptr(len_reg, 2);
4920         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4921           __ pextrw(Address(to, pos), xmm_result0, 0);
4922           __ psrldq(xmm_result0, 2);
4923           __ addptr(pos, 2);
4924         __ BIND(L_processTail_1_extr[k]);
4925         __ testptr(len_reg, 1);
4926         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4927           __ pextrb(Address(to, pos), xmm_result0, 0);
4928 
4929         __ BIND(L_processTail_exit_extr[k]);
4930         __ movl(Address(used_addr, 0), len_reg);
4931         __ jmp(L_exit);
4932 
4933     }
4934 
4935     __ BIND(L_exit);
4936     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4937     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4938     __ pop(rbx); // pop the saved RBX.
4939 #ifdef _WIN64
4940     __ movl(rax, len_mem);
4941     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4942     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4943     __ addptr(rsp, 2 * wordSize);
4944 #else
4945     __ pop(rax); // return 'len'
4946 #endif
4947     __ leave(); // required for proper stackwalking of RuntimeStub frame
4948     __ ret(0);
4949     return start;
4950   }
4951 
4952 void roundDec(XMMRegister xmm_reg) {
4953   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4954   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4955   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4956   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4957   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4958   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4959   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4960   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4961 }
4962 
4963 void roundDeclast(XMMRegister xmm_reg) {
4964   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4965   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4966   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4967   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4968   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4969   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4970   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4971   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4972 }
4973 
4974   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4975     __ movdqu(xmmdst, Address(key, offset));
4976     if (xmm_shuf_mask != NULL) {
4977       __ pshufb(xmmdst, xmm_shuf_mask);
4978     } else {
4979       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4980     }
4981     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4982 
4983   }
4984 
4985 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4986     assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
4987     __ align(CodeEntryAlignment);
4988     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4989     address start = __ pc();
4990 
4991     const Register from = c_rarg0;  // source array address
4992     const Register to = c_rarg1;  // destination array address
4993     const Register key = c_rarg2;  // key array address
4994     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4995     // and left with the results of the last encryption block
4996 #ifndef _WIN64
4997     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4998 #else
4999     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
5000     const Register len_reg = r11;      // pick the volatile windows register
5001 #endif
5002 
5003     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
5004           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
5005 
5006     __ enter();
5007 
5008 #ifdef _WIN64
5009   // on win64, fill len_reg from stack position
5010     __ movl(len_reg, len_mem);
5011 #else
5012     __ push(len_reg); // Save
5013 #endif
5014     __ push(rbx);
5015     __ vzeroupper();
5016 
5017     // Temporary variable declaration for swapping key bytes
5018     const XMMRegister xmm_key_shuf_mask = xmm1;
5019     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
5020 
5021     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
5022     const Register rounds = rbx;
5023     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
5024 
5025     const XMMRegister IV = xmm0;
5026     // Load IV and broadcast value to 512-bits
5027     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
5028 
5029     // Temporary variables for storing round keys
5030     const XMMRegister RK0 = xmm30;
5031     const XMMRegister RK1 = xmm9;
5032     const XMMRegister RK2 = xmm18;
5033     const XMMRegister RK3 = xmm19;
5034     const XMMRegister RK4 = xmm20;
5035     const XMMRegister RK5 = xmm21;
5036     const XMMRegister RK6 = xmm22;
5037     const XMMRegister RK7 = xmm23;
5038     const XMMRegister RK8 = xmm24;
5039     const XMMRegister RK9 = xmm25;
5040     const XMMRegister RK10 = xmm26;
5041 
5042      // Load and shuffle key
5043     // the java expanded key ordering is rotated one position from what we want
5044     // so we start from 1*16 here and hit 0*16 last
5045     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
5046     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
5047     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
5048     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
5049     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
5050     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
5051     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
5052     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
5053     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
5054     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
5055     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
5056 
5057     // Variables for storing source cipher text
5058     const XMMRegister S0 = xmm10;
5059     const XMMRegister S1 = xmm11;
5060     const XMMRegister S2 = xmm12;
5061     const XMMRegister S3 = xmm13;
5062     const XMMRegister S4 = xmm14;
5063     const XMMRegister S5 = xmm15;
5064     const XMMRegister S6 = xmm16;
5065     const XMMRegister S7 = xmm17;
5066 
5067     // Variables for storing decrypted text
5068     const XMMRegister B0 = xmm1;
5069     const XMMRegister B1 = xmm2;
5070     const XMMRegister B2 = xmm3;
5071     const XMMRegister B3 = xmm4;
5072     const XMMRegister B4 = xmm5;
5073     const XMMRegister B5 = xmm6;
5074     const XMMRegister B6 = xmm7;
5075     const XMMRegister B7 = xmm8;
5076 
5077     __ cmpl(rounds, 44);
5078     __ jcc(Assembler::greater, KEY_192);
5079     __ jmp(Loop);
5080 
5081     __ BIND(KEY_192);
5082     const XMMRegister RK11 = xmm27;
5083     const XMMRegister RK12 = xmm28;
5084     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
5085     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
5086 
5087     __ cmpl(rounds, 52);
5088     __ jcc(Assembler::greater, KEY_256);
5089     __ jmp(Loop);
5090 
5091     __ BIND(KEY_256);
5092     const XMMRegister RK13 = xmm29;
5093     const XMMRegister RK14 = xmm31;
5094     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
5095     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
5096 
5097     __ BIND(Loop);
5098     __ cmpl(len_reg, 512);
5099     __ jcc(Assembler::below, Lcbc_dec_rem);
5100     __ BIND(Loop1);
5101     __ subl(len_reg, 512);
5102     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
5103     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
5104     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
5105     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
5106     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
5107     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
5108     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
5109     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
5110     __ leaq(from, Address(from, 8 * 64));
5111 
5112     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5113     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
5114     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
5115     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
5116     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
5117     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
5118     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
5119     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
5120 
5121     __ evalignq(IV, S0, IV, 0x06);
5122     __ evalignq(S0, S1, S0, 0x06);
5123     __ evalignq(S1, S2, S1, 0x06);
5124     __ evalignq(S2, S3, S2, 0x06);
5125     __ evalignq(S3, S4, S3, 0x06);
5126     __ evalignq(S4, S5, S4, 0x06);
5127     __ evalignq(S5, S6, S5, 0x06);
5128     __ evalignq(S6, S7, S6, 0x06);
5129 
5130     roundDec(RK2);
5131     roundDec(RK3);
5132     roundDec(RK4);
5133     roundDec(RK5);
5134     roundDec(RK6);
5135     roundDec(RK7);
5136     roundDec(RK8);
5137     roundDec(RK9);
5138     roundDec(RK10);
5139 
5140     __ cmpl(rounds, 44);
5141     __ jcc(Assembler::belowEqual, L_128);
5142     roundDec(RK11);
5143     roundDec(RK12);
5144 
5145     __ cmpl(rounds, 52);
5146     __ jcc(Assembler::belowEqual, L_192);
5147     roundDec(RK13);
5148     roundDec(RK14);
5149 
5150     __ BIND(L_256);
5151     roundDeclast(RK0);
5152     __ jmp(Loop2);
5153 
5154     __ BIND(L_128);
5155     roundDeclast(RK0);
5156     __ jmp(Loop2);
5157 
5158     __ BIND(L_192);
5159     roundDeclast(RK0);
5160 
5161     __ BIND(Loop2);
5162     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5163     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
5164     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
5165     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
5166     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
5167     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
5168     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
5169     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
5170     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
5171 
5172     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
5173     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
5174     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
5175     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
5176     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
5177     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
5178     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
5179     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
5180     __ leaq(to, Address(to, 8 * 64));
5181     __ jmp(Loop);
5182 
5183     __ BIND(Lcbc_dec_rem);
5184     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
5185 
5186     __ BIND(Lcbc_dec_rem_loop);
5187     __ subl(len_reg, 16);
5188     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
5189 
5190     __ movdqu(S0, Address(from, 0));
5191     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5192     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
5193     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
5194     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
5195     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
5196     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
5197     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
5198     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
5199     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
5200     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
5201     __ cmpl(rounds, 44);
5202     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5203 
5204     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
5205     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
5206     __ cmpl(rounds, 52);
5207     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5208 
5209     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
5210     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
5211 
5212     __ BIND(Lcbc_dec_rem_last);
5213     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
5214 
5215     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5216     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
5217     __ movdqu(Address(to, 0), B0);
5218     __ leaq(from, Address(from, 16));
5219     __ leaq(to, Address(to, 16));
5220     __ jmp(Lcbc_dec_rem_loop);
5221 
5222     __ BIND(Lcbc_dec_ret);
5223     __ movdqu(Address(rvec, 0), IV);
5224 
5225     // Zero out the round keys
5226     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
5227     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
5228     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
5229     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
5230     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
5231     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
5232     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
5233     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
5234     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
5235     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
5236     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
5237     __ cmpl(rounds, 44);
5238     __ jcc(Assembler::belowEqual, Lcbc_exit);
5239     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
5240     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
5241     __ cmpl(rounds, 52);
5242     __ jcc(Assembler::belowEqual, Lcbc_exit);
5243     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
5244     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
5245 
5246     __ BIND(Lcbc_exit);
5247     __ vzeroupper();
5248     __ pop(rbx);
5249 #ifdef _WIN64
5250     __ movl(rax, len_mem);
5251 #else
5252     __ pop(rax); // return length
5253 #endif
5254     __ leave(); // required for proper stackwalking of RuntimeStub frame
5255     __ ret(0);
5256     return start;
5257 }
5258 
5259 // Polynomial x^128+x^127+x^126+x^121+1
5260 address ghash_polynomial_addr() {
5261     __ align(CodeEntryAlignment);
5262     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
5263     address start = __ pc();
5264     __ emit_data64(0x0000000000000001, relocInfo::none);
5265     __ emit_data64(0xc200000000000000, relocInfo::none);
5266     return start;
5267 }
5268 
5269 address ghash_shufflemask_addr() {
5270     __ align(CodeEntryAlignment);
5271     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
5272     address start = __ pc();
5273     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5274     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5275     return start;
5276 }
5277 
5278 // Ghash single and multi block operations using AVX instructions
5279 address generate_avx_ghash_processBlocks() {
5280     __ align(CodeEntryAlignment);
5281 
5282     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5283     address start = __ pc();
5284 
5285     // arguments
5286     const Register state = c_rarg0;
5287     const Register htbl = c_rarg1;
5288     const Register data = c_rarg2;
5289     const Register blocks = c_rarg3;
5290     __ enter();
5291    // Save state before entering routine
5292     __ avx_ghash(state, htbl, data, blocks);
5293     __ leave(); // required for proper stackwalking of RuntimeStub frame
5294     __ ret(0);
5295     return start;
5296 }
5297 
5298   // byte swap x86 long
5299   address generate_ghash_long_swap_mask() {
5300     __ align(CodeEntryAlignment);
5301     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
5302     address start = __ pc();
5303     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
5304     __ emit_data64(0x0706050403020100, relocInfo::none );
5305   return start;
5306   }
5307 
5308   // byte swap x86 byte array
5309   address generate_ghash_byte_swap_mask() {
5310     __ align(CodeEntryAlignment);
5311     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
5312     address start = __ pc();
5313     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
5314     __ emit_data64(0x0001020304050607, relocInfo::none );
5315   return start;
5316   }
5317 
5318   /* Single and multi-block ghash operations */
5319   address generate_ghash_processBlocks() {
5320     __ align(CodeEntryAlignment);
5321     Label L_ghash_loop, L_exit;
5322     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5323     address start = __ pc();
5324 
5325     const Register state        = c_rarg0;
5326     const Register subkeyH      = c_rarg1;
5327     const Register data         = c_rarg2;
5328     const Register blocks       = c_rarg3;
5329 
5330     const XMMRegister xmm_temp0 = xmm0;
5331     const XMMRegister xmm_temp1 = xmm1;
5332     const XMMRegister xmm_temp2 = xmm2;
5333     const XMMRegister xmm_temp3 = xmm3;
5334     const XMMRegister xmm_temp4 = xmm4;
5335     const XMMRegister xmm_temp5 = xmm5;
5336     const XMMRegister xmm_temp6 = xmm6;
5337     const XMMRegister xmm_temp7 = xmm7;
5338     const XMMRegister xmm_temp8 = xmm8;
5339     const XMMRegister xmm_temp9 = xmm9;
5340     const XMMRegister xmm_temp10 = xmm10;
5341 
5342     __ enter();
5343 
5344     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
5345 
5346     __ movdqu(xmm_temp0, Address(state, 0));
5347     __ pshufb(xmm_temp0, xmm_temp10);
5348 
5349 
5350     __ BIND(L_ghash_loop);
5351     __ movdqu(xmm_temp2, Address(data, 0));
5352     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
5353 
5354     __ movdqu(xmm_temp1, Address(subkeyH, 0));
5355     __ pshufb(xmm_temp1, xmm_temp10);
5356 
5357     __ pxor(xmm_temp0, xmm_temp2);
5358 
5359     //
5360     // Multiply with the hash key
5361     //
5362     __ movdqu(xmm_temp3, xmm_temp0);
5363     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
5364     __ movdqu(xmm_temp4, xmm_temp0);
5365     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
5366 
5367     __ movdqu(xmm_temp5, xmm_temp0);
5368     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
5369     __ movdqu(xmm_temp6, xmm_temp0);
5370     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
5371 
5372     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
5373 
5374     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
5375     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
5376     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
5377     __ pxor(xmm_temp3, xmm_temp5);
5378     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
5379                                         // of the carry-less multiplication of
5380                                         // xmm0 by xmm1.
5381 
5382     // We shift the result of the multiplication by one bit position
5383     // to the left to cope for the fact that the bits are reversed.
5384     __ movdqu(xmm_temp7, xmm_temp3);
5385     __ movdqu(xmm_temp8, xmm_temp6);
5386     __ pslld(xmm_temp3, 1);
5387     __ pslld(xmm_temp6, 1);
5388     __ psrld(xmm_temp7, 31);
5389     __ psrld(xmm_temp8, 31);
5390     __ movdqu(xmm_temp9, xmm_temp7);
5391     __ pslldq(xmm_temp8, 4);
5392     __ pslldq(xmm_temp7, 4);
5393     __ psrldq(xmm_temp9, 12);
5394     __ por(xmm_temp3, xmm_temp7);
5395     __ por(xmm_temp6, xmm_temp8);
5396     __ por(xmm_temp6, xmm_temp9);
5397 
5398     //
5399     // First phase of the reduction
5400     //
5401     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
5402     // independently.
5403     __ movdqu(xmm_temp7, xmm_temp3);
5404     __ movdqu(xmm_temp8, xmm_temp3);
5405     __ movdqu(xmm_temp9, xmm_temp3);
5406     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
5407     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
5408     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
5409     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
5410     __ pxor(xmm_temp7, xmm_temp9);
5411     __ movdqu(xmm_temp8, xmm_temp7);
5412     __ pslldq(xmm_temp7, 12);
5413     __ psrldq(xmm_temp8, 4);
5414     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
5415 
5416     //
5417     // Second phase of the reduction
5418     //
5419     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
5420     // shift operations.
5421     __ movdqu(xmm_temp2, xmm_temp3);
5422     __ movdqu(xmm_temp4, xmm_temp3);
5423     __ movdqu(xmm_temp5, xmm_temp3);
5424     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
5425     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
5426     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
5427     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
5428     __ pxor(xmm_temp2, xmm_temp5);
5429     __ pxor(xmm_temp2, xmm_temp8);
5430     __ pxor(xmm_temp3, xmm_temp2);
5431     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
5432 
5433     __ decrement(blocks);
5434     __ jcc(Assembler::zero, L_exit);
5435     __ movdqu(xmm_temp0, xmm_temp6);
5436     __ addptr(data, 16);
5437     __ jmp(L_ghash_loop);
5438 
5439     __ BIND(L_exit);
5440     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
5441     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
5442     __ leave();
5443     __ ret(0);
5444     return start;
5445   }
5446 
5447   address base64_shuffle_addr()
5448   {
5449     __ align64();
5450     StubCodeMark mark(this, "StubRoutines", "shuffle_base64");
5451     address start = __ pc();
5452     assert(((unsigned long long)start & 0x3f) == 0,
5453            "Alignment problem (0x%08llx)", (unsigned long long)start);
5454     __ emit_data64(0x0405030401020001, relocInfo::none);
5455     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5456     __ emit_data64(0x10110f100d0e0c0d, relocInfo::none);
5457     __ emit_data64(0x1617151613141213, relocInfo::none);
5458     __ emit_data64(0x1c1d1b1c191a1819, relocInfo::none);
5459     __ emit_data64(0x222321221f201e1f, relocInfo::none);
5460     __ emit_data64(0x2829272825262425, relocInfo::none);
5461     __ emit_data64(0x2e2f2d2e2b2c2a2b, relocInfo::none);
5462     return start;
5463   }
5464 
5465   address base64_avx2_shuffle_addr()
5466   {
5467     __ align32();
5468     StubCodeMark mark(this, "StubRoutines", "avx2_shuffle_base64");
5469     address start = __ pc();
5470     __ emit_data64(0x0809070805060405, relocInfo::none);
5471     __ emit_data64(0x0e0f0d0e0b0c0a0b, relocInfo::none);
5472     __ emit_data64(0x0405030401020001, relocInfo::none);
5473     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5474     return start;
5475   }
5476 
5477   address base64_avx2_input_mask_addr()
5478   {
5479     __ align32();
5480     StubCodeMark mark(this, "StubRoutines", "avx2_input_mask_base64");
5481     address start = __ pc();
5482     __ emit_data64(0x8000000000000000, relocInfo::none);
5483     __ emit_data64(0x8000000080000000, relocInfo::none);
5484     __ emit_data64(0x8000000080000000, relocInfo::none);
5485     __ emit_data64(0x8000000080000000, relocInfo::none);
5486     return start;
5487   }
5488 
5489   address base64_avx2_lut_addr()
5490   {
5491     __ align32();
5492     StubCodeMark mark(this, "StubRoutines", "avx2_lut_base64");
5493     address start = __ pc();
5494     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5495     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5496     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5497     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5498 
5499     // URL LUT
5500     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5501     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5502     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5503     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5504     return start;
5505   }
5506 
5507   address base64_encoding_table_addr()
5508   {
5509     __ align64();
5510     StubCodeMark mark(this, "StubRoutines", "encoding_table_base64");
5511     address start = __ pc();
5512     assert(((unsigned long long)start & 0x3f) == 0, "Alignment problem (0x%08llx)", (unsigned long long)start);
5513     __ emit_data64(0x4847464544434241, relocInfo::none);
5514     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5515     __ emit_data64(0x5857565554535251, relocInfo::none);
5516     __ emit_data64(0x6665646362615a59, relocInfo::none);
5517     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5518     __ emit_data64(0x767574737271706f, relocInfo::none);
5519     __ emit_data64(0x333231307a797877, relocInfo::none);
5520     __ emit_data64(0x2f2b393837363534, relocInfo::none);
5521 
5522     // URL table
5523     __ emit_data64(0x4847464544434241, relocInfo::none);
5524     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5525     __ emit_data64(0x5857565554535251, relocInfo::none);
5526     __ emit_data64(0x6665646362615a59, relocInfo::none);
5527     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5528     __ emit_data64(0x767574737271706f, relocInfo::none);
5529     __ emit_data64(0x333231307a797877, relocInfo::none);
5530     __ emit_data64(0x5f2d393837363534, relocInfo::none);
5531     return start;
5532   }
5533 
5534   // Code for generating Base64 encoding.
5535   // Intrinsic function prototype in Base64.java:
5536   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp,
5537   // boolean isURL) {
5538   address generate_base64_encodeBlock()
5539   {
5540     __ align(CodeEntryAlignment);
5541     StubCodeMark mark(this, "StubRoutines", "implEncode");
5542     address start = __ pc();
5543     __ enter();
5544 
5545     // Save callee-saved registers before using them
5546     __ push(r12);
5547     __ push(r13);
5548     __ push(r14);
5549     __ push(r15);
5550 
5551     // arguments
5552     const Register source = c_rarg0;       // Source Array
5553     const Register start_offset = c_rarg1; // start offset
5554     const Register end_offset = c_rarg2;   // end offset
5555     const Register dest = c_rarg3;   // destination array
5556 
5557 #ifndef _WIN64
5558     const Register dp = c_rarg4;    // Position for writing to dest array
5559     const Register isURL = c_rarg5; // Base64 or URL character set
5560 #else
5561     const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5562     const Address isURL_mem(rbp, 7 * wordSize);
5563     const Register isURL = r10; // pick the volatile windows register
5564     const Register dp = r12;
5565     __ movl(dp, dp_mem);
5566     __ movl(isURL, isURL_mem);
5567 #endif
5568 
5569     const Register length = r14;
5570     const Register encode_table = r13;
5571     Label L_process3, L_exit, L_processdata, L_vbmiLoop, L_not512, L_32byteLoop;
5572 
5573     // calculate length from offsets
5574     __ movl(length, end_offset);
5575     __ subl(length, start_offset);
5576     __ cmpl(length, 0);
5577     __ jcc(Assembler::lessEqual, L_exit);
5578 
5579     // Code for 512-bit VBMI encoding.  Encodes 48 input bytes into 64
5580     // output bytes. We read 64 input bytes and ignore the last 16, so be
5581     // sure not to read past the end of the input buffer.
5582     if (VM_Version::supports_avx512_vbmi()) {
5583       __ cmpl(length, 64); // Do not overrun input buffer.
5584       __ jcc(Assembler::below, L_not512);
5585 
5586       __ shll(isURL, 6); // index into decode table based on isURL
5587       __ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5588       __ addptr(encode_table, isURL);
5589       __ shrl(isURL, 6); // restore isURL
5590 
5591       __ mov64(rax, 0x3036242a1016040aull); // Shifts
5592       __ evmovdquq(xmm3, ExternalAddress(StubRoutines::x86::base64_shuffle_addr()), Assembler::AVX_512bit, r15);
5593       __ evmovdquq(xmm2, Address(encode_table, 0), Assembler::AVX_512bit);
5594       __ evpbroadcastq(xmm1, rax, Assembler::AVX_512bit);
5595 
5596       __ align32();
5597       __ BIND(L_vbmiLoop);
5598 
5599       __ vpermb(xmm0, xmm3, Address(source, start_offset), Assembler::AVX_512bit);
5600       __ subl(length, 48);
5601 
5602       // Put the input bytes into the proper lanes for writing, then
5603       // encode them.
5604       __ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
5605       __ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
5606 
5607       // Write to destination
5608       __ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
5609 
5610       __ addptr(dest, 64);
5611       __ addptr(source, 48);
5612       __ cmpl(length, 64);
5613       __ jcc(Assembler::aboveEqual, L_vbmiLoop);
5614 
5615       __ vzeroupper();
5616     }
5617 
5618     __ BIND(L_not512);
5619     if (VM_Version::supports_avx2()
5620         && VM_Version::supports_avx512vlbw()) {
5621       /*
5622       ** This AVX2 encoder is based off the paper at:
5623       **      https://dl.acm.org/doi/10.1145/3132709
5624       **
5625       ** We use AVX2 SIMD instructions to encode 24 bytes into 32
5626       ** output bytes.
5627       **
5628       */
5629       // Lengths under 32 bytes are done with scalar routine
5630       __ cmpl(length, 31);
5631       __ jcc(Assembler::belowEqual, L_process3);
5632 
5633       // Set up supporting constant table data
5634       __ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
5635       // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
5636       __ movl(rax, 0x0fc0fc00);
5637       __ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
5638       __ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
5639 
5640       // Multiplication constant for "shifting" right by 6 and 10
5641       // bits
5642       __ movl(rax, 0x04000040);
5643 
5644       __ subl(length, 24);
5645       __ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
5646 
5647       // For the first load, we mask off reading of the first 4
5648       // bytes into the register. This is so we can get 4 3-byte
5649       // chunks into each lane of the register, avoiding having to
5650       // handle end conditions.  We then shuffle these bytes into a
5651       // specific order so that manipulation is easier.
5652       //
5653       // The initial read loads the XMM register like this:
5654       //
5655       // Lower 128-bit lane:
5656       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5657       // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1
5658       // | C2 | D0 | D1 | D2 |
5659       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5660       //
5661       // Upper 128-bit lane:
5662       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5663       // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2
5664       // | XX | XX | XX | XX |
5665       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5666       //
5667       // Where A0 is the first input byte, B0 is the fourth, etc.
5668       // The alphabetical significance denotes the 3 bytes to be
5669       // consumed and encoded into 4 bytes.
5670       //
5671       // We then shuffle the register so each 32-bit word contains
5672       // the sequence:
5673       //    A1 A0 A2 A1, B1, B0, B2, B1, etc.
5674       // Each of these byte sequences are then manipulated into 4
5675       // 6-bit values ready for encoding.
5676       //
5677       // If we focus on one set of 3-byte chunks, changing the
5678       // nomenclature such that A0 => a, A1 => b, and A2 => c, we
5679       // shuffle such that each 24-bit chunk contains:
5680       //
5681       // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6
5682       // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0
5683       // Explain this step.
5684       // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4
5685       // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2
5686       //
5687       // W first and off all but bits 4-9 and 16-21 (c5..c0 and
5688       // a5..a0) and shift them using a vector multiplication
5689       // operation (vpmulhuw) which effectively shifts c right by 6
5690       // bits and a right by 10 bits.  We similarly mask bits 10-15
5691       // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4
5692       // bits respecively.  This is done using vpmullw.  We end up
5693       // with 4 6-bit values, thus splitting the 3 input bytes,
5694       // ready for encoding:
5695       //    0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0
5696       //
5697       // For translation, we recognize that there are 5 distinct
5698       // ranges of legal Base64 characters as below:
5699       //
5700       //   +-------------+-------------+------------+
5701       //   | 6-bit value | ASCII range |   offset   |
5702       //   +-------------+-------------+------------+
5703       //   |    0..25    |    A..Z     |     65     |
5704       //   |   26..51    |    a..z     |     71     |
5705       //   |   52..61    |    0..9     |     -4     |
5706       //   |     62      |   + or -    | -19 or -17 |
5707       //   |     63      |   / or _    | -16 or 32  |
5708       //   +-------------+-------------+------------+
5709       //
5710       // We note that vpshufb does a parallel lookup in a
5711       // destination register using the lower 4 bits of bytes from a
5712       // source register.  If we use a saturated subtraction and
5713       // subtract 51 from each 6-bit value, bytes from [0,51]
5714       // saturate to 0, and [52,63] map to a range of [1,12].  We
5715       // distinguish the [0,25] and [26,51] ranges by assigning a
5716       // value of 13 for all 6-bit values less than 26.  We end up
5717       // with:
5718       //
5719       //   +-------------+-------------+------------+
5720       //   | 6-bit value |   Reduced   |   offset   |
5721       //   +-------------+-------------+------------+
5722       //   |    0..25    |     13      |     65     |
5723       //   |   26..51    |      0      |     71     |
5724       //   |   52..61    |    0..9     |     -4     |
5725       //   |     62      |     11      | -19 or -17 |
5726       //   |     63      |     12      | -16 or 32  |
5727       //   +-------------+-------------+------------+
5728       //
5729       // We then use a final vpshufb to add the appropriate offset,
5730       // translating the bytes.
5731       //
5732       // Load input bytes - only 28 bytes.  Mask the first load to
5733       // not load into the full register.
5734       __ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
5735 
5736       // Move 3-byte chunks of input (12 bytes) into 16 bytes,
5737       // ordering by:
5738       //   1, 0, 2, 1; 4, 3, 5, 4; etc.  This groups 6-bit chunks
5739       //   for easy masking
5740       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5741 
5742       __ addl(start_offset, 24);
5743 
5744       // Load masking register for first and third (and multiples)
5745       // 6-bit values.
5746       __ movl(rax, 0x003f03f0);
5747       __ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
5748       // Multiplication constant for "shifting" left by 4 and 8 bits
5749       __ movl(rax, 0x01000010);
5750       __ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
5751 
5752       // Isolate 6-bit chunks of interest
5753       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5754 
5755       // Load constants for encoding
5756       __ movl(rax, 0x19191919);
5757       __ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
5758       __ movl(rax, 0x33333333);
5759       __ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
5760 
5761       // Shift output bytes 0 and 2 into proper lanes
5762       __ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
5763 
5764       // Mask and shift output bytes 1 and 3 into proper lanes and
5765       // combine
5766       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5767       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5768       __ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
5769 
5770       // Find out which are 0..25.  This indicates which input
5771       // values fall in the range of 'A'-'Z', which require an
5772       // additional offset (see comments above)
5773       __ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
5774       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5775       __ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
5776 
5777       // Load the proper lookup table
5778       __ lea(r11, ExternalAddress(StubRoutines::x86::base64_avx2_lut_addr()));
5779       __ movl(r15, isURL);
5780       __ shll(r15, 5);
5781       __ vmovdqu(xmm2, Address(r11, r15));
5782 
5783       // Shuffle the offsets based on the range calculation done
5784       // above. This allows us to add the correct offset to the
5785       // 6-bit value corresponding to the range documented above.
5786       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5787       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5788 
5789       // Store the encoded bytes
5790       __ vmovdqu(Address(dest, dp), xmm0);
5791       __ addl(dp, 32);
5792 
5793       __ cmpl(length, 31);
5794       __ jcc(Assembler::belowEqual, L_process3);
5795 
5796       __ align32();
5797       __ BIND(L_32byteLoop);
5798 
5799       // Get next 32 bytes
5800       __ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
5801 
5802       __ subl(length, 24);
5803       __ addl(start_offset, 24);
5804 
5805       // This logic is identical to the above, with only constant
5806       // register loads removed.  Shuffle the input, mask off 6-bit
5807       // chunks, shift them into place, then add the offset to
5808       // encode.
5809       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5810 
5811       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5812       __ vpmulhuw(xmm10, xmm0, xmm7, Assembler::AVX_256bit);
5813       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5814       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5815       __ vpor(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
5816       __ vpcmpgtb(xmm10, xmm0, xmm3, Assembler::AVX_256bit);
5817       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5818       __ vpsubb(xmm1, xmm1, xmm10, Assembler::AVX_256bit);
5819       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5820       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5821 
5822       // Store the encoded bytes
5823       __ vmovdqu(Address(dest, dp), xmm0);
5824       __ addl(dp, 32);
5825 
5826       __ cmpl(length, 31);
5827       __ jcc(Assembler::above, L_32byteLoop);
5828 
5829       __ BIND(L_process3);
5830       __ vzeroupper();
5831     } else {
5832       __ BIND(L_process3);
5833     }
5834 
5835     __ cmpl(length, 3);
5836     __ jcc(Assembler::below, L_exit);
5837 
5838     // Load the encoding table based on isURL
5839     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5840     __ movl(r15, isURL);
5841     __ shll(r15, 6);
5842     __ addptr(r11, r15);
5843 
5844     __ BIND(L_processdata);
5845 
5846     // Load 3 bytes
5847     __ load_unsigned_byte(r15, Address(source, start_offset));
5848     __ load_unsigned_byte(r10, Address(source, start_offset, Address::times_1, 1));
5849     __ load_unsigned_byte(r13, Address(source, start_offset, Address::times_1, 2));
5850 
5851     // Build a 32-bit word with bytes 1, 2, 0, 1
5852     __ movl(rax, r10);
5853     __ shll(r10, 24);
5854     __ orl(rax, r10);
5855 
5856     __ subl(length, 3);
5857 
5858     __ shll(r15, 8);
5859     __ shll(r13, 16);
5860     __ orl(rax, r15);
5861 
5862     __ addl(start_offset, 3);
5863 
5864     __ orl(rax, r13);
5865     // At this point, rax contains | byte1 | byte2 | byte0 | byte1
5866     // r13 has byte2 << 16 - need low-order 6 bits to translate.
5867     // This translated byte is the fourth output byte.
5868     __ shrl(r13, 16);
5869     __ andl(r13, 0x3f);
5870 
5871     // The high-order 6 bits of r15 (byte0) is translated.
5872     // The translated byte is the first output byte.
5873     __ shrl(r15, 10);
5874 
5875     __ load_unsigned_byte(r13, Address(r11, r13));
5876     __ load_unsigned_byte(r15, Address(r11, r15));
5877 
5878     __ movb(Address(dest, dp, Address::times_1, 3), r13);
5879 
5880     // Extract high-order 4 bits of byte1 and low-order 2 bits of byte0.
5881     // This translated byte is the second output byte.
5882     __ shrl(rax, 4);
5883     __ movl(r10, rax);
5884     __ andl(rax, 0x3f);
5885 
5886     __ movb(Address(dest, dp, Address::times_1, 0), r15);
5887 
5888     __ load_unsigned_byte(rax, Address(r11, rax));
5889 
5890     // Extract low-order 2 bits of byte1 and high-order 4 bits of byte2.
5891     // This translated byte is the third output byte.
5892     __ shrl(r10, 18);
5893     __ andl(r10, 0x3f);
5894 
5895     __ load_unsigned_byte(r10, Address(r11, r10));
5896 
5897     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5898     __ movb(Address(dest, dp, Address::times_1, 2), r10);
5899 
5900     __ addl(dp, 4);
5901     __ cmpl(length, 3);
5902     __ jcc(Assembler::aboveEqual, L_processdata);
5903 
5904     __ BIND(L_exit);
5905     __ pop(r15);
5906     __ pop(r14);
5907     __ pop(r13);
5908     __ pop(r12);
5909     __ leave();
5910     __ ret(0);
5911     return start;
5912   }
5913 
5914   // base64 AVX512vbmi tables
5915   address base64_vbmi_lookup_lo_addr() {
5916     __ align64();
5917     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64");
5918     address start = __ pc();
5919     assert(((unsigned long long)start & 0x3f) == 0,
5920            "Alignment problem (0x%08llx)", (unsigned long long)start);
5921     __ emit_data64(0x8080808080808080, relocInfo::none);
5922     __ emit_data64(0x8080808080808080, relocInfo::none);
5923     __ emit_data64(0x8080808080808080, relocInfo::none);
5924     __ emit_data64(0x8080808080808080, relocInfo::none);
5925     __ emit_data64(0x8080808080808080, relocInfo::none);
5926     __ emit_data64(0x3f8080803e808080, relocInfo::none);
5927     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5928     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5929     return start;
5930   }
5931 
5932   address base64_vbmi_lookup_hi_addr() {
5933     __ align64();
5934     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64");
5935     address start = __ pc();
5936     assert(((unsigned long long)start & 0x3f) == 0,
5937            "Alignment problem (0x%08llx)", (unsigned long long)start);
5938     __ emit_data64(0x0605040302010080, relocInfo::none);
5939     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5940     __ emit_data64(0x161514131211100f, relocInfo::none);
5941     __ emit_data64(0x8080808080191817, relocInfo::none);
5942     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5943     __ emit_data64(0x2827262524232221, relocInfo::none);
5944     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5945     __ emit_data64(0x8080808080333231, relocInfo::none);
5946     return start;
5947   }
5948   address base64_vbmi_lookup_lo_url_addr() {
5949     __ align64();
5950     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64url");
5951     address start = __ pc();
5952     assert(((unsigned long long)start & 0x3f) == 0,
5953            "Alignment problem (0x%08llx)", (unsigned long long)start);
5954     __ emit_data64(0x8080808080808080, relocInfo::none);
5955     __ emit_data64(0x8080808080808080, relocInfo::none);
5956     __ emit_data64(0x8080808080808080, relocInfo::none);
5957     __ emit_data64(0x8080808080808080, relocInfo::none);
5958     __ emit_data64(0x8080808080808080, relocInfo::none);
5959     __ emit_data64(0x80803e8080808080, relocInfo::none);
5960     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5961     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5962     return start;
5963   }
5964 
5965   address base64_vbmi_lookup_hi_url_addr() {
5966     __ align64();
5967     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64url");
5968     address start = __ pc();
5969     assert(((unsigned long long)start & 0x3f) == 0,
5970            "Alignment problem (0x%08llx)", (unsigned long long)start);
5971     __ emit_data64(0x0605040302010080, relocInfo::none);
5972     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5973     __ emit_data64(0x161514131211100f, relocInfo::none);
5974     __ emit_data64(0x3f80808080191817, relocInfo::none);
5975     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5976     __ emit_data64(0x2827262524232221, relocInfo::none);
5977     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5978     __ emit_data64(0x8080808080333231, relocInfo::none);
5979     return start;
5980   }
5981 
5982   address base64_vbmi_pack_vec_addr() {
5983     __ align64();
5984     StubCodeMark mark(this, "StubRoutines", "pack_vec_base64");
5985     address start = __ pc();
5986     assert(((unsigned long long)start & 0x3f) == 0,
5987            "Alignment problem (0x%08llx)", (unsigned long long)start);
5988     __ emit_data64(0x090a040506000102, relocInfo::none);
5989     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5990     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5991     __ emit_data64(0x292a242526202122, relocInfo::none);
5992     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5993     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5994     __ emit_data64(0x0000000000000000, relocInfo::none);
5995     __ emit_data64(0x0000000000000000, relocInfo::none);
5996     return start;
5997   }
5998 
5999   address base64_vbmi_join_0_1_addr() {
6000     __ align64();
6001     StubCodeMark mark(this, "StubRoutines", "join_0_1_base64");
6002     address start = __ pc();
6003     assert(((unsigned long long)start & 0x3f) == 0,
6004            "Alignment problem (0x%08llx)", (unsigned long long)start);
6005     __ emit_data64(0x090a040506000102, relocInfo::none);
6006     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
6007     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
6008     __ emit_data64(0x292a242526202122, relocInfo::none);
6009     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6010     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6011     __ emit_data64(0x494a444546404142, relocInfo::none);
6012     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6013     return start;
6014   }
6015 
6016   address base64_vbmi_join_1_2_addr() {
6017     __ align64();
6018     StubCodeMark mark(this, "StubRoutines", "join_1_2_base64");
6019     address start = __ pc();
6020     assert(((unsigned long long)start & 0x3f) == 0,
6021            "Alignment problem (0x%08llx)", (unsigned long long)start);
6022     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
6023     __ emit_data64(0x292a242526202122, relocInfo::none);
6024     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6025     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6026     __ emit_data64(0x494a444546404142, relocInfo::none);
6027     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6028     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
6029     __ emit_data64(0x696a646566606162, relocInfo::none);
6030     return start;
6031   }
6032 
6033   address base64_vbmi_join_2_3_addr() {
6034     __ align64();
6035     StubCodeMark mark(this, "StubRoutines", "join_2_3_base64");
6036     address start = __ pc();
6037     assert(((unsigned long long)start & 0x3f) == 0,
6038            "Alignment problem (0x%08llx)", (unsigned long long)start);
6039     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6040     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6041     __ emit_data64(0x494a444546404142, relocInfo::none);
6042     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6043     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
6044     __ emit_data64(0x696a646566606162, relocInfo::none);
6045     __ emit_data64(0x767071726c6d6e68, relocInfo::none);
6046     __ emit_data64(0x7c7d7e78797a7475, relocInfo::none);
6047     return start;
6048   }
6049 
6050   address base64_decoding_table_addr() {
6051     StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
6052     address start = __ pc();
6053     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6054     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6055     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6056     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6057     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6058     __ emit_data64(0x3fffffff3effffff, relocInfo::none);
6059     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6060     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6061     __ emit_data64(0x06050403020100ff, relocInfo::none);
6062     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6063     __ emit_data64(0x161514131211100f, relocInfo::none);
6064     __ emit_data64(0xffffffffff191817, relocInfo::none);
6065     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6066     __ emit_data64(0x2827262524232221, relocInfo::none);
6067     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6068     __ emit_data64(0xffffffffff333231, relocInfo::none);
6069     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6070     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6071     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6072     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6073     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6074     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6075     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6076     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6077     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6078     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6079     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6080     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6081     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6082     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6083     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6084     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6085 
6086     // URL table
6087     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6088     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6089     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6090     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6091     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6092     __ emit_data64(0xffff3effffffffff, relocInfo::none);
6093     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6094     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6095     __ emit_data64(0x06050403020100ff, relocInfo::none);
6096     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6097     __ emit_data64(0x161514131211100f, relocInfo::none);
6098     __ emit_data64(0x3fffffffff191817, relocInfo::none);
6099     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6100     __ emit_data64(0x2827262524232221, relocInfo::none);
6101     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6102     __ emit_data64(0xffffffffff333231, relocInfo::none);
6103     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6104     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6105     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6106     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6107     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6108     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6109     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6110     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6111     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6112     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6113     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6114     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6115     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6116     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6117     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6118     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6119     return start;
6120   }
6121 
6122 
6123 // Code for generating Base64 decoding.
6124 //
6125 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
6126 //
6127 // Intrinsic function prototype in Base64.java:
6128 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME) {
6129   address generate_base64_decodeBlock() {
6130     __ align(CodeEntryAlignment);
6131     StubCodeMark mark(this, "StubRoutines", "implDecode");
6132     address start = __ pc();
6133     __ enter();
6134 
6135     // Save callee-saved registers before using them
6136     __ push(r12);
6137     __ push(r13);
6138     __ push(r14);
6139     __ push(r15);
6140     __ push(rbx);
6141 
6142     // arguments
6143     const Register source = c_rarg0; // Source Array
6144     const Register start_offset = c_rarg1; // start offset
6145     const Register end_offset = c_rarg2; // end offset
6146     const Register dest = c_rarg3; // destination array
6147     const Register isMIME = rbx;
6148 
6149 #ifndef _WIN64
6150     const Register dp = c_rarg4;  // Position for writing to dest array
6151     const Register isURL = c_rarg5;// Base64 or URL character set
6152     __ movl(isMIME, Address(rbp, 2 * wordSize));
6153 #else
6154     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
6155     const Address isURL_mem(rbp, 7 * wordSize);
6156     const Register isURL = r10;      // pick the volatile windows register
6157     const Register dp = r12;
6158     __ movl(dp, dp_mem);
6159     __ movl(isURL, isURL_mem);
6160     __ movl(isMIME, Address(rbp, 8 * wordSize));
6161 #endif
6162 
6163     const XMMRegister lookup_lo = xmm5;
6164     const XMMRegister lookup_hi = xmm6;
6165     const XMMRegister errorvec = xmm7;
6166     const XMMRegister pack16_op = xmm9;
6167     const XMMRegister pack32_op = xmm8;
6168     const XMMRegister input0 = xmm3;
6169     const XMMRegister input1 = xmm20;
6170     const XMMRegister input2 = xmm21;
6171     const XMMRegister input3 = xmm19;
6172     const XMMRegister join01 = xmm12;
6173     const XMMRegister join12 = xmm11;
6174     const XMMRegister join23 = xmm10;
6175     const XMMRegister translated0 = xmm2;
6176     const XMMRegister translated1 = xmm1;
6177     const XMMRegister translated2 = xmm0;
6178     const XMMRegister translated3 = xmm4;
6179 
6180     const XMMRegister merged0 = xmm2;
6181     const XMMRegister merged1 = xmm1;
6182     const XMMRegister merged2 = xmm0;
6183     const XMMRegister merged3 = xmm4;
6184     const XMMRegister merge_ab_bc0 = xmm2;
6185     const XMMRegister merge_ab_bc1 = xmm1;
6186     const XMMRegister merge_ab_bc2 = xmm0;
6187     const XMMRegister merge_ab_bc3 = xmm4;
6188 
6189     const XMMRegister pack24bits = xmm4;
6190 
6191     const Register length = r14;
6192     const Register output_size = r13;
6193     const Register output_mask = r15;
6194     const KRegister input_mask = k1;
6195 
6196     const XMMRegister input_initial_valid_b64 = xmm0;
6197     const XMMRegister tmp = xmm10;
6198     const XMMRegister mask = xmm0;
6199     const XMMRegister invalid_b64 = xmm1;
6200 
6201     Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
6202     Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
6203     Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
6204 
6205     // calculate length from offsets
6206     __ movl(length, end_offset);
6207     __ subl(length, start_offset);
6208     __ push(dest);          // Save for return value calc
6209 
6210     // If AVX512 VBMI not supported, just compile non-AVX code
6211     if(VM_Version::supports_avx512_vbmi() &&
6212        VM_Version::supports_avx512bw()) {
6213       __ cmpl(length, 128);     // 128-bytes is break-even for AVX-512
6214       __ jcc(Assembler::lessEqual, L_bruteForce);
6215 
6216       __ cmpl(isMIME, 0);
6217       __ jcc(Assembler::notEqual, L_bruteForce);
6218 
6219       // Load lookup tables based on isURL
6220       __ cmpl(isURL, 0);
6221       __ jcc(Assembler::notZero, L_loadURL);
6222 
6223       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_addr()), Assembler::AVX_512bit, r13);
6224       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_addr()), Assembler::AVX_512bit, r13);
6225 
6226       __ BIND(L_continue);
6227 
6228       __ movl(r15, 0x01400140);
6229       __ evpbroadcastd(pack16_op, r15, Assembler::AVX_512bit);
6230 
6231       __ movl(r15, 0x00011000);
6232       __ evpbroadcastd(pack32_op, r15, Assembler::AVX_512bit);
6233 
6234       __ cmpl(length, 0xff);
6235       __ jcc(Assembler::lessEqual, L_process64);
6236 
6237       // load masks required for decoding data
6238       __ BIND(L_processdata);
6239       __ evmovdquq(join01, ExternalAddress(StubRoutines::x86::base64_vbmi_join_0_1_addr()), Assembler::AVX_512bit,r13);
6240       __ evmovdquq(join12, ExternalAddress(StubRoutines::x86::base64_vbmi_join_1_2_addr()), Assembler::AVX_512bit, r13);
6241       __ evmovdquq(join23, ExternalAddress(StubRoutines::x86::base64_vbmi_join_2_3_addr()), Assembler::AVX_512bit, r13);
6242 
6243       __ align32();
6244       __ BIND(L_process256);
6245       // Grab input data
6246       __ evmovdquq(input0, Address(source, start_offset, Address::times_1, 0x00), Assembler::AVX_512bit);
6247       __ evmovdquq(input1, Address(source, start_offset, Address::times_1, 0x40), Assembler::AVX_512bit);
6248       __ evmovdquq(input2, Address(source, start_offset, Address::times_1, 0x80), Assembler::AVX_512bit);
6249       __ evmovdquq(input3, Address(source, start_offset, Address::times_1, 0xc0), Assembler::AVX_512bit);
6250 
6251       // Copy the low part of the lookup table into the destination of the permutation
6252       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6253       __ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
6254       __ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
6255       __ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
6256 
6257       // Translate the base64 input into "decoded" bytes
6258       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6259       __ evpermt2b(translated1, input1, lookup_hi, Assembler::AVX_512bit);
6260       __ evpermt2b(translated2, input2, lookup_hi, Assembler::AVX_512bit);
6261       __ evpermt2b(translated3, input3, lookup_hi, Assembler::AVX_512bit);
6262 
6263       // OR all of the translations together to check for errors (high-order bit of byte set)
6264       __ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
6265 
6266       __ vpternlogd(input3, 0xfe, translated0, translated1, Assembler::AVX_512bit);
6267       __ vpternlogd(input0, 0xfe, translated2, translated3, Assembler::AVX_512bit);
6268       __ vpor(errorvec, input3, input0, Assembler::AVX_512bit);
6269 
6270       // Check if there was an error - if so, try 64-byte chunks
6271       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6272       __ kortestql(k3, k3);
6273       __ jcc(Assembler::notZero, L_process64);
6274 
6275       // The merging and shuffling happens here
6276       // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa]
6277       // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd]
6278       // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
6279       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6280       __ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
6281       __ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
6282       __ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
6283 
6284       // Now do the same with packed 16-bit values.
6285       // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb]
6286       // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12
6287       // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
6288       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6289       __ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
6290       __ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
6291       __ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
6292 
6293       // The join vectors specify which byte from which vector goes into the outputs
6294       // One of every 4 bytes in the extended vector is zero, so we pack them into their
6295       // final positions in the register for storing (256 bytes in, 192 bytes out)
6296       __ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
6297       __ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
6298       __ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
6299 
6300       // Store result
6301       __ evmovdquq(Address(dest, dp, Address::times_1, 0x00), merged0, Assembler::AVX_512bit);
6302       __ evmovdquq(Address(dest, dp, Address::times_1, 0x40), merged1, Assembler::AVX_512bit);
6303       __ evmovdquq(Address(dest, dp, Address::times_1, 0x80), merged2, Assembler::AVX_512bit);
6304 
6305       __ addptr(source, 0x100);
6306       __ addptr(dest, 0xc0);
6307       __ subl(length, 0x100);
6308       __ cmpl(length, 64 * 4);
6309       __ jcc(Assembler::greaterEqual, L_process256);
6310 
6311       // At this point, we've decoded 64 * 4 * n bytes.
6312       // The remaining length will be <= 64 * 4 - 1.
6313       // UNLESS there was an error decoding the first 256-byte chunk.  In this
6314       // case, the length will be arbitrarily long.
6315       //
6316       // Note that this will be the path for MIME-encoded strings.
6317 
6318       __ BIND(L_process64);
6319 
6320       __ evmovdquq(pack24bits, ExternalAddress(StubRoutines::x86::base64_vbmi_pack_vec_addr()), Assembler::AVX_512bit, r13);
6321 
6322       __ cmpl(length, 63);
6323       __ jcc(Assembler::lessEqual, L_finalBit);
6324 
6325       __ mov64(rax, 0x0000ffffffffffff);
6326       __ kmovql(k2, rax);
6327 
6328       __ align32();
6329       __ BIND(L_process64Loop);
6330 
6331       // Handle first 64-byte block
6332 
6333       __ evmovdquq(input0, Address(source, start_offset), Assembler::AVX_512bit);
6334       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6335       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6336 
6337       __ vpor(errorvec, translated0, input0, Assembler::AVX_512bit);
6338 
6339       // Check for error and bomb out before updating dest
6340       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6341       __ kortestql(k3, k3);
6342       __ jcc(Assembler::notZero, L_exit);
6343 
6344       // Pack output register, selecting correct byte ordering
6345       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6346       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6347       __ vpermb(merged0, pack24bits, merged0, Assembler::AVX_512bit);
6348 
6349       __ evmovdqub(Address(dest, dp), k2, merged0, true, Assembler::AVX_512bit);
6350 
6351       __ subl(length, 64);
6352       __ addptr(source, 64);
6353       __ addptr(dest, 48);
6354 
6355       __ cmpl(length, 64);
6356       __ jcc(Assembler::greaterEqual, L_process64Loop);
6357 
6358       __ cmpl(length, 0);
6359       __ jcc(Assembler::lessEqual, L_exit);
6360 
6361       __ BIND(L_finalBit);
6362       // Now have 1 to 63 bytes left to decode
6363 
6364       // I was going to let Java take care of the final fragment
6365       // however it will repeatedly call this routine for every 4 bytes
6366       // of input data, so handle the rest here.
6367       __ movq(rax, -1);
6368       __ bzhiq(rax, rax, length);    // Input mask in rax
6369 
6370       __ movl(output_size, length);
6371       __ shrl(output_size, 2);   // Find (len / 4) * 3 (output length)
6372       __ lea(output_size, Address(output_size, output_size, Address::times_2, 0));
6373       // output_size in r13
6374 
6375       // Strip pad characters, if any, and adjust length and mask
6376       __ cmpb(Address(source, length, Address::times_1, -1), '=');
6377       __ jcc(Assembler::equal, L_padding);
6378 
6379       __ BIND(L_donePadding);
6380 
6381       // Output size is (64 - output_size), output mask is (all 1s >> output_size).
6382       __ kmovql(input_mask, rax);
6383       __ movq(output_mask, -1);
6384       __ bzhiq(output_mask, output_mask, output_size);
6385 
6386       // Load initial input with all valid base64 characters.  Will be used
6387       // in merging source bytes to avoid masking when determining if an error occurred.
6388       __ movl(rax, 0x61616161);
6389       __ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
6390 
6391       // A register containing all invalid base64 decoded values
6392       __ movl(rax, 0x80808080);
6393       __ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
6394 
6395       // input_mask is in k1
6396       // output_size is in r13
6397       // output_mask is in r15
6398       // zmm0 - free
6399       // zmm1 - 0x00011000
6400       // zmm2 - 0x01400140
6401       // zmm3 - errorvec
6402       // zmm4 - pack vector
6403       // zmm5 - lookup_lo
6404       // zmm6 - lookup_hi
6405       // zmm7 - errorvec
6406       // zmm8 - 0x61616161
6407       // zmm9 - 0x80808080
6408 
6409       // Load only the bytes from source, merging into our "fully-valid" register
6410       __ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
6411 
6412       // Decode all bytes within our merged input
6413       __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
6414       __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
6415       __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
6416 
6417       // Check for error.  Compare (decoded | initial) to all invalid.
6418       // If any bytes have their high-order bit set, then we have an error.
6419       __ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
6420       __ kortestql(k2, k2);
6421 
6422       // If we have an error, use the brute force loop to decode what we can (4-byte chunks).
6423       __ jcc(Assembler::notZero, L_bruteForce);
6424 
6425       // Shuffle output bytes
6426       __ vpmaddubsw(tmp, tmp, pack16_op, Assembler::AVX_512bit);
6427       __ vpmaddwd(tmp, tmp, pack32_op, Assembler::AVX_512bit);
6428 
6429       __ vpermb(tmp, pack24bits, tmp, Assembler::AVX_512bit);
6430       __ kmovql(k1, output_mask);
6431       __ evmovdqub(Address(dest, dp), k1, tmp, true, Assembler::AVX_512bit);
6432 
6433       __ addptr(dest, output_size);
6434 
6435       __ BIND(L_exit);
6436       __ vzeroupper();
6437       __ pop(rax);             // Get original dest value
6438       __ subptr(dest, rax);      // Number of bytes converted
6439       __ movptr(rax, dest);
6440       __ pop(rbx);
6441       __ pop(r15);
6442       __ pop(r14);
6443       __ pop(r13);
6444       __ pop(r12);
6445       __ leave();
6446       __ ret(0);
6447 
6448       __ BIND(L_loadURL);
6449       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_url_addr()), Assembler::AVX_512bit, r13);
6450       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_url_addr()), Assembler::AVX_512bit, r13);
6451       __ jmp(L_continue);
6452 
6453       __ BIND(L_padding);
6454       __ decrementq(output_size, 1);
6455       __ shrq(rax, 1);
6456 
6457       __ cmpb(Address(source, length, Address::times_1, -2), '=');
6458       __ jcc(Assembler::notEqual, L_donePadding);
6459 
6460       __ decrementq(output_size, 1);
6461       __ shrq(rax, 1);
6462       __ jmp(L_donePadding);
6463 
6464       __ align32();
6465       __ BIND(L_bruteForce);
6466     }   // End of if(avx512_vbmi)
6467 
6468     // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
6469 
6470     // Register state (Linux):
6471     // r12-15 - saved on stack
6472     // rdi - src
6473     // rsi - sp
6474     // rdx - sl
6475     // rcx - dst
6476     // r8 - dp
6477     // r9 - isURL
6478 
6479     // Register state (Windows):
6480     // r12-15 - saved on stack
6481     // rcx - src
6482     // rdx - sp
6483     // r8 - sl
6484     // r9 - dst
6485     // r12 - dp
6486     // r10 - isURL
6487 
6488     // Registers (common):
6489     // length (r14) - bytes in src
6490 
6491     const Register decode_table = r11;
6492     const Register out_byte_count = rbx;
6493     const Register byte1 = r13;
6494     const Register byte2 = r15;
6495     const Register byte3 = WINDOWS_ONLY(r8) NOT_WINDOWS(rdx);
6496     const Register byte4 = WINDOWS_ONLY(r10) NOT_WINDOWS(r9);
6497 
6498     __ shrl(length, 2);    // Multiple of 4 bytes only - length is # 4-byte chunks
6499     __ cmpl(length, 0);
6500     __ jcc(Assembler::lessEqual, L_exit_no_vzero);
6501 
6502     __ shll(isURL, 8);    // index into decode table based on isURL
6503     __ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
6504     __ addptr(decode_table, isURL);
6505 
6506     __ jmp(L_bottomLoop);
6507 
6508     __ align32();
6509     __ BIND(L_forceLoop);
6510     __ shll(byte1, 18);
6511     __ shll(byte2, 12);
6512     __ shll(byte3, 6);
6513     __ orl(byte1, byte2);
6514     __ orl(byte1, byte3);
6515     __ orl(byte1, byte4);
6516 
6517     __ addptr(source, 4);
6518 
6519     __ movb(Address(dest, dp, Address::times_1, 2), byte1);
6520     __ shrl(byte1, 8);
6521     __ movb(Address(dest, dp, Address::times_1, 1), byte1);
6522     __ shrl(byte1, 8);
6523     __ movb(Address(dest, dp, Address::times_1, 0), byte1);
6524 
6525     __ addptr(dest, 3);
6526     __ decrementl(length, 1);
6527     __ jcc(Assembler::zero, L_exit_no_vzero);
6528 
6529     __ BIND(L_bottomLoop);
6530     __ load_unsigned_byte(byte1, Address(source, start_offset, Address::times_1, 0x00));
6531     __ load_unsigned_byte(byte2, Address(source, start_offset, Address::times_1, 0x01));
6532     __ load_signed_byte(byte1, Address(decode_table, byte1));
6533     __ load_signed_byte(byte2, Address(decode_table, byte2));
6534     __ load_unsigned_byte(byte3, Address(source, start_offset, Address::times_1, 0x02));
6535     __ load_unsigned_byte(byte4, Address(source, start_offset, Address::times_1, 0x03));
6536     __ load_signed_byte(byte3, Address(decode_table, byte3));
6537     __ load_signed_byte(byte4, Address(decode_table, byte4));
6538 
6539     __ mov(rax, byte1);
6540     __ orl(rax, byte2);
6541     __ orl(rax, byte3);
6542     __ orl(rax, byte4);
6543     __ jcc(Assembler::positive, L_forceLoop);
6544 
6545     __ BIND(L_exit_no_vzero);
6546     __ pop(rax);             // Get original dest value
6547     __ subptr(dest, rax);      // Number of bytes converted
6548     __ movptr(rax, dest);
6549     __ pop(rbx);
6550     __ pop(r15);
6551     __ pop(r14);
6552     __ pop(r13);
6553     __ pop(r12);
6554     __ leave();
6555     __ ret(0);
6556 
6557     return start;
6558   }
6559 
6560 
6561   /**
6562    *  Arguments:
6563    *
6564    * Inputs:
6565    *   c_rarg0   - int crc
6566    *   c_rarg1   - byte* buf
6567    *   c_rarg2   - int length
6568    *
6569    * Ouput:
6570    *       rax   - int crc result
6571    */
6572   address generate_updateBytesCRC32() {
6573     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
6574 
6575     __ align(CodeEntryAlignment);
6576     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6577 
6578     address start = __ pc();
6579     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6580     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6581     // rscratch1: r10
6582     const Register crc   = c_rarg0;  // crc
6583     const Register buf   = c_rarg1;  // source java byte array address
6584     const Register len   = c_rarg2;  // length
6585     const Register table = c_rarg3;  // crc_table address (reuse register)
6586     const Register tmp1   = r11;
6587     const Register tmp2   = r10;
6588     assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
6589 
6590     BLOCK_COMMENT("Entry:");
6591     __ enter(); // required for proper stackwalking of RuntimeStub frame
6592 
6593     if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6594         VM_Version::supports_avx512bw() &&
6595         VM_Version::supports_avx512vl()) {
6596         // The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value.
6597         // However, the constant table for CRC32-C assumes the original crc value.  Account for this
6598         // difference before calling and after returning.
6599       __ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
6600       __ notl(crc);
6601       __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6602       __ notl(crc);
6603     } else {
6604       __ kernel_crc32(crc, buf, len, table, tmp1);
6605     }
6606 
6607     __ movl(rax, crc);
6608     __ vzeroupper();
6609     __ leave(); // required for proper stackwalking of RuntimeStub frame
6610     __ ret(0);
6611 
6612     return start;
6613   }
6614 
6615   /**
6616   *  Arguments:
6617   *
6618   * Inputs:
6619   *   c_rarg0   - int crc
6620   *   c_rarg1   - byte* buf
6621   *   c_rarg2   - long length
6622   *   c_rarg3   - table_start - optional (present only when doing a library_call,
6623   *              not used by x86 algorithm)
6624   *
6625   * Ouput:
6626   *       rax   - int crc result
6627   */
6628   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
6629       assert(UseCRC32CIntrinsics, "need SSE4_2");
6630       __ align(CodeEntryAlignment);
6631       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
6632       address start = __ pc();
6633       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
6634       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
6635       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
6636       const Register crc = c_rarg0;  // crc
6637       const Register buf = c_rarg1;  // source java byte array address
6638       const Register len = c_rarg2;  // length
6639       const Register a = rax;
6640       const Register j = r9;
6641       const Register k = r10;
6642       const Register l = r11;
6643 #ifdef _WIN64
6644       const Register y = rdi;
6645       const Register z = rsi;
6646 #else
6647       const Register y = rcx;
6648       const Register z = r8;
6649 #endif
6650       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
6651 
6652       BLOCK_COMMENT("Entry:");
6653       __ enter(); // required for proper stackwalking of RuntimeStub frame
6654       if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6655           VM_Version::supports_avx512bw() &&
6656           VM_Version::supports_avx512vl()) {
6657         __ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr()));
6658         __ kernel_crc32_avx512(crc, buf, len, j, l, k);
6659       } else {
6660 #ifdef _WIN64
6661         __ push(y);
6662         __ push(z);
6663 #endif
6664         __ crc32c_ipl_alg2_alt2(crc, buf, len,
6665                                 a, j, k,
6666                                 l, y, z,
6667                                 c_farg0, c_farg1, c_farg2,
6668                                 is_pclmulqdq_supported);
6669 #ifdef _WIN64
6670         __ pop(z);
6671         __ pop(y);
6672 #endif
6673       }
6674       __ movl(rax, crc);
6675       __ vzeroupper();
6676       __ leave(); // required for proper stackwalking of RuntimeStub frame
6677       __ ret(0);
6678 
6679       return start;
6680   }
6681 
6682 
6683   /***
6684    *  Arguments:
6685    *
6686    *  Inputs:
6687    *   c_rarg0   - int   adler
6688    *   c_rarg1   - byte* buff
6689    *   c_rarg2   - int   len
6690    *
6691    * Output:
6692    *   rax   - int adler result
6693    */
6694 
6695   address generate_updateBytesAdler32() {
6696       assert(UseAdler32Intrinsics, "need AVX2");
6697 
6698       __ align(CodeEntryAlignment);
6699       StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
6700 
6701       address start = __ pc();
6702 
6703       const Register data = r9;
6704       const Register size = r10;
6705 
6706       const XMMRegister yshuf0 = xmm6;
6707       const XMMRegister yshuf1 = xmm7;
6708       assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
6709 
6710       BLOCK_COMMENT("Entry:");
6711       __ enter(); // required for proper stackwalking of RuntimeStub frame
6712 
6713       __ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
6714       __ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
6715       __ movptr(data, c_rarg1); //data
6716       __ movl(size, c_rarg2); //length
6717       __ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
6718       __ leave();
6719       __ ret(0);
6720       return start;
6721   }
6722 
6723   /**
6724    *  Arguments:
6725    *
6726    *  Input:
6727    *    c_rarg0   - x address
6728    *    c_rarg1   - x length
6729    *    c_rarg2   - y address
6730    *    c_rarg3   - y length
6731    * not Win64
6732    *    c_rarg4   - z address
6733    *    c_rarg5   - z length
6734    * Win64
6735    *    rsp+40    - z address
6736    *    rsp+48    - z length
6737    */
6738   address generate_multiplyToLen() {
6739     __ align(CodeEntryAlignment);
6740     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
6741 
6742     address start = __ pc();
6743     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6744     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6745     const Register x     = rdi;
6746     const Register xlen  = rax;
6747     const Register y     = rsi;
6748     const Register ylen  = rcx;
6749     const Register z     = r8;
6750     const Register zlen  = r11;
6751 
6752     // Next registers will be saved on stack in multiply_to_len().
6753     const Register tmp1  = r12;
6754     const Register tmp2  = r13;
6755     const Register tmp3  = r14;
6756     const Register tmp4  = r15;
6757     const Register tmp5  = rbx;
6758 
6759     BLOCK_COMMENT("Entry:");
6760     __ enter(); // required for proper stackwalking of RuntimeStub frame
6761 
6762 #ifndef _WIN64
6763     __ movptr(zlen, r9); // Save r9 in r11 - zlen
6764 #endif
6765     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
6766                        // ylen => rcx, z => r8, zlen => r11
6767                        // r9 and r10 may be used to save non-volatile registers
6768 #ifdef _WIN64
6769     // last 2 arguments (#4, #5) are on stack on Win64
6770     __ movptr(z, Address(rsp, 6 * wordSize));
6771     __ movptr(zlen, Address(rsp, 7 * wordSize));
6772 #endif
6773 
6774     __ movptr(xlen, rsi);
6775     __ movptr(y,    rdx);
6776     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
6777 
6778     restore_arg_regs();
6779 
6780     __ leave(); // required for proper stackwalking of RuntimeStub frame
6781     __ ret(0);
6782 
6783     return start;
6784   }
6785 
6786   /**
6787   *  Arguments:
6788   *
6789   *  Input:
6790   *    c_rarg0   - obja     address
6791   *    c_rarg1   - objb     address
6792   *    c_rarg3   - length   length
6793   *    c_rarg4   - scale    log2_array_indxscale
6794   *
6795   *  Output:
6796   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
6797   */
6798   address generate_vectorizedMismatch() {
6799     __ align(CodeEntryAlignment);
6800     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
6801     address start = __ pc();
6802 
6803     BLOCK_COMMENT("Entry:");
6804     __ enter();
6805 
6806 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6807     const Register scale = c_rarg0;  //rcx, will exchange with r9
6808     const Register objb = c_rarg1;   //rdx
6809     const Register length = c_rarg2; //r8
6810     const Register obja = c_rarg3;   //r9
6811     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
6812 
6813     const Register tmp1 = r10;
6814     const Register tmp2 = r11;
6815 #endif
6816 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6817     const Register obja = c_rarg0;   //U:rdi
6818     const Register objb = c_rarg1;   //U:rsi
6819     const Register length = c_rarg2; //U:rdx
6820     const Register scale = c_rarg3;  //U:rcx
6821     const Register tmp1 = r8;
6822     const Register tmp2 = r9;
6823 #endif
6824     const Register result = rax; //return value
6825     const XMMRegister vec0 = xmm0;
6826     const XMMRegister vec1 = xmm1;
6827     const XMMRegister vec2 = xmm2;
6828 
6829     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
6830 
6831     __ vzeroupper();
6832     __ leave();
6833     __ ret(0);
6834 
6835     return start;
6836   }
6837 
6838 /**
6839    *  Arguments:
6840    *
6841   //  Input:
6842   //    c_rarg0   - x address
6843   //    c_rarg1   - x length
6844   //    c_rarg2   - z address
6845   //    c_rarg3   - z lenth
6846    *
6847    */
6848   address generate_squareToLen() {
6849 
6850     __ align(CodeEntryAlignment);
6851     StubCodeMark mark(this, "StubRoutines", "squareToLen");
6852 
6853     address start = __ pc();
6854     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6855     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
6856     const Register x      = rdi;
6857     const Register len    = rsi;
6858     const Register z      = r8;
6859     const Register zlen   = rcx;
6860 
6861    const Register tmp1      = r12;
6862    const Register tmp2      = r13;
6863    const Register tmp3      = r14;
6864    const Register tmp4      = r15;
6865    const Register tmp5      = rbx;
6866 
6867     BLOCK_COMMENT("Entry:");
6868     __ enter(); // required for proper stackwalking of RuntimeStub frame
6869 
6870     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
6871                        // zlen => rcx
6872                        // r9 and r10 may be used to save non-volatile registers
6873     __ movptr(r8, rdx);
6874     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6875 
6876     restore_arg_regs();
6877 
6878     __ leave(); // required for proper stackwalking of RuntimeStub frame
6879     __ ret(0);
6880 
6881     return start;
6882   }
6883 
6884   address generate_method_entry_barrier() {
6885     __ align(CodeEntryAlignment);
6886     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
6887 
6888     Label deoptimize_label;
6889 
6890     address start = __ pc();
6891 
6892     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
6893 
6894     BLOCK_COMMENT("Entry:");
6895     __ enter(); // save rbp
6896 
6897     // save c_rarg0, because we want to use that value.
6898     // We could do without it but then we depend on the number of slots used by pusha
6899     __ push(c_rarg0);
6900 
6901     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
6902 
6903     __ pusha();
6904 
6905     // The method may have floats as arguments, and we must spill them before calling
6906     // the VM runtime.
6907     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
6908     const int xmm_size = wordSize * 2;
6909     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
6910     __ subptr(rsp, xmm_spill_size);
6911     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
6912     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
6913     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
6914     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
6915     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
6916     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
6917     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
6918     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
6919 
6920     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
6921 
6922     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
6923     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
6924     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
6925     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
6926     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
6927     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
6928     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
6929     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
6930     __ addptr(rsp, xmm_spill_size);
6931 
6932     __ cmpl(rax, 1); // 1 means deoptimize
6933     __ jcc(Assembler::equal, deoptimize_label);
6934 
6935     __ popa();
6936     __ pop(c_rarg0);
6937 
6938     __ leave();
6939 
6940     __ addptr(rsp, 1 * wordSize); // cookie
6941     __ ret(0);
6942 
6943 
6944     __ BIND(deoptimize_label);
6945 
6946     __ popa();
6947     __ pop(c_rarg0);
6948 
6949     __ leave();
6950 
6951     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
6952     // here while still having a correct stack is valuable
6953     __ testptr(rsp, Address(rsp, 0));
6954 
6955     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
6956     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
6957 
6958     return start;
6959   }
6960 
6961    /**
6962    *  Arguments:
6963    *
6964    *  Input:
6965    *    c_rarg0   - out address
6966    *    c_rarg1   - in address
6967    *    c_rarg2   - offset
6968    *    c_rarg3   - len
6969    * not Win64
6970    *    c_rarg4   - k
6971    * Win64
6972    *    rsp+40    - k
6973    */
6974   address generate_mulAdd() {
6975     __ align(CodeEntryAlignment);
6976     StubCodeMark mark(this, "StubRoutines", "mulAdd");
6977 
6978     address start = __ pc();
6979     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6980     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6981     const Register out     = rdi;
6982     const Register in      = rsi;
6983     const Register offset  = r11;
6984     const Register len     = rcx;
6985     const Register k       = r8;
6986 
6987     // Next registers will be saved on stack in mul_add().
6988     const Register tmp1  = r12;
6989     const Register tmp2  = r13;
6990     const Register tmp3  = r14;
6991     const Register tmp4  = r15;
6992     const Register tmp5  = rbx;
6993 
6994     BLOCK_COMMENT("Entry:");
6995     __ enter(); // required for proper stackwalking of RuntimeStub frame
6996 
6997     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
6998                        // len => rcx, k => r8
6999                        // r9 and r10 may be used to save non-volatile registers
7000 #ifdef _WIN64
7001     // last argument is on stack on Win64
7002     __ movl(k, Address(rsp, 6 * wordSize));
7003 #endif
7004     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
7005     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
7006 
7007     restore_arg_regs();
7008 
7009     __ leave(); // required for proper stackwalking of RuntimeStub frame
7010     __ ret(0);
7011 
7012     return start;
7013   }
7014 
7015   address generate_bigIntegerRightShift() {
7016     __ align(CodeEntryAlignment);
7017     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
7018 
7019     address start = __ pc();
7020     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7021     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7022     const Register newArr = rdi;
7023     const Register oldArr = rsi;
7024     const Register newIdx = rdx;
7025     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7026     const Register totalNumIter = r8;
7027 
7028     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7029     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7030     const Register tmp1 = r11;                    // Caller save.
7031     const Register tmp2 = rax;                    // Caller save.
7032     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7033     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7034     const Register tmp5 = r14;                    // Callee save.
7035     const Register tmp6 = r15;
7036 
7037     const XMMRegister x0 = xmm0;
7038     const XMMRegister x1 = xmm1;
7039     const XMMRegister x2 = xmm2;
7040 
7041     BLOCK_COMMENT("Entry:");
7042     __ enter(); // required for proper stackwalking of RuntimeStub frame
7043 
7044 #ifdef _WINDOWS
7045     setup_arg_regs(4);
7046     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7047     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7048     // Save callee save registers.
7049     __ push(tmp3);
7050     __ push(tmp4);
7051 #endif
7052     __ push(tmp5);
7053 
7054     // Rename temps used throughout the code.
7055     const Register idx = tmp1;
7056     const Register nIdx = tmp2;
7057 
7058     __ xorl(idx, idx);
7059 
7060     // Start right shift from end of the array.
7061     // For example, if #iteration = 4 and newIdx = 1
7062     // then dest[4] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
7063     // if #iteration = 4 and newIdx = 0
7064     // then dest[3] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
7065     __ movl(idx, totalNumIter);
7066     __ movl(nIdx, idx);
7067     __ addl(nIdx, newIdx);
7068 
7069     // If vectorization is enabled, check if the number of iterations is at least 64
7070     // If not, then go to ShifTwo processing 2 iterations
7071     if (VM_Version::supports_avx512_vbmi2()) {
7072       __ cmpptr(totalNumIter, (AVX3Threshold/64));
7073       __ jcc(Assembler::less, ShiftTwo);
7074 
7075       if (AVX3Threshold < 16 * 64) {
7076         __ cmpl(totalNumIter, 16);
7077         __ jcc(Assembler::less, ShiftTwo);
7078       }
7079       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7080       __ subl(idx, 16);
7081       __ subl(nIdx, 16);
7082       __ BIND(Shift512Loop);
7083       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
7084       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7085       __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
7086       __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
7087       __ subl(nIdx, 16);
7088       __ subl(idx, 16);
7089       __ jcc(Assembler::greaterEqual, Shift512Loop);
7090       __ addl(idx, 16);
7091       __ addl(nIdx, 16);
7092     }
7093     __ BIND(ShiftTwo);
7094     __ cmpl(idx, 2);
7095     __ jcc(Assembler::less, ShiftOne);
7096     __ subl(idx, 2);
7097     __ subl(nIdx, 2);
7098     __ BIND(ShiftTwoLoop);
7099     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
7100     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7101     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7102     __ shrdl(tmp5, tmp4);
7103     __ shrdl(tmp4, tmp3);
7104     __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
7105     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7106     __ subl(nIdx, 2);
7107     __ subl(idx, 2);
7108     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7109     __ addl(idx, 2);
7110     __ addl(nIdx, 2);
7111 
7112     // Do the last iteration
7113     __ BIND(ShiftOne);
7114     __ cmpl(idx, 1);
7115     __ jcc(Assembler::less, Exit);
7116     __ subl(idx, 1);
7117     __ subl(nIdx, 1);
7118     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7119     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7120     __ shrdl(tmp4, tmp3);
7121     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7122     __ BIND(Exit);
7123     __ vzeroupper();
7124     // Restore callee save registers.
7125     __ pop(tmp5);
7126 #ifdef _WINDOWS
7127     __ pop(tmp4);
7128     __ pop(tmp3);
7129     restore_arg_regs();
7130 #endif
7131     __ leave(); // required for proper stackwalking of RuntimeStub frame
7132     __ ret(0);
7133     return start;
7134   }
7135 
7136    /**
7137    *  Arguments:
7138    *
7139    *  Input:
7140    *    c_rarg0   - newArr address
7141    *    c_rarg1   - oldArr address
7142    *    c_rarg2   - newIdx
7143    *    c_rarg3   - shiftCount
7144    * not Win64
7145    *    c_rarg4   - numIter
7146    * Win64
7147    *    rsp40    - numIter
7148    */
7149   address generate_bigIntegerLeftShift() {
7150     __ align(CodeEntryAlignment);
7151     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
7152     address start = __ pc();
7153     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7154     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7155     const Register newArr = rdi;
7156     const Register oldArr = rsi;
7157     const Register newIdx = rdx;
7158     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7159     const Register totalNumIter = r8;
7160     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7161     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7162     const Register tmp1 = r11;                    // Caller save.
7163     const Register tmp2 = rax;                    // Caller save.
7164     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7165     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7166     const Register tmp5 = r14;                    // Callee save.
7167 
7168     const XMMRegister x0 = xmm0;
7169     const XMMRegister x1 = xmm1;
7170     const XMMRegister x2 = xmm2;
7171     BLOCK_COMMENT("Entry:");
7172     __ enter(); // required for proper stackwalking of RuntimeStub frame
7173 
7174 #ifdef _WINDOWS
7175     setup_arg_regs(4);
7176     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7177     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7178     // Save callee save registers.
7179     __ push(tmp3);
7180     __ push(tmp4);
7181 #endif
7182     __ push(tmp5);
7183 
7184     // Rename temps used throughout the code
7185     const Register idx = tmp1;
7186     const Register numIterTmp = tmp2;
7187 
7188     // Start idx from zero.
7189     __ xorl(idx, idx);
7190     // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
7191     __ lea(newArr, Address(newArr, newIdx, Address::times_4));
7192     __ movl(numIterTmp, totalNumIter);
7193 
7194     // If vectorization is enabled, check if the number of iterations is at least 64
7195     // If not, then go to ShiftTwo shifting two numbers at a time
7196     if (VM_Version::supports_avx512_vbmi2()) {
7197       __ cmpl(totalNumIter, (AVX3Threshold/64));
7198       __ jcc(Assembler::less, ShiftTwo);
7199 
7200       if (AVX3Threshold < 16 * 64) {
7201         __ cmpl(totalNumIter, 16);
7202         __ jcc(Assembler::less, ShiftTwo);
7203       }
7204       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7205       __ subl(numIterTmp, 16);
7206       __ BIND(Shift512Loop);
7207       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7208       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
7209       __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
7210       __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
7211       __ addl(idx, 16);
7212       __ subl(numIterTmp, 16);
7213       __ jcc(Assembler::greaterEqual, Shift512Loop);
7214       __ addl(numIterTmp, 16);
7215     }
7216     __ BIND(ShiftTwo);
7217     __ cmpl(totalNumIter, 1);
7218     __ jcc(Assembler::less, Exit);
7219     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7220     __ subl(numIterTmp, 2);
7221     __ jcc(Assembler::less, ShiftOne);
7222 
7223     __ BIND(ShiftTwoLoop);
7224     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7225     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
7226     __ shldl(tmp3, tmp4);
7227     __ shldl(tmp4, tmp5);
7228     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7229     __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
7230     __ movl(tmp3, tmp5);
7231     __ addl(idx, 2);
7232     __ subl(numIterTmp, 2);
7233     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7234 
7235     // Do the last iteration
7236     __ BIND(ShiftOne);
7237     __ addl(numIterTmp, 2);
7238     __ cmpl(numIterTmp, 1);
7239     __ jcc(Assembler::less, Exit);
7240     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7241     __ shldl(tmp3, tmp4);
7242     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7243 
7244     __ BIND(Exit);
7245     __ vzeroupper();
7246     // Restore callee save registers.
7247     __ pop(tmp5);
7248 #ifdef _WINDOWS
7249     __ pop(tmp4);
7250     __ pop(tmp3);
7251     restore_arg_regs();
7252 #endif
7253     __ leave(); // required for proper stackwalking of RuntimeStub frame
7254     __ ret(0);
7255     return start;
7256   }
7257 
7258   address generate_libmExp() {
7259     StubCodeMark mark(this, "StubRoutines", "libmExp");
7260 
7261     address start = __ pc();
7262 
7263     const XMMRegister x0  = xmm0;
7264     const XMMRegister x1  = xmm1;
7265     const XMMRegister x2  = xmm2;
7266     const XMMRegister x3  = xmm3;
7267 
7268     const XMMRegister x4  = xmm4;
7269     const XMMRegister x5  = xmm5;
7270     const XMMRegister x6  = xmm6;
7271     const XMMRegister x7  = xmm7;
7272 
7273     const Register tmp   = r11;
7274 
7275     BLOCK_COMMENT("Entry:");
7276     __ enter(); // required for proper stackwalking of RuntimeStub frame
7277 
7278     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7279 
7280     __ leave(); // required for proper stackwalking of RuntimeStub frame
7281     __ ret(0);
7282 
7283     return start;
7284 
7285   }
7286 
7287   address generate_libmLog() {
7288     StubCodeMark mark(this, "StubRoutines", "libmLog");
7289 
7290     address start = __ pc();
7291 
7292     const XMMRegister x0 = xmm0;
7293     const XMMRegister x1 = xmm1;
7294     const XMMRegister x2 = xmm2;
7295     const XMMRegister x3 = xmm3;
7296 
7297     const XMMRegister x4 = xmm4;
7298     const XMMRegister x5 = xmm5;
7299     const XMMRegister x6 = xmm6;
7300     const XMMRegister x7 = xmm7;
7301 
7302     const Register tmp1 = r11;
7303     const Register tmp2 = r8;
7304 
7305     BLOCK_COMMENT("Entry:");
7306     __ enter(); // required for proper stackwalking of RuntimeStub frame
7307 
7308     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
7309 
7310     __ leave(); // required for proper stackwalking of RuntimeStub frame
7311     __ ret(0);
7312 
7313     return start;
7314 
7315   }
7316 
7317   address generate_libmLog10() {
7318     StubCodeMark mark(this, "StubRoutines", "libmLog10");
7319 
7320     address start = __ pc();
7321 
7322     const XMMRegister x0 = xmm0;
7323     const XMMRegister x1 = xmm1;
7324     const XMMRegister x2 = xmm2;
7325     const XMMRegister x3 = xmm3;
7326 
7327     const XMMRegister x4 = xmm4;
7328     const XMMRegister x5 = xmm5;
7329     const XMMRegister x6 = xmm6;
7330     const XMMRegister x7 = xmm7;
7331 
7332     const Register tmp = r11;
7333 
7334     BLOCK_COMMENT("Entry:");
7335     __ enter(); // required for proper stackwalking of RuntimeStub frame
7336 
7337     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7338 
7339     __ leave(); // required for proper stackwalking of RuntimeStub frame
7340     __ ret(0);
7341 
7342     return start;
7343 
7344   }
7345 
7346   address generate_libmPow() {
7347     StubCodeMark mark(this, "StubRoutines", "libmPow");
7348 
7349     address start = __ pc();
7350 
7351     const XMMRegister x0 = xmm0;
7352     const XMMRegister x1 = xmm1;
7353     const XMMRegister x2 = xmm2;
7354     const XMMRegister x3 = xmm3;
7355 
7356     const XMMRegister x4 = xmm4;
7357     const XMMRegister x5 = xmm5;
7358     const XMMRegister x6 = xmm6;
7359     const XMMRegister x7 = xmm7;
7360 
7361     const Register tmp1 = r8;
7362     const Register tmp2 = r9;
7363     const Register tmp3 = r10;
7364     const Register tmp4 = r11;
7365 
7366     BLOCK_COMMENT("Entry:");
7367     __ enter(); // required for proper stackwalking of RuntimeStub frame
7368 
7369     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7370 
7371     __ leave(); // required for proper stackwalking of RuntimeStub frame
7372     __ ret(0);
7373 
7374     return start;
7375 
7376   }
7377 
7378   address generate_libmSin() {
7379     StubCodeMark mark(this, "StubRoutines", "libmSin");
7380 
7381     address start = __ pc();
7382 
7383     const XMMRegister x0 = xmm0;
7384     const XMMRegister x1 = xmm1;
7385     const XMMRegister x2 = xmm2;
7386     const XMMRegister x3 = xmm3;
7387 
7388     const XMMRegister x4 = xmm4;
7389     const XMMRegister x5 = xmm5;
7390     const XMMRegister x6 = xmm6;
7391     const XMMRegister x7 = xmm7;
7392 
7393     const Register tmp1 = r8;
7394     const Register tmp2 = r9;
7395     const Register tmp3 = r10;
7396     const Register tmp4 = r11;
7397 
7398     BLOCK_COMMENT("Entry:");
7399     __ enter(); // required for proper stackwalking of RuntimeStub frame
7400 
7401 #ifdef _WIN64
7402     __ push(rsi);
7403     __ push(rdi);
7404 #endif
7405     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7406 
7407 #ifdef _WIN64
7408     __ pop(rdi);
7409     __ pop(rsi);
7410 #endif
7411 
7412     __ leave(); // required for proper stackwalking of RuntimeStub frame
7413     __ ret(0);
7414 
7415     return start;
7416 
7417   }
7418 
7419   address generate_libmCos() {
7420     StubCodeMark mark(this, "StubRoutines", "libmCos");
7421 
7422     address start = __ pc();
7423 
7424     const XMMRegister x0 = xmm0;
7425     const XMMRegister x1 = xmm1;
7426     const XMMRegister x2 = xmm2;
7427     const XMMRegister x3 = xmm3;
7428 
7429     const XMMRegister x4 = xmm4;
7430     const XMMRegister x5 = xmm5;
7431     const XMMRegister x6 = xmm6;
7432     const XMMRegister x7 = xmm7;
7433 
7434     const Register tmp1 = r8;
7435     const Register tmp2 = r9;
7436     const Register tmp3 = r10;
7437     const Register tmp4 = r11;
7438 
7439     BLOCK_COMMENT("Entry:");
7440     __ enter(); // required for proper stackwalking of RuntimeStub frame
7441 
7442 #ifdef _WIN64
7443     __ push(rsi);
7444     __ push(rdi);
7445 #endif
7446     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7447 
7448 #ifdef _WIN64
7449     __ pop(rdi);
7450     __ pop(rsi);
7451 #endif
7452 
7453     __ leave(); // required for proper stackwalking of RuntimeStub frame
7454     __ ret(0);
7455 
7456     return start;
7457 
7458   }
7459 
7460   address generate_libmTan() {
7461     StubCodeMark mark(this, "StubRoutines", "libmTan");
7462 
7463     address start = __ pc();
7464 
7465     const XMMRegister x0 = xmm0;
7466     const XMMRegister x1 = xmm1;
7467     const XMMRegister x2 = xmm2;
7468     const XMMRegister x3 = xmm3;
7469 
7470     const XMMRegister x4 = xmm4;
7471     const XMMRegister x5 = xmm5;
7472     const XMMRegister x6 = xmm6;
7473     const XMMRegister x7 = xmm7;
7474 
7475     const Register tmp1 = r8;
7476     const Register tmp2 = r9;
7477     const Register tmp3 = r10;
7478     const Register tmp4 = r11;
7479 
7480     BLOCK_COMMENT("Entry:");
7481     __ enter(); // required for proper stackwalking of RuntimeStub frame
7482 
7483 #ifdef _WIN64
7484     __ push(rsi);
7485     __ push(rdi);
7486 #endif
7487     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7488 
7489 #ifdef _WIN64
7490     __ pop(rdi);
7491     __ pop(rsi);
7492 #endif
7493 
7494     __ leave(); // required for proper stackwalking of RuntimeStub frame
7495     __ ret(0);
7496 
7497     return start;
7498 
7499   }
7500 
7501 #undef __
7502 #define __ masm->
7503 
7504   // Continuation point for throwing of implicit exceptions that are
7505   // not handled in the current activation. Fabricates an exception
7506   // oop and initiates normal exception dispatching in this
7507   // frame. Since we need to preserve callee-saved values (currently
7508   // only for C2, but done for C1 as well) we need a callee-saved oop
7509   // map and therefore have to make these stubs into RuntimeStubs
7510   // rather than BufferBlobs.  If the compiler needs all registers to
7511   // be preserved between the fault point and the exception handler
7512   // then it must assume responsibility for that in
7513   // AbstractCompiler::continuation_for_implicit_null_exception or
7514   // continuation_for_implicit_division_by_zero_exception. All other
7515   // implicit exceptions (e.g., NullPointerException or
7516   // AbstractMethodError on entry) are either at call sites or
7517   // otherwise assume that stack unwinding will be initiated, so
7518   // caller saved registers were assumed volatile in the compiler.
7519   address generate_throw_exception(const char* name,
7520                                    address runtime_entry,
7521                                    Register arg1 = noreg,
7522                                    Register arg2 = noreg) {
7523     // Information about frame layout at time of blocking runtime call.
7524     // Note that we only have to preserve callee-saved registers since
7525     // the compilers are responsible for supplying a continuation point
7526     // if they expect all registers to be preserved.
7527     enum layout {
7528       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
7529       rbp_off2,
7530       return_off,
7531       return_off2,
7532       framesize // inclusive of return address
7533     };
7534 
7535     int insts_size = 512;
7536     int locs_size  = 64;
7537 
7538     CodeBuffer code(name, insts_size, locs_size);
7539     OopMapSet* oop_maps  = new OopMapSet();
7540     MacroAssembler* masm = new MacroAssembler(&code);
7541 
7542     address start = __ pc();
7543 
7544     // This is an inlined and slightly modified version of call_VM
7545     // which has the ability to fetch the return PC out of
7546     // thread-local storage and also sets up last_Java_sp slightly
7547     // differently than the real call_VM
7548 
7549     __ enter(); // required for proper stackwalking of RuntimeStub frame
7550 
7551     assert(is_even(framesize/2), "sp not 16-byte aligned");
7552 
7553     // return address and rbp are already in place
7554     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
7555 
7556     int frame_complete = __ pc() - start;
7557 
7558     // Set up last_Java_sp and last_Java_fp
7559     address the_pc = __ pc();
7560     __ set_last_Java_frame(rsp, rbp, the_pc);
7561     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
7562 
7563     // Call runtime
7564     if (arg1 != noreg) {
7565       assert(arg2 != c_rarg1, "clobbered");
7566       __ movptr(c_rarg1, arg1);
7567     }
7568     if (arg2 != noreg) {
7569       __ movptr(c_rarg2, arg2);
7570     }
7571     __ movptr(c_rarg0, r15_thread);
7572     BLOCK_COMMENT("call runtime_entry");
7573     __ call(RuntimeAddress(runtime_entry));
7574 
7575     // Generate oop map
7576     OopMap* map = new OopMap(framesize, 0);
7577 
7578     oop_maps->add_gc_map(the_pc - start, map);
7579 
7580     __ reset_last_Java_frame(true);
7581 
7582     __ leave(); // required for proper stackwalking of RuntimeStub frame
7583 
7584     // check for pending exceptions
7585 #ifdef ASSERT
7586     Label L;
7587     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
7588             (int32_t) NULL_WORD);
7589     __ jcc(Assembler::notEqual, L);
7590     __ should_not_reach_here();
7591     __ bind(L);
7592 #endif // ASSERT
7593     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7594 
7595 
7596     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7597     RuntimeStub* stub =
7598       RuntimeStub::new_runtime_stub(name,
7599                                     &code,
7600                                     frame_complete,
7601                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7602                                     oop_maps, false);
7603     return stub->entry_point();
7604   }
7605 
7606   void create_control_words() {
7607     // Round to nearest, 64-bit mode, exceptions masked
7608     StubRoutines::x86::_mxcsr_std = 0x1F80;
7609   }
7610 
7611   // Call here from the interpreter or compiled code to either load
7612   // multiple returned values from the inline type instance being
7613   // returned to registers or to store returned values to a newly
7614   // allocated inline type instance.
7615   // Register is a class, but it would be assigned numerical value.
7616   // "0" is assigned for xmm0. Thus we need to ignore -Wnonnull.
7617   PRAGMA_DIAG_PUSH
7618   PRAGMA_NONNULL_IGNORED
7619   address generate_return_value_stub(address destination, const char* name, bool has_res) {
7620     // We need to save all registers the calling convention may use so
7621     // the runtime calls read or update those registers. This needs to
7622     // be in sync with SharedRuntime::java_return_convention().
7623     enum layout {
7624       pad_off = frame::arg_reg_save_area_bytes/BytesPerInt, pad_off_2,
7625       rax_off, rax_off_2,
7626       j_rarg5_off, j_rarg5_2,
7627       j_rarg4_off, j_rarg4_2,
7628       j_rarg3_off, j_rarg3_2,
7629       j_rarg2_off, j_rarg2_2,
7630       j_rarg1_off, j_rarg1_2,
7631       j_rarg0_off, j_rarg0_2,
7632       j_farg0_off, j_farg0_2,
7633       j_farg1_off, j_farg1_2,
7634       j_farg2_off, j_farg2_2,
7635       j_farg3_off, j_farg3_2,
7636       j_farg4_off, j_farg4_2,
7637       j_farg5_off, j_farg5_2,
7638       j_farg6_off, j_farg6_2,
7639       j_farg7_off, j_farg7_2,
7640       rbp_off, rbp_off_2,
7641       return_off, return_off_2,
7642 
7643       framesize
7644     };
7645 
7646     CodeBuffer buffer(name, 1000, 512);
7647     MacroAssembler* masm = new MacroAssembler(&buffer);
7648 
7649     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
7650     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
7651     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
7652     int frame_size_in_words = frame_size_in_bytes / wordSize;
7653 
7654     OopMapSet *oop_maps = new OopMapSet();
7655     OopMap* map = new OopMap(frame_size_in_slots, 0);
7656 
7657     map->set_callee_saved(VMRegImpl::stack2reg(rax_off), rax->as_VMReg());
7658     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
7659     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
7660     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
7661     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
7662     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
7663     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
7664     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
7665     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
7666     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
7667     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
7668     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
7669     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
7670     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
7671     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
7672 
7673     int start = __ offset();
7674 
7675     __ subptr(rsp, frame_size_in_bytes - 8 /* return address*/);
7676 
7677     __ movptr(Address(rsp, rbp_off * BytesPerInt), rbp);
7678     __ movdbl(Address(rsp, j_farg7_off * BytesPerInt), j_farg7);
7679     __ movdbl(Address(rsp, j_farg6_off * BytesPerInt), j_farg6);
7680     __ movdbl(Address(rsp, j_farg5_off * BytesPerInt), j_farg5);
7681     __ movdbl(Address(rsp, j_farg4_off * BytesPerInt), j_farg4);
7682     __ movdbl(Address(rsp, j_farg3_off * BytesPerInt), j_farg3);
7683     __ movdbl(Address(rsp, j_farg2_off * BytesPerInt), j_farg2);
7684     __ movdbl(Address(rsp, j_farg1_off * BytesPerInt), j_farg1);
7685     __ movdbl(Address(rsp, j_farg0_off * BytesPerInt), j_farg0);
7686 
7687     __ movptr(Address(rsp, j_rarg0_off * BytesPerInt), j_rarg0);
7688     __ movptr(Address(rsp, j_rarg1_off * BytesPerInt), j_rarg1);
7689     __ movptr(Address(rsp, j_rarg2_off * BytesPerInt), j_rarg2);
7690     __ movptr(Address(rsp, j_rarg3_off * BytesPerInt), j_rarg3);
7691     __ movptr(Address(rsp, j_rarg4_off * BytesPerInt), j_rarg4);
7692     __ movptr(Address(rsp, j_rarg5_off * BytesPerInt), j_rarg5);
7693     __ movptr(Address(rsp, rax_off * BytesPerInt), rax);
7694 
7695     int frame_complete = __ offset();
7696 
7697     __ set_last_Java_frame(noreg, noreg, NULL);
7698 
7699     __ mov(c_rarg0, r15_thread);
7700     __ mov(c_rarg1, rax);
7701 
7702     __ call(RuntimeAddress(destination));
7703 
7704     // Set an oopmap for the call site.
7705 
7706     oop_maps->add_gc_map( __ offset() - start, map);
7707 
7708     // clear last_Java_sp
7709     __ reset_last_Java_frame(false);
7710 
7711     __ movptr(rbp, Address(rsp, rbp_off * BytesPerInt));
7712     __ movdbl(j_farg7, Address(rsp, j_farg7_off * BytesPerInt));
7713     __ movdbl(j_farg6, Address(rsp, j_farg6_off * BytesPerInt));
7714     __ movdbl(j_farg5, Address(rsp, j_farg5_off * BytesPerInt));
7715     __ movdbl(j_farg4, Address(rsp, j_farg4_off * BytesPerInt));
7716     __ movdbl(j_farg3, Address(rsp, j_farg3_off * BytesPerInt));
7717     __ movdbl(j_farg2, Address(rsp, j_farg2_off * BytesPerInt));
7718     __ movdbl(j_farg1, Address(rsp, j_farg1_off * BytesPerInt));
7719     __ movdbl(j_farg0, Address(rsp, j_farg0_off * BytesPerInt));
7720 
7721     __ movptr(j_rarg0, Address(rsp, j_rarg0_off * BytesPerInt));
7722     __ movptr(j_rarg1, Address(rsp, j_rarg1_off * BytesPerInt));
7723     __ movptr(j_rarg2, Address(rsp, j_rarg2_off * BytesPerInt));
7724     __ movptr(j_rarg3, Address(rsp, j_rarg3_off * BytesPerInt));
7725     __ movptr(j_rarg4, Address(rsp, j_rarg4_off * BytesPerInt));
7726     __ movptr(j_rarg5, Address(rsp, j_rarg5_off * BytesPerInt));
7727     __ movptr(rax, Address(rsp, rax_off * BytesPerInt));
7728 
7729     __ addptr(rsp, frame_size_in_bytes-8);
7730 
7731     // check for pending exceptions
7732     Label pending;
7733     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
7734     __ jcc(Assembler::notEqual, pending);
7735 
7736     if (has_res) {
7737       __ get_vm_result(rax, r15_thread);
7738     }
7739 
7740     __ ret(0);
7741 
7742     __ bind(pending);
7743 
7744     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
7745     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7746 
7747     // -------------
7748     // make sure all code is generated
7749     masm->flush();
7750 
7751     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, false);
7752     return stub->entry_point();
7753   }
7754 
7755   // Initialization
7756   void generate_initial() {
7757     // Generates all stubs and initializes the entry points
7758 
7759     // This platform-specific settings are needed by generate_call_stub()
7760     create_control_words();
7761 
7762     // entry points that exist in all platforms Note: This is code
7763     // that could be shared among different platforms - however the
7764     // benefit seems to be smaller than the disadvantage of having a
7765     // much more complicated generator structure. See also comment in
7766     // stubRoutines.hpp.
7767 
7768     StubRoutines::_forward_exception_entry = generate_forward_exception();
7769 
7770     // Generate these first because they are called from other stubs
7771     if (InlineTypeReturnedAsFields) {
7772       StubRoutines::_load_inline_type_fields_in_regs =
7773         generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
7774       StubRoutines::_store_inline_type_fields_to_buf =
7775         generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
7776     }
7777     StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
7778 
7779     // is referenced by megamorphic call
7780     StubRoutines::_catch_exception_entry = generate_catch_exception();
7781 
7782     // atomic calls
7783     StubRoutines::_fence_entry                = generate_orderaccess_fence();
7784 
7785     // platform dependent
7786     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
7787 
7788     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
7789 
7790     StubRoutines::x86::_f2i_fixup             = generate_f2i_fixup();
7791     StubRoutines::x86::_f2l_fixup             = generate_f2l_fixup();
7792     StubRoutines::x86::_d2i_fixup             = generate_d2i_fixup();
7793     StubRoutines::x86::_d2l_fixup             = generate_d2l_fixup();
7794 
7795     StubRoutines::x86::_float_sign_mask       = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
7796     StubRoutines::x86::_float_sign_flip       = generate_fp_mask("float_sign_flip",  0x8000000080000000);
7797     StubRoutines::x86::_double_sign_mask      = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7798     StubRoutines::x86::_double_sign_flip      = generate_fp_mask("double_sign_flip", 0x8000000000000000);
7799 
7800     // Build this early so it's available for the interpreter.
7801     StubRoutines::_throw_StackOverflowError_entry =
7802       generate_throw_exception("StackOverflowError throw_exception",
7803                                CAST_FROM_FN_PTR(address,
7804                                                 SharedRuntime::
7805                                                 throw_StackOverflowError));
7806     StubRoutines::_throw_delayed_StackOverflowError_entry =
7807       generate_throw_exception("delayed StackOverflowError throw_exception",
7808                                CAST_FROM_FN_PTR(address,
7809                                                 SharedRuntime::
7810                                                 throw_delayed_StackOverflowError));
7811     if (UseCRC32Intrinsics) {
7812       // set table address before stub generation which use it
7813       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
7814       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7815     }
7816 
7817     if (UseCRC32CIntrinsics) {
7818       bool supports_clmul = VM_Version::supports_clmul();
7819       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
7820       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
7821       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
7822     }
7823 
7824     if (UseAdler32Intrinsics) {
7825        StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7826     }
7827 
7828     if (UseLibmIntrinsic && InlineIntrinsics) {
7829       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
7830           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
7831           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7832         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
7833         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
7834         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
7835         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
7836         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
7837         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
7838         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
7839         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
7840         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
7841         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
7842         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
7843         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
7844         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
7845         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
7846       }
7847       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
7848         StubRoutines::_dexp = generate_libmExp();
7849       }
7850       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7851         StubRoutines::_dlog = generate_libmLog();
7852       }
7853       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
7854         StubRoutines::_dlog10 = generate_libmLog10();
7855       }
7856       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
7857         StubRoutines::_dpow = generate_libmPow();
7858       }
7859       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7860         StubRoutines::_dsin = generate_libmSin();
7861       }
7862       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7863         StubRoutines::_dcos = generate_libmCos();
7864       }
7865       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7866         StubRoutines::_dtan = generate_libmTan();
7867       }
7868     }
7869 
7870     // Safefetch stubs.
7871     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7872                                                        &StubRoutines::_safefetch32_fault_pc,
7873                                                        &StubRoutines::_safefetch32_continuation_pc);
7874     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7875                                                        &StubRoutines::_safefetchN_fault_pc,
7876                                                        &StubRoutines::_safefetchN_continuation_pc);
7877   }
7878 
7879   void generate_all() {
7880     // Generates all stubs and initializes the entry points
7881 
7882     // These entry points require SharedInfo::stack0 to be set up in
7883     // non-core builds and need to be relocatable, so they each
7884     // fabricate a RuntimeStub internally.
7885     StubRoutines::_throw_AbstractMethodError_entry =
7886       generate_throw_exception("AbstractMethodError throw_exception",
7887                                CAST_FROM_FN_PTR(address,
7888                                                 SharedRuntime::
7889                                                 throw_AbstractMethodError));
7890 
7891     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7892       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7893                                CAST_FROM_FN_PTR(address,
7894                                                 SharedRuntime::
7895                                                 throw_IncompatibleClassChangeError));
7896 
7897     StubRoutines::_throw_NullPointerException_at_call_entry =
7898       generate_throw_exception("NullPointerException at call throw_exception",
7899                                CAST_FROM_FN_PTR(address,
7900                                                 SharedRuntime::
7901                                                 throw_NullPointerException_at_call));
7902 
7903     // entry points that are platform specific
7904     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
7905     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
7906     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7907     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
7908     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
7909     StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x0000000100000001);
7910     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
7911     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
7912     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
7913     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
7914     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
7915                                                                         0xFFFFFFFF, 0, 0, 0);
7916     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
7917                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
7918     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
7919     StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
7920     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
7921     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
7922     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
7923     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
7924 
7925     if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
7926       // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
7927       StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
7928     }
7929 
7930     // support for verify_oop (must happen after universe_init)
7931     if (VerifyOops) {
7932       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7933     }
7934 
7935     // data cache line writeback
7936     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7937     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7938 
7939     // arraycopy stubs used by compilers
7940     generate_arraycopy_stubs();
7941 
7942     // don't bother generating these AES intrinsic stubs unless global flag is set
7943     if (UseAESIntrinsics) {
7944       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
7945       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7946       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7947       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7948       if (VM_Version::supports_avx512_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
7949         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
7950         StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
7951         StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
7952         StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7953         StubRoutines::x86::_ghash_poly512_addr = ghash_polynomial512_addr();
7954         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7955         StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7956       } else {
7957         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
7958       }
7959     }
7960 
7961     if (UseAESCTRIntrinsics) {
7962       if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
7963         if (StubRoutines::x86::_counter_mask_addr == NULL) {
7964           StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7965         }
7966         StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
7967       } else {
7968         StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
7969         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
7970       }
7971     }
7972 
7973     if (UseMD5Intrinsics) {
7974       StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
7975       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
7976     }
7977     if (UseSHA1Intrinsics) {
7978       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
7979       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
7980       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
7981       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
7982     }
7983     if (UseSHA256Intrinsics) {
7984       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
7985       char* dst = (char*)StubRoutines::x86::_k256_W;
7986       char* src = (char*)StubRoutines::x86::_k256;
7987       for (int ii = 0; ii < 16; ++ii) {
7988         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
7989         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
7990       }
7991       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
7992       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
7993       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
7994       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
7995     }
7996     if (UseSHA512Intrinsics) {
7997       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
7998       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
7999       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
8000       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
8001     }
8002 
8003     // Generate GHASH intrinsics code
8004     if (UseGHASHIntrinsics) {
8005       if (StubRoutines::x86::_ghash_long_swap_mask_addr == NULL) {
8006         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
8007       }
8008     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
8009       if (VM_Version::supports_avx()) {
8010         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
8011         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
8012         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
8013       } else {
8014         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8015       }
8016     }
8017 
8018 
8019     if (UseBASE64Intrinsics) {
8020       if(VM_Version::supports_avx2() &&
8021          VM_Version::supports_avx512bw() &&
8022          VM_Version::supports_avx512vl()) {
8023         StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
8024         StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
8025         StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
8026       }
8027       StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
8028       if (VM_Version::supports_avx512_vbmi()) {
8029         StubRoutines::x86::_shuffle_base64 = base64_shuffle_addr();
8030         StubRoutines::x86::_lookup_lo_base64 = base64_vbmi_lookup_lo_addr();
8031         StubRoutines::x86::_lookup_hi_base64 = base64_vbmi_lookup_hi_addr();
8032         StubRoutines::x86::_lookup_lo_base64url = base64_vbmi_lookup_lo_url_addr();
8033         StubRoutines::x86::_lookup_hi_base64url = base64_vbmi_lookup_hi_url_addr();
8034         StubRoutines::x86::_pack_vec_base64 = base64_vbmi_pack_vec_addr();
8035         StubRoutines::x86::_join_0_1_base64 = base64_vbmi_join_0_1_addr();
8036         StubRoutines::x86::_join_1_2_base64 = base64_vbmi_join_1_2_addr();
8037         StubRoutines::x86::_join_2_3_base64 = base64_vbmi_join_2_3_addr();
8038       }
8039       StubRoutines::x86::_decoding_table_base64 = base64_decoding_table_addr();
8040       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8041       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8042     }
8043 
8044     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8045     if (bs_nm != NULL) {
8046       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
8047     }
8048 #ifdef COMPILER2
8049     if (UseMultiplyToLenIntrinsic) {
8050       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8051     }
8052     if (UseSquareToLenIntrinsic) {
8053       StubRoutines::_squareToLen = generate_squareToLen();
8054     }
8055     if (UseMulAddIntrinsic) {
8056       StubRoutines::_mulAdd = generate_mulAdd();
8057     }
8058     if (VM_Version::supports_avx512_vbmi2()) {
8059       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8060       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
8061     }
8062     if (UseMontgomeryMultiplyIntrinsic) {
8063       StubRoutines::_montgomeryMultiply
8064         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
8065     }
8066     if (UseMontgomerySquareIntrinsic) {
8067       StubRoutines::_montgomerySquare
8068         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
8069     }
8070 
8071     // Get svml stub routine addresses
8072     void *libjsvml = NULL;
8073     char ebuf[1024];
8074     char dll_name[JVM_MAXPATHLEN];
8075     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "jsvml")) {
8076       libjsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
8077     }
8078     if (libjsvml != NULL) {
8079       // SVML method naming convention
8080       //   All the methods are named as __jsvml_op<T><N>_ha_<VV>
8081       //   Where:
8082       //      ha stands for high accuracy
8083       //      <T> is optional to indicate float/double
8084       //              Set to f for vector float operation
8085       //              Omitted for vector double operation
8086       //      <N> is the number of elements in the vector
8087       //              1, 2, 4, 8, 16
8088       //              e.g. 128 bit float vector has 4 float elements
8089       //      <VV> indicates the avx/sse level:
8090       //              z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2
8091       //      e.g. __jsvml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns
8092       //           __jsvml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
8093 
8094       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "jsvml" JNI_LIB_SUFFIX, p2i(libjsvml));
8095       if (UseAVX > 2) {
8096         for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
8097           int vop = VectorSupport::VECTOR_OP_SVML_START + op;
8098           if ((!VM_Version::supports_avx512dq()) &&
8099               (vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) {
8100             continue;
8101           }
8102           snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf16_ha_z0", VectorSupport::svmlname[op]);
8103           StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
8104 
8105           snprintf(ebuf, sizeof(ebuf), "__jsvml_%s8_ha_z0", VectorSupport::svmlname[op]);
8106           StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
8107         }
8108       }
8109       const char* avx_sse_str = (UseAVX >= 2) ? "l9" : ((UseAVX == 1) ? "e9" : "ex");
8110       for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
8111         int vop = VectorSupport::VECTOR_OP_SVML_START + op;
8112         if (vop == VectorSupport::VECTOR_OP_POW) {
8113           continue;
8114         }
8115         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8116         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
8117 
8118         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8119         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
8120 
8121         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf8_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8122         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
8123 
8124         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8125         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
8126 
8127         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8128         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
8129 
8130         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8131         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
8132       }
8133     }
8134 #endif // COMPILER2
8135 
8136     if (UseVectorizedMismatchIntrinsic) {
8137       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
8138     }
8139   }
8140 
8141  public:
8142   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
8143     if (all) {
8144       generate_all();
8145     } else {
8146       generate_initial();
8147     }
8148   }
8149 }; // end class declaration
8150 
8151 #define UCM_TABLE_MAX_ENTRIES 16
8152 void StubGenerator_generate(CodeBuffer* code, bool all) {
8153   if (UnsafeCopyMemory::_table == NULL) {
8154     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
8155   }
8156   StubGenerator g(code, all);
8157 }