1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/barrierSetNMethod.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/universe.hpp"
  36 #include "nativeInst_x86.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"
  41 #include "prims/methodHandles.hpp"
  42 #include "runtime/arguments.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubCodeGenerator.hpp"
  47 #include "runtime/stubRoutines.hpp"
  48 #include "runtime/thread.inline.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_JVMCI
  53 #include "jvmci/jvmci_globals.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #define __ _masm->
  64 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  65 #define a__ ((Assembler*)_masm)->
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     // This can destroy rscratch1 if counter is far from the code cache
  86     __ incrementl(ExternalAddress((address)&counter));
  87   }
  88 #define inc_counter_np(counter) \
  89   BLOCK_COMMENT("inc_counter " #counter); \
  90   inc_counter_np_(counter);
  91 #endif
  92 
  93   // Call stubs are used to call Java from C
  94   //
  95   // Linux Arguments:
  96   //    c_rarg0:   call wrapper address                   address
  97   //    c_rarg1:   result                                 address
  98   //    c_rarg2:   result type                            BasicType
  99   //    c_rarg3:   method                                 Method*
 100   //    c_rarg4:   (interpreter) entry point              address
 101   //    c_rarg5:   parameters                             intptr_t*
 102   //    16(rbp): parameter size (in words)              int
 103   //    24(rbp): thread                                 Thread*
 104   //
 105   //     [ return_from_Java     ] <--- rsp
 106   //     [ argument word n      ]
 107   //      ...
 108   // -12 [ argument word 1      ]
 109   // -11 [ saved r15            ] <--- rsp_after_call
 110   // -10 [ saved r14            ]
 111   //  -9 [ saved r13            ]
 112   //  -8 [ saved r12            ]
 113   //  -7 [ saved rbx            ]
 114   //  -6 [ call wrapper         ]
 115   //  -5 [ result               ]
 116   //  -4 [ result type          ]
 117   //  -3 [ method               ]
 118   //  -2 [ entry point          ]
 119   //  -1 [ parameters           ]
 120   //   0 [ saved rbp            ] <--- rbp
 121   //   1 [ return address       ]
 122   //   2 [ parameter size       ]
 123   //   3 [ thread               ]
 124   //
 125   // Windows Arguments:
 126   //    c_rarg0:   call wrapper address                   address
 127   //    c_rarg1:   result                                 address
 128   //    c_rarg2:   result type                            BasicType
 129   //    c_rarg3:   method                                 Method*
 130   //    48(rbp): (interpreter) entry point              address
 131   //    56(rbp): parameters                             intptr_t*
 132   //    64(rbp): parameter size (in words)              int
 133   //    72(rbp): thread                                 Thread*
 134   //
 135   //     [ return_from_Java     ] <--- rsp
 136   //     [ argument word n      ]
 137   //      ...
 138   // -60 [ argument word 1      ]
 139   // -59 [ saved xmm31          ] <--- rsp after_call
 140   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 141   // -27 [ saved xmm15          ]
 142   //     [ saved xmm7-xmm14     ]
 143   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 144   //  -7 [ saved r15            ]
 145   //  -6 [ saved r14            ]
 146   //  -5 [ saved r13            ]
 147   //  -4 [ saved r12            ]
 148   //  -3 [ saved rdi            ]
 149   //  -2 [ saved rsi            ]
 150   //  -1 [ saved rbx            ]
 151   //   0 [ saved rbp            ] <--- rbp
 152   //   1 [ return address       ]
 153   //   2 [ call wrapper         ]
 154   //   3 [ result               ]
 155   //   4 [ result type          ]
 156   //   5 [ method               ]
 157   //   6 [ entry point          ]
 158   //   7 [ parameters           ]
 159   //   8 [ parameter size       ]
 160   //   9 [ thread               ]
 161   //
 162   //    Windows reserves the callers stack space for arguments 1-4.
 163   //    We spill c_rarg0-c_rarg3 to this space.
 164 
 165   // Call stub stack layout word offsets from rbp
 166   enum call_stub_layout {
 167 #ifdef _WIN64
 168     xmm_save_first     = 6,  // save from xmm6
 169     xmm_save_last      = 31, // to xmm31
 170     xmm_save_base      = -9,
 171     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 172     r15_off            = -7,
 173     r14_off            = -6,
 174     r13_off            = -5,
 175     r12_off            = -4,
 176     rdi_off            = -3,
 177     rsi_off            = -2,
 178     rbx_off            = -1,
 179     rbp_off            =  0,
 180     retaddr_off        =  1,
 181     call_wrapper_off   =  2,
 182     result_off         =  3,
 183     result_type_off    =  4,
 184     method_off         =  5,
 185     entry_point_off    =  6,
 186     parameters_off     =  7,
 187     parameter_size_off =  8,
 188     thread_off         =  9
 189 #else
 190     rsp_after_call_off = -12,
 191     mxcsr_off          = rsp_after_call_off,
 192     r15_off            = -11,
 193     r14_off            = -10,
 194     r13_off            = -9,
 195     r12_off            = -8,
 196     rbx_off            = -7,
 197     call_wrapper_off   = -6,
 198     result_off         = -5,
 199     result_type_off    = -4,
 200     method_off         = -3,
 201     entry_point_off    = -2,
 202     parameters_off     = -1,
 203     rbp_off            =  0,
 204     retaddr_off        =  1,
 205     parameter_size_off =  2,
 206     thread_off         =  3
 207 #endif
 208   };
 209 
 210 #ifdef _WIN64
 211   Address xmm_save(int reg) {
 212     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 213     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 214   }
 215 #endif
 216 
 217   address generate_call_stub(address& return_address) {
 218     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 219            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 220            "adjust this code");
 221     StubCodeMark mark(this, "StubRoutines", "call_stub");
 222     address start = __ pc();
 223 
 224     // same as in generate_catch_exception()!
 225     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 226 
 227     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 228     const Address result        (rbp, result_off         * wordSize);
 229     const Address result_type   (rbp, result_type_off    * wordSize);
 230     const Address method        (rbp, method_off         * wordSize);
 231     const Address entry_point   (rbp, entry_point_off    * wordSize);
 232     const Address parameters    (rbp, parameters_off     * wordSize);
 233     const Address parameter_size(rbp, parameter_size_off * wordSize);
 234 
 235     // same as in generate_catch_exception()!
 236     const Address thread        (rbp, thread_off         * wordSize);
 237 
 238     const Address r15_save(rbp, r15_off * wordSize);
 239     const Address r14_save(rbp, r14_off * wordSize);
 240     const Address r13_save(rbp, r13_off * wordSize);
 241     const Address r12_save(rbp, r12_off * wordSize);
 242     const Address rbx_save(rbp, rbx_off * wordSize);
 243 
 244     // stub code
 245     __ enter();
 246     __ subptr(rsp, -rsp_after_call_off * wordSize);
 247 
 248     // save register parameters
 249 #ifndef _WIN64
 250     __ movptr(parameters,   c_rarg5); // parameters
 251     __ movptr(entry_point,  c_rarg4); // entry_point
 252 #endif
 253 
 254     __ movptr(method,       c_rarg3); // method
 255     __ movl(result_type,  c_rarg2);   // result type
 256     __ movptr(result,       c_rarg1); // result
 257     __ movptr(call_wrapper, c_rarg0); // call wrapper
 258 
 259     // save regs belonging to calling function
 260     __ movptr(rbx_save, rbx);
 261     __ movptr(r12_save, r12);
 262     __ movptr(r13_save, r13);
 263     __ movptr(r14_save, r14);
 264     __ movptr(r15_save, r15);
 265 
 266 #ifdef _WIN64
 267     int last_reg = 15;
 268     if (UseAVX > 2) {
 269       last_reg = 31;
 270     }
 271     if (VM_Version::supports_evex()) {
 272       for (int i = xmm_save_first; i <= last_reg; i++) {
 273         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 274       }
 275     } else {
 276       for (int i = xmm_save_first; i <= last_reg; i++) {
 277         __ movdqu(xmm_save(i), as_XMMRegister(i));
 278       }
 279     }
 280 
 281     const Address rdi_save(rbp, rdi_off * wordSize);
 282     const Address rsi_save(rbp, rsi_off * wordSize);
 283 
 284     __ movptr(rsi_save, rsi);
 285     __ movptr(rdi_save, rdi);
 286 #else
 287     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 288     {
 289       Label skip_ldmx;
 290       __ stmxcsr(mxcsr_save);
 291       __ movl(rax, mxcsr_save);
 292       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 293       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 294       __ cmp32(rax, mxcsr_std);
 295       __ jcc(Assembler::equal, skip_ldmx);
 296       __ ldmxcsr(mxcsr_std);
 297       __ bind(skip_ldmx);
 298     }
 299 #endif
 300 
 301     // Load up thread register
 302     __ movptr(r15_thread, thread);
 303     __ reinit_heapbase();
 304 
 305 #ifdef ASSERT
 306     // make sure we have no pending exceptions
 307     {
 308       Label L;
 309       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 310       __ jcc(Assembler::equal, L);
 311       __ stop("StubRoutines::call_stub: entered with pending exception");
 312       __ bind(L);
 313     }
 314 #endif
 315 
 316     // pass parameters if any
 317     BLOCK_COMMENT("pass parameters if any");
 318     Label parameters_done;
 319     __ movl(c_rarg3, parameter_size);
 320     __ testl(c_rarg3, c_rarg3);
 321     __ jcc(Assembler::zero, parameters_done);
 322 
 323     Label loop;
 324     __ movptr(c_rarg2, parameters);       // parameter pointer
 325     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 326     __ BIND(loop);
 327     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 328     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 329     __ decrementl(c_rarg1);             // decrement counter
 330     __ push(rax);                       // pass parameter
 331     __ jcc(Assembler::notZero, loop);
 332 
 333     // call Java function
 334     __ BIND(parameters_done);
 335     __ movptr(rbx, method);             // get Method*
 336     __ movptr(c_rarg1, entry_point);    // get entry_point
 337     __ mov(r13, rsp);                   // set sender sp
 338     BLOCK_COMMENT("call Java function");
 339     __ call(c_rarg1);
 340 
 341     BLOCK_COMMENT("call_stub_return_address:");
 342     return_address = __ pc();
 343 
 344     // store result depending on type (everything that is not
 345     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 346     __ movptr(c_rarg0, result);
 347     Label is_long, is_float, is_double, exit;
 348     __ movl(c_rarg1, result_type);
 349     __ cmpl(c_rarg1, T_OBJECT);
 350     __ jcc(Assembler::equal, is_long);
 351     __ cmpl(c_rarg1, T_LONG);
 352     __ jcc(Assembler::equal, is_long);
 353     __ cmpl(c_rarg1, T_FLOAT);
 354     __ jcc(Assembler::equal, is_float);
 355     __ cmpl(c_rarg1, T_DOUBLE);
 356     __ jcc(Assembler::equal, is_double);
 357 
 358     // handle T_INT case
 359     __ movl(Address(c_rarg0, 0), rax);
 360 
 361     __ BIND(exit);
 362 
 363     // pop parameters
 364     __ lea(rsp, rsp_after_call);
 365 
 366 #ifdef ASSERT
 367     // verify that threads correspond
 368     {
 369      Label L1, L2, L3;
 370       __ cmpptr(r15_thread, thread);
 371       __ jcc(Assembler::equal, L1);
 372       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 373       __ bind(L1);
 374       __ get_thread(rbx);
 375       __ cmpptr(r15_thread, thread);
 376       __ jcc(Assembler::equal, L2);
 377       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 378       __ bind(L2);
 379       __ cmpptr(r15_thread, rbx);
 380       __ jcc(Assembler::equal, L3);
 381       __ stop("StubRoutines::call_stub: threads must correspond");
 382       __ bind(L3);
 383     }
 384 #endif
 385 
 386     // restore regs belonging to calling function
 387 #ifdef _WIN64
 388     // emit the restores for xmm regs
 389     if (VM_Version::supports_evex()) {
 390       for (int i = xmm_save_first; i <= last_reg; i++) {
 391         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 392       }
 393     } else {
 394       for (int i = xmm_save_first; i <= last_reg; i++) {
 395         __ movdqu(as_XMMRegister(i), xmm_save(i));
 396       }
 397     }
 398 #endif
 399     __ movptr(r15, r15_save);
 400     __ movptr(r14, r14_save);
 401     __ movptr(r13, r13_save);
 402     __ movptr(r12, r12_save);
 403     __ movptr(rbx, rbx_save);
 404 
 405 #ifdef _WIN64
 406     __ movptr(rdi, rdi_save);
 407     __ movptr(rsi, rsi_save);
 408 #else
 409     __ ldmxcsr(mxcsr_save);
 410 #endif
 411 
 412     // restore rsp
 413     __ addptr(rsp, -rsp_after_call_off * wordSize);
 414 
 415     // return
 416     __ vzeroupper();
 417     __ pop(rbp);
 418     __ ret(0);
 419 
 420     // handle return types different from T_INT
 421     __ BIND(is_long);
 422     __ movq(Address(c_rarg0, 0), rax);
 423     __ jmp(exit);
 424 
 425     __ BIND(is_float);
 426     __ movflt(Address(c_rarg0, 0), xmm0);
 427     __ jmp(exit);
 428 
 429     __ BIND(is_double);
 430     __ movdbl(Address(c_rarg0, 0), xmm0);
 431     __ jmp(exit);
 432 
 433     return start;
 434   }
 435 
 436   // Return point for a Java call if there's an exception thrown in
 437   // Java code.  The exception is caught and transformed into a
 438   // pending exception stored in JavaThread that can be tested from
 439   // within the VM.
 440   //
 441   // Note: Usually the parameters are removed by the callee. In case
 442   // of an exception crossing an activation frame boundary, that is
 443   // not the case if the callee is compiled code => need to setup the
 444   // rsp.
 445   //
 446   // rax: exception oop
 447 
 448   address generate_catch_exception() {
 449     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 450     address start = __ pc();
 451 
 452     // same as in generate_call_stub():
 453     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 454     const Address thread        (rbp, thread_off         * wordSize);
 455 
 456 #ifdef ASSERT
 457     // verify that threads correspond
 458     {
 459       Label L1, L2, L3;
 460       __ cmpptr(r15_thread, thread);
 461       __ jcc(Assembler::equal, L1);
 462       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 463       __ bind(L1);
 464       __ get_thread(rbx);
 465       __ cmpptr(r15_thread, thread);
 466       __ jcc(Assembler::equal, L2);
 467       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 468       __ bind(L2);
 469       __ cmpptr(r15_thread, rbx);
 470       __ jcc(Assembler::equal, L3);
 471       __ stop("StubRoutines::catch_exception: threads must correspond");
 472       __ bind(L3);
 473     }
 474 #endif
 475 
 476     // set pending exception
 477     __ verify_oop(rax);
 478 
 479     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 480     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 481     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 482     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 483 
 484     // complete return to VM
 485     assert(StubRoutines::_call_stub_return_address != NULL,
 486            "_call_stub_return_address must have been generated before");
 487     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 488 
 489     return start;
 490   }
 491 
 492   // Continuation point for runtime calls returning with a pending
 493   // exception.  The pending exception check happened in the runtime
 494   // or native call stub.  The pending exception in Thread is
 495   // converted into a Java-level exception.
 496   //
 497   // Contract with Java-level exception handlers:
 498   // rax: exception
 499   // rdx: throwing pc
 500   //
 501   // NOTE: At entry of this stub, exception-pc must be on stack !!
 502 
 503   address generate_forward_exception() {
 504     StubCodeMark mark(this, "StubRoutines", "forward exception");
 505     address start = __ pc();
 506 
 507     // Upon entry, the sp points to the return address returning into
 508     // Java (interpreted or compiled) code; i.e., the return address
 509     // becomes the throwing pc.
 510     //
 511     // Arguments pushed before the runtime call are still on the stack
 512     // but the exception handler will reset the stack pointer ->
 513     // ignore them.  A potential result in registers can be ignored as
 514     // well.
 515 
 516 #ifdef ASSERT
 517     // make sure this code is only executed if there is a pending exception
 518     {
 519       Label L;
 520       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 521       __ jcc(Assembler::notEqual, L);
 522       __ stop("StubRoutines::forward exception: no pending exception (1)");
 523       __ bind(L);
 524     }
 525 #endif
 526 
 527     // compute exception handler into rbx
 528     __ movptr(c_rarg0, Address(rsp, 0));
 529     BLOCK_COMMENT("call exception_handler_for_return_address");
 530     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 531                          SharedRuntime::exception_handler_for_return_address),
 532                     r15_thread, c_rarg0);
 533     __ mov(rbx, rax);
 534 
 535     // setup rax & rdx, remove return address & clear pending exception
 536     __ pop(rdx);
 537     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 538     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 539 
 540 #ifdef ASSERT
 541     // make sure exception is set
 542     {
 543       Label L;
 544       __ testptr(rax, rax);
 545       __ jcc(Assembler::notEqual, L);
 546       __ stop("StubRoutines::forward exception: no pending exception (2)");
 547       __ bind(L);
 548     }
 549 #endif
 550 
 551     // continue at exception handler (return address removed)
 552     // rax: exception
 553     // rbx: exception handler
 554     // rdx: throwing pc
 555     __ verify_oop(rax);
 556     __ jmp(rbx);
 557 
 558     return start;
 559   }
 560 
 561   // Support for intptr_t OrderAccess::fence()
 562   //
 563   // Arguments :
 564   //
 565   // Result:
 566   address generate_orderaccess_fence() {
 567     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 568     address start = __ pc();
 569     __ membar(Assembler::StoreLoad);
 570     __ ret(0);
 571 
 572     return start;
 573   }
 574 
 575 
 576   // Support for intptr_t get_previous_sp()
 577   //
 578   // This routine is used to find the previous stack pointer for the
 579   // caller.
 580   address generate_get_previous_sp() {
 581     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 582     address start = __ pc();
 583 
 584     __ movptr(rax, rsp);
 585     __ addptr(rax, 8); // return address is at the top of the stack.
 586     __ ret(0);
 587 
 588     return start;
 589   }
 590 
 591   //----------------------------------------------------------------------------------------------------
 592   // Support for void verify_mxcsr()
 593   //
 594   // This routine is used with -Xcheck:jni to verify that native
 595   // JNI code does not return to Java code without restoring the
 596   // MXCSR register to our expected state.
 597 
 598   address generate_verify_mxcsr() {
 599     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 600     address start = __ pc();
 601 
 602     const Address mxcsr_save(rsp, 0);
 603 
 604     if (CheckJNICalls) {
 605       Label ok_ret;
 606       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 607       __ push(rax);
 608       __ subptr(rsp, wordSize);      // allocate a temp location
 609       __ stmxcsr(mxcsr_save);
 610       __ movl(rax, mxcsr_save);
 611       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 612       __ cmp32(rax, mxcsr_std);
 613       __ jcc(Assembler::equal, ok_ret);
 614 
 615       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 616 
 617       __ ldmxcsr(mxcsr_std);
 618 
 619       __ bind(ok_ret);
 620       __ addptr(rsp, wordSize);
 621       __ pop(rax);
 622     }
 623 
 624     __ ret(0);
 625 
 626     return start;
 627   }
 628 
 629   address generate_f2i_fixup() {
 630     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 631     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 632 
 633     address start = __ pc();
 634 
 635     Label L;
 636 
 637     __ push(rax);
 638     __ push(c_rarg3);
 639     __ push(c_rarg2);
 640     __ push(c_rarg1);
 641 
 642     __ movl(rax, 0x7f800000);
 643     __ xorl(c_rarg3, c_rarg3);
 644     __ movl(c_rarg2, inout);
 645     __ movl(c_rarg1, c_rarg2);
 646     __ andl(c_rarg1, 0x7fffffff);
 647     __ cmpl(rax, c_rarg1); // NaN? -> 0
 648     __ jcc(Assembler::negative, L);
 649     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 650     __ movl(c_rarg3, 0x80000000);
 651     __ movl(rax, 0x7fffffff);
 652     __ cmovl(Assembler::positive, c_rarg3, rax);
 653 
 654     __ bind(L);
 655     __ movptr(inout, c_rarg3);
 656 
 657     __ pop(c_rarg1);
 658     __ pop(c_rarg2);
 659     __ pop(c_rarg3);
 660     __ pop(rax);
 661 
 662     __ ret(0);
 663 
 664     return start;
 665   }
 666 
 667   address generate_f2l_fixup() {
 668     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 669     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 670     address start = __ pc();
 671 
 672     Label L;
 673 
 674     __ push(rax);
 675     __ push(c_rarg3);
 676     __ push(c_rarg2);
 677     __ push(c_rarg1);
 678 
 679     __ movl(rax, 0x7f800000);
 680     __ xorl(c_rarg3, c_rarg3);
 681     __ movl(c_rarg2, inout);
 682     __ movl(c_rarg1, c_rarg2);
 683     __ andl(c_rarg1, 0x7fffffff);
 684     __ cmpl(rax, c_rarg1); // NaN? -> 0
 685     __ jcc(Assembler::negative, L);
 686     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 687     __ mov64(c_rarg3, 0x8000000000000000);
 688     __ mov64(rax, 0x7fffffffffffffff);
 689     __ cmov(Assembler::positive, c_rarg3, rax);
 690 
 691     __ bind(L);
 692     __ movptr(inout, c_rarg3);
 693 
 694     __ pop(c_rarg1);
 695     __ pop(c_rarg2);
 696     __ pop(c_rarg3);
 697     __ pop(rax);
 698 
 699     __ ret(0);
 700 
 701     return start;
 702   }
 703 
 704   address generate_d2i_fixup() {
 705     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 706     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 707 
 708     address start = __ pc();
 709 
 710     Label L;
 711 
 712     __ push(rax);
 713     __ push(c_rarg3);
 714     __ push(c_rarg2);
 715     __ push(c_rarg1);
 716     __ push(c_rarg0);
 717 
 718     __ movl(rax, 0x7ff00000);
 719     __ movq(c_rarg2, inout);
 720     __ movl(c_rarg3, c_rarg2);
 721     __ mov(c_rarg1, c_rarg2);
 722     __ mov(c_rarg0, c_rarg2);
 723     __ negl(c_rarg3);
 724     __ shrptr(c_rarg1, 0x20);
 725     __ orl(c_rarg3, c_rarg2);
 726     __ andl(c_rarg1, 0x7fffffff);
 727     __ xorl(c_rarg2, c_rarg2);
 728     __ shrl(c_rarg3, 0x1f);
 729     __ orl(c_rarg1, c_rarg3);
 730     __ cmpl(rax, c_rarg1);
 731     __ jcc(Assembler::negative, L); // NaN -> 0
 732     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 733     __ movl(c_rarg2, 0x80000000);
 734     __ movl(rax, 0x7fffffff);
 735     __ cmov(Assembler::positive, c_rarg2, rax);
 736 
 737     __ bind(L);
 738     __ movptr(inout, c_rarg2);
 739 
 740     __ pop(c_rarg0);
 741     __ pop(c_rarg1);
 742     __ pop(c_rarg2);
 743     __ pop(c_rarg3);
 744     __ pop(rax);
 745 
 746     __ ret(0);
 747 
 748     return start;
 749   }
 750 
 751   address generate_d2l_fixup() {
 752     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 753     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 754 
 755     address start = __ pc();
 756 
 757     Label L;
 758 
 759     __ push(rax);
 760     __ push(c_rarg3);
 761     __ push(c_rarg2);
 762     __ push(c_rarg1);
 763     __ push(c_rarg0);
 764 
 765     __ movl(rax, 0x7ff00000);
 766     __ movq(c_rarg2, inout);
 767     __ movl(c_rarg3, c_rarg2);
 768     __ mov(c_rarg1, c_rarg2);
 769     __ mov(c_rarg0, c_rarg2);
 770     __ negl(c_rarg3);
 771     __ shrptr(c_rarg1, 0x20);
 772     __ orl(c_rarg3, c_rarg2);
 773     __ andl(c_rarg1, 0x7fffffff);
 774     __ xorl(c_rarg2, c_rarg2);
 775     __ shrl(c_rarg3, 0x1f);
 776     __ orl(c_rarg1, c_rarg3);
 777     __ cmpl(rax, c_rarg1);
 778     __ jcc(Assembler::negative, L); // NaN -> 0
 779     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 780     __ mov64(c_rarg2, 0x8000000000000000);
 781     __ mov64(rax, 0x7fffffffffffffff);
 782     __ cmovq(Assembler::positive, c_rarg2, rax);
 783 
 784     __ bind(L);
 785     __ movq(inout, c_rarg2);
 786 
 787     __ pop(c_rarg0);
 788     __ pop(c_rarg1);
 789     __ pop(c_rarg2);
 790     __ pop(c_rarg3);
 791     __ pop(rax);
 792 
 793     __ ret(0);
 794 
 795     return start;
 796   }
 797 















 798   address generate_popcount_avx_lut(const char *stub_name) {
 799     __ align64();
 800     StubCodeMark mark(this, "StubRoutines", stub_name);
 801     address start = __ pc();
 802     __ emit_data64(0x0302020102010100, relocInfo::none);
 803     __ emit_data64(0x0403030203020201, relocInfo::none);
 804     __ emit_data64(0x0302020102010100, relocInfo::none);
 805     __ emit_data64(0x0403030203020201, relocInfo::none);
 806     __ emit_data64(0x0302020102010100, relocInfo::none);
 807     __ emit_data64(0x0403030203020201, relocInfo::none);
 808     __ emit_data64(0x0302020102010100, relocInfo::none);
 809     __ emit_data64(0x0403030203020201, relocInfo::none);
 810     return start;
 811   }
 812 
 813   address generate_iota_indices(const char *stub_name) {
 814     __ align(CodeEntryAlignment);
 815     StubCodeMark mark(this, "StubRoutines", stub_name);
 816     address start = __ pc();
 817     __ emit_data64(0x0706050403020100, relocInfo::none);
 818     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 819     __ emit_data64(0x1716151413121110, relocInfo::none);
 820     __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
 821     __ emit_data64(0x2726252423222120, relocInfo::none);
 822     __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
 823     __ emit_data64(0x3736353433323130, relocInfo::none);
 824     __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
 825     return start;
 826   }
 827 




























































 828   address generate_vector_byte_shuffle_mask(const char *stub_name) {
 829     __ align(CodeEntryAlignment);
 830     StubCodeMark mark(this, "StubRoutines", stub_name);
 831     address start = __ pc();
 832     __ emit_data64(0x7070707070707070, relocInfo::none);
 833     __ emit_data64(0x7070707070707070, relocInfo::none);
 834     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 835     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 836     return start;
 837   }
 838 
 839   address generate_fp_mask(const char *stub_name, int64_t mask) {
 840     __ align(CodeEntryAlignment);
 841     StubCodeMark mark(this, "StubRoutines", stub_name);
 842     address start = __ pc();
 843 
 844     __ emit_data64( mask, relocInfo::none );
 845     __ emit_data64( mask, relocInfo::none );
 846 
 847     return start;
 848   }
 849 
 850   address generate_vector_mask(const char *stub_name, int64_t mask) {
 851     __ align(CodeEntryAlignment);
 852     StubCodeMark mark(this, "StubRoutines", stub_name);
 853     address start = __ pc();
 854 
 855     __ emit_data64(mask, relocInfo::none);
 856     __ emit_data64(mask, relocInfo::none);
 857     __ emit_data64(mask, relocInfo::none);
 858     __ emit_data64(mask, relocInfo::none);
 859     __ emit_data64(mask, relocInfo::none);
 860     __ emit_data64(mask, relocInfo::none);
 861     __ emit_data64(mask, relocInfo::none);
 862     __ emit_data64(mask, relocInfo::none);
 863 
 864     return start;
 865   }
 866 
 867   address generate_vector_byte_perm_mask(const char *stub_name) {
 868     __ align(CodeEntryAlignment);
 869     StubCodeMark mark(this, "StubRoutines", stub_name);
 870     address start = __ pc();
 871 
 872     __ emit_data64(0x0000000000000001, relocInfo::none);
 873     __ emit_data64(0x0000000000000003, relocInfo::none);
 874     __ emit_data64(0x0000000000000005, relocInfo::none);
 875     __ emit_data64(0x0000000000000007, relocInfo::none);
 876     __ emit_data64(0x0000000000000000, relocInfo::none);
 877     __ emit_data64(0x0000000000000002, relocInfo::none);
 878     __ emit_data64(0x0000000000000004, relocInfo::none);
 879     __ emit_data64(0x0000000000000006, relocInfo::none);
 880 
 881     return start;
 882   }
 883 
 884   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
 885     __ align(CodeEntryAlignment);
 886     StubCodeMark mark(this, "StubRoutines", stub_name);
 887     address start = __ pc();
 888 
 889     __ emit_data64(mask, relocInfo::none);
 890     __ emit_data64(mask, relocInfo::none);
 891     __ emit_data64(mask, relocInfo::none);
 892     __ emit_data64(mask, relocInfo::none);
 893     __ emit_data64(mask, relocInfo::none);
 894     __ emit_data64(mask, relocInfo::none);
 895     __ emit_data64(mask, relocInfo::none);
 896     __ emit_data64(mask, relocInfo::none);
 897 
 898     return start;
 899   }
 900 
 901   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 902                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 903                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 904                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 905                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 906     __ align(CodeEntryAlignment);
 907     StubCodeMark mark(this, "StubRoutines", stub_name);
 908     address start = __ pc();
 909 
 910     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 911     __ emit_data(val0, relocInfo::none, 0);
 912     __ emit_data(val1, relocInfo::none, 0);
 913     __ emit_data(val2, relocInfo::none, 0);
 914     __ emit_data(val3, relocInfo::none, 0);
 915     if (len >= Assembler::AVX_256bit) {
 916       __ emit_data(val4, relocInfo::none, 0);
 917       __ emit_data(val5, relocInfo::none, 0);
 918       __ emit_data(val6, relocInfo::none, 0);
 919       __ emit_data(val7, relocInfo::none, 0);
 920       if (len >= Assembler::AVX_512bit) {
 921         __ emit_data(val8, relocInfo::none, 0);
 922         __ emit_data(val9, relocInfo::none, 0);
 923         __ emit_data(val10, relocInfo::none, 0);
 924         __ emit_data(val11, relocInfo::none, 0);
 925         __ emit_data(val12, relocInfo::none, 0);
 926         __ emit_data(val13, relocInfo::none, 0);
 927         __ emit_data(val14, relocInfo::none, 0);
 928         __ emit_data(val15, relocInfo::none, 0);
 929       }
 930     }
 931 
 932     return start;
 933   }
 934 
 935   // Non-destructive plausibility checks for oops
 936   //
 937   // Arguments:
 938   //    all args on stack!
 939   //
 940   // Stack after saving c_rarg3:
 941   //    [tos + 0]: saved c_rarg3
 942   //    [tos + 1]: saved c_rarg2
 943   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 944   //    [tos + 3]: saved flags
 945   //    [tos + 4]: return address
 946   //  * [tos + 5]: error message (char*)
 947   //  * [tos + 6]: object to verify (oop)
 948   //  * [tos + 7]: saved rax - saved by caller and bashed
 949   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 950   //  * = popped on exit
 951   address generate_verify_oop() {
 952     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 953     address start = __ pc();
 954 
 955     Label exit, error;
 956 
 957     __ pushf();
 958     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 959 
 960     __ push(r12);
 961 
 962     // save c_rarg2 and c_rarg3
 963     __ push(c_rarg2);
 964     __ push(c_rarg3);
 965 
 966     enum {
 967            // After previous pushes.
 968            oop_to_verify = 6 * wordSize,
 969            saved_rax     = 7 * wordSize,
 970            saved_r10     = 8 * wordSize,
 971 
 972            // Before the call to MacroAssembler::debug(), see below.
 973            return_addr   = 16 * wordSize,
 974            error_msg     = 17 * wordSize
 975     };
 976 
 977     // get object
 978     __ movptr(rax, Address(rsp, oop_to_verify));
 979 
 980     // make sure object is 'reasonable'
 981     __ testptr(rax, rax);
 982     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
 983 
 984 #if INCLUDE_ZGC
 985     if (UseZGC) {
 986       // Check if metadata bits indicate a bad oop
 987       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
 988       __ jcc(Assembler::notZero, error);
 989     }
 990 #endif
 991 
 992     // Check if the oop is in the right area of memory
 993     __ movptr(c_rarg2, rax);
 994     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 995     __ andptr(c_rarg2, c_rarg3);
 996     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 997     __ cmpptr(c_rarg2, c_rarg3);
 998     __ jcc(Assembler::notZero, error);
 999 
1000     // make sure klass is 'reasonable', which is not zero.
1001     __ load_klass(rax, rax, rscratch1);  // get klass
1002     __ testptr(rax, rax);
1003     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1004 
1005     // return if everything seems ok
1006     __ bind(exit);
1007     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1008     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1009     __ pop(c_rarg3);                             // restore c_rarg3
1010     __ pop(c_rarg2);                             // restore c_rarg2
1011     __ pop(r12);                                 // restore r12
1012     __ popf();                                   // restore flags
1013     __ ret(4 * wordSize);                        // pop caller saved stuff
1014 
1015     // handle errors
1016     __ bind(error);
1017     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1018     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1019     __ pop(c_rarg3);                             // get saved c_rarg3 back
1020     __ pop(c_rarg2);                             // get saved c_rarg2 back
1021     __ pop(r12);                                 // get saved r12 back
1022     __ popf();                                   // get saved flags off stack --
1023                                                  // will be ignored
1024 
1025     __ pusha();                                  // push registers
1026                                                  // (rip is already
1027                                                  // already pushed)
1028     // debug(char* msg, int64_t pc, int64_t regs[])
1029     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1030     // pushed all the registers, so now the stack looks like:
1031     //     [tos +  0] 16 saved registers
1032     //     [tos + 16] return address
1033     //   * [tos + 17] error message (char*)
1034     //   * [tos + 18] object to verify (oop)
1035     //   * [tos + 19] saved rax - saved by caller and bashed
1036     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1037     //   * = popped on exit
1038 
1039     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1040     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1041     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1042     __ mov(r12, rsp);                               // remember rsp
1043     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1044     __ andptr(rsp, -16);                            // align stack as required by ABI
1045     BLOCK_COMMENT("call MacroAssembler::debug");
1046     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1047     __ hlt();
1048     return start;
1049   }
1050 
1051   //
1052   // Verify that a register contains clean 32-bits positive value
1053   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1054   //
1055   //  Input:
1056   //    Rint  -  32-bits value
1057   //    Rtmp  -  scratch
1058   //
1059   void assert_clean_int(Register Rint, Register Rtmp) {
1060 #ifdef ASSERT
1061     Label L;
1062     assert_different_registers(Rtmp, Rint);
1063     __ movslq(Rtmp, Rint);
1064     __ cmpq(Rtmp, Rint);
1065     __ jcc(Assembler::equal, L);
1066     __ stop("high 32-bits of int value are not 0");
1067     __ bind(L);
1068 #endif
1069   }
1070 
1071   //  Generate overlap test for array copy stubs
1072   //
1073   //  Input:
1074   //     c_rarg0 - from
1075   //     c_rarg1 - to
1076   //     c_rarg2 - element count
1077   //
1078   //  Output:
1079   //     rax   - &from[element count - 1]
1080   //
1081   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1082     assert(no_overlap_target != NULL, "must be generated");
1083     array_overlap_test(no_overlap_target, NULL, sf);
1084   }
1085   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1086     array_overlap_test(NULL, &L_no_overlap, sf);
1087   }
1088   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1089     const Register from     = c_rarg0;
1090     const Register to       = c_rarg1;
1091     const Register count    = c_rarg2;
1092     const Register end_from = rax;
1093 
1094     __ cmpptr(to, from);
1095     __ lea(end_from, Address(from, count, sf, 0));
1096     if (NOLp == NULL) {
1097       ExternalAddress no_overlap(no_overlap_target);
1098       __ jump_cc(Assembler::belowEqual, no_overlap);
1099       __ cmpptr(to, end_from);
1100       __ jump_cc(Assembler::aboveEqual, no_overlap);
1101     } else {
1102       __ jcc(Assembler::belowEqual, (*NOLp));
1103       __ cmpptr(to, end_from);
1104       __ jcc(Assembler::aboveEqual, (*NOLp));
1105     }
1106   }
1107 
1108   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1109   //
1110   // Outputs:
1111   //    rdi - rcx
1112   //    rsi - rdx
1113   //    rdx - r8
1114   //    rcx - r9
1115   //
1116   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1117   // are non-volatile.  r9 and r10 should not be used by the caller.
1118   //
1119   DEBUG_ONLY(bool regs_in_thread;)
1120 
1121   void setup_arg_regs(int nargs = 3) {
1122     const Register saved_rdi = r9;
1123     const Register saved_rsi = r10;
1124     assert(nargs == 3 || nargs == 4, "else fix");
1125 #ifdef _WIN64
1126     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1127            "unexpected argument registers");
1128     if (nargs >= 4)
1129       __ mov(rax, r9);  // r9 is also saved_rdi
1130     __ movptr(saved_rdi, rdi);
1131     __ movptr(saved_rsi, rsi);
1132     __ mov(rdi, rcx); // c_rarg0
1133     __ mov(rsi, rdx); // c_rarg1
1134     __ mov(rdx, r8);  // c_rarg2
1135     if (nargs >= 4)
1136       __ mov(rcx, rax); // c_rarg3 (via rax)
1137 #else
1138     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1139            "unexpected argument registers");
1140 #endif
1141     DEBUG_ONLY(regs_in_thread = false;)
1142   }
1143 
1144   void restore_arg_regs() {
1145     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1146     const Register saved_rdi = r9;
1147     const Register saved_rsi = r10;
1148 #ifdef _WIN64
1149     __ movptr(rdi, saved_rdi);
1150     __ movptr(rsi, saved_rsi);
1151 #endif
1152   }
1153 
1154   // This is used in places where r10 is a scratch register, and can
1155   // be adapted if r9 is needed also.
1156   void setup_arg_regs_using_thread() {
1157     const Register saved_r15 = r9;
1158 #ifdef _WIN64
1159     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1160     __ get_thread(r15_thread);
1161     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1162            "unexpected argument registers");
1163     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1164     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1165 
1166     __ mov(rdi, rcx); // c_rarg0
1167     __ mov(rsi, rdx); // c_rarg1
1168     __ mov(rdx, r8);  // c_rarg2
1169 #else
1170     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1171            "unexpected argument registers");
1172 #endif
1173     DEBUG_ONLY(regs_in_thread = true;)
1174   }
1175 
1176   void restore_arg_regs_using_thread() {
1177     assert(regs_in_thread, "wrong call to restore_arg_regs");
1178     const Register saved_r15 = r9;
1179 #ifdef _WIN64
1180     __ get_thread(r15_thread);
1181     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1182     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1183     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1184 #endif
1185   }
1186 
1187   // Copy big chunks forward
1188   //
1189   // Inputs:
1190   //   end_from     - source arrays end address
1191   //   end_to       - destination array end address
1192   //   qword_count  - 64-bits element count, negative
1193   //   to           - scratch
1194   //   L_copy_bytes - entry label
1195   //   L_copy_8_bytes  - exit  label
1196   //
1197   void copy_bytes_forward(Register end_from, Register end_to,
1198                              Register qword_count, Register to,
1199                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1200     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1201     Label L_loop;
1202     __ align(OptoLoopAlignment);
1203     if (UseUnalignedLoadStores) {
1204       Label L_end;
1205       __ BIND(L_loop);
1206       if (UseAVX >= 2) {
1207         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1208         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1209         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1210         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1211       } else {
1212         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1213         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1214         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1215         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1216         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1217         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1218         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1219         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1220       }
1221 
1222       __ BIND(L_copy_bytes);
1223       __ addptr(qword_count, 8);
1224       __ jcc(Assembler::lessEqual, L_loop);
1225       __ subptr(qword_count, 4);  // sub(8) and add(4)
1226       __ jccb(Assembler::greater, L_end);
1227       // Copy trailing 32 bytes
1228       if (UseAVX >= 2) {
1229         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1230         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1231       } else {
1232         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1233         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1234         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1235         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1236       }
1237       __ addptr(qword_count, 4);
1238       __ BIND(L_end);
1239     } else {
1240       // Copy 32-bytes per iteration
1241       __ BIND(L_loop);
1242       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1243       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1244       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1245       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1246       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1247       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1248       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1249       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1250 
1251       __ BIND(L_copy_bytes);
1252       __ addptr(qword_count, 4);
1253       __ jcc(Assembler::lessEqual, L_loop);
1254     }
1255     __ subptr(qword_count, 4);
1256     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1257   }
1258 
1259   // Copy big chunks backward
1260   //
1261   // Inputs:
1262   //   from         - source arrays address
1263   //   dest         - destination array address
1264   //   qword_count  - 64-bits element count
1265   //   to           - scratch
1266   //   L_copy_bytes - entry label
1267   //   L_copy_8_bytes  - exit  label
1268   //
1269   void copy_bytes_backward(Register from, Register dest,
1270                               Register qword_count, Register to,
1271                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1272     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1273     Label L_loop;
1274     __ align(OptoLoopAlignment);
1275     if (UseUnalignedLoadStores) {
1276       Label L_end;
1277       __ BIND(L_loop);
1278       if (UseAVX >= 2) {
1279         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1280         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1281         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1282         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1283       } else {
1284         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1285         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1286         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1287         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1288         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1289         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1290         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1291         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1292       }
1293 
1294       __ BIND(L_copy_bytes);
1295       __ subptr(qword_count, 8);
1296       __ jcc(Assembler::greaterEqual, L_loop);
1297 
1298       __ addptr(qword_count, 4);  // add(8) and sub(4)
1299       __ jccb(Assembler::less, L_end);
1300       // Copy trailing 32 bytes
1301       if (UseAVX >= 2) {
1302         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1303         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1304       } else {
1305         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1306         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1307         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1308         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1309       }
1310       __ subptr(qword_count, 4);
1311       __ BIND(L_end);
1312     } else {
1313       // Copy 32-bytes per iteration
1314       __ BIND(L_loop);
1315       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1316       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1317       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1318       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1319       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1320       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1321       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1322       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1323 
1324       __ BIND(L_copy_bytes);
1325       __ subptr(qword_count, 4);
1326       __ jcc(Assembler::greaterEqual, L_loop);
1327     }
1328     __ addptr(qword_count, 4);
1329     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1330   }
1331 
1332 #ifndef PRODUCT
1333     int& get_profile_ctr(int shift) {
1334       if ( 0 == shift)
1335         return SharedRuntime::_jbyte_array_copy_ctr;
1336       else if(1 == shift)
1337         return SharedRuntime::_jshort_array_copy_ctr;
1338       else if(2 == shift)
1339         return SharedRuntime::_jint_array_copy_ctr;
1340       else
1341         return SharedRuntime::_jlong_array_copy_ctr;
1342     }
1343 #endif
1344 
1345   void setup_argument_regs(BasicType type) {
1346     if (type == T_BYTE || type == T_SHORT) {
1347       setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1348                         // r9 and r10 may be used to save non-volatile registers
1349     } else {
1350       setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1351                                      // r9 is used to save r15_thread
1352     }
1353   }
1354 
1355   void restore_argument_regs(BasicType type) {
1356     if (type == T_BYTE || type == T_SHORT) {
1357       restore_arg_regs();
1358     } else {
1359       restore_arg_regs_using_thread();
1360     }
1361   }
1362 
1363 #if COMPILER2_OR_JVMCI
1364   // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
1365   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
1366   //   for both special cases (various small block sizes) and aligned copy loop. This is the
1367   //   default configuration.
1368   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
1369   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
1370   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
1371   //   better performance for disjoint copies. For conjoint/backward copy vector based
1372   //   copy performs better.
1373   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
1374   //   64 byte vector registers (ZMMs).
1375 
1376   // Inputs:
1377   //   c_rarg0   - source array address
1378   //   c_rarg1   - destination array address
1379   //   c_rarg2   - element count, treated as ssize_t, can be zero
1380   //
1381   //
1382   // Side Effects:
1383   //   disjoint_copy_avx3_masked is set to the no-overlap entry point
1384   //   used by generate_conjoint_[byte/int/short/long]_copy().
1385   //
1386 
1387   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1388                                              bool aligned, bool is_oop, bool dest_uninitialized) {
1389     __ align(CodeEntryAlignment);
1390     StubCodeMark mark(this, "StubRoutines", name);
1391     address start = __ pc();
1392     int avx3threshold = VM_Version::avx3_threshold();
1393     bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
1394     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1395     Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1396     const Register from        = rdi;  // source array address
1397     const Register to          = rsi;  // destination array address
1398     const Register count       = rdx;  // elements count
1399     const Register temp1       = r8;
1400     const Register temp2       = r11;
1401     const Register temp3       = rax;
1402     const Register temp4       = rcx;
1403     // End pointers are inclusive, and if count is not zero they point
1404     // to the last unit copied:  end_to[0] := end_from[0]
1405 
1406     __ enter(); // required for proper stackwalking of RuntimeStub frame
1407     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1408 
1409     if (entry != NULL) {
1410       *entry = __ pc();
1411        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1412       BLOCK_COMMENT("Entry:");
1413     }
1414 
1415     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1416     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1417 
1418     setup_argument_regs(type);
1419 
1420     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1421     if (dest_uninitialized) {
1422       decorators |= IS_DEST_UNINITIALIZED;
1423     }
1424     if (aligned) {
1425       decorators |= ARRAYCOPY_ALIGNED;
1426     }
1427     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1428     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1429 
1430     {
1431       // Type(shift)           byte(0), short(1), int(2),   long(3)
1432       int loop_size[]        = { 192,     96,       48,      24};
1433       int threshold[]        = { 4096,    2048,     1024,    512};
1434 
1435       // UnsafeCopyMemory page error: continue after ucm
1436       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1437       // 'from', 'to' and 'count' are now valid
1438 
1439       // temp1 holds remaining count and temp4 holds running count used to compute
1440       // next address offset for start of to/from addresses (temp4 * scale).
1441       __ mov64(temp4, 0);
1442       __ movq(temp1, count);
1443 
1444       // Zero length check.
1445       __ BIND(L_tail);
1446       __ cmpq(temp1, 0);
1447       __ jcc(Assembler::lessEqual, L_exit);
1448 
1449       // Special cases using 32 byte [masked] vector copy operations.
1450       __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1451                                       temp4, temp3, use64byteVector, L_entry, L_exit);
1452 
1453       // PRE-MAIN-POST loop for aligned copy.
1454       __ BIND(L_entry);
1455 
1456       if (avx3threshold != 0) {
1457         __ cmpq(count, threshold[shift]);
1458         if (MaxVectorSize == 64) {
1459           // Copy using 64 byte vectors.
1460           __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1461         } else {
1462           assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
1463           // REP MOVS offer a faster copy path.
1464           __ jcc(Assembler::greaterEqual, L_repmovs);
1465         }
1466       }
1467 
1468       if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
1469         // Partial copy to make dst address 32 byte aligned.
1470         __ movq(temp2, to);
1471         __ andq(temp2, 31);
1472         __ jcc(Assembler::equal, L_main_pre_loop);
1473 
1474         __ negptr(temp2);
1475         __ addq(temp2, 32);
1476         if (shift) {
1477           __ shrq(temp2, shift);
1478         }
1479         __ movq(temp3, temp2);
1480         __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
1481         __ movq(temp4, temp2);
1482         __ movq(temp1, count);
1483         __ subq(temp1, temp2);
1484 
1485         __ cmpq(temp1, loop_size[shift]);
1486         __ jcc(Assembler::less, L_tail);
1487 
1488         __ BIND(L_main_pre_loop);
1489         __ subq(temp1, loop_size[shift]);
1490 
1491         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1492         __ align32();
1493         __ BIND(L_main_loop);
1494            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
1495            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
1496            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
1497            __ addptr(temp4, loop_size[shift]);
1498            __ subq(temp1, loop_size[shift]);
1499            __ jcc(Assembler::greater, L_main_loop);
1500 
1501         __ addq(temp1, loop_size[shift]);
1502 
1503         // Tail loop.
1504         __ jmp(L_tail);
1505 
1506         __ BIND(L_repmovs);
1507           __ movq(temp2, temp1);
1508           // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
1509           __ movq(temp3, to);
1510           __ movq(to,  from);
1511           __ movq(from, temp3);
1512           // Save to/from for restoration post rep_mov.
1513           __ movq(temp1, to);
1514           __ movq(temp3, from);
1515           if(shift < 3) {
1516             __ shrq(temp2, 3-shift);     // quad word count
1517           }
1518           __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
1519           __ rep_mov();
1520           __ shlq(temp2, 3);             // convert quad words into byte count.
1521           if(shift) {
1522             __ shrq(temp2, shift);       // type specific count.
1523           }
1524           // Restore original addresses in to/from.
1525           __ movq(to, temp3);
1526           __ movq(from, temp1);
1527           __ movq(temp4, temp2);
1528           __ movq(temp1, count);
1529           __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
1530           __ jmp(L_tail);
1531       }
1532 
1533       if (MaxVectorSize > 32) {
1534         __ BIND(L_pre_main_post_64);
1535         // Partial copy to make dst address 64 byte aligned.
1536         __ movq(temp2, to);
1537         __ andq(temp2, 63);
1538         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1539 
1540         __ negptr(temp2);
1541         __ addq(temp2, 64);
1542         if (shift) {
1543           __ shrq(temp2, shift);
1544         }
1545         __ movq(temp3, temp2);
1546         __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
1547         __ movq(temp4, temp2);
1548         __ movq(temp1, count);
1549         __ subq(temp1, temp2);
1550 
1551         __ cmpq(temp1, loop_size[shift]);
1552         __ jcc(Assembler::less, L_tail64);
1553 
1554         __ BIND(L_main_pre_loop_64bytes);
1555         __ subq(temp1, loop_size[shift]);
1556 
1557         // Main loop with aligned copy block size of 192 bytes at
1558         // 64 byte copy granularity.
1559         __ align32();
1560         __ BIND(L_main_loop_64bytes);
1561            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
1562            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
1563            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
1564            __ addptr(temp4, loop_size[shift]);
1565            __ subq(temp1, loop_size[shift]);
1566            __ jcc(Assembler::greater, L_main_loop_64bytes);
1567 
1568         __ addq(temp1, loop_size[shift]);
1569         // Zero length check.
1570         __ jcc(Assembler::lessEqual, L_exit);
1571 
1572         __ BIND(L_tail64);
1573 
1574         // Tail handling using 64 byte [masked] vector copy operations.
1575         use64byteVector = true;
1576         __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1577                                         temp4, temp3, use64byteVector, L_entry, L_exit);
1578       }
1579       __ BIND(L_exit);
1580     }
1581 
1582     address ucme_exit_pc = __ pc();
1583     // When called from generic_arraycopy r11 contains specific values
1584     // used during arraycopy epilogue, re-initializing r11.
1585     if (is_oop) {
1586       __ movq(r11, shift == 3 ? count : to);
1587     }
1588     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1589     restore_argument_regs(type);
1590     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1591     __ xorptr(rax, rax); // return 0
1592     __ vzeroupper();
1593     __ leave(); // required for proper stackwalking of RuntimeStub frame
1594     __ ret(0);
1595     return start;
1596   }
1597 
1598   // Inputs:
1599   //   c_rarg0   - source array address
1600   //   c_rarg1   - destination array address
1601   //   c_rarg2   - element count, treated as ssize_t, can be zero
1602   //
1603   //
1604   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1605                                              address nooverlap_target, bool aligned, bool is_oop,
1606                                              bool dest_uninitialized) {
1607     __ align(CodeEntryAlignment);
1608     StubCodeMark mark(this, "StubRoutines", name);
1609     address start = __ pc();
1610 
1611     int avx3threshold = VM_Version::avx3_threshold();
1612     bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
1613 
1614     Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1615     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1616     const Register from        = rdi;  // source array address
1617     const Register to          = rsi;  // destination array address
1618     const Register count       = rdx;  // elements count
1619     const Register temp1       = r8;
1620     const Register temp2       = rcx;
1621     const Register temp3       = r11;
1622     const Register temp4       = rax;
1623     // End pointers are inclusive, and if count is not zero they point
1624     // to the last unit copied:  end_to[0] := end_from[0]
1625 
1626     __ enter(); // required for proper stackwalking of RuntimeStub frame
1627     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1628 
1629     if (entry != NULL) {
1630       *entry = __ pc();
1631        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1632       BLOCK_COMMENT("Entry:");
1633     }
1634 
1635     array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1636 
1637     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1638     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1639 
1640     setup_argument_regs(type);
1641 
1642     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1643     if (dest_uninitialized) {
1644       decorators |= IS_DEST_UNINITIALIZED;
1645     }
1646     if (aligned) {
1647       decorators |= ARRAYCOPY_ALIGNED;
1648     }
1649     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1650     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1651     {
1652       // Type(shift)       byte(0), short(1), int(2),   long(3)
1653       int loop_size[]   = { 192,     96,       48,      24};
1654       int threshold[]   = { 4096,    2048,     1024,    512};
1655 
1656       // UnsafeCopyMemory page error: continue after ucm
1657       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1658       // 'from', 'to' and 'count' are now valid
1659 
1660       // temp1 holds remaining count.
1661       __ movq(temp1, count);
1662 
1663       // Zero length check.
1664       __ BIND(L_tail);
1665       __ cmpq(temp1, 0);
1666       __ jcc(Assembler::lessEqual, L_exit);
1667 
1668       __ mov64(temp2, 0);
1669       __ movq(temp3, temp1);
1670       // Special cases using 32 byte [masked] vector copy operations.
1671       __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1672                                                temp4, use64byteVector, L_entry, L_exit);
1673 
1674       // PRE-MAIN-POST loop for aligned copy.
1675       __ BIND(L_entry);
1676 
1677       if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
1678         __ cmpq(temp1, threshold[shift]);
1679         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1680       }
1681 
1682       if ((MaxVectorSize < 64)  || (avx3threshold != 0)) {
1683         // Partial copy to make dst address 32 byte aligned.
1684         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1685         __ andq(temp2, 31);
1686         __ jcc(Assembler::equal, L_main_pre_loop);
1687 
1688         if (shift) {
1689           __ shrq(temp2, shift);
1690         }
1691         __ subq(temp1, temp2);
1692         __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1693 
1694         __ cmpq(temp1, loop_size[shift]);
1695         __ jcc(Assembler::less, L_tail);
1696 
1697         __ BIND(L_main_pre_loop);
1698 
1699         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1700         __ align32();
1701         __ BIND(L_main_loop);
1702            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1703            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1704            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1705            __ subptr(temp1, loop_size[shift]);
1706            __ cmpq(temp1, loop_size[shift]);
1707            __ jcc(Assembler::greater, L_main_loop);
1708 
1709         // Tail loop.
1710         __ jmp(L_tail);
1711       }
1712 
1713       if (MaxVectorSize > 32) {
1714         __ BIND(L_pre_main_post_64);
1715         // Partial copy to make dst address 64 byte aligned.
1716         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1717         __ andq(temp2, 63);
1718         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1719 
1720         if (shift) {
1721           __ shrq(temp2, shift);
1722         }
1723         __ subq(temp1, temp2);
1724         __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1725 
1726         __ cmpq(temp1, loop_size[shift]);
1727         __ jcc(Assembler::less, L_tail64);
1728 
1729         __ BIND(L_main_pre_loop_64bytes);
1730 
1731         // Main loop with aligned copy block size of 192 bytes at
1732         // 64 byte copy granularity.
1733         __ align32();
1734         __ BIND(L_main_loop_64bytes);
1735            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1736            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1737            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1738            __ subq(temp1, loop_size[shift]);
1739            __ cmpq(temp1, loop_size[shift]);
1740            __ jcc(Assembler::greater, L_main_loop_64bytes);
1741 
1742         // Zero length check.
1743         __ cmpq(temp1, 0);
1744         __ jcc(Assembler::lessEqual, L_exit);
1745 
1746         __ BIND(L_tail64);
1747 
1748         // Tail handling using 64 byte [masked] vector copy operations.
1749         use64byteVector = true;
1750         __ mov64(temp2, 0);
1751         __ movq(temp3, temp1);
1752         __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1753                                                  temp4, use64byteVector, L_entry, L_exit);
1754       }
1755       __ BIND(L_exit);
1756     }
1757     address ucme_exit_pc = __ pc();
1758     // When called from generic_arraycopy r11 contains specific values
1759     // used during arraycopy epilogue, re-initializing r11.
1760     if(is_oop) {
1761       __ movq(r11, count);
1762     }
1763     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1764     restore_argument_regs(type);
1765     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1766     __ xorptr(rax, rax); // return 0
1767     __ vzeroupper();
1768     __ leave(); // required for proper stackwalking of RuntimeStub frame
1769     __ ret(0);
1770     return start;
1771   }
1772 #endif // COMPILER2_OR_JVMCI
1773 
1774 
1775   // Arguments:
1776   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1777   //             ignored
1778   //   name    - stub name string
1779   //
1780   // Inputs:
1781   //   c_rarg0   - source array address
1782   //   c_rarg1   - destination array address
1783   //   c_rarg2   - element count, treated as ssize_t, can be zero
1784   //
1785   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1786   // we let the hardware handle it.  The one to eight bytes within words,
1787   // dwords or qwords that span cache line boundaries will still be loaded
1788   // and stored atomically.
1789   //
1790   // Side Effects:
1791   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1792   //   used by generate_conjoint_byte_copy().
1793   //
1794   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1795 #if COMPILER2_OR_JVMCI
1796     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1797        return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1798                                                  aligned, false, false);
1799     }
1800 #endif
1801     __ align(CodeEntryAlignment);
1802     StubCodeMark mark(this, "StubRoutines", name);
1803     address start = __ pc();
1804 
1805     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1806     Label L_copy_byte, L_exit;
1807     const Register from        = rdi;  // source array address
1808     const Register to          = rsi;  // destination array address
1809     const Register count       = rdx;  // elements count
1810     const Register byte_count  = rcx;
1811     const Register qword_count = count;
1812     const Register end_from    = from; // source array end address
1813     const Register end_to      = to;   // destination array end address
1814     // End pointers are inclusive, and if count is not zero they point
1815     // to the last unit copied:  end_to[0] := end_from[0]
1816 
1817     __ enter(); // required for proper stackwalking of RuntimeStub frame
1818     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1819 
1820     if (entry != NULL) {
1821       *entry = __ pc();
1822        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1823       BLOCK_COMMENT("Entry:");
1824     }
1825 
1826     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1827                       // r9 and r10 may be used to save non-volatile registers
1828 
1829     {
1830       // UnsafeCopyMemory page error: continue after ucm
1831       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1832       // 'from', 'to' and 'count' are now valid
1833       __ movptr(byte_count, count);
1834       __ shrptr(count, 3); // count => qword_count
1835 
1836       // Copy from low to high addresses.  Use 'to' as scratch.
1837       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1838       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1839       __ negptr(qword_count); // make the count negative
1840       __ jmp(L_copy_bytes);
1841 
1842       // Copy trailing qwords
1843     __ BIND(L_copy_8_bytes);
1844       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1845       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1846       __ increment(qword_count);
1847       __ jcc(Assembler::notZero, L_copy_8_bytes);
1848 
1849       // Check for and copy trailing dword
1850     __ BIND(L_copy_4_bytes);
1851       __ testl(byte_count, 4);
1852       __ jccb(Assembler::zero, L_copy_2_bytes);
1853       __ movl(rax, Address(end_from, 8));
1854       __ movl(Address(end_to, 8), rax);
1855 
1856       __ addptr(end_from, 4);
1857       __ addptr(end_to, 4);
1858 
1859       // Check for and copy trailing word
1860     __ BIND(L_copy_2_bytes);
1861       __ testl(byte_count, 2);
1862       __ jccb(Assembler::zero, L_copy_byte);
1863       __ movw(rax, Address(end_from, 8));
1864       __ movw(Address(end_to, 8), rax);
1865 
1866       __ addptr(end_from, 2);
1867       __ addptr(end_to, 2);
1868 
1869       // Check for and copy trailing byte
1870     __ BIND(L_copy_byte);
1871       __ testl(byte_count, 1);
1872       __ jccb(Assembler::zero, L_exit);
1873       __ movb(rax, Address(end_from, 8));
1874       __ movb(Address(end_to, 8), rax);
1875     }
1876   __ BIND(L_exit);
1877     address ucme_exit_pc = __ pc();
1878     restore_arg_regs();
1879     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1880     __ xorptr(rax, rax); // return 0
1881     __ vzeroupper();
1882     __ leave(); // required for proper stackwalking of RuntimeStub frame
1883     __ ret(0);
1884 
1885     {
1886       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1887       // Copy in multi-bytes chunks
1888       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1889       __ jmp(L_copy_4_bytes);
1890     }
1891     return start;
1892   }
1893 
1894   // Arguments:
1895   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1896   //             ignored
1897   //   name    - stub name string
1898   //
1899   // Inputs:
1900   //   c_rarg0   - source array address
1901   //   c_rarg1   - destination array address
1902   //   c_rarg2   - element count, treated as ssize_t, can be zero
1903   //
1904   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1905   // we let the hardware handle it.  The one to eight bytes within words,
1906   // dwords or qwords that span cache line boundaries will still be loaded
1907   // and stored atomically.
1908   //
1909   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1910                                       address* entry, const char *name) {
1911 #if COMPILER2_OR_JVMCI
1912     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1913        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1914                                                  nooverlap_target, aligned, false, false);
1915     }
1916 #endif
1917     __ align(CodeEntryAlignment);
1918     StubCodeMark mark(this, "StubRoutines", name);
1919     address start = __ pc();
1920 
1921     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1922     const Register from        = rdi;  // source array address
1923     const Register to          = rsi;  // destination array address
1924     const Register count       = rdx;  // elements count
1925     const Register byte_count  = rcx;
1926     const Register qword_count = count;
1927 
1928     __ enter(); // required for proper stackwalking of RuntimeStub frame
1929     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1930 
1931     if (entry != NULL) {
1932       *entry = __ pc();
1933       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1934       BLOCK_COMMENT("Entry:");
1935     }
1936 
1937     array_overlap_test(nooverlap_target, Address::times_1);
1938     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1939                       // r9 and r10 may be used to save non-volatile registers
1940 
1941     {
1942       // UnsafeCopyMemory page error: continue after ucm
1943       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1944       // 'from', 'to' and 'count' are now valid
1945       __ movptr(byte_count, count);
1946       __ shrptr(count, 3);   // count => qword_count
1947 
1948       // Copy from high to low addresses.
1949 
1950       // Check for and copy trailing byte
1951       __ testl(byte_count, 1);
1952       __ jcc(Assembler::zero, L_copy_2_bytes);
1953       __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1954       __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1955       __ decrement(byte_count); // Adjust for possible trailing word
1956 
1957       // Check for and copy trailing word
1958     __ BIND(L_copy_2_bytes);
1959       __ testl(byte_count, 2);
1960       __ jcc(Assembler::zero, L_copy_4_bytes);
1961       __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1962       __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1963 
1964       // Check for and copy trailing dword
1965     __ BIND(L_copy_4_bytes);
1966       __ testl(byte_count, 4);
1967       __ jcc(Assembler::zero, L_copy_bytes);
1968       __ movl(rax, Address(from, qword_count, Address::times_8));
1969       __ movl(Address(to, qword_count, Address::times_8), rax);
1970       __ jmp(L_copy_bytes);
1971 
1972       // Copy trailing qwords
1973     __ BIND(L_copy_8_bytes);
1974       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1975       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1976       __ decrement(qword_count);
1977       __ jcc(Assembler::notZero, L_copy_8_bytes);
1978     }
1979     restore_arg_regs();
1980     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1981     __ xorptr(rax, rax); // return 0
1982     __ vzeroupper();
1983     __ leave(); // required for proper stackwalking of RuntimeStub frame
1984     __ ret(0);
1985 
1986     {
1987       // UnsafeCopyMemory page error: continue after ucm
1988       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1989       // Copy in multi-bytes chunks
1990       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1991     }
1992     restore_arg_regs();
1993     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1994     __ xorptr(rax, rax); // return 0
1995     __ vzeroupper();
1996     __ leave(); // required for proper stackwalking of RuntimeStub frame
1997     __ ret(0);
1998 
1999     return start;
2000   }
2001 
2002   // Arguments:
2003   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2004   //             ignored
2005   //   name    - stub name string
2006   //
2007   // Inputs:
2008   //   c_rarg0   - source array address
2009   //   c_rarg1   - destination array address
2010   //   c_rarg2   - element count, treated as ssize_t, can be zero
2011   //
2012   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2013   // let the hardware handle it.  The two or four words within dwords
2014   // or qwords that span cache line boundaries will still be loaded
2015   // and stored atomically.
2016   //
2017   // Side Effects:
2018   //   disjoint_short_copy_entry is set to the no-overlap entry point
2019   //   used by generate_conjoint_short_copy().
2020   //
2021   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
2022 #if COMPILER2_OR_JVMCI
2023     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2024        return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
2025                                                  aligned, false, false);
2026     }
2027 #endif
2028 
2029     __ align(CodeEntryAlignment);
2030     StubCodeMark mark(this, "StubRoutines", name);
2031     address start = __ pc();
2032 
2033     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
2034     const Register from        = rdi;  // source array address
2035     const Register to          = rsi;  // destination array address
2036     const Register count       = rdx;  // elements count
2037     const Register word_count  = rcx;
2038     const Register qword_count = count;
2039     const Register end_from    = from; // source array end address
2040     const Register end_to      = to;   // destination array end address
2041     // End pointers are inclusive, and if count is not zero they point
2042     // to the last unit copied:  end_to[0] := end_from[0]
2043 
2044     __ enter(); // required for proper stackwalking of RuntimeStub frame
2045     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2046 
2047     if (entry != NULL) {
2048       *entry = __ pc();
2049       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2050       BLOCK_COMMENT("Entry:");
2051     }
2052 
2053     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2054                       // r9 and r10 may be used to save non-volatile registers
2055 
2056     {
2057       // UnsafeCopyMemory page error: continue after ucm
2058       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2059       // 'from', 'to' and 'count' are now valid
2060       __ movptr(word_count, count);
2061       __ shrptr(count, 2); // count => qword_count
2062 
2063       // Copy from low to high addresses.  Use 'to' as scratch.
2064       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2065       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2066       __ negptr(qword_count);
2067       __ jmp(L_copy_bytes);
2068 
2069       // Copy trailing qwords
2070     __ BIND(L_copy_8_bytes);
2071       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2072       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2073       __ increment(qword_count);
2074       __ jcc(Assembler::notZero, L_copy_8_bytes);
2075 
2076       // Original 'dest' is trashed, so we can't use it as a
2077       // base register for a possible trailing word copy
2078 
2079       // Check for and copy trailing dword
2080     __ BIND(L_copy_4_bytes);
2081       __ testl(word_count, 2);
2082       __ jccb(Assembler::zero, L_copy_2_bytes);
2083       __ movl(rax, Address(end_from, 8));
2084       __ movl(Address(end_to, 8), rax);
2085 
2086       __ addptr(end_from, 4);
2087       __ addptr(end_to, 4);
2088 
2089       // Check for and copy trailing word
2090     __ BIND(L_copy_2_bytes);
2091       __ testl(word_count, 1);
2092       __ jccb(Assembler::zero, L_exit);
2093       __ movw(rax, Address(end_from, 8));
2094       __ movw(Address(end_to, 8), rax);
2095     }
2096   __ BIND(L_exit);
2097     address ucme_exit_pc = __ pc();
2098     restore_arg_regs();
2099     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2100     __ xorptr(rax, rax); // return 0
2101     __ vzeroupper();
2102     __ leave(); // required for proper stackwalking of RuntimeStub frame
2103     __ ret(0);
2104 
2105     {
2106       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
2107       // Copy in multi-bytes chunks
2108       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2109       __ jmp(L_copy_4_bytes);
2110     }
2111 
2112     return start;
2113   }
2114 
2115   address generate_fill(BasicType t, bool aligned, const char *name) {
2116     __ align(CodeEntryAlignment);
2117     StubCodeMark mark(this, "StubRoutines", name);
2118     address start = __ pc();
2119 
2120     BLOCK_COMMENT("Entry:");
2121 
2122     const Register to       = c_rarg0;  // destination array address
2123     const Register value    = c_rarg1;  // value
2124     const Register count    = c_rarg2;  // elements count
2125     __ mov(r11, count);
2126 
2127     __ enter(); // required for proper stackwalking of RuntimeStub frame
2128 
2129     __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
2130 
2131     __ vzeroupper();
2132     __ leave(); // required for proper stackwalking of RuntimeStub frame
2133     __ ret(0);
2134     return start;
2135   }
2136 
2137   // Arguments:
2138   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2139   //             ignored
2140   //   name    - stub name string
2141   //
2142   // Inputs:
2143   //   c_rarg0   - source array address
2144   //   c_rarg1   - destination array address
2145   //   c_rarg2   - element count, treated as ssize_t, can be zero
2146   //
2147   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2148   // let the hardware handle it.  The two or four words within dwords
2149   // or qwords that span cache line boundaries will still be loaded
2150   // and stored atomically.
2151   //
2152   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2153                                        address *entry, const char *name) {
2154 #if COMPILER2_OR_JVMCI
2155     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2156        return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
2157                                                  nooverlap_target, aligned, false, false);
2158     }
2159 #endif
2160     __ align(CodeEntryAlignment);
2161     StubCodeMark mark(this, "StubRoutines", name);
2162     address start = __ pc();
2163 
2164     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2165     const Register from        = rdi;  // source array address
2166     const Register to          = rsi;  // destination array address
2167     const Register count       = rdx;  // elements count
2168     const Register word_count  = rcx;
2169     const Register qword_count = count;
2170 
2171     __ enter(); // required for proper stackwalking of RuntimeStub frame
2172     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2173 
2174     if (entry != NULL) {
2175       *entry = __ pc();
2176       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2177       BLOCK_COMMENT("Entry:");
2178     }
2179 
2180     array_overlap_test(nooverlap_target, Address::times_2);
2181     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2182                       // r9 and r10 may be used to save non-volatile registers
2183 
2184     {
2185       // UnsafeCopyMemory page error: continue after ucm
2186       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2187       // 'from', 'to' and 'count' are now valid
2188       __ movptr(word_count, count);
2189       __ shrptr(count, 2); // count => qword_count
2190 
2191       // Copy from high to low addresses.  Use 'to' as scratch.
2192 
2193       // Check for and copy trailing word
2194       __ testl(word_count, 1);
2195       __ jccb(Assembler::zero, L_copy_4_bytes);
2196       __ movw(rax, Address(from, word_count, Address::times_2, -2));
2197       __ movw(Address(to, word_count, Address::times_2, -2), rax);
2198 
2199      // Check for and copy trailing dword
2200     __ BIND(L_copy_4_bytes);
2201       __ testl(word_count, 2);
2202       __ jcc(Assembler::zero, L_copy_bytes);
2203       __ movl(rax, Address(from, qword_count, Address::times_8));
2204       __ movl(Address(to, qword_count, Address::times_8), rax);
2205       __ jmp(L_copy_bytes);
2206 
2207       // Copy trailing qwords
2208     __ BIND(L_copy_8_bytes);
2209       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2210       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2211       __ decrement(qword_count);
2212       __ jcc(Assembler::notZero, L_copy_8_bytes);
2213     }
2214     restore_arg_regs();
2215     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2216     __ xorptr(rax, rax); // return 0
2217     __ vzeroupper();
2218     __ leave(); // required for proper stackwalking of RuntimeStub frame
2219     __ ret(0);
2220 
2221     {
2222       // UnsafeCopyMemory page error: continue after ucm
2223       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2224       // Copy in multi-bytes chunks
2225       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2226     }
2227     restore_arg_regs();
2228     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2229     __ xorptr(rax, rax); // return 0
2230     __ vzeroupper();
2231     __ leave(); // required for proper stackwalking of RuntimeStub frame
2232     __ ret(0);
2233 
2234     return start;
2235   }
2236 
2237   // Arguments:
2238   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2239   //             ignored
2240   //   is_oop  - true => oop array, so generate store check code
2241   //   name    - stub name string
2242   //
2243   // Inputs:
2244   //   c_rarg0   - source array address
2245   //   c_rarg1   - destination array address
2246   //   c_rarg2   - element count, treated as ssize_t, can be zero
2247   //
2248   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2249   // the hardware handle it.  The two dwords within qwords that span
2250   // cache line boundaries will still be loaded and stored atomically.
2251   //
2252   // Side Effects:
2253   //   disjoint_int_copy_entry is set to the no-overlap entry point
2254   //   used by generate_conjoint_int_oop_copy().
2255   //
2256   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2257                                          const char *name, bool dest_uninitialized = false) {
2258 #if COMPILER2_OR_JVMCI
2259     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2260        return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
2261                                                  aligned, is_oop, dest_uninitialized);
2262     }
2263 #endif
2264 
2265     __ align(CodeEntryAlignment);
2266     StubCodeMark mark(this, "StubRoutines", name);
2267     address start = __ pc();
2268 
2269     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2270     const Register from        = rdi;  // source array address
2271     const Register to          = rsi;  // destination array address
2272     const Register count       = rdx;  // elements count
2273     const Register dword_count = rcx;
2274     const Register qword_count = count;
2275     const Register end_from    = from; // source array end address
2276     const Register end_to      = to;   // destination array end address
2277     // End pointers are inclusive, and if count is not zero they point
2278     // to the last unit copied:  end_to[0] := end_from[0]
2279 
2280     __ enter(); // required for proper stackwalking of RuntimeStub frame
2281     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2282 
2283     if (entry != NULL) {
2284       *entry = __ pc();
2285       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2286       BLOCK_COMMENT("Entry:");
2287     }
2288 
2289     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2290                                    // r9 is used to save r15_thread
2291 
2292     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2293     if (dest_uninitialized) {
2294       decorators |= IS_DEST_UNINITIALIZED;
2295     }
2296     if (aligned) {
2297       decorators |= ARRAYCOPY_ALIGNED;
2298     }
2299 
2300     BasicType type = is_oop ? T_OBJECT : T_INT;
2301     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2302     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2303 
2304     {
2305       // UnsafeCopyMemory page error: continue after ucm
2306       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2307       // 'from', 'to' and 'count' are now valid
2308       __ movptr(dword_count, count);
2309       __ shrptr(count, 1); // count => qword_count
2310 
2311       // Copy from low to high addresses.  Use 'to' as scratch.
2312       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2313       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2314       __ negptr(qword_count);
2315       __ jmp(L_copy_bytes);
2316 
2317       // Copy trailing qwords
2318     __ BIND(L_copy_8_bytes);
2319       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2320       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2321       __ increment(qword_count);
2322       __ jcc(Assembler::notZero, L_copy_8_bytes);
2323 
2324       // Check for and copy trailing dword
2325     __ BIND(L_copy_4_bytes);
2326       __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2327       __ jccb(Assembler::zero, L_exit);
2328       __ movl(rax, Address(end_from, 8));
2329       __ movl(Address(end_to, 8), rax);
2330     }
2331   __ BIND(L_exit);
2332     address ucme_exit_pc = __ pc();
2333     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2334     restore_arg_regs_using_thread();
2335     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2336     __ vzeroupper();
2337     __ xorptr(rax, rax); // return 0
2338     __ leave(); // required for proper stackwalking of RuntimeStub frame
2339     __ ret(0);
2340 
2341     {
2342       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2343       // Copy in multi-bytes chunks
2344       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2345       __ jmp(L_copy_4_bytes);
2346     }
2347 
2348     return start;
2349   }
2350 
2351   // Arguments:
2352   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2353   //             ignored
2354   //   is_oop  - true => oop array, so generate store check code
2355   //   name    - stub name string
2356   //
2357   // Inputs:
2358   //   c_rarg0   - source array address
2359   //   c_rarg1   - destination array address
2360   //   c_rarg2   - element count, treated as ssize_t, can be zero
2361   //
2362   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2363   // the hardware handle it.  The two dwords within qwords that span
2364   // cache line boundaries will still be loaded and stored atomically.
2365   //
2366   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2367                                          address *entry, const char *name,
2368                                          bool dest_uninitialized = false) {
2369 #if COMPILER2_OR_JVMCI
2370     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2371        return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
2372                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2373     }
2374 #endif
2375     __ align(CodeEntryAlignment);
2376     StubCodeMark mark(this, "StubRoutines", name);
2377     address start = __ pc();
2378 
2379     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2380     const Register from        = rdi;  // source array address
2381     const Register to          = rsi;  // destination array address
2382     const Register count       = rdx;  // elements count
2383     const Register dword_count = rcx;
2384     const Register qword_count = count;
2385 
2386     __ enter(); // required for proper stackwalking of RuntimeStub frame
2387     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2388 
2389     if (entry != NULL) {
2390       *entry = __ pc();
2391        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2392       BLOCK_COMMENT("Entry:");
2393     }
2394 
2395     array_overlap_test(nooverlap_target, Address::times_4);
2396     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2397                                    // r9 is used to save r15_thread
2398 
2399     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2400     if (dest_uninitialized) {
2401       decorators |= IS_DEST_UNINITIALIZED;
2402     }
2403     if (aligned) {
2404       decorators |= ARRAYCOPY_ALIGNED;
2405     }
2406 
2407     BasicType type = is_oop ? T_OBJECT : T_INT;
2408     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2409     // no registers are destroyed by this call
2410     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2411 
2412     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2413     {
2414       // UnsafeCopyMemory page error: continue after ucm
2415       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2416       // 'from', 'to' and 'count' are now valid
2417       __ movptr(dword_count, count);
2418       __ shrptr(count, 1); // count => qword_count
2419 
2420       // Copy from high to low addresses.  Use 'to' as scratch.
2421 
2422       // Check for and copy trailing dword
2423       __ testl(dword_count, 1);
2424       __ jcc(Assembler::zero, L_copy_bytes);
2425       __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2426       __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2427       __ jmp(L_copy_bytes);
2428 
2429       // Copy trailing qwords
2430     __ BIND(L_copy_8_bytes);
2431       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2432       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2433       __ decrement(qword_count);
2434       __ jcc(Assembler::notZero, L_copy_8_bytes);
2435     }
2436     if (is_oop) {
2437       __ jmp(L_exit);
2438     }
2439     restore_arg_regs_using_thread();
2440     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2441     __ xorptr(rax, rax); // return 0
2442     __ vzeroupper();
2443     __ leave(); // required for proper stackwalking of RuntimeStub frame
2444     __ ret(0);
2445 
2446     {
2447       // UnsafeCopyMemory page error: continue after ucm
2448       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2449       // Copy in multi-bytes chunks
2450       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2451     }
2452 
2453   __ BIND(L_exit);
2454     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2455     restore_arg_regs_using_thread();
2456     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2457     __ xorptr(rax, rax); // return 0
2458     __ vzeroupper();
2459     __ leave(); // required for proper stackwalking of RuntimeStub frame
2460     __ ret(0);
2461 
2462     return start;
2463   }
2464 
2465   // Arguments:
2466   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2467   //             ignored
2468   //   is_oop  - true => oop array, so generate store check code
2469   //   name    - stub name string
2470   //
2471   // Inputs:
2472   //   c_rarg0   - source array address
2473   //   c_rarg1   - destination array address
2474   //   c_rarg2   - element count, treated as ssize_t, can be zero
2475   //
2476  // Side Effects:
2477   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2478   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2479   //
2480   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2481                                           const char *name, bool dest_uninitialized = false) {
2482 #if COMPILER2_OR_JVMCI
2483     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2484        return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
2485                                                  aligned, is_oop, dest_uninitialized);
2486     }
2487 #endif
2488     __ align(CodeEntryAlignment);
2489     StubCodeMark mark(this, "StubRoutines", name);
2490     address start = __ pc();
2491 
2492     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2493     const Register from        = rdi;  // source array address
2494     const Register to          = rsi;  // destination array address
2495     const Register qword_count = rdx;  // elements count
2496     const Register end_from    = from; // source array end address
2497     const Register end_to      = rcx;  // destination array end address
2498     const Register saved_count = r11;
2499     // End pointers are inclusive, and if count is not zero they point
2500     // to the last unit copied:  end_to[0] := end_from[0]
2501 
2502     __ enter(); // required for proper stackwalking of RuntimeStub frame
2503     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2504     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2505 
2506     if (entry != NULL) {
2507       *entry = __ pc();
2508       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2509       BLOCK_COMMENT("Entry:");
2510     }
2511 
2512     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2513                                      // r9 is used to save r15_thread
2514     // 'from', 'to' and 'qword_count' are now valid
2515 
2516     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2517     if (dest_uninitialized) {
2518       decorators |= IS_DEST_UNINITIALIZED;
2519     }
2520     if (aligned) {
2521       decorators |= ARRAYCOPY_ALIGNED;
2522     }
2523 
2524     BasicType type = is_oop ? T_OBJECT : T_LONG;
2525     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2526     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2527     {
2528       // UnsafeCopyMemory page error: continue after ucm
2529       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2530 
2531       // Copy from low to high addresses.  Use 'to' as scratch.
2532       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2533       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2534       __ negptr(qword_count);
2535       __ jmp(L_copy_bytes);
2536 
2537       // Copy trailing qwords
2538     __ BIND(L_copy_8_bytes);
2539       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2540       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2541       __ increment(qword_count);
2542       __ jcc(Assembler::notZero, L_copy_8_bytes);
2543     }
2544     if (is_oop) {
2545       __ jmp(L_exit);
2546     } else {
2547       restore_arg_regs_using_thread();
2548       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2549       __ xorptr(rax, rax); // return 0
2550       __ vzeroupper();
2551       __ leave(); // required for proper stackwalking of RuntimeStub frame
2552       __ ret(0);
2553     }
2554 
2555     {
2556       // UnsafeCopyMemory page error: continue after ucm
2557       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2558       // Copy in multi-bytes chunks
2559       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2560     }
2561 
2562     __ BIND(L_exit);
2563     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2564     restore_arg_regs_using_thread();
2565     if (is_oop) {
2566       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2567     } else {
2568       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2569     }
2570     __ vzeroupper();
2571     __ xorptr(rax, rax); // return 0
2572     __ leave(); // required for proper stackwalking of RuntimeStub frame
2573     __ ret(0);
2574 
2575     return start;
2576   }
2577 
2578   // Arguments:
2579   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2580   //             ignored
2581   //   is_oop  - true => oop array, so generate store check code
2582   //   name    - stub name string
2583   //
2584   // Inputs:
2585   //   c_rarg0   - source array address
2586   //   c_rarg1   - destination array address
2587   //   c_rarg2   - element count, treated as ssize_t, can be zero
2588   //
2589   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2590                                           address nooverlap_target, address *entry,
2591                                           const char *name, bool dest_uninitialized = false) {
2592 #if COMPILER2_OR_JVMCI
2593     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2594        return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2595                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2596     }
2597 #endif
2598     __ align(CodeEntryAlignment);
2599     StubCodeMark mark(this, "StubRoutines", name);
2600     address start = __ pc();
2601 
2602     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2603     const Register from        = rdi;  // source array address
2604     const Register to          = rsi;  // destination array address
2605     const Register qword_count = rdx;  // elements count
2606     const Register saved_count = rcx;
2607 
2608     __ enter(); // required for proper stackwalking of RuntimeStub frame
2609     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2610 
2611     if (entry != NULL) {
2612       *entry = __ pc();
2613       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2614       BLOCK_COMMENT("Entry:");
2615     }
2616 
2617     array_overlap_test(nooverlap_target, Address::times_8);
2618     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2619                                    // r9 is used to save r15_thread
2620     // 'from', 'to' and 'qword_count' are now valid
2621 
2622     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2623     if (dest_uninitialized) {
2624       decorators |= IS_DEST_UNINITIALIZED;
2625     }
2626     if (aligned) {
2627       decorators |= ARRAYCOPY_ALIGNED;
2628     }
2629 
2630     BasicType type = is_oop ? T_OBJECT : T_LONG;
2631     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2632     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2633     {
2634       // UnsafeCopyMemory page error: continue after ucm
2635       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2636 
2637       __ jmp(L_copy_bytes);
2638 
2639       // Copy trailing qwords
2640     __ BIND(L_copy_8_bytes);
2641       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2642       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2643       __ decrement(qword_count);
2644       __ jcc(Assembler::notZero, L_copy_8_bytes);
2645     }
2646     if (is_oop) {
2647       __ jmp(L_exit);
2648     } else {
2649       restore_arg_regs_using_thread();
2650       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2651       __ xorptr(rax, rax); // return 0
2652       __ vzeroupper();
2653       __ leave(); // required for proper stackwalking of RuntimeStub frame
2654       __ ret(0);
2655     }
2656     {
2657       // UnsafeCopyMemory page error: continue after ucm
2658       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2659 
2660       // Copy in multi-bytes chunks
2661       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2662     }
2663     __ BIND(L_exit);
2664     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2665     restore_arg_regs_using_thread();
2666     if (is_oop) {
2667       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2668     } else {
2669       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2670     }
2671     __ vzeroupper();
2672     __ xorptr(rax, rax); // return 0
2673     __ leave(); // required for proper stackwalking of RuntimeStub frame
2674     __ ret(0);
2675 
2676     return start;
2677   }
2678 
2679 
2680   // Helper for generating a dynamic type check.
2681   // Smashes no registers.
2682   void generate_type_check(Register sub_klass,
2683                            Register super_check_offset,
2684                            Register super_klass,
2685                            Label& L_success) {
2686     assert_different_registers(sub_klass, super_check_offset, super_klass);
2687 
2688     BLOCK_COMMENT("type_check:");
2689 
2690     Label L_miss;
2691 
2692     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2693                                      super_check_offset);
2694     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2695 
2696     // Fall through on failure!
2697     __ BIND(L_miss);
2698   }
2699 
2700   //
2701   //  Generate checkcasting array copy stub
2702   //
2703   //  Input:
2704   //    c_rarg0   - source array address
2705   //    c_rarg1   - destination array address
2706   //    c_rarg2   - element count, treated as ssize_t, can be zero
2707   //    c_rarg3   - size_t ckoff (super_check_offset)
2708   // not Win64
2709   //    c_rarg4   - oop ckval (super_klass)
2710   // Win64
2711   //    rsp+40    - oop ckval (super_klass)
2712   //
2713   //  Output:
2714   //    rax ==  0  -  success
2715   //    rax == -1^K - failure, where K is partial transfer count
2716   //
2717   address generate_checkcast_copy(const char *name, address *entry,
2718                                   bool dest_uninitialized = false) {
2719 
2720     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2721 
2722     // Input registers (after setup_arg_regs)
2723     const Register from        = rdi;   // source array address
2724     const Register to          = rsi;   // destination array address
2725     const Register length      = rdx;   // elements count
2726     const Register ckoff       = rcx;   // super_check_offset
2727     const Register ckval       = r8;    // super_klass
2728 
2729     // Registers used as temps (r13, r14 are save-on-entry)
2730     const Register end_from    = from;  // source array end address
2731     const Register end_to      = r13;   // destination array end address
2732     const Register count       = rdx;   // -(count_remaining)
2733     const Register r14_length  = r14;   // saved copy of length
2734     // End pointers are inclusive, and if length is not zero they point
2735     // to the last unit copied:  end_to[0] := end_from[0]
2736 
2737     const Register rax_oop    = rax;    // actual oop copied
2738     const Register r11_klass  = r11;    // oop._klass
2739 
2740     //---------------------------------------------------------------
2741     // Assembler stub will be used for this call to arraycopy
2742     // if the two arrays are subtypes of Object[] but the
2743     // destination array type is not equal to or a supertype
2744     // of the source type.  Each element must be separately
2745     // checked.
2746 
2747     __ align(CodeEntryAlignment);
2748     StubCodeMark mark(this, "StubRoutines", name);
2749     address start = __ pc();
2750 
2751     __ enter(); // required for proper stackwalking of RuntimeStub frame
2752 
2753 #ifdef ASSERT
2754     // caller guarantees that the arrays really are different
2755     // otherwise, we would have to make conjoint checks
2756     { Label L;
2757       array_overlap_test(L, TIMES_OOP);
2758       __ stop("checkcast_copy within a single array");
2759       __ bind(L);
2760     }
2761 #endif //ASSERT
2762 
2763     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2764                        // ckoff => rcx, ckval => r8
2765                        // r9 and r10 may be used to save non-volatile registers
2766 #ifdef _WIN64
2767     // last argument (#4) is on stack on Win64
2768     __ movptr(ckval, Address(rsp, 6 * wordSize));
2769 #endif
2770 
2771     // Caller of this entry point must set up the argument registers.
2772     if (entry != NULL) {
2773       *entry = __ pc();
2774       BLOCK_COMMENT("Entry:");
2775     }
2776 
2777     // allocate spill slots for r13, r14
2778     enum {
2779       saved_r13_offset,
2780       saved_r14_offset,
2781       saved_r10_offset,
2782       saved_rbp_offset
2783     };
2784     __ subptr(rsp, saved_rbp_offset * wordSize);
2785     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2786     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2787     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2788 
2789 #ifdef ASSERT
2790       Label L2;
2791       __ get_thread(r14);
2792       __ cmpptr(r15_thread, r14);
2793       __ jcc(Assembler::equal, L2);
2794       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2795       __ bind(L2);
2796 #endif // ASSERT
2797 
2798     // check that int operands are properly extended to size_t
2799     assert_clean_int(length, rax);
2800     assert_clean_int(ckoff, rax);
2801 
2802 #ifdef ASSERT
2803     BLOCK_COMMENT("assert consistent ckoff/ckval");
2804     // The ckoff and ckval must be mutually consistent,
2805     // even though caller generates both.
2806     { Label L;
2807       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2808       __ cmpl(ckoff, Address(ckval, sco_offset));
2809       __ jcc(Assembler::equal, L);
2810       __ stop("super_check_offset inconsistent");
2811       __ bind(L);
2812     }
2813 #endif //ASSERT
2814 
2815     // Loop-invariant addresses.  They are exclusive end pointers.
2816     Address end_from_addr(from, length, TIMES_OOP, 0);
2817     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2818     // Loop-variant addresses.  They assume post-incremented count < 0.
2819     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2820     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2821 
2822     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2823     if (dest_uninitialized) {
2824       decorators |= IS_DEST_UNINITIALIZED;
2825     }
2826 
2827     BasicType type = T_OBJECT;
2828     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2829     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2830 
2831     // Copy from low to high addresses, indexed from the end of each array.
2832     __ lea(end_from, end_from_addr);
2833     __ lea(end_to,   end_to_addr);
2834     __ movptr(r14_length, length);        // save a copy of the length
2835     assert(length == count, "");          // else fix next line:
2836     __ negptr(count);                     // negate and test the length
2837     __ jcc(Assembler::notZero, L_load_element);
2838 
2839     // Empty array:  Nothing to do.
2840     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2841     __ jmp(L_done);
2842 
2843     // ======== begin loop ========
2844     // (Loop is rotated; its entry is L_load_element.)
2845     // Loop control:
2846     //   for (count = -count; count != 0; count++)
2847     // Base pointers src, dst are biased by 8*(count-1),to last element.
2848     __ align(OptoLoopAlignment);
2849 
2850     __ BIND(L_store_element);
2851     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
2852     __ increment(count);               // increment the count toward zero
2853     __ jcc(Assembler::zero, L_do_card_marks);
2854 
2855     // ======== loop entry is here ========
2856     __ BIND(L_load_element);
2857     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2858     __ testptr(rax_oop, rax_oop);
2859     __ jcc(Assembler::zero, L_store_element);
2860 
2861     __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2862     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2863     // ======== end loop ========
2864 
2865     // It was a real error; we must depend on the caller to finish the job.
2866     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2867     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2868     // and report their number to the caller.
2869     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2870     Label L_post_barrier;
2871     __ addptr(r14_length, count);     // K = (original - remaining) oops
2872     __ movptr(rax, r14_length);       // save the value
2873     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2874     __ jccb(Assembler::notZero, L_post_barrier);
2875     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2876 
2877     // Come here on success only.
2878     __ BIND(L_do_card_marks);
2879     __ xorptr(rax, rax);              // return 0 on success
2880 
2881     __ BIND(L_post_barrier);
2882     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2883 
2884     // Common exit point (success or failure).
2885     __ BIND(L_done);
2886     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2887     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2888     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2889     restore_arg_regs();
2890     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2891     __ leave(); // required for proper stackwalking of RuntimeStub frame
2892     __ ret(0);
2893 
2894     return start;
2895   }
2896 
2897   //
2898   //  Generate 'unsafe' array copy stub
2899   //  Though just as safe as the other stubs, it takes an unscaled
2900   //  size_t argument instead of an element count.
2901   //
2902   //  Input:
2903   //    c_rarg0   - source array address
2904   //    c_rarg1   - destination array address
2905   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2906   //
2907   // Examines the alignment of the operands and dispatches
2908   // to a long, int, short, or byte copy loop.
2909   //
2910   address generate_unsafe_copy(const char *name,
2911                                address byte_copy_entry, address short_copy_entry,
2912                                address int_copy_entry, address long_copy_entry) {
2913 
2914     Label L_long_aligned, L_int_aligned, L_short_aligned;
2915 
2916     // Input registers (before setup_arg_regs)
2917     const Register from        = c_rarg0;  // source array address
2918     const Register to          = c_rarg1;  // destination array address
2919     const Register size        = c_rarg2;  // byte count (size_t)
2920 
2921     // Register used as a temp
2922     const Register bits        = rax;      // test copy of low bits
2923 
2924     __ align(CodeEntryAlignment);
2925     StubCodeMark mark(this, "StubRoutines", name);
2926     address start = __ pc();
2927 
2928     __ enter(); // required for proper stackwalking of RuntimeStub frame
2929 
2930     // bump this on entry, not on exit:
2931     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2932 
2933     __ mov(bits, from);
2934     __ orptr(bits, to);
2935     __ orptr(bits, size);
2936 
2937     __ testb(bits, BytesPerLong-1);
2938     __ jccb(Assembler::zero, L_long_aligned);
2939 
2940     __ testb(bits, BytesPerInt-1);
2941     __ jccb(Assembler::zero, L_int_aligned);
2942 
2943     __ testb(bits, BytesPerShort-1);
2944     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2945 
2946     __ BIND(L_short_aligned);
2947     __ shrptr(size, LogBytesPerShort); // size => short_count
2948     __ jump(RuntimeAddress(short_copy_entry));
2949 
2950     __ BIND(L_int_aligned);
2951     __ shrptr(size, LogBytesPerInt); // size => int_count
2952     __ jump(RuntimeAddress(int_copy_entry));
2953 
2954     __ BIND(L_long_aligned);
2955     __ shrptr(size, LogBytesPerLong); // size => qword_count
2956     __ jump(RuntimeAddress(long_copy_entry));
2957 
2958     return start;
2959   }
2960 
2961   // Perform range checks on the proposed arraycopy.
2962   // Kills temp, but nothing else.
2963   // Also, clean the sign bits of src_pos and dst_pos.
2964   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2965                               Register src_pos, // source position (c_rarg1)
2966                               Register dst,     // destination array oo (c_rarg2)
2967                               Register dst_pos, // destination position (c_rarg3)
2968                               Register length,
2969                               Register temp,
2970                               Label& L_failed) {
2971     BLOCK_COMMENT("arraycopy_range_checks:");
2972 
2973     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2974     __ movl(temp, length);
2975     __ addl(temp, src_pos);             // src_pos + length
2976     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2977     __ jcc(Assembler::above, L_failed);
2978 
2979     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2980     __ movl(temp, length);
2981     __ addl(temp, dst_pos);             // dst_pos + length
2982     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2983     __ jcc(Assembler::above, L_failed);
2984 
2985     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2986     // Move with sign extension can be used since they are positive.
2987     __ movslq(src_pos, src_pos);
2988     __ movslq(dst_pos, dst_pos);
2989 
2990     BLOCK_COMMENT("arraycopy_range_checks done");
2991   }
2992 
2993   //
2994   //  Generate generic array copy stubs
2995   //
2996   //  Input:
2997   //    c_rarg0    -  src oop
2998   //    c_rarg1    -  src_pos (32-bits)
2999   //    c_rarg2    -  dst oop
3000   //    c_rarg3    -  dst_pos (32-bits)
3001   // not Win64
3002   //    c_rarg4    -  element count (32-bits)
3003   // Win64
3004   //    rsp+40     -  element count (32-bits)
3005   //
3006   //  Output:
3007   //    rax ==  0  -  success
3008   //    rax == -1^K - failure, where K is partial transfer count
3009   //
3010   address generate_generic_copy(const char *name,
3011                                 address byte_copy_entry, address short_copy_entry,
3012                                 address int_copy_entry, address oop_copy_entry,
3013                                 address long_copy_entry, address checkcast_copy_entry) {
3014 
3015     Label L_failed, L_failed_0, L_objArray;
3016     Label L_copy_shorts, L_copy_ints, L_copy_longs;
3017 
3018     // Input registers
3019     const Register src        = c_rarg0;  // source array oop
3020     const Register src_pos    = c_rarg1;  // source position
3021     const Register dst        = c_rarg2;  // destination array oop
3022     const Register dst_pos    = c_rarg3;  // destination position
3023 #ifndef _WIN64
3024     const Register length     = c_rarg4;
3025     const Register rklass_tmp = r9;  // load_klass
3026 #else
3027     const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3028     const Register rklass_tmp = rdi;  // load_klass
3029 #endif
3030 
3031     { int modulus = CodeEntryAlignment;
3032       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
3033       int advance = target - (__ offset() % modulus);
3034       if (advance < 0)  advance += modulus;
3035       if (advance > 0)  __ nop(advance);
3036     }
3037     StubCodeMark mark(this, "StubRoutines", name);
3038 
3039     // Short-hop target to L_failed.  Makes for denser prologue code.
3040     __ BIND(L_failed_0);
3041     __ jmp(L_failed);
3042     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
3043 
3044     __ align(CodeEntryAlignment);
3045     address start = __ pc();
3046 
3047     __ enter(); // required for proper stackwalking of RuntimeStub frame
3048 
3049 #ifdef _WIN64
3050     __ push(rklass_tmp); // rdi is callee-save on Windows
3051 #endif
3052 
3053     // bump this on entry, not on exit:
3054     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
3055 
3056     //-----------------------------------------------------------------------
3057     // Assembler stub will be used for this call to arraycopy
3058     // if the following conditions are met:
3059     //
3060     // (1) src and dst must not be null.
3061     // (2) src_pos must not be negative.
3062     // (3) dst_pos must not be negative.
3063     // (4) length  must not be negative.
3064     // (5) src klass and dst klass should be the same and not NULL.
3065     // (6) src and dst should be arrays.
3066     // (7) src_pos + length must not exceed length of src.
3067     // (8) dst_pos + length must not exceed length of dst.
3068     //
3069 
3070     //  if (src == NULL) return -1;
3071     __ testptr(src, src);         // src oop
3072     size_t j1off = __ offset();
3073     __ jccb(Assembler::zero, L_failed_0);
3074 
3075     //  if (src_pos < 0) return -1;
3076     __ testl(src_pos, src_pos); // src_pos (32-bits)
3077     __ jccb(Assembler::negative, L_failed_0);
3078 
3079     //  if (dst == NULL) return -1;
3080     __ testptr(dst, dst);         // dst oop
3081     __ jccb(Assembler::zero, L_failed_0);
3082 
3083     //  if (dst_pos < 0) return -1;
3084     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3085     size_t j4off = __ offset();
3086     __ jccb(Assembler::negative, L_failed_0);
3087 
3088     // The first four tests are very dense code,
3089     // but not quite dense enough to put four
3090     // jumps in a 16-byte instruction fetch buffer.
3091     // That's good, because some branch predicters
3092     // do not like jumps so close together.
3093     // Make sure of this.
3094     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3095 
3096     // registers used as temp
3097     const Register r11_length    = r11; // elements count to copy
3098     const Register r10_src_klass = r10; // array klass
3099 
3100     //  if (length < 0) return -1;
3101     __ movl(r11_length, length);        // length (elements count, 32-bits value)
3102     __ testl(r11_length, r11_length);
3103     __ jccb(Assembler::negative, L_failed_0);
3104 
3105     __ load_klass(r10_src_klass, src, rklass_tmp);
3106 #ifdef ASSERT
3107     //  assert(src->klass() != NULL);
3108     {
3109       BLOCK_COMMENT("assert klasses not null {");
3110       Label L1, L2;
3111       __ testptr(r10_src_klass, r10_src_klass);
3112       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
3113       __ bind(L1);
3114       __ stop("broken null klass");
3115       __ bind(L2);
3116       __ load_klass(rax, dst, rklass_tmp);
3117       __ cmpq(rax, 0);
3118       __ jcc(Assembler::equal, L1);     // this would be broken also
3119       BLOCK_COMMENT("} assert klasses not null done");
3120     }
3121 #endif
3122 
3123     // Load layout helper (32-bits)
3124     //
3125     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3126     // 32        30    24            16              8     2                 0
3127     //
3128     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3129     //
3130 
3131     const int lh_offset = in_bytes(Klass::layout_helper_offset());
3132 
3133     // Handle objArrays completely differently...
3134     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3135     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3136     __ jcc(Assembler::equal, L_objArray);
3137 
3138     //  if (src->klass() != dst->klass()) return -1;
3139     __ load_klass(rax, dst, rklass_tmp);
3140     __ cmpq(r10_src_klass, rax);
3141     __ jcc(Assembler::notEqual, L_failed);
3142 
3143     const Register rax_lh = rax;  // layout helper
3144     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3145 
3146     //  if (!src->is_Array()) return -1;
3147     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3148     __ jcc(Assembler::greaterEqual, L_failed);
3149 
3150     // At this point, it is known to be a typeArray (array_tag 0x3).
3151 #ifdef ASSERT
3152     {
3153       BLOCK_COMMENT("assert primitive array {");
3154       Label L;
3155       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3156       __ jcc(Assembler::greaterEqual, L);
3157       __ stop("must be a primitive array");
3158       __ bind(L);
3159       BLOCK_COMMENT("} assert primitive array done");
3160     }
3161 #endif
3162 
3163     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3164                            r10, L_failed);
3165 
3166     // TypeArrayKlass
3167     //
3168     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3169     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3170     //
3171 
3172     const Register r10_offset = r10;    // array offset
3173     const Register rax_elsize = rax_lh; // element size
3174 
3175     __ movl(r10_offset, rax_lh);
3176     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3177     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3178     __ addptr(src, r10_offset);           // src array offset
3179     __ addptr(dst, r10_offset);           // dst array offset
3180     BLOCK_COMMENT("choose copy loop based on element size");
3181     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3182 
3183 #ifdef _WIN64
3184     __ pop(rklass_tmp); // Restore callee-save rdi
3185 #endif
3186 
3187     // next registers should be set before the jump to corresponding stub
3188     const Register from     = c_rarg0;  // source array address
3189     const Register to       = c_rarg1;  // destination array address
3190     const Register count    = c_rarg2;  // elements count
3191 
3192     // 'from', 'to', 'count' registers should be set in such order
3193     // since they are the same as 'src', 'src_pos', 'dst'.
3194 
3195     __ cmpl(rax_elsize, 0);
3196     __ jccb(Assembler::notEqual, L_copy_shorts);
3197     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3198     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3199     __ movl2ptr(count, r11_length); // length
3200     __ jump(RuntimeAddress(byte_copy_entry));
3201 
3202   __ BIND(L_copy_shorts);
3203     __ cmpl(rax_elsize, LogBytesPerShort);
3204     __ jccb(Assembler::notEqual, L_copy_ints);
3205     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3206     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3207     __ movl2ptr(count, r11_length); // length
3208     __ jump(RuntimeAddress(short_copy_entry));
3209 
3210   __ BIND(L_copy_ints);
3211     __ cmpl(rax_elsize, LogBytesPerInt);
3212     __ jccb(Assembler::notEqual, L_copy_longs);
3213     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3214     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3215     __ movl2ptr(count, r11_length); // length
3216     __ jump(RuntimeAddress(int_copy_entry));
3217 
3218   __ BIND(L_copy_longs);
3219 #ifdef ASSERT
3220     {
3221       BLOCK_COMMENT("assert long copy {");
3222       Label L;
3223       __ cmpl(rax_elsize, LogBytesPerLong);
3224       __ jcc(Assembler::equal, L);
3225       __ stop("must be long copy, but elsize is wrong");
3226       __ bind(L);
3227       BLOCK_COMMENT("} assert long copy done");
3228     }
3229 #endif
3230     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3231     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3232     __ movl2ptr(count, r11_length); // length
3233     __ jump(RuntimeAddress(long_copy_entry));
3234 
3235     // ObjArrayKlass
3236   __ BIND(L_objArray);
3237     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3238 
3239     Label L_plain_copy, L_checkcast_copy;
3240     //  test array classes for subtyping
3241     __ load_klass(rax, dst, rklass_tmp);
3242     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3243     __ jcc(Assembler::notEqual, L_checkcast_copy);
3244 
3245     // Identically typed arrays can be copied without element-wise checks.
3246     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3247                            r10, L_failed);
3248 
3249     __ lea(from, Address(src, src_pos, TIMES_OOP,
3250                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3251     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3252                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3253     __ movl2ptr(count, r11_length); // length
3254   __ BIND(L_plain_copy);
3255 #ifdef _WIN64
3256     __ pop(rklass_tmp); // Restore callee-save rdi
3257 #endif
3258     __ jump(RuntimeAddress(oop_copy_entry));
3259 
3260   __ BIND(L_checkcast_copy);
3261     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3262     {
3263       // Before looking at dst.length, make sure dst is also an objArray.
3264       __ cmpl(Address(rax, lh_offset), objArray_lh);
3265       __ jcc(Assembler::notEqual, L_failed);
3266 
3267       // It is safe to examine both src.length and dst.length.
3268       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3269                              rax, L_failed);
3270 
3271       const Register r11_dst_klass = r11;
3272       __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3273 
3274       // Marshal the base address arguments now, freeing registers.
3275       __ lea(from, Address(src, src_pos, TIMES_OOP,
3276                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3277       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3278                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3279       __ movl(count, length);           // length (reloaded)
3280       Register sco_temp = c_rarg3;      // this register is free now
3281       assert_different_registers(from, to, count, sco_temp,
3282                                  r11_dst_klass, r10_src_klass);
3283       assert_clean_int(count, sco_temp);
3284 
3285       // Generate the type check.
3286       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3287       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3288       assert_clean_int(sco_temp, rax);
3289       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3290 
3291       // Fetch destination element klass from the ObjArrayKlass header.
3292       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3293       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3294       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3295       assert_clean_int(sco_temp, rax);
3296 
3297 #ifdef _WIN64
3298       __ pop(rklass_tmp); // Restore callee-save rdi
3299 #endif
3300 
3301       // the checkcast_copy loop needs two extra arguments:
3302       assert(c_rarg3 == sco_temp, "#3 already in place");
3303       // Set up arguments for checkcast_copy_entry.
3304       setup_arg_regs(4);
3305       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3306       __ jump(RuntimeAddress(checkcast_copy_entry));
3307     }
3308 
3309   __ BIND(L_failed);
3310 #ifdef _WIN64
3311     __ pop(rklass_tmp); // Restore callee-save rdi
3312 #endif
3313     __ xorptr(rax, rax);
3314     __ notptr(rax); // return -1
3315     __ leave();   // required for proper stackwalking of RuntimeStub frame
3316     __ ret(0);
3317 
3318     return start;
3319   }
3320 
3321   address generate_data_cache_writeback() {
3322     const Register src        = c_rarg0;  // source address
3323 
3324     __ align(CodeEntryAlignment);
3325 
3326     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3327 
3328     address start = __ pc();
3329     __ enter();
3330     __ cache_wb(Address(src, 0));
3331     __ leave();
3332     __ ret(0);
3333 
3334     return start;
3335   }
3336 
3337   address generate_data_cache_writeback_sync() {
3338     const Register is_pre    = c_rarg0;  // pre or post sync
3339 
3340     __ align(CodeEntryAlignment);
3341 
3342     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3343 
3344     // pre wbsync is a no-op
3345     // post wbsync translates to an sfence
3346 
3347     Label skip;
3348     address start = __ pc();
3349     __ enter();
3350     __ cmpl(is_pre, 0);
3351     __ jcc(Assembler::notEqual, skip);
3352     __ cache_wbsync(false);
3353     __ bind(skip);
3354     __ leave();
3355     __ ret(0);
3356 
3357     return start;
3358   }
3359 
3360   void generate_arraycopy_stubs() {
3361     address entry;
3362     address entry_jbyte_arraycopy;
3363     address entry_jshort_arraycopy;
3364     address entry_jint_arraycopy;
3365     address entry_oop_arraycopy;
3366     address entry_jlong_arraycopy;
3367     address entry_checkcast_arraycopy;
3368 
3369     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3370                                                                            "jbyte_disjoint_arraycopy");
3371     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3372                                                                            "jbyte_arraycopy");
3373 
3374     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3375                                                                             "jshort_disjoint_arraycopy");
3376     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3377                                                                             "jshort_arraycopy");
3378 
3379     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3380                                                                               "jint_disjoint_arraycopy");
3381     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3382                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3383 
3384     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3385                                                                                "jlong_disjoint_arraycopy");
3386     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3387                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3388 
3389 
3390     if (UseCompressedOops) {
3391       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3392                                                                               "oop_disjoint_arraycopy");
3393       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3394                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3395       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3396                                                                                      "oop_disjoint_arraycopy_uninit",
3397                                                                                      /*dest_uninitialized*/true);
3398       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3399                                                                                      NULL, "oop_arraycopy_uninit",
3400                                                                                      /*dest_uninitialized*/true);
3401     } else {
3402       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3403                                                                                "oop_disjoint_arraycopy");
3404       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3405                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3406       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3407                                                                                       "oop_disjoint_arraycopy_uninit",
3408                                                                                       /*dest_uninitialized*/true);
3409       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3410                                                                                       NULL, "oop_arraycopy_uninit",
3411                                                                                       /*dest_uninitialized*/true);
3412     }
3413 
3414     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3415     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3416                                                                         /*dest_uninitialized*/true);
3417 
3418     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3419                                                               entry_jbyte_arraycopy,
3420                                                               entry_jshort_arraycopy,
3421                                                               entry_jint_arraycopy,
3422                                                               entry_jlong_arraycopy);
3423     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3424                                                                entry_jbyte_arraycopy,
3425                                                                entry_jshort_arraycopy,
3426                                                                entry_jint_arraycopy,
3427                                                                entry_oop_arraycopy,
3428                                                                entry_jlong_arraycopy,
3429                                                                entry_checkcast_arraycopy);
3430 
3431     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3432     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3433     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3434     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3435     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3436     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3437 
3438     // We don't generate specialized code for HeapWord-aligned source
3439     // arrays, so just use the code we've already generated
3440     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3441     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3442 
3443     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3444     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3445 
3446     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3447     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3448 
3449     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3450     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3451 
3452     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3453     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3454 
3455     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3456     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3457   }
3458 
3459   // AES intrinsic stubs
3460   enum {AESBlockSize = 16};
3461 
3462   address generate_key_shuffle_mask() {
3463     __ align(16);
3464     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3465     address start = __ pc();
3466     __ emit_data64( 0x0405060700010203, relocInfo::none );
3467     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3468     return start;
3469   }
3470 
3471   address generate_counter_shuffle_mask() {
3472     __ align(16);
3473     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3474     address start = __ pc();
3475     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3476     __ emit_data64(0x0001020304050607, relocInfo::none);
3477     return start;
3478   }
3479 
3480   // Utility routine for loading a 128-bit key word in little endian format
3481   // can optionally specify that the shuffle mask is already in an xmmregister
3482   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3483     __ movdqu(xmmdst, Address(key, offset));
3484     if (xmm_shuf_mask != NULL) {
3485       __ pshufb(xmmdst, xmm_shuf_mask);
3486     } else {
3487       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3488     }
3489   }
3490 
3491   // Utility routine for increase 128bit counter (iv in CTR mode)
3492   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3493     __ pextrq(reg, xmmdst, 0x0);
3494     __ addq(reg, inc_delta);
3495     __ pinsrq(xmmdst, reg, 0x0);
3496     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3497     __ pextrq(reg, xmmdst, 0x01); // Carry
3498     __ addq(reg, 0x01);
3499     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3500     __ BIND(next_block);          // next instruction
3501   }
3502 
3503   // Arguments:
3504   //
3505   // Inputs:
3506   //   c_rarg0   - source byte array address
3507   //   c_rarg1   - destination byte array address
3508   //   c_rarg2   - K (key) in little endian int array
3509   //
3510   address generate_aescrypt_encryptBlock() {
3511     assert(UseAES, "need AES instructions and misaligned SSE support");
3512     __ align(CodeEntryAlignment);
3513     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3514     Label L_doLast;
3515     address start = __ pc();
3516 
3517     const Register from        = c_rarg0;  // source array address
3518     const Register to          = c_rarg1;  // destination array address
3519     const Register key         = c_rarg2;  // key array address
3520     const Register keylen      = rax;
3521 
3522     const XMMRegister xmm_result = xmm0;
3523     const XMMRegister xmm_key_shuf_mask = xmm1;
3524     // On win64 xmm6-xmm15 must be preserved so don't use them.
3525     const XMMRegister xmm_temp1  = xmm2;
3526     const XMMRegister xmm_temp2  = xmm3;
3527     const XMMRegister xmm_temp3  = xmm4;
3528     const XMMRegister xmm_temp4  = xmm5;
3529 
3530     __ enter(); // required for proper stackwalking of RuntimeStub frame
3531 
3532     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3533     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3534 
3535     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3536     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3537 
3538     // For encryption, the java expanded key ordering is just what we need
3539     // we don't know if the key is aligned, hence not using load-execute form
3540 
3541     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3542     __ pxor(xmm_result, xmm_temp1);
3543 
3544     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3545     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3546     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3547     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3548 
3549     __ aesenc(xmm_result, xmm_temp1);
3550     __ aesenc(xmm_result, xmm_temp2);
3551     __ aesenc(xmm_result, xmm_temp3);
3552     __ aesenc(xmm_result, xmm_temp4);
3553 
3554     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3555     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3556     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3557     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3558 
3559     __ aesenc(xmm_result, xmm_temp1);
3560     __ aesenc(xmm_result, xmm_temp2);
3561     __ aesenc(xmm_result, xmm_temp3);
3562     __ aesenc(xmm_result, xmm_temp4);
3563 
3564     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3565     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3566 
3567     __ cmpl(keylen, 44);
3568     __ jccb(Assembler::equal, L_doLast);
3569 
3570     __ aesenc(xmm_result, xmm_temp1);
3571     __ aesenc(xmm_result, xmm_temp2);
3572 
3573     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3574     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3575 
3576     __ cmpl(keylen, 52);
3577     __ jccb(Assembler::equal, L_doLast);
3578 
3579     __ aesenc(xmm_result, xmm_temp1);
3580     __ aesenc(xmm_result, xmm_temp2);
3581 
3582     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3583     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3584 
3585     __ BIND(L_doLast);
3586     __ aesenc(xmm_result, xmm_temp1);
3587     __ aesenclast(xmm_result, xmm_temp2);
3588     __ movdqu(Address(to, 0), xmm_result);        // store the result
3589     __ xorptr(rax, rax); // return 0
3590     __ leave(); // required for proper stackwalking of RuntimeStub frame
3591     __ ret(0);
3592 
3593     return start;
3594   }
3595 
3596 
3597   // Arguments:
3598   //
3599   // Inputs:
3600   //   c_rarg0   - source byte array address
3601   //   c_rarg1   - destination byte array address
3602   //   c_rarg2   - K (key) in little endian int array
3603   //
3604   address generate_aescrypt_decryptBlock() {
3605     assert(UseAES, "need AES instructions and misaligned SSE support");
3606     __ align(CodeEntryAlignment);
3607     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3608     Label L_doLast;
3609     address start = __ pc();
3610 
3611     const Register from        = c_rarg0;  // source array address
3612     const Register to          = c_rarg1;  // destination array address
3613     const Register key         = c_rarg2;  // key array address
3614     const Register keylen      = rax;
3615 
3616     const XMMRegister xmm_result = xmm0;
3617     const XMMRegister xmm_key_shuf_mask = xmm1;
3618     // On win64 xmm6-xmm15 must be preserved so don't use them.
3619     const XMMRegister xmm_temp1  = xmm2;
3620     const XMMRegister xmm_temp2  = xmm3;
3621     const XMMRegister xmm_temp3  = xmm4;
3622     const XMMRegister xmm_temp4  = xmm5;
3623 
3624     __ enter(); // required for proper stackwalking of RuntimeStub frame
3625 
3626     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3627     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3628 
3629     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3630     __ movdqu(xmm_result, Address(from, 0));
3631 
3632     // for decryption java expanded key ordering is rotated one position from what we want
3633     // so we start from 0x10 here and hit 0x00 last
3634     // we don't know if the key is aligned, hence not using load-execute form
3635     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3636     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3637     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3638     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3639 
3640     __ pxor  (xmm_result, xmm_temp1);
3641     __ aesdec(xmm_result, xmm_temp2);
3642     __ aesdec(xmm_result, xmm_temp3);
3643     __ aesdec(xmm_result, xmm_temp4);
3644 
3645     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3646     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3647     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3648     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3649 
3650     __ aesdec(xmm_result, xmm_temp1);
3651     __ aesdec(xmm_result, xmm_temp2);
3652     __ aesdec(xmm_result, xmm_temp3);
3653     __ aesdec(xmm_result, xmm_temp4);
3654 
3655     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3656     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3657     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3658 
3659     __ cmpl(keylen, 44);
3660     __ jccb(Assembler::equal, L_doLast);
3661 
3662     __ aesdec(xmm_result, xmm_temp1);
3663     __ aesdec(xmm_result, xmm_temp2);
3664 
3665     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3666     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3667 
3668     __ cmpl(keylen, 52);
3669     __ jccb(Assembler::equal, L_doLast);
3670 
3671     __ aesdec(xmm_result, xmm_temp1);
3672     __ aesdec(xmm_result, xmm_temp2);
3673 
3674     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3675     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3676 
3677     __ BIND(L_doLast);
3678     __ aesdec(xmm_result, xmm_temp1);
3679     __ aesdec(xmm_result, xmm_temp2);
3680 
3681     // for decryption the aesdeclast operation is always on key+0x00
3682     __ aesdeclast(xmm_result, xmm_temp3);
3683     __ movdqu(Address(to, 0), xmm_result);  // store the result
3684     __ xorptr(rax, rax); // return 0
3685     __ leave(); // required for proper stackwalking of RuntimeStub frame
3686     __ ret(0);
3687 
3688     return start;
3689   }
3690 
3691 
3692   // Arguments:
3693   //
3694   // Inputs:
3695   //   c_rarg0   - source byte array address
3696   //   c_rarg1   - destination byte array address
3697   //   c_rarg2   - K (key) in little endian int array
3698   //   c_rarg3   - r vector byte array address
3699   //   c_rarg4   - input length
3700   //
3701   // Output:
3702   //   rax       - input length
3703   //
3704   address generate_cipherBlockChaining_encryptAESCrypt() {
3705     assert(UseAES, "need AES instructions and misaligned SSE support");
3706     __ align(CodeEntryAlignment);
3707     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3708     address start = __ pc();
3709 
3710     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3711     const Register from        = c_rarg0;  // source array address
3712     const Register to          = c_rarg1;  // destination array address
3713     const Register key         = c_rarg2;  // key array address
3714     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3715                                            // and left with the results of the last encryption block
3716 #ifndef _WIN64
3717     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3718 #else
3719     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3720     const Register len_reg     = r11;      // pick the volatile windows register
3721 #endif
3722     const Register pos         = rax;
3723 
3724     // xmm register assignments for the loops below
3725     const XMMRegister xmm_result = xmm0;
3726     const XMMRegister xmm_temp   = xmm1;
3727     // keys 0-10 preloaded into xmm2-xmm12
3728     const int XMM_REG_NUM_KEY_FIRST = 2;
3729     const int XMM_REG_NUM_KEY_LAST  = 15;
3730     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3731     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3732     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3733     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3734     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3735 
3736     __ enter(); // required for proper stackwalking of RuntimeStub frame
3737 
3738 #ifdef _WIN64
3739     // on win64, fill len_reg from stack position
3740     __ movl(len_reg, len_mem);
3741 #else
3742     __ push(len_reg); // Save
3743 #endif
3744 
3745     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3746     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3747     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3748     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3749       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3750       offset += 0x10;
3751     }
3752     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3753 
3754     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3755     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3756     __ cmpl(rax, 44);
3757     __ jcc(Assembler::notEqual, L_key_192_256);
3758 
3759     // 128 bit code follows here
3760     __ movptr(pos, 0);
3761     __ align(OptoLoopAlignment);
3762 
3763     __ BIND(L_loopTop_128);
3764     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3765     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3766     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3767     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3768       __ aesenc(xmm_result, as_XMMRegister(rnum));
3769     }
3770     __ aesenclast(xmm_result, xmm_key10);
3771     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3772     // no need to store r to memory until we exit
3773     __ addptr(pos, AESBlockSize);
3774     __ subptr(len_reg, AESBlockSize);
3775     __ jcc(Assembler::notEqual, L_loopTop_128);
3776 
3777     __ BIND(L_exit);
3778     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3779 
3780 #ifdef _WIN64
3781     __ movl(rax, len_mem);
3782 #else
3783     __ pop(rax); // return length
3784 #endif
3785     __ leave(); // required for proper stackwalking of RuntimeStub frame
3786     __ ret(0);
3787 
3788     __ BIND(L_key_192_256);
3789     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3790     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3791     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3792     __ cmpl(rax, 52);
3793     __ jcc(Assembler::notEqual, L_key_256);
3794 
3795     // 192-bit code follows here (could be changed to use more xmm registers)
3796     __ movptr(pos, 0);
3797     __ align(OptoLoopAlignment);
3798 
3799     __ BIND(L_loopTop_192);
3800     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3801     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3802     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3803     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3804       __ aesenc(xmm_result, as_XMMRegister(rnum));
3805     }
3806     __ aesenclast(xmm_result, xmm_key12);
3807     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3808     // no need to store r to memory until we exit
3809     __ addptr(pos, AESBlockSize);
3810     __ subptr(len_reg, AESBlockSize);
3811     __ jcc(Assembler::notEqual, L_loopTop_192);
3812     __ jmp(L_exit);
3813 
3814     __ BIND(L_key_256);
3815     // 256-bit code follows here (could be changed to use more xmm registers)
3816     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3817     __ movptr(pos, 0);
3818     __ align(OptoLoopAlignment);
3819 
3820     __ BIND(L_loopTop_256);
3821     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3822     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3823     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3824     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3825       __ aesenc(xmm_result, as_XMMRegister(rnum));
3826     }
3827     load_key(xmm_temp, key, 0xe0);
3828     __ aesenclast(xmm_result, xmm_temp);
3829     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3830     // no need to store r to memory until we exit
3831     __ addptr(pos, AESBlockSize);
3832     __ subptr(len_reg, AESBlockSize);
3833     __ jcc(Assembler::notEqual, L_loopTop_256);
3834     __ jmp(L_exit);
3835 
3836     return start;
3837   }
3838 
3839   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3840   // to hide instruction latency
3841   //
3842   // Arguments:
3843   //
3844   // Inputs:
3845   //   c_rarg0   - source byte array address
3846   //   c_rarg1   - destination byte array address
3847   //   c_rarg2   - K (key) in little endian int array
3848   //   c_rarg3   - r vector byte array address
3849   //   c_rarg4   - input length
3850   //
3851   // Output:
3852   //   rax       - input length
3853   //
3854   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3855     assert(UseAES, "need AES instructions and misaligned SSE support");
3856     __ align(CodeEntryAlignment);
3857     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3858     address start = __ pc();
3859 
3860     const Register from        = c_rarg0;  // source array address
3861     const Register to          = c_rarg1;  // destination array address
3862     const Register key         = c_rarg2;  // key array address
3863     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3864                                            // and left with the results of the last encryption block
3865 #ifndef _WIN64
3866     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3867 #else
3868     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3869     const Register len_reg     = r11;      // pick the volatile windows register
3870 #endif
3871     const Register pos         = rax;
3872 
3873     const int PARALLEL_FACTOR = 4;
3874     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3875 
3876     Label L_exit;
3877     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3878     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3879     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3880     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3881     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3882 
3883     // keys 0-10 preloaded into xmm5-xmm15
3884     const int XMM_REG_NUM_KEY_FIRST = 5;
3885     const int XMM_REG_NUM_KEY_LAST  = 15;
3886     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3887     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3888 
3889     __ enter(); // required for proper stackwalking of RuntimeStub frame
3890 
3891 #ifdef _WIN64
3892     // on win64, fill len_reg from stack position
3893     __ movl(len_reg, len_mem);
3894 #else
3895     __ push(len_reg); // Save
3896 #endif
3897     __ push(rbx);
3898     // the java expanded key ordering is rotated one position from what we want
3899     // so we start from 0x10 here and hit 0x00 last
3900     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3901     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3902     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3903     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3904       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3905       offset += 0x10;
3906     }
3907     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3908 
3909     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3910 
3911     // registers holding the four results in the parallelized loop
3912     const XMMRegister xmm_result0 = xmm0;
3913     const XMMRegister xmm_result1 = xmm2;
3914     const XMMRegister xmm_result2 = xmm3;
3915     const XMMRegister xmm_result3 = xmm4;
3916 
3917     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3918 
3919     __ xorptr(pos, pos);
3920 
3921     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3922     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3923     __ cmpl(rbx, 52);
3924     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3925     __ cmpl(rbx, 60);
3926     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3927 
3928 #define DoFour(opc, src_reg)           \
3929   __ opc(xmm_result0, src_reg);         \
3930   __ opc(xmm_result1, src_reg);         \
3931   __ opc(xmm_result2, src_reg);         \
3932   __ opc(xmm_result3, src_reg);         \
3933 
3934     for (int k = 0; k < 3; ++k) {
3935       __ BIND(L_multiBlock_loopTopHead[k]);
3936       if (k != 0) {
3937         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3938         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3939       }
3940       if (k == 1) {
3941         __ subptr(rsp, 6 * wordSize);
3942         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3943         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3944         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3945         load_key(xmm1, key, 0xc0);  // 0xc0;
3946         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3947       } else if (k == 2) {
3948         __ subptr(rsp, 10 * wordSize);
3949         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3950         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes up to 0xe0
3951         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3952         load_key(xmm1, key, 0xe0);  // 0xe0;
3953         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3954         load_key(xmm15, key, 0xb0); // 0xb0;
3955         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3956         load_key(xmm1, key, 0xc0);  // 0xc0;
3957         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3958       }
3959       __ align(OptoLoopAlignment);
3960       __ BIND(L_multiBlock_loopTop[k]);
3961       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3962       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3963 
3964       if  (k != 0) {
3965         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3966         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3967       }
3968 
3969       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3970       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3971       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3972       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3973 
3974       DoFour(pxor, xmm_key_first);
3975       if (k == 0) {
3976         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3977           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3978         }
3979         DoFour(aesdeclast, xmm_key_last);
3980       } else if (k == 1) {
3981         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3982           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3983         }
3984         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3985         DoFour(aesdec, xmm1);  // key : 0xc0
3986         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3987         DoFour(aesdeclast, xmm_key_last);
3988       } else if (k == 2) {
3989         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3990           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3991         }
3992         DoFour(aesdec, xmm1);  // key : 0xc0
3993         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3994         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3995         DoFour(aesdec, xmm15);  // key : 0xd0
3996         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3997         DoFour(aesdec, xmm1);  // key : 0xe0
3998         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3999         DoFour(aesdeclast, xmm_key_last);
4000       }
4001 
4002       // for each result, xor with the r vector of previous cipher block
4003       __ pxor(xmm_result0, xmm_prev_block_cipher);
4004       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4005       __ pxor(xmm_result1, xmm_prev_block_cipher);
4006       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4007       __ pxor(xmm_result2, xmm_prev_block_cipher);
4008       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4009       __ pxor(xmm_result3, xmm_prev_block_cipher);
4010       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
4011       if (k != 0) {
4012         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
4013       }
4014 
4015       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
4016       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4017       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4018       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4019 
4020       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
4021       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
4022       __ jmp(L_multiBlock_loopTop[k]);
4023 
4024       // registers used in the non-parallelized loops
4025       // xmm register assignments for the loops below
4026       const XMMRegister xmm_result = xmm0;
4027       const XMMRegister xmm_prev_block_cipher_save = xmm2;
4028       const XMMRegister xmm_key11 = xmm3;
4029       const XMMRegister xmm_key12 = xmm4;
4030       const XMMRegister key_tmp = xmm4;
4031 
4032       __ BIND(L_singleBlock_loopTopHead[k]);
4033       if (k == 1) {
4034         __ addptr(rsp, 6 * wordSize);
4035       } else if (k == 2) {
4036         __ addptr(rsp, 10 * wordSize);
4037       }
4038       __ cmpptr(len_reg, 0); // any blocks left??
4039       __ jcc(Assembler::equal, L_exit);
4040       __ BIND(L_singleBlock_loopTopHead2[k]);
4041       if (k == 1) {
4042         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
4043         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes up to 0xc0
4044       }
4045       if (k == 2) {
4046         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes up to 0xe0
4047       }
4048       __ align(OptoLoopAlignment);
4049       __ BIND(L_singleBlock_loopTop[k]);
4050       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
4051       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
4052       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
4053       for (int rnum = 1; rnum <= 9 ; rnum++) {
4054           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4055       }
4056       if (k == 1) {
4057         __ aesdec(xmm_result, xmm_key11);
4058         __ aesdec(xmm_result, xmm_key12);
4059       }
4060       if (k == 2) {
4061         __ aesdec(xmm_result, xmm_key11);
4062         load_key(key_tmp, key, 0xc0);
4063         __ aesdec(xmm_result, key_tmp);
4064         load_key(key_tmp, key, 0xd0);
4065         __ aesdec(xmm_result, key_tmp);
4066         load_key(key_tmp, key, 0xe0);
4067         __ aesdec(xmm_result, key_tmp);
4068       }
4069 
4070       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
4071       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
4072       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
4073       // no need to store r to memory until we exit
4074       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
4075       __ addptr(pos, AESBlockSize);
4076       __ subptr(len_reg, AESBlockSize);
4077       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
4078       if (k != 2) {
4079         __ jmp(L_exit);
4080       }
4081     } //for 128/192/256
4082 
4083     __ BIND(L_exit);
4084     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
4085     __ pop(rbx);
4086 #ifdef _WIN64
4087     __ movl(rax, len_mem);
4088 #else
4089     __ pop(rax); // return length
4090 #endif
4091     __ leave(); // required for proper stackwalking of RuntimeStub frame
4092     __ ret(0);
4093     return start;
4094 }
4095 
4096   address generate_electronicCodeBook_encryptAESCrypt() {
4097     __ align(CodeEntryAlignment);
4098     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
4099     address start = __ pc();
4100     const Register from = c_rarg0;  // source array address
4101     const Register to = c_rarg1;  // destination array address
4102     const Register key = c_rarg2;  // key array address
4103     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4104     __ enter(); // required for proper stackwalking of RuntimeStub frame
4105     __ aesecb_encrypt(from, to, key, len);
4106     __ vzeroupper();
4107     __ leave(); // required for proper stackwalking of RuntimeStub frame
4108     __ ret(0);
4109     return start;
4110  }
4111 
4112   address generate_electronicCodeBook_decryptAESCrypt() {
4113     __ align(CodeEntryAlignment);
4114     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
4115     address start = __ pc();
4116     const Register from = c_rarg0;  // source array address
4117     const Register to = c_rarg1;  // destination array address
4118     const Register key = c_rarg2;  // key array address
4119     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4120     __ enter(); // required for proper stackwalking of RuntimeStub frame
4121     __ aesecb_decrypt(from, to, key, len);
4122     __ vzeroupper();
4123     __ leave(); // required for proper stackwalking of RuntimeStub frame
4124     __ ret(0);
4125     return start;
4126   }
4127 
4128   // ofs and limit are use for multi-block byte array.
4129   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
4130   address generate_md5_implCompress(bool multi_block, const char *name) {
4131     __ align(CodeEntryAlignment);
4132     StubCodeMark mark(this, "StubRoutines", name);
4133     address start = __ pc();
4134 
4135     const Register buf_param = r15;
4136     const Address state_param(rsp, 0 * wordSize);
4137     const Address ofs_param  (rsp, 1 * wordSize    );
4138     const Address limit_param(rsp, 1 * wordSize + 4);
4139 
4140     __ enter();
4141     __ push(rbx);
4142     __ push(rdi);
4143     __ push(rsi);
4144     __ push(r15);
4145     __ subptr(rsp, 2 * wordSize);
4146 
4147     __ movptr(buf_param, c_rarg0);
4148     __ movptr(state_param, c_rarg1);
4149     if (multi_block) {
4150       __ movl(ofs_param, c_rarg2);
4151       __ movl(limit_param, c_rarg3);
4152     }
4153     __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
4154 
4155     __ addptr(rsp, 2 * wordSize);
4156     __ pop(r15);
4157     __ pop(rsi);
4158     __ pop(rdi);
4159     __ pop(rbx);
4160     __ leave();
4161     __ ret(0);
4162     return start;
4163   }
4164 
4165   address generate_upper_word_mask() {
4166     __ align64();
4167     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
4168     address start = __ pc();
4169     __ emit_data64(0x0000000000000000, relocInfo::none);
4170     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
4171     return start;
4172   }
4173 
4174   address generate_shuffle_byte_flip_mask() {
4175     __ align64();
4176     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
4177     address start = __ pc();
4178     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4179     __ emit_data64(0x0001020304050607, relocInfo::none);
4180     return start;
4181   }
4182 
4183   // ofs and limit are use for multi-block byte array.
4184   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4185   address generate_sha1_implCompress(bool multi_block, const char *name) {
4186     __ align(CodeEntryAlignment);
4187     StubCodeMark mark(this, "StubRoutines", name);
4188     address start = __ pc();
4189 
4190     Register buf = c_rarg0;
4191     Register state = c_rarg1;
4192     Register ofs = c_rarg2;
4193     Register limit = c_rarg3;
4194 
4195     const XMMRegister abcd = xmm0;
4196     const XMMRegister e0 = xmm1;
4197     const XMMRegister e1 = xmm2;
4198     const XMMRegister msg0 = xmm3;
4199 
4200     const XMMRegister msg1 = xmm4;
4201     const XMMRegister msg2 = xmm5;
4202     const XMMRegister msg3 = xmm6;
4203     const XMMRegister shuf_mask = xmm7;
4204 
4205     __ enter();
4206 
4207     __ subptr(rsp, 4 * wordSize);
4208 
4209     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4210       buf, state, ofs, limit, rsp, multi_block);
4211 
4212     __ addptr(rsp, 4 * wordSize);
4213 
4214     __ leave();
4215     __ ret(0);
4216     return start;
4217   }
4218 
4219   address generate_pshuffle_byte_flip_mask() {
4220     __ align64();
4221     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4222     address start = __ pc();
4223     __ emit_data64(0x0405060700010203, relocInfo::none);
4224     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4225 
4226     if (VM_Version::supports_avx2()) {
4227       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4228       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4229       // _SHUF_00BA
4230       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4231       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4232       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4233       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4234       // _SHUF_DC00
4235       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4236       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4237       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4238       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4239     }
4240 
4241     return start;
4242   }
4243 
4244   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4245   address generate_pshuffle_byte_flip_mask_sha512() {
4246     __ align32();
4247     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4248     address start = __ pc();
4249     if (VM_Version::supports_avx2()) {
4250       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4251       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4252       __ emit_data64(0x1011121314151617, relocInfo::none);
4253       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4254       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4255       __ emit_data64(0x0000000000000000, relocInfo::none);
4256       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4257       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4258     }
4259 
4260     return start;
4261   }
4262 
4263 // ofs and limit are use for multi-block byte array.
4264 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4265   address generate_sha256_implCompress(bool multi_block, const char *name) {
4266     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4267     __ align(CodeEntryAlignment);
4268     StubCodeMark mark(this, "StubRoutines", name);
4269     address start = __ pc();
4270 
4271     Register buf = c_rarg0;
4272     Register state = c_rarg1;
4273     Register ofs = c_rarg2;
4274     Register limit = c_rarg3;
4275 
4276     const XMMRegister msg = xmm0;
4277     const XMMRegister state0 = xmm1;
4278     const XMMRegister state1 = xmm2;
4279     const XMMRegister msgtmp0 = xmm3;
4280 
4281     const XMMRegister msgtmp1 = xmm4;
4282     const XMMRegister msgtmp2 = xmm5;
4283     const XMMRegister msgtmp3 = xmm6;
4284     const XMMRegister msgtmp4 = xmm7;
4285 
4286     const XMMRegister shuf_mask = xmm8;
4287 
4288     __ enter();
4289 
4290     __ subptr(rsp, 4 * wordSize);
4291 
4292     if (VM_Version::supports_sha()) {
4293       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4294         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4295     } else if (VM_Version::supports_avx2()) {
4296       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4297         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4298     }
4299     __ addptr(rsp, 4 * wordSize);
4300     __ vzeroupper();
4301     __ leave();
4302     __ ret(0);
4303     return start;
4304   }
4305 
4306   address generate_sha512_implCompress(bool multi_block, const char *name) {
4307     assert(VM_Version::supports_avx2(), "");
4308     assert(VM_Version::supports_bmi2(), "");
4309     __ align(CodeEntryAlignment);
4310     StubCodeMark mark(this, "StubRoutines", name);
4311     address start = __ pc();
4312 
4313     Register buf = c_rarg0;
4314     Register state = c_rarg1;
4315     Register ofs = c_rarg2;
4316     Register limit = c_rarg3;
4317 
4318     const XMMRegister msg = xmm0;
4319     const XMMRegister state0 = xmm1;
4320     const XMMRegister state1 = xmm2;
4321     const XMMRegister msgtmp0 = xmm3;
4322     const XMMRegister msgtmp1 = xmm4;
4323     const XMMRegister msgtmp2 = xmm5;
4324     const XMMRegister msgtmp3 = xmm6;
4325     const XMMRegister msgtmp4 = xmm7;
4326 
4327     const XMMRegister shuf_mask = xmm8;
4328 
4329     __ enter();
4330 
4331     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4332     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4333 
4334     __ vzeroupper();
4335     __ leave();
4336     __ ret(0);
4337     return start;
4338   }
4339 
4340   address ghash_polynomial512_addr() {
4341     __ align(CodeEntryAlignment);
4342     StubCodeMark mark(this, "StubRoutines", "_ghash_poly512_addr");
4343     address start = __ pc();
4344     __ emit_data64(0x00000001C2000000, relocInfo::none); // POLY for reduction
4345     __ emit_data64(0xC200000000000000, relocInfo::none);
4346     __ emit_data64(0x00000001C2000000, relocInfo::none);
4347     __ emit_data64(0xC200000000000000, relocInfo::none);
4348     __ emit_data64(0x00000001C2000000, relocInfo::none);
4349     __ emit_data64(0xC200000000000000, relocInfo::none);
4350     __ emit_data64(0x00000001C2000000, relocInfo::none);
4351     __ emit_data64(0xC200000000000000, relocInfo::none);
4352     __ emit_data64(0x0000000000000001, relocInfo::none); // POLY
4353     __ emit_data64(0xC200000000000000, relocInfo::none);
4354     __ emit_data64(0x0000000000000001, relocInfo::none); // TWOONE
4355     __ emit_data64(0x0000000100000000, relocInfo::none);
4356     return start;
4357 }
4358 
4359   // Vector AES Galois Counter Mode implementation. Parameters:
4360   // Windows regs            |  Linux regs
4361   // in = c_rarg0 (rcx)      |  c_rarg0 (rsi)
4362   // len = c_rarg1 (rdx)     |  c_rarg1 (rdi)
4363   // ct = c_rarg2 (r8)       |  c_rarg2 (rdx)
4364   // out = c_rarg3 (r9)      |  c_rarg3 (rcx)
4365   // key = r10               |  c_rarg4 (r8)
4366   // state = r13             |  c_rarg5 (r9)
4367   // subkeyHtbl = r14        |  r11
4368   // counter = rsi           |  r12
4369   // return - number of processed bytes
4370   address generate_galoisCounterMode_AESCrypt() {
4371     __ align(CodeEntryAlignment);
4372     StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
4373     address start = __ pc();
4374     const Register in = c_rarg0;
4375     const Register len = c_rarg1;
4376     const Register ct = c_rarg2;
4377     const Register out = c_rarg3;
4378     // and updated with the incremented counter in the end
4379 #ifndef _WIN64
4380     const Register key = c_rarg4;
4381     const Register state = c_rarg5;
4382     const Address subkeyH_mem(rbp, 2 * wordSize);
4383     const Register subkeyHtbl = r11;
4384     const Register avx512_subkeyHtbl = r13;
4385     const Address counter_mem(rbp, 3 * wordSize);
4386     const Register counter = r12;
4387 #else
4388     const Address key_mem(rbp, 6 * wordSize);
4389     const Register key = r10;
4390     const Address state_mem(rbp, 7 * wordSize);
4391     const Register state = r13;
4392     const Address subkeyH_mem(rbp, 8 * wordSize);
4393     const Register subkeyHtbl = r14;
4394     const Register avx512_subkeyHtbl = r12;
4395     const Address counter_mem(rbp, 9 * wordSize);
4396     const Register counter = rsi;
4397 #endif
4398     __ enter();
4399    // Save state before entering routine
4400     __ push(r12);
4401     __ push(r13);
4402     __ push(r14);
4403     __ push(r15);
4404     __ push(rbx);
4405 #ifdef _WIN64
4406     // on win64, fill len_reg from stack position
4407     __ push(rsi);
4408     __ movptr(key, key_mem);
4409     __ movptr(state, state_mem);
4410 #endif
4411     __ movptr(subkeyHtbl, subkeyH_mem);
4412     __ movptr(counter, counter_mem);
4413 // Save rbp and rsp
4414     __ push(rbp);
4415     __ movq(rbp, rsp);
4416 // Align stack
4417     __ andq(rsp, -64);
4418     __ subptr(rsp, 96 * longSize); // Create space on the stack for htbl entries
4419     __ movptr(avx512_subkeyHtbl, rsp);
4420 
4421     __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
4422     __ vzeroupper();
4423 
4424     __ movq(rsp, rbp);
4425     __ pop(rbp);
4426 
4427     // Restore state before leaving routine
4428 #ifdef _WIN64
4429     __ pop(rsi);
4430 #endif
4431     __ pop(rbx);
4432     __ pop(r15);
4433     __ pop(r14);
4434     __ pop(r13);
4435     __ pop(r12);
4436 
4437     __ leave(); // required for proper stackwalking of RuntimeStub frame
4438     __ ret(0);
4439      return start;
4440   }
4441 
4442   // This mask is used for incrementing counter value(linc0, linc4, etc.)
4443   address counter_mask_addr() {
4444     __ align64();
4445     StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
4446     address start = __ pc();
4447     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
4448     __ emit_data64(0x0001020304050607, relocInfo::none);
4449     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4450     __ emit_data64(0x0001020304050607, relocInfo::none);
4451     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4452     __ emit_data64(0x0001020304050607, relocInfo::none);
4453     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4454     __ emit_data64(0x0001020304050607, relocInfo::none);
4455     __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4456     __ emit_data64(0x0000000000000000, relocInfo::none);
4457     __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4458     __ emit_data64(0x0000000000000000, relocInfo::none);
4459     __ emit_data64(0x0000000000000002, relocInfo::none);
4460     __ emit_data64(0x0000000000000000, relocInfo::none);
4461     __ emit_data64(0x0000000000000003, relocInfo::none);
4462     __ emit_data64(0x0000000000000000, relocInfo::none);
4463     __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4464     __ emit_data64(0x0000000000000000, relocInfo::none);
4465     __ emit_data64(0x0000000000000004, relocInfo::none);
4466     __ emit_data64(0x0000000000000000, relocInfo::none);
4467     __ emit_data64(0x0000000000000004, relocInfo::none);
4468     __ emit_data64(0x0000000000000000, relocInfo::none);
4469     __ emit_data64(0x0000000000000004, relocInfo::none);
4470     __ emit_data64(0x0000000000000000, relocInfo::none);
4471     __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4472     __ emit_data64(0x0000000000000000, relocInfo::none);
4473     __ emit_data64(0x0000000000000008, relocInfo::none);
4474     __ emit_data64(0x0000000000000000, relocInfo::none);
4475     __ emit_data64(0x0000000000000008, relocInfo::none);
4476     __ emit_data64(0x0000000000000000, relocInfo::none);
4477     __ emit_data64(0x0000000000000008, relocInfo::none);
4478     __ emit_data64(0x0000000000000000, relocInfo::none);
4479     __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4480     __ emit_data64(0x0000000000000000, relocInfo::none);
4481     __ emit_data64(0x0000000000000020, relocInfo::none);
4482     __ emit_data64(0x0000000000000000, relocInfo::none);
4483     __ emit_data64(0x0000000000000020, relocInfo::none);
4484     __ emit_data64(0x0000000000000000, relocInfo::none);
4485     __ emit_data64(0x0000000000000020, relocInfo::none);
4486     __ emit_data64(0x0000000000000000, relocInfo::none);
4487     __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4488     __ emit_data64(0x0000000000000000, relocInfo::none);
4489     __ emit_data64(0x0000000000000010, relocInfo::none);
4490     __ emit_data64(0x0000000000000000, relocInfo::none);
4491     __ emit_data64(0x0000000000000010, relocInfo::none);
4492     __ emit_data64(0x0000000000000000, relocInfo::none);
4493     __ emit_data64(0x0000000000000010, relocInfo::none);
4494     __ emit_data64(0x0000000000000000, relocInfo::none);
4495     return start;
4496   }
4497 
4498  // Vector AES Counter implementation
4499   address generate_counterMode_VectorAESCrypt()  {
4500     __ align(CodeEntryAlignment);
4501     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4502     address start = __ pc();
4503     const Register from = c_rarg0; // source array address
4504     const Register to = c_rarg1; // destination array address
4505     const Register key = c_rarg2; // key array address r8
4506     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4507     // and updated with the incremented counter in the end
4508 #ifndef _WIN64
4509     const Register len_reg = c_rarg4;
4510     const Register saved_encCounter_start = c_rarg5;
4511     const Register used_addr = r10;
4512     const Address  used_mem(rbp, 2 * wordSize);
4513     const Register used = r11;
4514 #else
4515     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4516     const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4517     const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4518     const Register len_reg = r10; // pick the first volatile windows register
4519     const Register saved_encCounter_start = r11;
4520     const Register used_addr = r13;
4521     const Register used = r14;
4522 #endif
4523     __ enter();
4524    // Save state before entering routine
4525     __ push(r12);
4526     __ push(r13);
4527     __ push(r14);
4528     __ push(r15);
4529 #ifdef _WIN64
4530     // on win64, fill len_reg from stack position
4531     __ movl(len_reg, len_mem);
4532     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4533     __ movptr(used_addr, used_mem);
4534     __ movl(used, Address(used_addr, 0));
4535 #else
4536     __ push(len_reg); // Save
4537     __ movptr(used_addr, used_mem);
4538     __ movl(used, Address(used_addr, 0));
4539 #endif
4540     __ push(rbx);
4541     __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
4542     __ vzeroupper();
4543     // Restore state before leaving routine
4544     __ pop(rbx);
4545 #ifdef _WIN64
4546     __ movl(rax, len_mem); // return length
4547 #else
4548     __ pop(rax); // return length
4549 #endif
4550     __ pop(r15);
4551     __ pop(r14);
4552     __ pop(r13);
4553     __ pop(r12);
4554 
4555     __ leave(); // required for proper stackwalking of RuntimeStub frame
4556     __ ret(0);
4557     return start;
4558   }
4559 
4560   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4561   // to hide instruction latency
4562   //
4563   // Arguments:
4564   //
4565   // Inputs:
4566   //   c_rarg0   - source byte array address
4567   //   c_rarg1   - destination byte array address
4568   //   c_rarg2   - K (key) in little endian int array
4569   //   c_rarg3   - counter vector byte array address
4570   //   Linux
4571   //     c_rarg4   -          input length
4572   //     c_rarg5   -          saved encryptedCounter start
4573   //     rbp + 6 * wordSize - saved used length
4574   //   Windows
4575   //     rbp + 6 * wordSize - input length
4576   //     rbp + 7 * wordSize - saved encryptedCounter start
4577   //     rbp + 8 * wordSize - saved used length
4578   //
4579   // Output:
4580   //   rax       - input length
4581   //
4582   address generate_counterMode_AESCrypt_Parallel() {
4583     assert(UseAES, "need AES instructions and misaligned SSE support");
4584     __ align(CodeEntryAlignment);
4585     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4586     address start = __ pc();
4587     const Register from = c_rarg0; // source array address
4588     const Register to = c_rarg1; // destination array address
4589     const Register key = c_rarg2; // key array address
4590     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4591                                       // and updated with the incremented counter in the end
4592 #ifndef _WIN64
4593     const Register len_reg = c_rarg4;
4594     const Register saved_encCounter_start = c_rarg5;
4595     const Register used_addr = r10;
4596     const Address  used_mem(rbp, 2 * wordSize);
4597     const Register used = r11;
4598 #else
4599     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4600     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4601     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4602     const Register len_reg = r10; // pick the first volatile windows register
4603     const Register saved_encCounter_start = r11;
4604     const Register used_addr = r13;
4605     const Register used = r14;
4606 #endif
4607     const Register pos = rax;
4608 
4609     const int PARALLEL_FACTOR = 6;
4610     const XMMRegister xmm_counter_shuf_mask = xmm0;
4611     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4612     const XMMRegister xmm_curr_counter = xmm2;
4613 
4614     const XMMRegister xmm_key_tmp0 = xmm3;
4615     const XMMRegister xmm_key_tmp1 = xmm4;
4616 
4617     // registers holding the four results in the parallelized loop
4618     const XMMRegister xmm_result0 = xmm5;
4619     const XMMRegister xmm_result1 = xmm6;
4620     const XMMRegister xmm_result2 = xmm7;
4621     const XMMRegister xmm_result3 = xmm8;
4622     const XMMRegister xmm_result4 = xmm9;
4623     const XMMRegister xmm_result5 = xmm10;
4624 
4625     const XMMRegister xmm_from0 = xmm11;
4626     const XMMRegister xmm_from1 = xmm12;
4627     const XMMRegister xmm_from2 = xmm13;
4628     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4629     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4630     const XMMRegister xmm_from5 = xmm4;
4631 
4632     //for key_128, key_192, key_256
4633     const int rounds[3] = {10, 12, 14};
4634     Label L_exit_preLoop, L_preLoop_start;
4635     Label L_multiBlock_loopTop[3];
4636     Label L_singleBlockLoopTop[3];
4637     Label L__incCounter[3][6]; //for 6 blocks
4638     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4639     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4640     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4641 
4642     Label L_exit;
4643 
4644     __ enter(); // required for proper stackwalking of RuntimeStub frame
4645 
4646 #ifdef _WIN64
4647     // allocate spill slots for r13, r14
4648     enum {
4649         saved_r13_offset,
4650         saved_r14_offset
4651     };
4652     __ subptr(rsp, 2 * wordSize);
4653     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4654     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4655 
4656     // on win64, fill len_reg from stack position
4657     __ movl(len_reg, len_mem);
4658     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4659     __ movptr(used_addr, used_mem);
4660     __ movl(used, Address(used_addr, 0));
4661 #else
4662     __ push(len_reg); // Save
4663     __ movptr(used_addr, used_mem);
4664     __ movl(used, Address(used_addr, 0));
4665 #endif
4666 
4667     __ push(rbx); // Save RBX
4668     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4669     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4670     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4671     __ movptr(pos, 0);
4672 
4673     // Use the partially used encrpyted counter from last invocation
4674     __ BIND(L_preLoop_start);
4675     __ cmpptr(used, 16);
4676     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4677       __ cmpptr(len_reg, 0);
4678       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4679       __ movb(rbx, Address(saved_encCounter_start, used));
4680       __ xorb(rbx, Address(from, pos));
4681       __ movb(Address(to, pos), rbx);
4682       __ addptr(pos, 1);
4683       __ addptr(used, 1);
4684       __ subptr(len_reg, 1);
4685 
4686     __ jmp(L_preLoop_start);
4687 
4688     __ BIND(L_exit_preLoop);
4689     __ movl(Address(used_addr, 0), used);
4690 
4691     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4692     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4693     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4694     __ cmpl(rbx, 52);
4695     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4696     __ cmpl(rbx, 60);
4697     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4698 
4699 #define CTR_DoSix(opc, src_reg)                \
4700     __ opc(xmm_result0, src_reg);              \
4701     __ opc(xmm_result1, src_reg);              \
4702     __ opc(xmm_result2, src_reg);              \
4703     __ opc(xmm_result3, src_reg);              \
4704     __ opc(xmm_result4, src_reg);              \
4705     __ opc(xmm_result5, src_reg);
4706 
4707     // k == 0 :  generate code for key_128
4708     // k == 1 :  generate code for key_192
4709     // k == 2 :  generate code for key_256
4710     for (int k = 0; k < 3; ++k) {
4711       //multi blocks starts here
4712       __ align(OptoLoopAlignment);
4713       __ BIND(L_multiBlock_loopTop[k]);
4714       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4715       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4716       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4717 
4718       //load, then increase counters
4719       CTR_DoSix(movdqa, xmm_curr_counter);
4720       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4721       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4722       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4723       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4724       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4725       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4726       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4727       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4728 
4729       //load two ROUND_KEYs at a time
4730       for (int i = 1; i < rounds[k]; ) {
4731         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4732         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4733         CTR_DoSix(aesenc, xmm_key_tmp1);
4734         i++;
4735         if (i != rounds[k]) {
4736           CTR_DoSix(aesenc, xmm_key_tmp0);
4737         } else {
4738           CTR_DoSix(aesenclast, xmm_key_tmp0);
4739         }
4740         i++;
4741       }
4742 
4743       // get next PARALLEL_FACTOR blocks into xmm_result registers
4744       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4745       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4746       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4747       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4748       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4749       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4750 
4751       __ pxor(xmm_result0, xmm_from0);
4752       __ pxor(xmm_result1, xmm_from1);
4753       __ pxor(xmm_result2, xmm_from2);
4754       __ pxor(xmm_result3, xmm_from3);
4755       __ pxor(xmm_result4, xmm_from4);
4756       __ pxor(xmm_result5, xmm_from5);
4757 
4758       // store 6 results into the next 64 bytes of output
4759       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4760       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4761       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4762       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4763       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4764       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4765 
4766       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4767       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4768       __ jmp(L_multiBlock_loopTop[k]);
4769 
4770       // singleBlock starts here
4771       __ align(OptoLoopAlignment);
4772       __ BIND(L_singleBlockLoopTop[k]);
4773       __ cmpptr(len_reg, 0);
4774       __ jcc(Assembler::lessEqual, L_exit);
4775       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4776       __ movdqa(xmm_result0, xmm_curr_counter);
4777       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4778       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4779       __ pxor(xmm_result0, xmm_key_tmp0);
4780       for (int i = 1; i < rounds[k]; i++) {
4781         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4782         __ aesenc(xmm_result0, xmm_key_tmp0);
4783       }
4784       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4785       __ aesenclast(xmm_result0, xmm_key_tmp0);
4786       __ cmpptr(len_reg, AESBlockSize);
4787       __ jcc(Assembler::less, L_processTail_insr[k]);
4788         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4789         __ pxor(xmm_result0, xmm_from0);
4790         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4791         __ addptr(pos, AESBlockSize);
4792         __ subptr(len_reg, AESBlockSize);
4793         __ jmp(L_singleBlockLoopTop[k]);
4794       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4795         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4796         __ testptr(len_reg, 8);
4797         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4798           __ subptr(pos,8);
4799           __ pinsrq(xmm_from0, Address(from, pos), 0);
4800         __ BIND(L_processTail_4_insr[k]);
4801         __ testptr(len_reg, 4);
4802         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4803           __ subptr(pos,4);
4804           __ pslldq(xmm_from0, 4);
4805           __ pinsrd(xmm_from0, Address(from, pos), 0);
4806         __ BIND(L_processTail_2_insr[k]);
4807         __ testptr(len_reg, 2);
4808         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4809           __ subptr(pos, 2);
4810           __ pslldq(xmm_from0, 2);
4811           __ pinsrw(xmm_from0, Address(from, pos), 0);
4812         __ BIND(L_processTail_1_insr[k]);
4813         __ testptr(len_reg, 1);
4814         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4815           __ subptr(pos, 1);
4816           __ pslldq(xmm_from0, 1);
4817           __ pinsrb(xmm_from0, Address(from, pos), 0);
4818         __ BIND(L_processTail_exit_insr[k]);
4819 
4820         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4821         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4822 
4823         __ testptr(len_reg, 8);
4824         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4825           __ pextrq(Address(to, pos), xmm_result0, 0);
4826           __ psrldq(xmm_result0, 8);
4827           __ addptr(pos, 8);
4828         __ BIND(L_processTail_4_extr[k]);
4829         __ testptr(len_reg, 4);
4830         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4831           __ pextrd(Address(to, pos), xmm_result0, 0);
4832           __ psrldq(xmm_result0, 4);
4833           __ addptr(pos, 4);
4834         __ BIND(L_processTail_2_extr[k]);
4835         __ testptr(len_reg, 2);
4836         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4837           __ pextrw(Address(to, pos), xmm_result0, 0);
4838           __ psrldq(xmm_result0, 2);
4839           __ addptr(pos, 2);
4840         __ BIND(L_processTail_1_extr[k]);
4841         __ testptr(len_reg, 1);
4842         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4843           __ pextrb(Address(to, pos), xmm_result0, 0);
4844 
4845         __ BIND(L_processTail_exit_extr[k]);
4846         __ movl(Address(used_addr, 0), len_reg);
4847         __ jmp(L_exit);
4848 
4849     }
4850 
4851     __ BIND(L_exit);
4852     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4853     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4854     __ pop(rbx); // pop the saved RBX.
4855 #ifdef _WIN64
4856     __ movl(rax, len_mem);
4857     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4858     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4859     __ addptr(rsp, 2 * wordSize);
4860 #else
4861     __ pop(rax); // return 'len'
4862 #endif
4863     __ leave(); // required for proper stackwalking of RuntimeStub frame
4864     __ ret(0);
4865     return start;
4866   }
4867 
4868 void roundDec(XMMRegister xmm_reg) {
4869   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4870   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4871   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4872   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4873   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4874   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4875   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4876   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4877 }
4878 
4879 void roundDeclast(XMMRegister xmm_reg) {
4880   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4881   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4882   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4883   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4884   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4885   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4886   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4887   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4888 }
4889 
4890   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4891     __ movdqu(xmmdst, Address(key, offset));
4892     if (xmm_shuf_mask != NULL) {
4893       __ pshufb(xmmdst, xmm_shuf_mask);
4894     } else {
4895       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4896     }
4897     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4898 
4899   }
4900 
4901 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4902     assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
4903     __ align(CodeEntryAlignment);
4904     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4905     address start = __ pc();
4906 
4907     const Register from = c_rarg0;  // source array address
4908     const Register to = c_rarg1;  // destination array address
4909     const Register key = c_rarg2;  // key array address
4910     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4911     // and left with the results of the last encryption block
4912 #ifndef _WIN64
4913     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4914 #else
4915     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4916     const Register len_reg = r11;      // pick the volatile windows register
4917 #endif
4918 
4919     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4920           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4921 
4922     __ enter();
4923 
4924 #ifdef _WIN64
4925   // on win64, fill len_reg from stack position
4926     __ movl(len_reg, len_mem);
4927 #else
4928     __ push(len_reg); // Save
4929 #endif
4930     __ push(rbx);
4931     __ vzeroupper();
4932 
4933     // Temporary variable declaration for swapping key bytes
4934     const XMMRegister xmm_key_shuf_mask = xmm1;
4935     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4936 
4937     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4938     const Register rounds = rbx;
4939     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4940 
4941     const XMMRegister IV = xmm0;
4942     // Load IV and broadcast value to 512-bits
4943     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4944 
4945     // Temporary variables for storing round keys
4946     const XMMRegister RK0 = xmm30;
4947     const XMMRegister RK1 = xmm9;
4948     const XMMRegister RK2 = xmm18;
4949     const XMMRegister RK3 = xmm19;
4950     const XMMRegister RK4 = xmm20;
4951     const XMMRegister RK5 = xmm21;
4952     const XMMRegister RK6 = xmm22;
4953     const XMMRegister RK7 = xmm23;
4954     const XMMRegister RK8 = xmm24;
4955     const XMMRegister RK9 = xmm25;
4956     const XMMRegister RK10 = xmm26;
4957 
4958      // Load and shuffle key
4959     // the java expanded key ordering is rotated one position from what we want
4960     // so we start from 1*16 here and hit 0*16 last
4961     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4962     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4963     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4964     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4965     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4966     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4967     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4968     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4969     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4970     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4971     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4972 
4973     // Variables for storing source cipher text
4974     const XMMRegister S0 = xmm10;
4975     const XMMRegister S1 = xmm11;
4976     const XMMRegister S2 = xmm12;
4977     const XMMRegister S3 = xmm13;
4978     const XMMRegister S4 = xmm14;
4979     const XMMRegister S5 = xmm15;
4980     const XMMRegister S6 = xmm16;
4981     const XMMRegister S7 = xmm17;
4982 
4983     // Variables for storing decrypted text
4984     const XMMRegister B0 = xmm1;
4985     const XMMRegister B1 = xmm2;
4986     const XMMRegister B2 = xmm3;
4987     const XMMRegister B3 = xmm4;
4988     const XMMRegister B4 = xmm5;
4989     const XMMRegister B5 = xmm6;
4990     const XMMRegister B6 = xmm7;
4991     const XMMRegister B7 = xmm8;
4992 
4993     __ cmpl(rounds, 44);
4994     __ jcc(Assembler::greater, KEY_192);
4995     __ jmp(Loop);
4996 
4997     __ BIND(KEY_192);
4998     const XMMRegister RK11 = xmm27;
4999     const XMMRegister RK12 = xmm28;
5000     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
5001     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
5002 
5003     __ cmpl(rounds, 52);
5004     __ jcc(Assembler::greater, KEY_256);
5005     __ jmp(Loop);
5006 
5007     __ BIND(KEY_256);
5008     const XMMRegister RK13 = xmm29;
5009     const XMMRegister RK14 = xmm31;
5010     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
5011     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
5012 
5013     __ BIND(Loop);
5014     __ cmpl(len_reg, 512);
5015     __ jcc(Assembler::below, Lcbc_dec_rem);
5016     __ BIND(Loop1);
5017     __ subl(len_reg, 512);
5018     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
5019     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
5020     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
5021     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
5022     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
5023     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
5024     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
5025     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
5026     __ leaq(from, Address(from, 8 * 64));
5027 
5028     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5029     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
5030     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
5031     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
5032     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
5033     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
5034     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
5035     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
5036 
5037     __ evalignq(IV, S0, IV, 0x06);
5038     __ evalignq(S0, S1, S0, 0x06);
5039     __ evalignq(S1, S2, S1, 0x06);
5040     __ evalignq(S2, S3, S2, 0x06);
5041     __ evalignq(S3, S4, S3, 0x06);
5042     __ evalignq(S4, S5, S4, 0x06);
5043     __ evalignq(S5, S6, S5, 0x06);
5044     __ evalignq(S6, S7, S6, 0x06);
5045 
5046     roundDec(RK2);
5047     roundDec(RK3);
5048     roundDec(RK4);
5049     roundDec(RK5);
5050     roundDec(RK6);
5051     roundDec(RK7);
5052     roundDec(RK8);
5053     roundDec(RK9);
5054     roundDec(RK10);
5055 
5056     __ cmpl(rounds, 44);
5057     __ jcc(Assembler::belowEqual, L_128);
5058     roundDec(RK11);
5059     roundDec(RK12);
5060 
5061     __ cmpl(rounds, 52);
5062     __ jcc(Assembler::belowEqual, L_192);
5063     roundDec(RK13);
5064     roundDec(RK14);
5065 
5066     __ BIND(L_256);
5067     roundDeclast(RK0);
5068     __ jmp(Loop2);
5069 
5070     __ BIND(L_128);
5071     roundDeclast(RK0);
5072     __ jmp(Loop2);
5073 
5074     __ BIND(L_192);
5075     roundDeclast(RK0);
5076 
5077     __ BIND(Loop2);
5078     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5079     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
5080     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
5081     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
5082     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
5083     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
5084     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
5085     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
5086     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
5087 
5088     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
5089     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
5090     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
5091     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
5092     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
5093     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
5094     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
5095     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
5096     __ leaq(to, Address(to, 8 * 64));
5097     __ jmp(Loop);
5098 
5099     __ BIND(Lcbc_dec_rem);
5100     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
5101 
5102     __ BIND(Lcbc_dec_rem_loop);
5103     __ subl(len_reg, 16);
5104     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
5105 
5106     __ movdqu(S0, Address(from, 0));
5107     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5108     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
5109     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
5110     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
5111     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
5112     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
5113     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
5114     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
5115     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
5116     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
5117     __ cmpl(rounds, 44);
5118     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5119 
5120     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
5121     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
5122     __ cmpl(rounds, 52);
5123     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5124 
5125     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
5126     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
5127 
5128     __ BIND(Lcbc_dec_rem_last);
5129     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
5130 
5131     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5132     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
5133     __ movdqu(Address(to, 0), B0);
5134     __ leaq(from, Address(from, 16));
5135     __ leaq(to, Address(to, 16));
5136     __ jmp(Lcbc_dec_rem_loop);
5137 
5138     __ BIND(Lcbc_dec_ret);
5139     __ movdqu(Address(rvec, 0), IV);
5140 
5141     // Zero out the round keys
5142     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
5143     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
5144     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
5145     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
5146     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
5147     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
5148     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
5149     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
5150     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
5151     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
5152     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
5153     __ cmpl(rounds, 44);
5154     __ jcc(Assembler::belowEqual, Lcbc_exit);
5155     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
5156     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
5157     __ cmpl(rounds, 52);
5158     __ jcc(Assembler::belowEqual, Lcbc_exit);
5159     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
5160     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
5161 
5162     __ BIND(Lcbc_exit);
5163     __ vzeroupper();
5164     __ pop(rbx);
5165 #ifdef _WIN64
5166     __ movl(rax, len_mem);
5167 #else
5168     __ pop(rax); // return length
5169 #endif
5170     __ leave(); // required for proper stackwalking of RuntimeStub frame
5171     __ ret(0);
5172     return start;
5173 }
5174 
5175 // Polynomial x^128+x^127+x^126+x^121+1
5176 address ghash_polynomial_addr() {
5177     __ align(CodeEntryAlignment);
5178     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
5179     address start = __ pc();
5180     __ emit_data64(0x0000000000000001, relocInfo::none);
5181     __ emit_data64(0xc200000000000000, relocInfo::none);
5182     return start;
5183 }
5184 
5185 address ghash_shufflemask_addr() {
5186     __ align(CodeEntryAlignment);
5187     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
5188     address start = __ pc();
5189     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5190     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5191     return start;
5192 }
5193 
5194 // Ghash single and multi block operations using AVX instructions
5195 address generate_avx_ghash_processBlocks() {
5196     __ align(CodeEntryAlignment);
5197 
5198     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5199     address start = __ pc();
5200 
5201     // arguments
5202     const Register state = c_rarg0;
5203     const Register htbl = c_rarg1;
5204     const Register data = c_rarg2;
5205     const Register blocks = c_rarg3;
5206     __ enter();
5207    // Save state before entering routine
5208     __ avx_ghash(state, htbl, data, blocks);
5209     __ leave(); // required for proper stackwalking of RuntimeStub frame
5210     __ ret(0);
5211     return start;
5212 }
5213 
5214   // byte swap x86 long
5215   address generate_ghash_long_swap_mask() {
5216     __ align(CodeEntryAlignment);
5217     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
5218     address start = __ pc();
5219     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
5220     __ emit_data64(0x0706050403020100, relocInfo::none );
5221   return start;
5222   }
5223 
5224   // byte swap x86 byte array
5225   address generate_ghash_byte_swap_mask() {
5226     __ align(CodeEntryAlignment);
5227     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
5228     address start = __ pc();
5229     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
5230     __ emit_data64(0x0001020304050607, relocInfo::none );
5231   return start;
5232   }
5233 
5234   /* Single and multi-block ghash operations */
5235   address generate_ghash_processBlocks() {
5236     __ align(CodeEntryAlignment);
5237     Label L_ghash_loop, L_exit;
5238     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5239     address start = __ pc();
5240 
5241     const Register state        = c_rarg0;
5242     const Register subkeyH      = c_rarg1;
5243     const Register data         = c_rarg2;
5244     const Register blocks       = c_rarg3;
5245 
5246     const XMMRegister xmm_temp0 = xmm0;
5247     const XMMRegister xmm_temp1 = xmm1;
5248     const XMMRegister xmm_temp2 = xmm2;
5249     const XMMRegister xmm_temp3 = xmm3;
5250     const XMMRegister xmm_temp4 = xmm4;
5251     const XMMRegister xmm_temp5 = xmm5;
5252     const XMMRegister xmm_temp6 = xmm6;
5253     const XMMRegister xmm_temp7 = xmm7;
5254     const XMMRegister xmm_temp8 = xmm8;
5255     const XMMRegister xmm_temp9 = xmm9;
5256     const XMMRegister xmm_temp10 = xmm10;
5257 
5258     __ enter();
5259 
5260     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
5261 
5262     __ movdqu(xmm_temp0, Address(state, 0));
5263     __ pshufb(xmm_temp0, xmm_temp10);
5264 
5265 
5266     __ BIND(L_ghash_loop);
5267     __ movdqu(xmm_temp2, Address(data, 0));
5268     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
5269 
5270     __ movdqu(xmm_temp1, Address(subkeyH, 0));
5271     __ pshufb(xmm_temp1, xmm_temp10);
5272 
5273     __ pxor(xmm_temp0, xmm_temp2);
5274 
5275     //
5276     // Multiply with the hash key
5277     //
5278     __ movdqu(xmm_temp3, xmm_temp0);
5279     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
5280     __ movdqu(xmm_temp4, xmm_temp0);
5281     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
5282 
5283     __ movdqu(xmm_temp5, xmm_temp0);
5284     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
5285     __ movdqu(xmm_temp6, xmm_temp0);
5286     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
5287 
5288     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
5289 
5290     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
5291     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
5292     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
5293     __ pxor(xmm_temp3, xmm_temp5);
5294     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
5295                                         // of the carry-less multiplication of
5296                                         // xmm0 by xmm1.
5297 
5298     // We shift the result of the multiplication by one bit position
5299     // to the left to cope for the fact that the bits are reversed.
5300     __ movdqu(xmm_temp7, xmm_temp3);
5301     __ movdqu(xmm_temp8, xmm_temp6);
5302     __ pslld(xmm_temp3, 1);
5303     __ pslld(xmm_temp6, 1);
5304     __ psrld(xmm_temp7, 31);
5305     __ psrld(xmm_temp8, 31);
5306     __ movdqu(xmm_temp9, xmm_temp7);
5307     __ pslldq(xmm_temp8, 4);
5308     __ pslldq(xmm_temp7, 4);
5309     __ psrldq(xmm_temp9, 12);
5310     __ por(xmm_temp3, xmm_temp7);
5311     __ por(xmm_temp6, xmm_temp8);
5312     __ por(xmm_temp6, xmm_temp9);
5313 
5314     //
5315     // First phase of the reduction
5316     //
5317     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
5318     // independently.
5319     __ movdqu(xmm_temp7, xmm_temp3);
5320     __ movdqu(xmm_temp8, xmm_temp3);
5321     __ movdqu(xmm_temp9, xmm_temp3);
5322     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
5323     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
5324     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
5325     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
5326     __ pxor(xmm_temp7, xmm_temp9);
5327     __ movdqu(xmm_temp8, xmm_temp7);
5328     __ pslldq(xmm_temp7, 12);
5329     __ psrldq(xmm_temp8, 4);
5330     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
5331 
5332     //
5333     // Second phase of the reduction
5334     //
5335     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
5336     // shift operations.
5337     __ movdqu(xmm_temp2, xmm_temp3);
5338     __ movdqu(xmm_temp4, xmm_temp3);
5339     __ movdqu(xmm_temp5, xmm_temp3);
5340     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
5341     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
5342     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
5343     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
5344     __ pxor(xmm_temp2, xmm_temp5);
5345     __ pxor(xmm_temp2, xmm_temp8);
5346     __ pxor(xmm_temp3, xmm_temp2);
5347     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
5348 
5349     __ decrement(blocks);
5350     __ jcc(Assembler::zero, L_exit);
5351     __ movdqu(xmm_temp0, xmm_temp6);
5352     __ addptr(data, 16);
5353     __ jmp(L_ghash_loop);
5354 
5355     __ BIND(L_exit);
5356     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
5357     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
5358     __ leave();
5359     __ ret(0);
5360     return start;
5361   }
5362 
5363   address base64_shuffle_addr()
5364   {
5365     __ align64();
5366     StubCodeMark mark(this, "StubRoutines", "shuffle_base64");
5367     address start = __ pc();
5368     assert(((unsigned long long)start & 0x3f) == 0,
5369            "Alignment problem (0x%08llx)", (unsigned long long)start);
5370     __ emit_data64(0x0405030401020001, relocInfo::none);
5371     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5372     __ emit_data64(0x10110f100d0e0c0d, relocInfo::none);
5373     __ emit_data64(0x1617151613141213, relocInfo::none);
5374     __ emit_data64(0x1c1d1b1c191a1819, relocInfo::none);
5375     __ emit_data64(0x222321221f201e1f, relocInfo::none);
5376     __ emit_data64(0x2829272825262425, relocInfo::none);
5377     __ emit_data64(0x2e2f2d2e2b2c2a2b, relocInfo::none);
5378     return start;
5379   }
5380 
5381   address base64_avx2_shuffle_addr()
5382   {
5383     __ align32();
5384     StubCodeMark mark(this, "StubRoutines", "avx2_shuffle_base64");
5385     address start = __ pc();
5386     __ emit_data64(0x0809070805060405, relocInfo::none);
5387     __ emit_data64(0x0e0f0d0e0b0c0a0b, relocInfo::none);
5388     __ emit_data64(0x0405030401020001, relocInfo::none);
5389     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5390     return start;
5391   }
5392 
5393   address base64_avx2_input_mask_addr()
5394   {
5395     __ align32();
5396     StubCodeMark mark(this, "StubRoutines", "avx2_input_mask_base64");
5397     address start = __ pc();
5398     __ emit_data64(0x8000000000000000, relocInfo::none);
5399     __ emit_data64(0x8000000080000000, relocInfo::none);
5400     __ emit_data64(0x8000000080000000, relocInfo::none);
5401     __ emit_data64(0x8000000080000000, relocInfo::none);
5402     return start;
5403   }
5404 
5405   address base64_avx2_lut_addr()
5406   {
5407     __ align32();
5408     StubCodeMark mark(this, "StubRoutines", "avx2_lut_base64");
5409     address start = __ pc();
5410     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5411     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5412     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5413     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5414 
5415     // URL LUT
5416     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5417     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5418     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5419     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5420     return start;
5421   }
5422 
5423   address base64_encoding_table_addr()
5424   {
5425     __ align64();
5426     StubCodeMark mark(this, "StubRoutines", "encoding_table_base64");
5427     address start = __ pc();
5428     assert(((unsigned long long)start & 0x3f) == 0, "Alignment problem (0x%08llx)", (unsigned long long)start);
5429     __ emit_data64(0x4847464544434241, relocInfo::none);
5430     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5431     __ emit_data64(0x5857565554535251, relocInfo::none);
5432     __ emit_data64(0x6665646362615a59, relocInfo::none);
5433     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5434     __ emit_data64(0x767574737271706f, relocInfo::none);
5435     __ emit_data64(0x333231307a797877, relocInfo::none);
5436     __ emit_data64(0x2f2b393837363534, relocInfo::none);
5437 
5438     // URL table
5439     __ emit_data64(0x4847464544434241, relocInfo::none);
5440     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5441     __ emit_data64(0x5857565554535251, relocInfo::none);
5442     __ emit_data64(0x6665646362615a59, relocInfo::none);
5443     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5444     __ emit_data64(0x767574737271706f, relocInfo::none);
5445     __ emit_data64(0x333231307a797877, relocInfo::none);
5446     __ emit_data64(0x5f2d393837363534, relocInfo::none);
5447     return start;
5448   }
5449 
5450   // Code for generating Base64 encoding.
5451   // Intrinsic function prototype in Base64.java:
5452   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp,
5453   // boolean isURL) {
5454   address generate_base64_encodeBlock()
5455   {
5456     __ align(CodeEntryAlignment);
5457     StubCodeMark mark(this, "StubRoutines", "implEncode");
5458     address start = __ pc();
5459     __ enter();
5460 
5461     // Save callee-saved registers before using them
5462     __ push(r12);
5463     __ push(r13);
5464     __ push(r14);
5465     __ push(r15);
5466 
5467     // arguments
5468     const Register source = c_rarg0;       // Source Array
5469     const Register start_offset = c_rarg1; // start offset
5470     const Register end_offset = c_rarg2;   // end offset
5471     const Register dest = c_rarg3;   // destination array
5472 
5473 #ifndef _WIN64
5474     const Register dp = c_rarg4;    // Position for writing to dest array
5475     const Register isURL = c_rarg5; // Base64 or URL character set
5476 #else
5477     const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5478     const Address isURL_mem(rbp, 7 * wordSize);
5479     const Register isURL = r10; // pick the volatile windows register
5480     const Register dp = r12;
5481     __ movl(dp, dp_mem);
5482     __ movl(isURL, isURL_mem);
5483 #endif
5484 
5485     const Register length = r14;
5486     const Register encode_table = r13;
5487     Label L_process3, L_exit, L_processdata, L_vbmiLoop, L_not512, L_32byteLoop;
5488 
5489     // calculate length from offsets
5490     __ movl(length, end_offset);
5491     __ subl(length, start_offset);
5492     __ cmpl(length, 0);
5493     __ jcc(Assembler::lessEqual, L_exit);
5494 
5495     // Code for 512-bit VBMI encoding.  Encodes 48 input bytes into 64
5496     // output bytes. We read 64 input bytes and ignore the last 16, so be
5497     // sure not to read past the end of the input buffer.
5498     if (VM_Version::supports_avx512_vbmi()) {
5499       __ cmpl(length, 64); // Do not overrun input buffer.
5500       __ jcc(Assembler::below, L_not512);
5501 
5502       __ shll(isURL, 6); // index into decode table based on isURL
5503       __ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5504       __ addptr(encode_table, isURL);
5505       __ shrl(isURL, 6); // restore isURL
5506 
5507       __ mov64(rax, 0x3036242a1016040aull); // Shifts
5508       __ evmovdquq(xmm3, ExternalAddress(StubRoutines::x86::base64_shuffle_addr()), Assembler::AVX_512bit, r15);
5509       __ evmovdquq(xmm2, Address(encode_table, 0), Assembler::AVX_512bit);
5510       __ evpbroadcastq(xmm1, rax, Assembler::AVX_512bit);
5511 
5512       __ align32();
5513       __ BIND(L_vbmiLoop);
5514 
5515       __ vpermb(xmm0, xmm3, Address(source, start_offset), Assembler::AVX_512bit);
5516       __ subl(length, 48);
5517 
5518       // Put the input bytes into the proper lanes for writing, then
5519       // encode them.
5520       __ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
5521       __ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
5522 
5523       // Write to destination
5524       __ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
5525 
5526       __ addptr(dest, 64);
5527       __ addptr(source, 48);
5528       __ cmpl(length, 64);
5529       __ jcc(Assembler::aboveEqual, L_vbmiLoop);
5530 
5531       __ vzeroupper();
5532     }
5533 
5534     __ BIND(L_not512);
5535     if (VM_Version::supports_avx2()
5536         && VM_Version::supports_avx512vlbw()) {
5537       /*
5538       ** This AVX2 encoder is based off the paper at:
5539       **      https://dl.acm.org/doi/10.1145/3132709
5540       **
5541       ** We use AVX2 SIMD instructions to encode 24 bytes into 32
5542       ** output bytes.
5543       **
5544       */
5545       // Lengths under 32 bytes are done with scalar routine
5546       __ cmpl(length, 31);
5547       __ jcc(Assembler::belowEqual, L_process3);
5548 
5549       // Set up supporting constant table data
5550       __ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
5551       // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
5552       __ movl(rax, 0x0fc0fc00);
5553       __ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
5554       __ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
5555 
5556       // Multiplication constant for "shifting" right by 6 and 10
5557       // bits
5558       __ movl(rax, 0x04000040);
5559 
5560       __ subl(length, 24);
5561       __ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
5562 
5563       // For the first load, we mask off reading of the first 4
5564       // bytes into the register. This is so we can get 4 3-byte
5565       // chunks into each lane of the register, avoiding having to
5566       // handle end conditions.  We then shuffle these bytes into a
5567       // specific order so that manipulation is easier.
5568       //
5569       // The initial read loads the XMM register like this:
5570       //
5571       // Lower 128-bit lane:
5572       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5573       // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1
5574       // | C2 | D0 | D1 | D2 |
5575       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5576       //
5577       // Upper 128-bit lane:
5578       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5579       // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2
5580       // | XX | XX | XX | XX |
5581       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5582       //
5583       // Where A0 is the first input byte, B0 is the fourth, etc.
5584       // The alphabetical significance denotes the 3 bytes to be
5585       // consumed and encoded into 4 bytes.
5586       //
5587       // We then shuffle the register so each 32-bit word contains
5588       // the sequence:
5589       //    A1 A0 A2 A1, B1, B0, B2, B1, etc.
5590       // Each of these byte sequences are then manipulated into 4
5591       // 6-bit values ready for encoding.
5592       //
5593       // If we focus on one set of 3-byte chunks, changing the
5594       // nomenclature such that A0 => a, A1 => b, and A2 => c, we
5595       // shuffle such that each 24-bit chunk contains:
5596       //
5597       // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6
5598       // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0
5599       // Explain this step.
5600       // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4
5601       // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2
5602       //
5603       // W first and off all but bits 4-9 and 16-21 (c5..c0 and
5604       // a5..a0) and shift them using a vector multiplication
5605       // operation (vpmulhuw) which effectively shifts c right by 6
5606       // bits and a right by 10 bits.  We similarly mask bits 10-15
5607       // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4
5608       // bits respectively.  This is done using vpmullw.  We end up
5609       // with 4 6-bit values, thus splitting the 3 input bytes,
5610       // ready for encoding:
5611       //    0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0
5612       //
5613       // For translation, we recognize that there are 5 distinct
5614       // ranges of legal Base64 characters as below:
5615       //
5616       //   +-------------+-------------+------------+
5617       //   | 6-bit value | ASCII range |   offset   |
5618       //   +-------------+-------------+------------+
5619       //   |    0..25    |    A..Z     |     65     |
5620       //   |   26..51    |    a..z     |     71     |
5621       //   |   52..61    |    0..9     |     -4     |
5622       //   |     62      |   + or -    | -19 or -17 |
5623       //   |     63      |   / or _    | -16 or 32  |
5624       //   +-------------+-------------+------------+
5625       //
5626       // We note that vpshufb does a parallel lookup in a
5627       // destination register using the lower 4 bits of bytes from a
5628       // source register.  If we use a saturated subtraction and
5629       // subtract 51 from each 6-bit value, bytes from [0,51]
5630       // saturate to 0, and [52,63] map to a range of [1,12].  We
5631       // distinguish the [0,25] and [26,51] ranges by assigning a
5632       // value of 13 for all 6-bit values less than 26.  We end up
5633       // with:
5634       //
5635       //   +-------------+-------------+------------+
5636       //   | 6-bit value |   Reduced   |   offset   |
5637       //   +-------------+-------------+------------+
5638       //   |    0..25    |     13      |     65     |
5639       //   |   26..51    |      0      |     71     |
5640       //   |   52..61    |    0..9     |     -4     |
5641       //   |     62      |     11      | -19 or -17 |
5642       //   |     63      |     12      | -16 or 32  |
5643       //   +-------------+-------------+------------+
5644       //
5645       // We then use a final vpshufb to add the appropriate offset,
5646       // translating the bytes.
5647       //
5648       // Load input bytes - only 28 bytes.  Mask the first load to
5649       // not load into the full register.
5650       __ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
5651 
5652       // Move 3-byte chunks of input (12 bytes) into 16 bytes,
5653       // ordering by:
5654       //   1, 0, 2, 1; 4, 3, 5, 4; etc.  This groups 6-bit chunks
5655       //   for easy masking
5656       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5657 
5658       __ addl(start_offset, 24);
5659 
5660       // Load masking register for first and third (and multiples)
5661       // 6-bit values.
5662       __ movl(rax, 0x003f03f0);
5663       __ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
5664       // Multiplication constant for "shifting" left by 4 and 8 bits
5665       __ movl(rax, 0x01000010);
5666       __ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
5667 
5668       // Isolate 6-bit chunks of interest
5669       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5670 
5671       // Load constants for encoding
5672       __ movl(rax, 0x19191919);
5673       __ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
5674       __ movl(rax, 0x33333333);
5675       __ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
5676 
5677       // Shift output bytes 0 and 2 into proper lanes
5678       __ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
5679 
5680       // Mask and shift output bytes 1 and 3 into proper lanes and
5681       // combine
5682       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5683       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5684       __ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
5685 
5686       // Find out which are 0..25.  This indicates which input
5687       // values fall in the range of 'A'-'Z', which require an
5688       // additional offset (see comments above)
5689       __ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
5690       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5691       __ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
5692 
5693       // Load the proper lookup table
5694       __ lea(r11, ExternalAddress(StubRoutines::x86::base64_avx2_lut_addr()));
5695       __ movl(r15, isURL);
5696       __ shll(r15, 5);
5697       __ vmovdqu(xmm2, Address(r11, r15));
5698 
5699       // Shuffle the offsets based on the range calculation done
5700       // above. This allows us to add the correct offset to the
5701       // 6-bit value corresponding to the range documented above.
5702       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5703       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5704 
5705       // Store the encoded bytes
5706       __ vmovdqu(Address(dest, dp), xmm0);
5707       __ addl(dp, 32);
5708 
5709       __ cmpl(length, 31);
5710       __ jcc(Assembler::belowEqual, L_process3);
5711 
5712       __ align32();
5713       __ BIND(L_32byteLoop);
5714 
5715       // Get next 32 bytes
5716       __ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
5717 
5718       __ subl(length, 24);
5719       __ addl(start_offset, 24);
5720 
5721       // This logic is identical to the above, with only constant
5722       // register loads removed.  Shuffle the input, mask off 6-bit
5723       // chunks, shift them into place, then add the offset to
5724       // encode.
5725       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5726 
5727       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5728       __ vpmulhuw(xmm10, xmm0, xmm7, Assembler::AVX_256bit);
5729       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5730       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5731       __ vpor(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
5732       __ vpcmpgtb(xmm10, xmm0, xmm3, Assembler::AVX_256bit);
5733       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5734       __ vpsubb(xmm1, xmm1, xmm10, Assembler::AVX_256bit);
5735       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5736       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5737 
5738       // Store the encoded bytes
5739       __ vmovdqu(Address(dest, dp), xmm0);
5740       __ addl(dp, 32);
5741 
5742       __ cmpl(length, 31);
5743       __ jcc(Assembler::above, L_32byteLoop);
5744 
5745       __ BIND(L_process3);
5746       __ vzeroupper();
5747     } else {
5748       __ BIND(L_process3);
5749     }
5750 
5751     __ cmpl(length, 3);
5752     __ jcc(Assembler::below, L_exit);
5753 
5754     // Load the encoding table based on isURL
5755     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5756     __ movl(r15, isURL);
5757     __ shll(r15, 6);
5758     __ addptr(r11, r15);
5759 
5760     __ BIND(L_processdata);
5761 
5762     // Load 3 bytes
5763     __ load_unsigned_byte(r15, Address(source, start_offset));
5764     __ load_unsigned_byte(r10, Address(source, start_offset, Address::times_1, 1));
5765     __ load_unsigned_byte(r13, Address(source, start_offset, Address::times_1, 2));
5766 
5767     // Build a 32-bit word with bytes 1, 2, 0, 1
5768     __ movl(rax, r10);
5769     __ shll(r10, 24);
5770     __ orl(rax, r10);
5771 
5772     __ subl(length, 3);
5773 
5774     __ shll(r15, 8);
5775     __ shll(r13, 16);
5776     __ orl(rax, r15);
5777 
5778     __ addl(start_offset, 3);
5779 
5780     __ orl(rax, r13);
5781     // At this point, rax contains | byte1 | byte2 | byte0 | byte1
5782     // r13 has byte2 << 16 - need low-order 6 bits to translate.
5783     // This translated byte is the fourth output byte.
5784     __ shrl(r13, 16);
5785     __ andl(r13, 0x3f);
5786 
5787     // The high-order 6 bits of r15 (byte0) is translated.
5788     // The translated byte is the first output byte.
5789     __ shrl(r15, 10);
5790 
5791     __ load_unsigned_byte(r13, Address(r11, r13));
5792     __ load_unsigned_byte(r15, Address(r11, r15));
5793 
5794     __ movb(Address(dest, dp, Address::times_1, 3), r13);
5795 
5796     // Extract high-order 4 bits of byte1 and low-order 2 bits of byte0.
5797     // This translated byte is the second output byte.
5798     __ shrl(rax, 4);
5799     __ movl(r10, rax);
5800     __ andl(rax, 0x3f);
5801 
5802     __ movb(Address(dest, dp, Address::times_1, 0), r15);
5803 
5804     __ load_unsigned_byte(rax, Address(r11, rax));
5805 
5806     // Extract low-order 2 bits of byte1 and high-order 4 bits of byte2.
5807     // This translated byte is the third output byte.
5808     __ shrl(r10, 18);
5809     __ andl(r10, 0x3f);
5810 
5811     __ load_unsigned_byte(r10, Address(r11, r10));
5812 
5813     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5814     __ movb(Address(dest, dp, Address::times_1, 2), r10);
5815 
5816     __ addl(dp, 4);
5817     __ cmpl(length, 3);
5818     __ jcc(Assembler::aboveEqual, L_processdata);
5819 
5820     __ BIND(L_exit);
5821     __ pop(r15);
5822     __ pop(r14);
5823     __ pop(r13);
5824     __ pop(r12);
5825     __ leave();
5826     __ ret(0);
5827     return start;
5828   }
5829 
5830   // base64 AVX512vbmi tables
5831   address base64_vbmi_lookup_lo_addr() {
5832     __ align64();
5833     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64");
5834     address start = __ pc();
5835     assert(((unsigned long long)start & 0x3f) == 0,
5836            "Alignment problem (0x%08llx)", (unsigned long long)start);
5837     __ emit_data64(0x8080808080808080, relocInfo::none);
5838     __ emit_data64(0x8080808080808080, relocInfo::none);
5839     __ emit_data64(0x8080808080808080, relocInfo::none);
5840     __ emit_data64(0x8080808080808080, relocInfo::none);
5841     __ emit_data64(0x8080808080808080, relocInfo::none);
5842     __ emit_data64(0x3f8080803e808080, relocInfo::none);
5843     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5844     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5845     return start;
5846   }
5847 
5848   address base64_vbmi_lookup_hi_addr() {
5849     __ align64();
5850     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64");
5851     address start = __ pc();
5852     assert(((unsigned long long)start & 0x3f) == 0,
5853            "Alignment problem (0x%08llx)", (unsigned long long)start);
5854     __ emit_data64(0x0605040302010080, relocInfo::none);
5855     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5856     __ emit_data64(0x161514131211100f, relocInfo::none);
5857     __ emit_data64(0x8080808080191817, relocInfo::none);
5858     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5859     __ emit_data64(0x2827262524232221, relocInfo::none);
5860     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5861     __ emit_data64(0x8080808080333231, relocInfo::none);
5862     return start;
5863   }
5864   address base64_vbmi_lookup_lo_url_addr() {
5865     __ align64();
5866     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64url");
5867     address start = __ pc();
5868     assert(((unsigned long long)start & 0x3f) == 0,
5869            "Alignment problem (0x%08llx)", (unsigned long long)start);
5870     __ emit_data64(0x8080808080808080, relocInfo::none);
5871     __ emit_data64(0x8080808080808080, relocInfo::none);
5872     __ emit_data64(0x8080808080808080, relocInfo::none);
5873     __ emit_data64(0x8080808080808080, relocInfo::none);
5874     __ emit_data64(0x8080808080808080, relocInfo::none);
5875     __ emit_data64(0x80803e8080808080, relocInfo::none);
5876     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5877     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5878     return start;
5879   }
5880 
5881   address base64_vbmi_lookup_hi_url_addr() {
5882     __ align64();
5883     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64url");
5884     address start = __ pc();
5885     assert(((unsigned long long)start & 0x3f) == 0,
5886            "Alignment problem (0x%08llx)", (unsigned long long)start);
5887     __ emit_data64(0x0605040302010080, relocInfo::none);
5888     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5889     __ emit_data64(0x161514131211100f, relocInfo::none);
5890     __ emit_data64(0x3f80808080191817, relocInfo::none);
5891     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5892     __ emit_data64(0x2827262524232221, relocInfo::none);
5893     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5894     __ emit_data64(0x8080808080333231, relocInfo::none);
5895     return start;
5896   }
5897 
5898   address base64_vbmi_pack_vec_addr() {
5899     __ align64();
5900     StubCodeMark mark(this, "StubRoutines", "pack_vec_base64");
5901     address start = __ pc();
5902     assert(((unsigned long long)start & 0x3f) == 0,
5903            "Alignment problem (0x%08llx)", (unsigned long long)start);
5904     __ emit_data64(0x090a040506000102, relocInfo::none);
5905     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5906     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5907     __ emit_data64(0x292a242526202122, relocInfo::none);
5908     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5909     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5910     __ emit_data64(0x0000000000000000, relocInfo::none);
5911     __ emit_data64(0x0000000000000000, relocInfo::none);
5912     return start;
5913   }
5914 
5915   address base64_vbmi_join_0_1_addr() {
5916     __ align64();
5917     StubCodeMark mark(this, "StubRoutines", "join_0_1_base64");
5918     address start = __ pc();
5919     assert(((unsigned long long)start & 0x3f) == 0,
5920            "Alignment problem (0x%08llx)", (unsigned long long)start);
5921     __ emit_data64(0x090a040506000102, relocInfo::none);
5922     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5923     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5924     __ emit_data64(0x292a242526202122, relocInfo::none);
5925     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5926     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5927     __ emit_data64(0x494a444546404142, relocInfo::none);
5928     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5929     return start;
5930   }
5931 
5932   address base64_vbmi_join_1_2_addr() {
5933     __ align64();
5934     StubCodeMark mark(this, "StubRoutines", "join_1_2_base64");
5935     address start = __ pc();
5936     assert(((unsigned long long)start & 0x3f) == 0,
5937            "Alignment problem (0x%08llx)", (unsigned long long)start);
5938     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5939     __ emit_data64(0x292a242526202122, relocInfo::none);
5940     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5941     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5942     __ emit_data64(0x494a444546404142, relocInfo::none);
5943     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5944     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
5945     __ emit_data64(0x696a646566606162, relocInfo::none);
5946     return start;
5947   }
5948 
5949   address base64_vbmi_join_2_3_addr() {
5950     __ align64();
5951     StubCodeMark mark(this, "StubRoutines", "join_2_3_base64");
5952     address start = __ pc();
5953     assert(((unsigned long long)start & 0x3f) == 0,
5954            "Alignment problem (0x%08llx)", (unsigned long long)start);
5955     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5956     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5957     __ emit_data64(0x494a444546404142, relocInfo::none);
5958     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5959     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
5960     __ emit_data64(0x696a646566606162, relocInfo::none);
5961     __ emit_data64(0x767071726c6d6e68, relocInfo::none);
5962     __ emit_data64(0x7c7d7e78797a7475, relocInfo::none);
5963     return start;
5964   }
5965 
5966   address base64_decoding_table_addr() {
5967     StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
5968     address start = __ pc();
5969     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5970     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5971     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5972     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5973     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5974     __ emit_data64(0x3fffffff3effffff, relocInfo::none);
5975     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5976     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
5977     __ emit_data64(0x06050403020100ff, relocInfo::none);
5978     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5979     __ emit_data64(0x161514131211100f, relocInfo::none);
5980     __ emit_data64(0xffffffffff191817, relocInfo::none);
5981     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
5982     __ emit_data64(0x2827262524232221, relocInfo::none);
5983     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5984     __ emit_data64(0xffffffffff333231, relocInfo::none);
5985     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5986     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5987     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5988     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5989     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5990     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5991     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5992     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5993     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5994     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5995     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5996     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5997     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5998     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5999     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6000     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6001 
6002     // URL table
6003     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6004     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6005     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6006     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6007     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6008     __ emit_data64(0xffff3effffffffff, relocInfo::none);
6009     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6010     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6011     __ emit_data64(0x06050403020100ff, relocInfo::none);
6012     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6013     __ emit_data64(0x161514131211100f, relocInfo::none);
6014     __ emit_data64(0x3fffffffff191817, relocInfo::none);
6015     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6016     __ emit_data64(0x2827262524232221, relocInfo::none);
6017     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6018     __ emit_data64(0xffffffffff333231, relocInfo::none);
6019     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6020     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6021     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6022     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6023     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6024     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6025     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6026     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6027     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6028     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6029     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6030     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6031     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6032     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6033     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6034     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6035     return start;
6036   }
6037 
6038 
6039 // Code for generating Base64 decoding.
6040 //
6041 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
6042 //
6043 // Intrinsic function prototype in Base64.java:
6044 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME) {
6045   address generate_base64_decodeBlock() {
6046     __ align(CodeEntryAlignment);
6047     StubCodeMark mark(this, "StubRoutines", "implDecode");
6048     address start = __ pc();
6049     __ enter();
6050 
6051     // Save callee-saved registers before using them
6052     __ push(r12);
6053     __ push(r13);
6054     __ push(r14);
6055     __ push(r15);
6056     __ push(rbx);
6057 
6058     // arguments
6059     const Register source = c_rarg0; // Source Array
6060     const Register start_offset = c_rarg1; // start offset
6061     const Register end_offset = c_rarg2; // end offset
6062     const Register dest = c_rarg3; // destination array
6063     const Register isMIME = rbx;
6064 
6065 #ifndef _WIN64
6066     const Register dp = c_rarg4;  // Position for writing to dest array
6067     const Register isURL = c_rarg5;// Base64 or URL character set
6068     __ movl(isMIME, Address(rbp, 2 * wordSize));
6069 #else
6070     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
6071     const Address isURL_mem(rbp, 7 * wordSize);
6072     const Register isURL = r10;      // pick the volatile windows register
6073     const Register dp = r12;
6074     __ movl(dp, dp_mem);
6075     __ movl(isURL, isURL_mem);
6076     __ movl(isMIME, Address(rbp, 8 * wordSize));
6077 #endif
6078 
6079     const XMMRegister lookup_lo = xmm5;
6080     const XMMRegister lookup_hi = xmm6;
6081     const XMMRegister errorvec = xmm7;
6082     const XMMRegister pack16_op = xmm9;
6083     const XMMRegister pack32_op = xmm8;
6084     const XMMRegister input0 = xmm3;
6085     const XMMRegister input1 = xmm20;
6086     const XMMRegister input2 = xmm21;
6087     const XMMRegister input3 = xmm19;
6088     const XMMRegister join01 = xmm12;
6089     const XMMRegister join12 = xmm11;
6090     const XMMRegister join23 = xmm10;
6091     const XMMRegister translated0 = xmm2;
6092     const XMMRegister translated1 = xmm1;
6093     const XMMRegister translated2 = xmm0;
6094     const XMMRegister translated3 = xmm4;
6095 
6096     const XMMRegister merged0 = xmm2;
6097     const XMMRegister merged1 = xmm1;
6098     const XMMRegister merged2 = xmm0;
6099     const XMMRegister merged3 = xmm4;
6100     const XMMRegister merge_ab_bc0 = xmm2;
6101     const XMMRegister merge_ab_bc1 = xmm1;
6102     const XMMRegister merge_ab_bc2 = xmm0;
6103     const XMMRegister merge_ab_bc3 = xmm4;
6104 
6105     const XMMRegister pack24bits = xmm4;
6106 
6107     const Register length = r14;
6108     const Register output_size = r13;
6109     const Register output_mask = r15;
6110     const KRegister input_mask = k1;
6111 
6112     const XMMRegister input_initial_valid_b64 = xmm0;
6113     const XMMRegister tmp = xmm10;
6114     const XMMRegister mask = xmm0;
6115     const XMMRegister invalid_b64 = xmm1;
6116 
6117     Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
6118     Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
6119     Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
6120 
6121     // calculate length from offsets
6122     __ movl(length, end_offset);
6123     __ subl(length, start_offset);
6124     __ push(dest);          // Save for return value calc
6125 
6126     // If AVX512 VBMI not supported, just compile non-AVX code
6127     if(VM_Version::supports_avx512_vbmi() &&
6128        VM_Version::supports_avx512bw()) {
6129       __ cmpl(length, 128);     // 128-bytes is break-even for AVX-512
6130       __ jcc(Assembler::lessEqual, L_bruteForce);
6131 
6132       __ cmpl(isMIME, 0);
6133       __ jcc(Assembler::notEqual, L_bruteForce);
6134 
6135       // Load lookup tables based on isURL
6136       __ cmpl(isURL, 0);
6137       __ jcc(Assembler::notZero, L_loadURL);
6138 
6139       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_addr()), Assembler::AVX_512bit, r13);
6140       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_addr()), Assembler::AVX_512bit, r13);
6141 
6142       __ BIND(L_continue);
6143 
6144       __ movl(r15, 0x01400140);
6145       __ evpbroadcastd(pack16_op, r15, Assembler::AVX_512bit);
6146 
6147       __ movl(r15, 0x00011000);
6148       __ evpbroadcastd(pack32_op, r15, Assembler::AVX_512bit);
6149 
6150       __ cmpl(length, 0xff);
6151       __ jcc(Assembler::lessEqual, L_process64);
6152 
6153       // load masks required for decoding data
6154       __ BIND(L_processdata);
6155       __ evmovdquq(join01, ExternalAddress(StubRoutines::x86::base64_vbmi_join_0_1_addr()), Assembler::AVX_512bit,r13);
6156       __ evmovdquq(join12, ExternalAddress(StubRoutines::x86::base64_vbmi_join_1_2_addr()), Assembler::AVX_512bit, r13);
6157       __ evmovdquq(join23, ExternalAddress(StubRoutines::x86::base64_vbmi_join_2_3_addr()), Assembler::AVX_512bit, r13);
6158 
6159       __ align32();
6160       __ BIND(L_process256);
6161       // Grab input data
6162       __ evmovdquq(input0, Address(source, start_offset, Address::times_1, 0x00), Assembler::AVX_512bit);
6163       __ evmovdquq(input1, Address(source, start_offset, Address::times_1, 0x40), Assembler::AVX_512bit);
6164       __ evmovdquq(input2, Address(source, start_offset, Address::times_1, 0x80), Assembler::AVX_512bit);
6165       __ evmovdquq(input3, Address(source, start_offset, Address::times_1, 0xc0), Assembler::AVX_512bit);
6166 
6167       // Copy the low part of the lookup table into the destination of the permutation
6168       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6169       __ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
6170       __ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
6171       __ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
6172 
6173       // Translate the base64 input into "decoded" bytes
6174       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6175       __ evpermt2b(translated1, input1, lookup_hi, Assembler::AVX_512bit);
6176       __ evpermt2b(translated2, input2, lookup_hi, Assembler::AVX_512bit);
6177       __ evpermt2b(translated3, input3, lookup_hi, Assembler::AVX_512bit);
6178 
6179       // OR all of the translations together to check for errors (high-order bit of byte set)
6180       __ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
6181 
6182       __ vpternlogd(input3, 0xfe, translated0, translated1, Assembler::AVX_512bit);
6183       __ vpternlogd(input0, 0xfe, translated2, translated3, Assembler::AVX_512bit);
6184       __ vpor(errorvec, input3, input0, Assembler::AVX_512bit);
6185 
6186       // Check if there was an error - if so, try 64-byte chunks
6187       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6188       __ kortestql(k3, k3);
6189       __ jcc(Assembler::notZero, L_process64);
6190 
6191       // The merging and shuffling happens here
6192       // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa]
6193       // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd]
6194       // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
6195       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6196       __ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
6197       __ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
6198       __ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
6199 
6200       // Now do the same with packed 16-bit values.
6201       // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb]
6202       // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12
6203       // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
6204       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6205       __ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
6206       __ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
6207       __ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
6208 
6209       // The join vectors specify which byte from which vector goes into the outputs
6210       // One of every 4 bytes in the extended vector is zero, so we pack them into their
6211       // final positions in the register for storing (256 bytes in, 192 bytes out)
6212       __ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
6213       __ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
6214       __ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
6215 
6216       // Store result
6217       __ evmovdquq(Address(dest, dp, Address::times_1, 0x00), merged0, Assembler::AVX_512bit);
6218       __ evmovdquq(Address(dest, dp, Address::times_1, 0x40), merged1, Assembler::AVX_512bit);
6219       __ evmovdquq(Address(dest, dp, Address::times_1, 0x80), merged2, Assembler::AVX_512bit);
6220 
6221       __ addptr(source, 0x100);
6222       __ addptr(dest, 0xc0);
6223       __ subl(length, 0x100);
6224       __ cmpl(length, 64 * 4);
6225       __ jcc(Assembler::greaterEqual, L_process256);
6226 
6227       // At this point, we've decoded 64 * 4 * n bytes.
6228       // The remaining length will be <= 64 * 4 - 1.
6229       // UNLESS there was an error decoding the first 256-byte chunk.  In this
6230       // case, the length will be arbitrarily long.
6231       //
6232       // Note that this will be the path for MIME-encoded strings.
6233 
6234       __ BIND(L_process64);
6235 
6236       __ evmovdquq(pack24bits, ExternalAddress(StubRoutines::x86::base64_vbmi_pack_vec_addr()), Assembler::AVX_512bit, r13);
6237 
6238       __ cmpl(length, 63);
6239       __ jcc(Assembler::lessEqual, L_finalBit);
6240 
6241       __ mov64(rax, 0x0000ffffffffffff);
6242       __ kmovql(k2, rax);
6243 
6244       __ align32();
6245       __ BIND(L_process64Loop);
6246 
6247       // Handle first 64-byte block
6248 
6249       __ evmovdquq(input0, Address(source, start_offset), Assembler::AVX_512bit);
6250       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6251       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6252 
6253       __ vpor(errorvec, translated0, input0, Assembler::AVX_512bit);
6254 
6255       // Check for error and bomb out before updating dest
6256       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6257       __ kortestql(k3, k3);
6258       __ jcc(Assembler::notZero, L_exit);
6259 
6260       // Pack output register, selecting correct byte ordering
6261       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6262       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6263       __ vpermb(merged0, pack24bits, merged0, Assembler::AVX_512bit);
6264 
6265       __ evmovdqub(Address(dest, dp), k2, merged0, true, Assembler::AVX_512bit);
6266 
6267       __ subl(length, 64);
6268       __ addptr(source, 64);
6269       __ addptr(dest, 48);
6270 
6271       __ cmpl(length, 64);
6272       __ jcc(Assembler::greaterEqual, L_process64Loop);
6273 
6274       __ cmpl(length, 0);
6275       __ jcc(Assembler::lessEqual, L_exit);
6276 
6277       __ BIND(L_finalBit);
6278       // Now have 1 to 63 bytes left to decode
6279 
6280       // I was going to let Java take care of the final fragment
6281       // however it will repeatedly call this routine for every 4 bytes
6282       // of input data, so handle the rest here.
6283       __ movq(rax, -1);
6284       __ bzhiq(rax, rax, length);    // Input mask in rax
6285 
6286       __ movl(output_size, length);
6287       __ shrl(output_size, 2);   // Find (len / 4) * 3 (output length)
6288       __ lea(output_size, Address(output_size, output_size, Address::times_2, 0));
6289       // output_size in r13
6290 
6291       // Strip pad characters, if any, and adjust length and mask
6292       __ cmpb(Address(source, length, Address::times_1, -1), '=');
6293       __ jcc(Assembler::equal, L_padding);
6294 
6295       __ BIND(L_donePadding);
6296 
6297       // Output size is (64 - output_size), output mask is (all 1s >> output_size).
6298       __ kmovql(input_mask, rax);
6299       __ movq(output_mask, -1);
6300       __ bzhiq(output_mask, output_mask, output_size);
6301 
6302       // Load initial input with all valid base64 characters.  Will be used
6303       // in merging source bytes to avoid masking when determining if an error occurred.
6304       __ movl(rax, 0x61616161);
6305       __ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
6306 
6307       // A register containing all invalid base64 decoded values
6308       __ movl(rax, 0x80808080);
6309       __ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
6310 
6311       // input_mask is in k1
6312       // output_size is in r13
6313       // output_mask is in r15
6314       // zmm0 - free
6315       // zmm1 - 0x00011000
6316       // zmm2 - 0x01400140
6317       // zmm3 - errorvec
6318       // zmm4 - pack vector
6319       // zmm5 - lookup_lo
6320       // zmm6 - lookup_hi
6321       // zmm7 - errorvec
6322       // zmm8 - 0x61616161
6323       // zmm9 - 0x80808080
6324 
6325       // Load only the bytes from source, merging into our "fully-valid" register
6326       __ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
6327 
6328       // Decode all bytes within our merged input
6329       __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
6330       __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
6331       __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
6332 
6333       // Check for error.  Compare (decoded | initial) to all invalid.
6334       // If any bytes have their high-order bit set, then we have an error.
6335       __ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
6336       __ kortestql(k2, k2);
6337 
6338       // If we have an error, use the brute force loop to decode what we can (4-byte chunks).
6339       __ jcc(Assembler::notZero, L_bruteForce);
6340 
6341       // Shuffle output bytes
6342       __ vpmaddubsw(tmp, tmp, pack16_op, Assembler::AVX_512bit);
6343       __ vpmaddwd(tmp, tmp, pack32_op, Assembler::AVX_512bit);
6344 
6345       __ vpermb(tmp, pack24bits, tmp, Assembler::AVX_512bit);
6346       __ kmovql(k1, output_mask);
6347       __ evmovdqub(Address(dest, dp), k1, tmp, true, Assembler::AVX_512bit);
6348 
6349       __ addptr(dest, output_size);
6350 
6351       __ BIND(L_exit);
6352       __ vzeroupper();
6353       __ pop(rax);             // Get original dest value
6354       __ subptr(dest, rax);      // Number of bytes converted
6355       __ movptr(rax, dest);
6356       __ pop(rbx);
6357       __ pop(r15);
6358       __ pop(r14);
6359       __ pop(r13);
6360       __ pop(r12);
6361       __ leave();
6362       __ ret(0);
6363 
6364       __ BIND(L_loadURL);
6365       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_url_addr()), Assembler::AVX_512bit, r13);
6366       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_url_addr()), Assembler::AVX_512bit, r13);
6367       __ jmp(L_continue);
6368 
6369       __ BIND(L_padding);
6370       __ decrementq(output_size, 1);
6371       __ shrq(rax, 1);
6372 
6373       __ cmpb(Address(source, length, Address::times_1, -2), '=');
6374       __ jcc(Assembler::notEqual, L_donePadding);
6375 
6376       __ decrementq(output_size, 1);
6377       __ shrq(rax, 1);
6378       __ jmp(L_donePadding);
6379 
6380       __ align32();
6381       __ BIND(L_bruteForce);
6382     }   // End of if(avx512_vbmi)
6383 
6384     // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
6385 
6386     // Register state (Linux):
6387     // r12-15 - saved on stack
6388     // rdi - src
6389     // rsi - sp
6390     // rdx - sl
6391     // rcx - dst
6392     // r8 - dp
6393     // r9 - isURL
6394 
6395     // Register state (Windows):
6396     // r12-15 - saved on stack
6397     // rcx - src
6398     // rdx - sp
6399     // r8 - sl
6400     // r9 - dst
6401     // r12 - dp
6402     // r10 - isURL
6403 
6404     // Registers (common):
6405     // length (r14) - bytes in src
6406 
6407     const Register decode_table = r11;
6408     const Register out_byte_count = rbx;
6409     const Register byte1 = r13;
6410     const Register byte2 = r15;
6411     const Register byte3 = WINDOWS_ONLY(r8) NOT_WINDOWS(rdx);
6412     const Register byte4 = WINDOWS_ONLY(r10) NOT_WINDOWS(r9);
6413 
6414     __ shrl(length, 2);    // Multiple of 4 bytes only - length is # 4-byte chunks
6415     __ cmpl(length, 0);
6416     __ jcc(Assembler::lessEqual, L_exit_no_vzero);
6417 
6418     __ shll(isURL, 8);    // index into decode table based on isURL
6419     __ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
6420     __ addptr(decode_table, isURL);
6421 
6422     __ jmp(L_bottomLoop);
6423 
6424     __ align32();
6425     __ BIND(L_forceLoop);
6426     __ shll(byte1, 18);
6427     __ shll(byte2, 12);
6428     __ shll(byte3, 6);
6429     __ orl(byte1, byte2);
6430     __ orl(byte1, byte3);
6431     __ orl(byte1, byte4);
6432 
6433     __ addptr(source, 4);
6434 
6435     __ movb(Address(dest, dp, Address::times_1, 2), byte1);
6436     __ shrl(byte1, 8);
6437     __ movb(Address(dest, dp, Address::times_1, 1), byte1);
6438     __ shrl(byte1, 8);
6439     __ movb(Address(dest, dp, Address::times_1, 0), byte1);
6440 
6441     __ addptr(dest, 3);
6442     __ decrementl(length, 1);
6443     __ jcc(Assembler::zero, L_exit_no_vzero);
6444 
6445     __ BIND(L_bottomLoop);
6446     __ load_unsigned_byte(byte1, Address(source, start_offset, Address::times_1, 0x00));
6447     __ load_unsigned_byte(byte2, Address(source, start_offset, Address::times_1, 0x01));
6448     __ load_signed_byte(byte1, Address(decode_table, byte1));
6449     __ load_signed_byte(byte2, Address(decode_table, byte2));
6450     __ load_unsigned_byte(byte3, Address(source, start_offset, Address::times_1, 0x02));
6451     __ load_unsigned_byte(byte4, Address(source, start_offset, Address::times_1, 0x03));
6452     __ load_signed_byte(byte3, Address(decode_table, byte3));
6453     __ load_signed_byte(byte4, Address(decode_table, byte4));
6454 
6455     __ mov(rax, byte1);
6456     __ orl(rax, byte2);
6457     __ orl(rax, byte3);
6458     __ orl(rax, byte4);
6459     __ jcc(Assembler::positive, L_forceLoop);
6460 
6461     __ BIND(L_exit_no_vzero);
6462     __ pop(rax);             // Get original dest value
6463     __ subptr(dest, rax);      // Number of bytes converted
6464     __ movptr(rax, dest);
6465     __ pop(rbx);
6466     __ pop(r15);
6467     __ pop(r14);
6468     __ pop(r13);
6469     __ pop(r12);
6470     __ leave();
6471     __ ret(0);
6472 
6473     return start;
6474   }
6475 
6476 
6477   /**
6478    *  Arguments:
6479    *
6480    * Inputs:
6481    *   c_rarg0   - int crc
6482    *   c_rarg1   - byte* buf
6483    *   c_rarg2   - int length
6484    *
6485    * Output:
6486    *       rax   - int crc result
6487    */
6488   address generate_updateBytesCRC32() {
6489     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
6490 
6491     __ align(CodeEntryAlignment);
6492     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6493 
6494     address start = __ pc();
6495     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6496     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6497     // rscratch1: r10
6498     const Register crc   = c_rarg0;  // crc
6499     const Register buf   = c_rarg1;  // source java byte array address
6500     const Register len   = c_rarg2;  // length
6501     const Register table = c_rarg3;  // crc_table address (reuse register)
6502     const Register tmp1   = r11;
6503     const Register tmp2   = r10;
6504     assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
6505 
6506     BLOCK_COMMENT("Entry:");
6507     __ enter(); // required for proper stackwalking of RuntimeStub frame
6508 
6509     if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6510         VM_Version::supports_avx512bw() &&
6511         VM_Version::supports_avx512vl()) {
6512         // The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value.
6513         // However, the constant table for CRC32-C assumes the original crc value.  Account for this
6514         // difference before calling and after returning.
6515       __ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
6516       __ notl(crc);
6517       __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6518       __ notl(crc);
6519     } else {
6520       __ kernel_crc32(crc, buf, len, table, tmp1);
6521     }
6522 
6523     __ movl(rax, crc);
6524     __ vzeroupper();
6525     __ leave(); // required for proper stackwalking of RuntimeStub frame
6526     __ ret(0);
6527 
6528     return start;
6529   }
6530 
6531   /**
6532   *  Arguments:
6533   *
6534   * Inputs:
6535   *   c_rarg0   - int crc
6536   *   c_rarg1   - byte* buf
6537   *   c_rarg2   - long length
6538   *   c_rarg3   - table_start - optional (present only when doing a library_call,
6539   *              not used by x86 algorithm)
6540   *
6541   * Output:
6542   *       rax   - int crc result
6543   */
6544   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
6545       assert(UseCRC32CIntrinsics, "need SSE4_2");
6546       __ align(CodeEntryAlignment);
6547       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
6548       address start = __ pc();
6549       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
6550       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
6551       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
6552       const Register crc = c_rarg0;  // crc
6553       const Register buf = c_rarg1;  // source java byte array address
6554       const Register len = c_rarg2;  // length
6555       const Register a = rax;
6556       const Register j = r9;
6557       const Register k = r10;
6558       const Register l = r11;
6559 #ifdef _WIN64
6560       const Register y = rdi;
6561       const Register z = rsi;
6562 #else
6563       const Register y = rcx;
6564       const Register z = r8;
6565 #endif
6566       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
6567 
6568       BLOCK_COMMENT("Entry:");
6569       __ enter(); // required for proper stackwalking of RuntimeStub frame
6570       if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6571           VM_Version::supports_avx512bw() &&
6572           VM_Version::supports_avx512vl()) {
6573         __ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr()));
6574         __ kernel_crc32_avx512(crc, buf, len, j, l, k);
6575       } else {
6576 #ifdef _WIN64
6577         __ push(y);
6578         __ push(z);
6579 #endif
6580         __ crc32c_ipl_alg2_alt2(crc, buf, len,
6581                                 a, j, k,
6582                                 l, y, z,
6583                                 c_farg0, c_farg1, c_farg2,
6584                                 is_pclmulqdq_supported);
6585 #ifdef _WIN64
6586         __ pop(z);
6587         __ pop(y);
6588 #endif
6589       }
6590       __ movl(rax, crc);
6591       __ vzeroupper();
6592       __ leave(); // required for proper stackwalking of RuntimeStub frame
6593       __ ret(0);
6594 
6595       return start;
6596   }
6597 
6598 
6599   /***
6600    *  Arguments:
6601    *
6602    *  Inputs:
6603    *   c_rarg0   - int   adler
6604    *   c_rarg1   - byte* buff
6605    *   c_rarg2   - int   len
6606    *
6607    * Output:
6608    *   rax   - int adler result
6609    */
6610 
6611   address generate_updateBytesAdler32() {
6612       assert(UseAdler32Intrinsics, "need AVX2");
6613 
6614       __ align(CodeEntryAlignment);
6615       StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
6616 
6617       address start = __ pc();
6618 
6619       const Register data = r9;
6620       const Register size = r10;
6621 
6622       const XMMRegister yshuf0 = xmm6;
6623       const XMMRegister yshuf1 = xmm7;
6624       assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
6625 
6626       BLOCK_COMMENT("Entry:");
6627       __ enter(); // required for proper stackwalking of RuntimeStub frame
6628 
6629       __ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
6630       __ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
6631       __ movptr(data, c_rarg1); //data
6632       __ movl(size, c_rarg2); //length
6633       __ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
6634       __ leave();
6635       __ ret(0);
6636       return start;
6637   }
6638 
6639   /**
6640    *  Arguments:
6641    *
6642    *  Input:
6643    *    c_rarg0   - x address
6644    *    c_rarg1   - x length
6645    *    c_rarg2   - y address
6646    *    c_rarg3   - y length
6647    * not Win64
6648    *    c_rarg4   - z address
6649    *    c_rarg5   - z length
6650    * Win64
6651    *    rsp+40    - z address
6652    *    rsp+48    - z length
6653    */
6654   address generate_multiplyToLen() {
6655     __ align(CodeEntryAlignment);
6656     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
6657 
6658     address start = __ pc();
6659     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6660     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6661     const Register x     = rdi;
6662     const Register xlen  = rax;
6663     const Register y     = rsi;
6664     const Register ylen  = rcx;
6665     const Register z     = r8;
6666     const Register zlen  = r11;
6667 
6668     // Next registers will be saved on stack in multiply_to_len().
6669     const Register tmp1  = r12;
6670     const Register tmp2  = r13;
6671     const Register tmp3  = r14;
6672     const Register tmp4  = r15;
6673     const Register tmp5  = rbx;
6674 
6675     BLOCK_COMMENT("Entry:");
6676     __ enter(); // required for proper stackwalking of RuntimeStub frame
6677 
6678 #ifndef _WIN64
6679     __ movptr(zlen, r9); // Save r9 in r11 - zlen
6680 #endif
6681     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
6682                        // ylen => rcx, z => r8, zlen => r11
6683                        // r9 and r10 may be used to save non-volatile registers
6684 #ifdef _WIN64
6685     // last 2 arguments (#4, #5) are on stack on Win64
6686     __ movptr(z, Address(rsp, 6 * wordSize));
6687     __ movptr(zlen, Address(rsp, 7 * wordSize));
6688 #endif
6689 
6690     __ movptr(xlen, rsi);
6691     __ movptr(y,    rdx);
6692     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
6693 
6694     restore_arg_regs();
6695 
6696     __ leave(); // required for proper stackwalking of RuntimeStub frame
6697     __ ret(0);
6698 
6699     return start;
6700   }
6701 
6702   /**
6703   *  Arguments:
6704   *
6705   *  Input:
6706   *    c_rarg0   - obja     address
6707   *    c_rarg1   - objb     address
6708   *    c_rarg3   - length   length
6709   *    c_rarg4   - scale    log2_array_indxscale
6710   *
6711   *  Output:
6712   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
6713   */
6714   address generate_vectorizedMismatch() {
6715     __ align(CodeEntryAlignment);
6716     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
6717     address start = __ pc();
6718 
6719     BLOCK_COMMENT("Entry:");
6720     __ enter();
6721 
6722 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6723     const Register scale = c_rarg0;  //rcx, will exchange with r9
6724     const Register objb = c_rarg1;   //rdx
6725     const Register length = c_rarg2; //r8
6726     const Register obja = c_rarg3;   //r9
6727     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
6728 
6729     const Register tmp1 = r10;
6730     const Register tmp2 = r11;
6731 #endif
6732 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6733     const Register obja = c_rarg0;   //U:rdi
6734     const Register objb = c_rarg1;   //U:rsi
6735     const Register length = c_rarg2; //U:rdx
6736     const Register scale = c_rarg3;  //U:rcx
6737     const Register tmp1 = r8;
6738     const Register tmp2 = r9;
6739 #endif
6740     const Register result = rax; //return value
6741     const XMMRegister vec0 = xmm0;
6742     const XMMRegister vec1 = xmm1;
6743     const XMMRegister vec2 = xmm2;
6744 
6745     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
6746 
6747     __ vzeroupper();
6748     __ leave();
6749     __ ret(0);
6750 
6751     return start;
6752   }
6753 
6754 /**
6755    *  Arguments:
6756    *
6757   //  Input:
6758   //    c_rarg0   - x address
6759   //    c_rarg1   - x length
6760   //    c_rarg2   - z address
6761   //    c_rarg3   - z length
6762    *
6763    */
6764   address generate_squareToLen() {
6765 
6766     __ align(CodeEntryAlignment);
6767     StubCodeMark mark(this, "StubRoutines", "squareToLen");
6768 
6769     address start = __ pc();
6770     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6771     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
6772     const Register x      = rdi;
6773     const Register len    = rsi;
6774     const Register z      = r8;
6775     const Register zlen   = rcx;
6776 
6777    const Register tmp1      = r12;
6778    const Register tmp2      = r13;
6779    const Register tmp3      = r14;
6780    const Register tmp4      = r15;
6781    const Register tmp5      = rbx;
6782 
6783     BLOCK_COMMENT("Entry:");
6784     __ enter(); // required for proper stackwalking of RuntimeStub frame
6785 
6786     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
6787                        // zlen => rcx
6788                        // r9 and r10 may be used to save non-volatile registers
6789     __ movptr(r8, rdx);
6790     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6791 
6792     restore_arg_regs();
6793 
6794     __ leave(); // required for proper stackwalking of RuntimeStub frame
6795     __ ret(0);
6796 
6797     return start;
6798   }
6799 
6800   address generate_method_entry_barrier() {
6801     __ align(CodeEntryAlignment);
6802     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
6803 
6804     Label deoptimize_label;
6805 
6806     address start = __ pc();
6807 
6808     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
6809 
6810     BLOCK_COMMENT("Entry:");
6811     __ enter(); // save rbp
6812 
6813     // save c_rarg0, because we want to use that value.
6814     // We could do without it but then we depend on the number of slots used by pusha
6815     __ push(c_rarg0);
6816 
6817     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
6818 
6819     __ pusha();
6820 
6821     // The method may have floats as arguments, and we must spill them before calling
6822     // the VM runtime.
6823     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
6824     const int xmm_size = wordSize * 2;
6825     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
6826     __ subptr(rsp, xmm_spill_size);
6827     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
6828     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
6829     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
6830     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
6831     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
6832     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
6833     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
6834     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
6835 
6836     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
6837 
6838     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
6839     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
6840     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
6841     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
6842     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
6843     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
6844     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
6845     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
6846     __ addptr(rsp, xmm_spill_size);
6847 
6848     __ cmpl(rax, 1); // 1 means deoptimize
6849     __ jcc(Assembler::equal, deoptimize_label);
6850 
6851     __ popa();
6852     __ pop(c_rarg0);
6853 
6854     __ leave();
6855 
6856     __ addptr(rsp, 1 * wordSize); // cookie
6857     __ ret(0);
6858 
6859 
6860     __ BIND(deoptimize_label);
6861 
6862     __ popa();
6863     __ pop(c_rarg0);
6864 
6865     __ leave();
6866 
6867     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
6868     // here while still having a correct stack is valuable
6869     __ testptr(rsp, Address(rsp, 0));
6870 
6871     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
6872     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
6873 
6874     return start;
6875   }
6876 
6877    /**
6878    *  Arguments:
6879    *
6880    *  Input:
6881    *    c_rarg0   - out address
6882    *    c_rarg1   - in address
6883    *    c_rarg2   - offset
6884    *    c_rarg3   - len
6885    * not Win64
6886    *    c_rarg4   - k
6887    * Win64
6888    *    rsp+40    - k
6889    */
6890   address generate_mulAdd() {
6891     __ align(CodeEntryAlignment);
6892     StubCodeMark mark(this, "StubRoutines", "mulAdd");
6893 
6894     address start = __ pc();
6895     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6896     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6897     const Register out     = rdi;
6898     const Register in      = rsi;
6899     const Register offset  = r11;
6900     const Register len     = rcx;
6901     const Register k       = r8;
6902 
6903     // Next registers will be saved on stack in mul_add().
6904     const Register tmp1  = r12;
6905     const Register tmp2  = r13;
6906     const Register tmp3  = r14;
6907     const Register tmp4  = r15;
6908     const Register tmp5  = rbx;
6909 
6910     BLOCK_COMMENT("Entry:");
6911     __ enter(); // required for proper stackwalking of RuntimeStub frame
6912 
6913     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
6914                        // len => rcx, k => r8
6915                        // r9 and r10 may be used to save non-volatile registers
6916 #ifdef _WIN64
6917     // last argument is on stack on Win64
6918     __ movl(k, Address(rsp, 6 * wordSize));
6919 #endif
6920     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
6921     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6922 
6923     restore_arg_regs();
6924 
6925     __ leave(); // required for proper stackwalking of RuntimeStub frame
6926     __ ret(0);
6927 
6928     return start;
6929   }
6930 
6931   address generate_bigIntegerRightShift() {
6932     __ align(CodeEntryAlignment);
6933     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
6934 
6935     address start = __ pc();
6936     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
6937     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
6938     const Register newArr = rdi;
6939     const Register oldArr = rsi;
6940     const Register newIdx = rdx;
6941     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
6942     const Register totalNumIter = r8;
6943 
6944     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
6945     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
6946     const Register tmp1 = r11;                    // Caller save.
6947     const Register tmp2 = rax;                    // Caller save.
6948     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
6949     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
6950     const Register tmp5 = r14;                    // Callee save.
6951     const Register tmp6 = r15;
6952 
6953     const XMMRegister x0 = xmm0;
6954     const XMMRegister x1 = xmm1;
6955     const XMMRegister x2 = xmm2;
6956 
6957     BLOCK_COMMENT("Entry:");
6958     __ enter(); // required for proper stackwalking of RuntimeStub frame
6959 
6960 #ifdef _WINDOWS
6961     setup_arg_regs(4);
6962     // For windows, since last argument is on stack, we need to move it to the appropriate register.
6963     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
6964     // Save callee save registers.
6965     __ push(tmp3);
6966     __ push(tmp4);
6967 #endif
6968     __ push(tmp5);
6969 
6970     // Rename temps used throughout the code.
6971     const Register idx = tmp1;
6972     const Register nIdx = tmp2;
6973 
6974     __ xorl(idx, idx);
6975 
6976     // Start right shift from end of the array.
6977     // For example, if #iteration = 4 and newIdx = 1
6978     // then dest[4] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
6979     // if #iteration = 4 and newIdx = 0
6980     // then dest[3] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
6981     __ movl(idx, totalNumIter);
6982     __ movl(nIdx, idx);
6983     __ addl(nIdx, newIdx);
6984 
6985     // If vectorization is enabled, check if the number of iterations is at least 64
6986     // If not, then go to ShifTwo processing 2 iterations
6987     if (VM_Version::supports_avx512_vbmi2()) {
6988       __ cmpptr(totalNumIter, (AVX3Threshold/64));
6989       __ jcc(Assembler::less, ShiftTwo);
6990 
6991       if (AVX3Threshold < 16 * 64) {
6992         __ cmpl(totalNumIter, 16);
6993         __ jcc(Assembler::less, ShiftTwo);
6994       }
6995       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
6996       __ subl(idx, 16);
6997       __ subl(nIdx, 16);
6998       __ BIND(Shift512Loop);
6999       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
7000       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7001       __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
7002       __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
7003       __ subl(nIdx, 16);
7004       __ subl(idx, 16);
7005       __ jcc(Assembler::greaterEqual, Shift512Loop);
7006       __ addl(idx, 16);
7007       __ addl(nIdx, 16);
7008     }
7009     __ BIND(ShiftTwo);
7010     __ cmpl(idx, 2);
7011     __ jcc(Assembler::less, ShiftOne);
7012     __ subl(idx, 2);
7013     __ subl(nIdx, 2);
7014     __ BIND(ShiftTwoLoop);
7015     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
7016     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7017     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7018     __ shrdl(tmp5, tmp4);
7019     __ shrdl(tmp4, tmp3);
7020     __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
7021     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7022     __ subl(nIdx, 2);
7023     __ subl(idx, 2);
7024     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7025     __ addl(idx, 2);
7026     __ addl(nIdx, 2);
7027 
7028     // Do the last iteration
7029     __ BIND(ShiftOne);
7030     __ cmpl(idx, 1);
7031     __ jcc(Assembler::less, Exit);
7032     __ subl(idx, 1);
7033     __ subl(nIdx, 1);
7034     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7035     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7036     __ shrdl(tmp4, tmp3);
7037     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7038     __ BIND(Exit);
7039     __ vzeroupper();
7040     // Restore callee save registers.
7041     __ pop(tmp5);
7042 #ifdef _WINDOWS
7043     __ pop(tmp4);
7044     __ pop(tmp3);
7045     restore_arg_regs();
7046 #endif
7047     __ leave(); // required for proper stackwalking of RuntimeStub frame
7048     __ ret(0);
7049     return start;
7050   }
7051 
7052    /**
7053    *  Arguments:
7054    *
7055    *  Input:
7056    *    c_rarg0   - newArr address
7057    *    c_rarg1   - oldArr address
7058    *    c_rarg2   - newIdx
7059    *    c_rarg3   - shiftCount
7060    * not Win64
7061    *    c_rarg4   - numIter
7062    * Win64
7063    *    rsp40    - numIter
7064    */
7065   address generate_bigIntegerLeftShift() {
7066     __ align(CodeEntryAlignment);
7067     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
7068     address start = __ pc();
7069     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7070     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7071     const Register newArr = rdi;
7072     const Register oldArr = rsi;
7073     const Register newIdx = rdx;
7074     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7075     const Register totalNumIter = r8;
7076     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7077     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7078     const Register tmp1 = r11;                    // Caller save.
7079     const Register tmp2 = rax;                    // Caller save.
7080     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7081     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7082     const Register tmp5 = r14;                    // Callee save.
7083 
7084     const XMMRegister x0 = xmm0;
7085     const XMMRegister x1 = xmm1;
7086     const XMMRegister x2 = xmm2;
7087     BLOCK_COMMENT("Entry:");
7088     __ enter(); // required for proper stackwalking of RuntimeStub frame
7089 
7090 #ifdef _WINDOWS
7091     setup_arg_regs(4);
7092     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7093     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7094     // Save callee save registers.
7095     __ push(tmp3);
7096     __ push(tmp4);
7097 #endif
7098     __ push(tmp5);
7099 
7100     // Rename temps used throughout the code
7101     const Register idx = tmp1;
7102     const Register numIterTmp = tmp2;
7103 
7104     // Start idx from zero.
7105     __ xorl(idx, idx);
7106     // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
7107     __ lea(newArr, Address(newArr, newIdx, Address::times_4));
7108     __ movl(numIterTmp, totalNumIter);
7109 
7110     // If vectorization is enabled, check if the number of iterations is at least 64
7111     // If not, then go to ShiftTwo shifting two numbers at a time
7112     if (VM_Version::supports_avx512_vbmi2()) {
7113       __ cmpl(totalNumIter, (AVX3Threshold/64));
7114       __ jcc(Assembler::less, ShiftTwo);
7115 
7116       if (AVX3Threshold < 16 * 64) {
7117         __ cmpl(totalNumIter, 16);
7118         __ jcc(Assembler::less, ShiftTwo);
7119       }
7120       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7121       __ subl(numIterTmp, 16);
7122       __ BIND(Shift512Loop);
7123       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7124       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
7125       __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
7126       __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
7127       __ addl(idx, 16);
7128       __ subl(numIterTmp, 16);
7129       __ jcc(Assembler::greaterEqual, Shift512Loop);
7130       __ addl(numIterTmp, 16);
7131     }
7132     __ BIND(ShiftTwo);
7133     __ cmpl(totalNumIter, 1);
7134     __ jcc(Assembler::less, Exit);
7135     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7136     __ subl(numIterTmp, 2);
7137     __ jcc(Assembler::less, ShiftOne);
7138 
7139     __ BIND(ShiftTwoLoop);
7140     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7141     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
7142     __ shldl(tmp3, tmp4);
7143     __ shldl(tmp4, tmp5);
7144     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7145     __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
7146     __ movl(tmp3, tmp5);
7147     __ addl(idx, 2);
7148     __ subl(numIterTmp, 2);
7149     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7150 
7151     // Do the last iteration
7152     __ BIND(ShiftOne);
7153     __ addl(numIterTmp, 2);
7154     __ cmpl(numIterTmp, 1);
7155     __ jcc(Assembler::less, Exit);
7156     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7157     __ shldl(tmp3, tmp4);
7158     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7159 
7160     __ BIND(Exit);
7161     __ vzeroupper();
7162     // Restore callee save registers.
7163     __ pop(tmp5);
7164 #ifdef _WINDOWS
7165     __ pop(tmp4);
7166     __ pop(tmp3);
7167     restore_arg_regs();
7168 #endif
7169     __ leave(); // required for proper stackwalking of RuntimeStub frame
7170     __ ret(0);
7171     return start;
7172   }
7173 
7174   address generate_libmExp() {
7175     StubCodeMark mark(this, "StubRoutines", "libmExp");
7176 
7177     address start = __ pc();
7178 
7179     const XMMRegister x0  = xmm0;
7180     const XMMRegister x1  = xmm1;
7181     const XMMRegister x2  = xmm2;
7182     const XMMRegister x3  = xmm3;
7183 
7184     const XMMRegister x4  = xmm4;
7185     const XMMRegister x5  = xmm5;
7186     const XMMRegister x6  = xmm6;
7187     const XMMRegister x7  = xmm7;
7188 
7189     const Register tmp   = r11;
7190 
7191     BLOCK_COMMENT("Entry:");
7192     __ enter(); // required for proper stackwalking of RuntimeStub frame
7193 
7194     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7195 
7196     __ leave(); // required for proper stackwalking of RuntimeStub frame
7197     __ ret(0);
7198 
7199     return start;
7200 
7201   }
7202 
7203   address generate_libmLog() {
7204     StubCodeMark mark(this, "StubRoutines", "libmLog");
7205 
7206     address start = __ pc();
7207 
7208     const XMMRegister x0 = xmm0;
7209     const XMMRegister x1 = xmm1;
7210     const XMMRegister x2 = xmm2;
7211     const XMMRegister x3 = xmm3;
7212 
7213     const XMMRegister x4 = xmm4;
7214     const XMMRegister x5 = xmm5;
7215     const XMMRegister x6 = xmm6;
7216     const XMMRegister x7 = xmm7;
7217 
7218     const Register tmp1 = r11;
7219     const Register tmp2 = r8;
7220 
7221     BLOCK_COMMENT("Entry:");
7222     __ enter(); // required for proper stackwalking of RuntimeStub frame
7223 
7224     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
7225 
7226     __ leave(); // required for proper stackwalking of RuntimeStub frame
7227     __ ret(0);
7228 
7229     return start;
7230 
7231   }
7232 
7233   address generate_libmLog10() {
7234     StubCodeMark mark(this, "StubRoutines", "libmLog10");
7235 
7236     address start = __ pc();
7237 
7238     const XMMRegister x0 = xmm0;
7239     const XMMRegister x1 = xmm1;
7240     const XMMRegister x2 = xmm2;
7241     const XMMRegister x3 = xmm3;
7242 
7243     const XMMRegister x4 = xmm4;
7244     const XMMRegister x5 = xmm5;
7245     const XMMRegister x6 = xmm6;
7246     const XMMRegister x7 = xmm7;
7247 
7248     const Register tmp = r11;
7249 
7250     BLOCK_COMMENT("Entry:");
7251     __ enter(); // required for proper stackwalking of RuntimeStub frame
7252 
7253     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7254 
7255     __ leave(); // required for proper stackwalking of RuntimeStub frame
7256     __ ret(0);
7257 
7258     return start;
7259 
7260   }
7261 
7262   address generate_libmPow() {
7263     StubCodeMark mark(this, "StubRoutines", "libmPow");
7264 
7265     address start = __ pc();
7266 
7267     const XMMRegister x0 = xmm0;
7268     const XMMRegister x1 = xmm1;
7269     const XMMRegister x2 = xmm2;
7270     const XMMRegister x3 = xmm3;
7271 
7272     const XMMRegister x4 = xmm4;
7273     const XMMRegister x5 = xmm5;
7274     const XMMRegister x6 = xmm6;
7275     const XMMRegister x7 = xmm7;
7276 
7277     const Register tmp1 = r8;
7278     const Register tmp2 = r9;
7279     const Register tmp3 = r10;
7280     const Register tmp4 = r11;
7281 
7282     BLOCK_COMMENT("Entry:");
7283     __ enter(); // required for proper stackwalking of RuntimeStub frame
7284 
7285     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7286 
7287     __ leave(); // required for proper stackwalking of RuntimeStub frame
7288     __ ret(0);
7289 
7290     return start;
7291 
7292   }
7293 
7294   address generate_libmSin() {
7295     StubCodeMark mark(this, "StubRoutines", "libmSin");
7296 
7297     address start = __ pc();
7298 
7299     const XMMRegister x0 = xmm0;
7300     const XMMRegister x1 = xmm1;
7301     const XMMRegister x2 = xmm2;
7302     const XMMRegister x3 = xmm3;
7303 
7304     const XMMRegister x4 = xmm4;
7305     const XMMRegister x5 = xmm5;
7306     const XMMRegister x6 = xmm6;
7307     const XMMRegister x7 = xmm7;
7308 
7309     const Register tmp1 = r8;
7310     const Register tmp2 = r9;
7311     const Register tmp3 = r10;
7312     const Register tmp4 = r11;
7313 
7314     BLOCK_COMMENT("Entry:");
7315     __ enter(); // required for proper stackwalking of RuntimeStub frame
7316 
7317 #ifdef _WIN64
7318     __ push(rsi);
7319     __ push(rdi);
7320 #endif
7321     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7322 
7323 #ifdef _WIN64
7324     __ pop(rdi);
7325     __ pop(rsi);
7326 #endif
7327 
7328     __ leave(); // required for proper stackwalking of RuntimeStub frame
7329     __ ret(0);
7330 
7331     return start;
7332 
7333   }
7334 
7335   address generate_libmCos() {
7336     StubCodeMark mark(this, "StubRoutines", "libmCos");
7337 
7338     address start = __ pc();
7339 
7340     const XMMRegister x0 = xmm0;
7341     const XMMRegister x1 = xmm1;
7342     const XMMRegister x2 = xmm2;
7343     const XMMRegister x3 = xmm3;
7344 
7345     const XMMRegister x4 = xmm4;
7346     const XMMRegister x5 = xmm5;
7347     const XMMRegister x6 = xmm6;
7348     const XMMRegister x7 = xmm7;
7349 
7350     const Register tmp1 = r8;
7351     const Register tmp2 = r9;
7352     const Register tmp3 = r10;
7353     const Register tmp4 = r11;
7354 
7355     BLOCK_COMMENT("Entry:");
7356     __ enter(); // required for proper stackwalking of RuntimeStub frame
7357 
7358 #ifdef _WIN64
7359     __ push(rsi);
7360     __ push(rdi);
7361 #endif
7362     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7363 
7364 #ifdef _WIN64
7365     __ pop(rdi);
7366     __ pop(rsi);
7367 #endif
7368 
7369     __ leave(); // required for proper stackwalking of RuntimeStub frame
7370     __ ret(0);
7371 
7372     return start;
7373 
7374   }
7375 
7376   address generate_libmTan() {
7377     StubCodeMark mark(this, "StubRoutines", "libmTan");
7378 
7379     address start = __ pc();
7380 
7381     const XMMRegister x0 = xmm0;
7382     const XMMRegister x1 = xmm1;
7383     const XMMRegister x2 = xmm2;
7384     const XMMRegister x3 = xmm3;
7385 
7386     const XMMRegister x4 = xmm4;
7387     const XMMRegister x5 = xmm5;
7388     const XMMRegister x6 = xmm6;
7389     const XMMRegister x7 = xmm7;
7390 
7391     const Register tmp1 = r8;
7392     const Register tmp2 = r9;
7393     const Register tmp3 = r10;
7394     const Register tmp4 = r11;
7395 
7396     BLOCK_COMMENT("Entry:");
7397     __ enter(); // required for proper stackwalking of RuntimeStub frame
7398 
7399 #ifdef _WIN64
7400     __ push(rsi);
7401     __ push(rdi);
7402 #endif
7403     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7404 
7405 #ifdef _WIN64
7406     __ pop(rdi);
7407     __ pop(rsi);
7408 #endif
7409 
7410     __ leave(); // required for proper stackwalking of RuntimeStub frame
7411     __ ret(0);
7412 
7413     return start;
7414 
7415   }
7416 
7417 #undef __
7418 #define __ masm->
7419 
7420   // Continuation point for throwing of implicit exceptions that are
7421   // not handled in the current activation. Fabricates an exception
7422   // oop and initiates normal exception dispatching in this
7423   // frame. Since we need to preserve callee-saved values (currently
7424   // only for C2, but done for C1 as well) we need a callee-saved oop
7425   // map and therefore have to make these stubs into RuntimeStubs
7426   // rather than BufferBlobs.  If the compiler needs all registers to
7427   // be preserved between the fault point and the exception handler
7428   // then it must assume responsibility for that in
7429   // AbstractCompiler::continuation_for_implicit_null_exception or
7430   // continuation_for_implicit_division_by_zero_exception. All other
7431   // implicit exceptions (e.g., NullPointerException or
7432   // AbstractMethodError on entry) are either at call sites or
7433   // otherwise assume that stack unwinding will be initiated, so
7434   // caller saved registers were assumed volatile in the compiler.
7435   address generate_throw_exception(const char* name,
7436                                    address runtime_entry,
7437                                    Register arg1 = noreg,
7438                                    Register arg2 = noreg) {
7439     // Information about frame layout at time of blocking runtime call.
7440     // Note that we only have to preserve callee-saved registers since
7441     // the compilers are responsible for supplying a continuation point
7442     // if they expect all registers to be preserved.
7443     enum layout {
7444       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
7445       rbp_off2,
7446       return_off,
7447       return_off2,
7448       framesize // inclusive of return address
7449     };
7450 
7451     int insts_size = 512;
7452     int locs_size  = 64;
7453 
7454     CodeBuffer code(name, insts_size, locs_size);
7455     OopMapSet* oop_maps  = new OopMapSet();
7456     MacroAssembler* masm = new MacroAssembler(&code);
7457 
7458     address start = __ pc();
7459 
7460     // This is an inlined and slightly modified version of call_VM
7461     // which has the ability to fetch the return PC out of
7462     // thread-local storage and also sets up last_Java_sp slightly
7463     // differently than the real call_VM
7464 
7465     __ enter(); // required for proper stackwalking of RuntimeStub frame
7466 
7467     assert(is_even(framesize/2), "sp not 16-byte aligned");
7468 
7469     // return address and rbp are already in place
7470     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
7471 
7472     int frame_complete = __ pc() - start;
7473 
7474     // Set up last_Java_sp and last_Java_fp
7475     address the_pc = __ pc();
7476     __ set_last_Java_frame(rsp, rbp, the_pc);
7477     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
7478 
7479     // Call runtime
7480     if (arg1 != noreg) {
7481       assert(arg2 != c_rarg1, "clobbered");
7482       __ movptr(c_rarg1, arg1);
7483     }
7484     if (arg2 != noreg) {
7485       __ movptr(c_rarg2, arg2);
7486     }
7487     __ movptr(c_rarg0, r15_thread);
7488     BLOCK_COMMENT("call runtime_entry");
7489     __ call(RuntimeAddress(runtime_entry));
7490 
7491     // Generate oop map
7492     OopMap* map = new OopMap(framesize, 0);
7493 
7494     oop_maps->add_gc_map(the_pc - start, map);
7495 
7496     __ reset_last_Java_frame(true);
7497 
7498     __ leave(); // required for proper stackwalking of RuntimeStub frame
7499 
7500     // check for pending exceptions
7501 #ifdef ASSERT
7502     Label L;
7503     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
7504             (int32_t) NULL_WORD);
7505     __ jcc(Assembler::notEqual, L);
7506     __ should_not_reach_here();
7507     __ bind(L);
7508 #endif // ASSERT
7509     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7510 
7511 
7512     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7513     RuntimeStub* stub =
7514       RuntimeStub::new_runtime_stub(name,
7515                                     &code,
7516                                     frame_complete,
7517                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7518                                     oop_maps, false);
7519     return stub->entry_point();
7520   }
7521 
7522   void create_control_words() {
7523     // Round to nearest, 64-bit mode, exceptions masked
7524     StubRoutines::x86::_mxcsr_std = 0x1F80;
7525   }
7526 
7527   // Initialization
7528   void generate_initial() {
7529     // Generates all stubs and initializes the entry points
7530 
7531     // This platform-specific settings are needed by generate_call_stub()
7532     create_control_words();
7533 
7534     // entry points that exist in all platforms Note: This is code
7535     // that could be shared among different platforms - however the
7536     // benefit seems to be smaller than the disadvantage of having a
7537     // much more complicated generator structure. See also comment in
7538     // stubRoutines.hpp.
7539 
7540     StubRoutines::_forward_exception_entry = generate_forward_exception();
7541 
7542     StubRoutines::_call_stub_entry =
7543       generate_call_stub(StubRoutines::_call_stub_return_address);
7544 
7545     // is referenced by megamorphic call
7546     StubRoutines::_catch_exception_entry = generate_catch_exception();
7547 
7548     // atomic calls
7549     StubRoutines::_fence_entry                = generate_orderaccess_fence();
7550 
7551     // platform dependent
7552     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
7553 
7554     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
7555 
7556     StubRoutines::x86::_f2i_fixup             = generate_f2i_fixup();
7557     StubRoutines::x86::_f2l_fixup             = generate_f2l_fixup();
7558     StubRoutines::x86::_d2i_fixup             = generate_d2i_fixup();
7559     StubRoutines::x86::_d2l_fixup             = generate_d2l_fixup();
7560 
7561     StubRoutines::x86::_float_sign_mask       = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
7562     StubRoutines::x86::_float_sign_flip       = generate_fp_mask("float_sign_flip",  0x8000000080000000);
7563     StubRoutines::x86::_double_sign_mask      = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7564     StubRoutines::x86::_double_sign_flip      = generate_fp_mask("double_sign_flip", 0x8000000000000000);
7565 
7566     // Build this early so it's available for the interpreter.
7567     StubRoutines::_throw_StackOverflowError_entry =
7568       generate_throw_exception("StackOverflowError throw_exception",
7569                                CAST_FROM_FN_PTR(address,
7570                                                 SharedRuntime::
7571                                                 throw_StackOverflowError));
7572     StubRoutines::_throw_delayed_StackOverflowError_entry =
7573       generate_throw_exception("delayed StackOverflowError throw_exception",
7574                                CAST_FROM_FN_PTR(address,
7575                                                 SharedRuntime::
7576                                                 throw_delayed_StackOverflowError));
7577     if (UseCRC32Intrinsics) {
7578       // set table address before stub generation which use it
7579       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
7580       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7581     }
7582 
7583     if (UseCRC32CIntrinsics) {
7584       bool supports_clmul = VM_Version::supports_clmul();
7585       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
7586       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
7587       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
7588     }
7589 
7590     if (UseAdler32Intrinsics) {
7591        StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7592     }
7593 
7594     if (UseLibmIntrinsic && InlineIntrinsics) {
7595       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
7596           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
7597           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7598         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
7599         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
7600         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
7601         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
7602         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
7603         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
7604         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
7605         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
7606         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
7607         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
7608         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
7609         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
7610         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
7611         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
7612       }
7613       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
7614         StubRoutines::_dexp = generate_libmExp();
7615       }
7616       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7617         StubRoutines::_dlog = generate_libmLog();
7618       }
7619       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
7620         StubRoutines::_dlog10 = generate_libmLog10();
7621       }
7622       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
7623         StubRoutines::_dpow = generate_libmPow();
7624       }
7625       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7626         StubRoutines::_dsin = generate_libmSin();
7627       }
7628       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7629         StubRoutines::_dcos = generate_libmCos();
7630       }
7631       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7632         StubRoutines::_dtan = generate_libmTan();
7633       }
7634     }
7635   }
7636 
7637   void generate_all() {
7638     // Generates all stubs and initializes the entry points
7639 
7640     // These entry points require SharedInfo::stack0 to be set up in
7641     // non-core builds and need to be relocatable, so they each
7642     // fabricate a RuntimeStub internally.
7643     StubRoutines::_throw_AbstractMethodError_entry =
7644       generate_throw_exception("AbstractMethodError throw_exception",
7645                                CAST_FROM_FN_PTR(address,
7646                                                 SharedRuntime::
7647                                                 throw_AbstractMethodError));
7648 
7649     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7650       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7651                                CAST_FROM_FN_PTR(address,
7652                                                 SharedRuntime::
7653                                                 throw_IncompatibleClassChangeError));
7654 
7655     StubRoutines::_throw_NullPointerException_at_call_entry =
7656       generate_throw_exception("NullPointerException at call throw_exception",
7657                                CAST_FROM_FN_PTR(address,
7658                                                 SharedRuntime::
7659                                                 throw_NullPointerException_at_call));
7660 
7661     // entry points that are platform specific
7662     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
7663     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
7664     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7665     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
7666     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
7667     StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x0000000100000001);
7668     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
7669     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
7670     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
7671     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
7672     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
7673                                                                         0xFFFFFFFF, 0, 0, 0);
7674     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
7675                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
7676     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
7677     StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
7678     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
7679     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
7680     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
7681     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");





7682 
7683     if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
7684       // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
7685       StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
7686     }
7687 
7688     // support for verify_oop (must happen after universe_init)
7689     if (VerifyOops) {
7690       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7691     }
7692 
7693     // data cache line writeback
7694     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7695     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7696 
7697     // arraycopy stubs used by compilers
7698     generate_arraycopy_stubs();
7699 
7700     // don't bother generating these AES intrinsic stubs unless global flag is set
7701     if (UseAESIntrinsics) {
7702       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
7703       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7704       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7705       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7706       if (VM_Version::supports_avx512_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
7707         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
7708         StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
7709         StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
7710         StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7711         StubRoutines::x86::_ghash_poly512_addr = ghash_polynomial512_addr();
7712         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7713         StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7714       } else {
7715         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
7716       }
7717     }
7718 
7719     if (UseAESCTRIntrinsics) {
7720       if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
7721         if (StubRoutines::x86::_counter_mask_addr == NULL) {
7722           StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7723         }
7724         StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
7725       } else {
7726         StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
7727         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
7728       }
7729     }
7730 
7731     if (UseMD5Intrinsics) {
7732       StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
7733       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
7734     }
7735     if (UseSHA1Intrinsics) {
7736       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
7737       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
7738       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
7739       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
7740     }
7741     if (UseSHA256Intrinsics) {
7742       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
7743       char* dst = (char*)StubRoutines::x86::_k256_W;
7744       char* src = (char*)StubRoutines::x86::_k256;
7745       for (int ii = 0; ii < 16; ++ii) {
7746         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
7747         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
7748       }
7749       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
7750       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
7751       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
7752       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
7753     }
7754     if (UseSHA512Intrinsics) {
7755       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
7756       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
7757       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
7758       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
7759     }
7760 
7761     // Generate GHASH intrinsics code
7762     if (UseGHASHIntrinsics) {
7763       if (StubRoutines::x86::_ghash_long_swap_mask_addr == NULL) {
7764         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7765       }
7766     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
7767       if (VM_Version::supports_avx()) {
7768         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
7769         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
7770         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
7771       } else {
7772         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7773       }
7774     }
7775 
7776 
7777     if (UseBASE64Intrinsics) {
7778       if(VM_Version::supports_avx2() &&
7779          VM_Version::supports_avx512bw() &&
7780          VM_Version::supports_avx512vl()) {
7781         StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
7782         StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
7783         StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
7784       }
7785       StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
7786       if (VM_Version::supports_avx512_vbmi()) {
7787         StubRoutines::x86::_shuffle_base64 = base64_shuffle_addr();
7788         StubRoutines::x86::_lookup_lo_base64 = base64_vbmi_lookup_lo_addr();
7789         StubRoutines::x86::_lookup_hi_base64 = base64_vbmi_lookup_hi_addr();
7790         StubRoutines::x86::_lookup_lo_base64url = base64_vbmi_lookup_lo_url_addr();
7791         StubRoutines::x86::_lookup_hi_base64url = base64_vbmi_lookup_hi_url_addr();
7792         StubRoutines::x86::_pack_vec_base64 = base64_vbmi_pack_vec_addr();
7793         StubRoutines::x86::_join_0_1_base64 = base64_vbmi_join_0_1_addr();
7794         StubRoutines::x86::_join_1_2_base64 = base64_vbmi_join_1_2_addr();
7795         StubRoutines::x86::_join_2_3_base64 = base64_vbmi_join_2_3_addr();
7796       }
7797       StubRoutines::x86::_decoding_table_base64 = base64_decoding_table_addr();
7798       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7799       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7800     }
7801 
7802     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7803     if (bs_nm != NULL) {
7804       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
7805     }
7806 #ifdef COMPILER2
7807     if (UseMultiplyToLenIntrinsic) {
7808       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7809     }
7810     if (UseSquareToLenIntrinsic) {
7811       StubRoutines::_squareToLen = generate_squareToLen();
7812     }
7813     if (UseMulAddIntrinsic) {
7814       StubRoutines::_mulAdd = generate_mulAdd();
7815     }
7816     if (VM_Version::supports_avx512_vbmi2()) {
7817       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7818       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7819     }
7820     if (UseMontgomeryMultiplyIntrinsic) {
7821       StubRoutines::_montgomeryMultiply
7822         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
7823     }
7824     if (UseMontgomerySquareIntrinsic) {
7825       StubRoutines::_montgomerySquare
7826         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
7827     }
7828 
7829     // Get svml stub routine addresses
7830     void *libjsvml = NULL;
7831     char ebuf[1024];
7832     char dll_name[JVM_MAXPATHLEN];
7833     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "jsvml")) {
7834       libjsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
7835     }
7836     if (libjsvml != NULL) {
7837       // SVML method naming convention
7838       //   All the methods are named as __jsvml_op<T><N>_ha_<VV>
7839       //   Where:
7840       //      ha stands for high accuracy
7841       //      <T> is optional to indicate float/double
7842       //              Set to f for vector float operation
7843       //              Omitted for vector double operation
7844       //      <N> is the number of elements in the vector
7845       //              1, 2, 4, 8, 16
7846       //              e.g. 128 bit float vector has 4 float elements
7847       //      <VV> indicates the avx/sse level:
7848       //              z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2
7849       //      e.g. __jsvml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns
7850       //           __jsvml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
7851 
7852       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "jsvml" JNI_LIB_SUFFIX, p2i(libjsvml));
7853       if (UseAVX > 2) {
7854         for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7855           int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7856           if ((!VM_Version::supports_avx512dq()) &&
7857               (vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) {
7858             continue;
7859           }
7860           snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf16_ha_z0", VectorSupport::svmlname[op]);
7861           StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
7862 
7863           snprintf(ebuf, sizeof(ebuf), "__jsvml_%s8_ha_z0", VectorSupport::svmlname[op]);
7864           StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
7865         }
7866       }
7867       const char* avx_sse_str = (UseAVX >= 2) ? "l9" : ((UseAVX == 1) ? "e9" : "ex");
7868       for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7869         int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7870         if (vop == VectorSupport::VECTOR_OP_POW) {
7871           continue;
7872         }
7873         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7874         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
7875 
7876         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7877         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
7878 
7879         snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf8_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7880         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
7881 
7882         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7883         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
7884 
7885         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7886         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
7887 
7888         snprintf(ebuf, sizeof(ebuf), "__jsvml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7889         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
7890       }
7891     }
7892 #endif // COMPILER2
7893 
7894     if (UseVectorizedMismatchIntrinsic) {
7895       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
7896     }
7897   }
7898 
7899  public:
7900   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7901     if (all) {
7902       generate_all();
7903     } else {
7904       generate_initial();
7905     }
7906   }
7907 }; // end class declaration
7908 
7909 #define UCM_TABLE_MAX_ENTRIES 16
7910 void StubGenerator_generate(CodeBuffer* code, bool all) {
7911   if (UnsafeCopyMemory::_table == NULL) {
7912     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7913   }
7914   StubGenerator g(code, all);
7915 }
--- EOF ---