1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/barrierSetNMethod.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/universe.hpp"
  36 #include "nativeInst_x86.hpp"
  37 #include "oops/instanceOop.hpp"
  38 #include "oops/method.hpp"
  39 #include "oops/objArrayKlass.hpp"
  40 #include "oops/oop.inline.hpp"
  41 #include "prims/methodHandles.hpp"
  42 #include "runtime/arguments.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubCodeGenerator.hpp"
  47 #include "runtime/stubRoutines.hpp"
  48 #include "runtime/thread.inline.hpp"
  49 #ifdef COMPILER2
  50 #include "opto/runtime.hpp"
  51 #endif
  52 #if INCLUDE_JVMCI
  53 #include "jvmci/jvmci_globals.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #define __ _masm->
  64 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  65 #define a__ ((Assembler*)_masm)->
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  75 
  76 // Stub Code definitions
  77 
  78 class StubGenerator: public StubCodeGenerator {
  79  private:
  80 
  81 #ifdef PRODUCT
  82 #define inc_counter_np(counter) ((void)0)
  83 #else
  84   void inc_counter_np_(int& counter) {
  85     // This can destroy rscratch1 if counter is far from the code cache
  86     __ incrementl(ExternalAddress((address)&counter));
  87   }
  88 #define inc_counter_np(counter) \
  89   BLOCK_COMMENT("inc_counter " #counter); \
  90   inc_counter_np_(counter);
  91 #endif
  92 
  93   // Call stubs are used to call Java from C
  94   //
  95   // Linux Arguments:
  96   //    c_rarg0:   call wrapper address                   address
  97   //    c_rarg1:   result                                 address
  98   //    c_rarg2:   result type                            BasicType
  99   //    c_rarg3:   method                                 Method*
 100   //    c_rarg4:   (interpreter) entry point              address
 101   //    c_rarg5:   parameters                             intptr_t*
 102   //    16(rbp): parameter size (in words)              int
 103   //    24(rbp): thread                                 Thread*
 104   //
 105   //     [ return_from_Java     ] <--- rsp
 106   //     [ argument word n      ]
 107   //      ...
 108   // -12 [ argument word 1      ]
 109   // -11 [ saved r15            ] <--- rsp_after_call
 110   // -10 [ saved r14            ]
 111   //  -9 [ saved r13            ]
 112   //  -8 [ saved r12            ]
 113   //  -7 [ saved rbx            ]
 114   //  -6 [ call wrapper         ]
 115   //  -5 [ result               ]
 116   //  -4 [ result type          ]
 117   //  -3 [ method               ]
 118   //  -2 [ entry point          ]
 119   //  -1 [ parameters           ]
 120   //   0 [ saved rbp            ] <--- rbp
 121   //   1 [ return address       ]
 122   //   2 [ parameter size       ]
 123   //   3 [ thread               ]
 124   //
 125   // Windows Arguments:
 126   //    c_rarg0:   call wrapper address                   address
 127   //    c_rarg1:   result                                 address
 128   //    c_rarg2:   result type                            BasicType
 129   //    c_rarg3:   method                                 Method*
 130   //    48(rbp): (interpreter) entry point              address
 131   //    56(rbp): parameters                             intptr_t*
 132   //    64(rbp): parameter size (in words)              int
 133   //    72(rbp): thread                                 Thread*
 134   //
 135   //     [ return_from_Java     ] <--- rsp
 136   //     [ argument word n      ]
 137   //      ...
 138   // -60 [ argument word 1      ]
 139   // -59 [ saved xmm31          ] <--- rsp after_call
 140   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 141   // -27 [ saved xmm15          ]
 142   //     [ saved xmm7-xmm14     ]
 143   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 144   //  -7 [ saved r15            ]
 145   //  -6 [ saved r14            ]
 146   //  -5 [ saved r13            ]
 147   //  -4 [ saved r12            ]
 148   //  -3 [ saved rdi            ]
 149   //  -2 [ saved rsi            ]
 150   //  -1 [ saved rbx            ]
 151   //   0 [ saved rbp            ] <--- rbp
 152   //   1 [ return address       ]
 153   //   2 [ call wrapper         ]
 154   //   3 [ result               ]
 155   //   4 [ result type          ]
 156   //   5 [ method               ]
 157   //   6 [ entry point          ]
 158   //   7 [ parameters           ]
 159   //   8 [ parameter size       ]
 160   //   9 [ thread               ]
 161   //
 162   //    Windows reserves the callers stack space for arguments 1-4.
 163   //    We spill c_rarg0-c_rarg3 to this space.
 164 
 165   // Call stub stack layout word offsets from rbp
 166   enum call_stub_layout {
 167 #ifdef _WIN64
 168     xmm_save_first     = 6,  // save from xmm6
 169     xmm_save_last      = 31, // to xmm31
 170     xmm_save_base      = -9,
 171     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 172     r15_off            = -7,
 173     r14_off            = -6,
 174     r13_off            = -5,
 175     r12_off            = -4,
 176     rdi_off            = -3,
 177     rsi_off            = -2,
 178     rbx_off            = -1,
 179     rbp_off            =  0,
 180     retaddr_off        =  1,
 181     call_wrapper_off   =  2,
 182     result_off         =  3,
 183     result_type_off    =  4,
 184     method_off         =  5,
 185     entry_point_off    =  6,
 186     parameters_off     =  7,
 187     parameter_size_off =  8,
 188     thread_off         =  9
 189 #else
 190     rsp_after_call_off = -12,
 191     mxcsr_off          = rsp_after_call_off,
 192     r15_off            = -11,
 193     r14_off            = -10,
 194     r13_off            = -9,
 195     r12_off            = -8,
 196     rbx_off            = -7,
 197     call_wrapper_off   = -6,
 198     result_off         = -5,
 199     result_type_off    = -4,
 200     method_off         = -3,
 201     entry_point_off    = -2,
 202     parameters_off     = -1,
 203     rbp_off            =  0,
 204     retaddr_off        =  1,
 205     parameter_size_off =  2,
 206     thread_off         =  3
 207 #endif
 208   };
 209 
 210 #ifdef _WIN64
 211   Address xmm_save(int reg) {
 212     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 213     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 214   }
 215 #endif
 216 
 217   address generate_call_stub(address& return_address) {
 218     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 219            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 220            "adjust this code");
 221     StubCodeMark mark(this, "StubRoutines", "call_stub");
 222     address start = __ pc();
 223 
 224     // same as in generate_catch_exception()!
 225     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 226 
 227     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 228     const Address result        (rbp, result_off         * wordSize);
 229     const Address result_type   (rbp, result_type_off    * wordSize);
 230     const Address method        (rbp, method_off         * wordSize);
 231     const Address entry_point   (rbp, entry_point_off    * wordSize);
 232     const Address parameters    (rbp, parameters_off     * wordSize);
 233     const Address parameter_size(rbp, parameter_size_off * wordSize);
 234 
 235     // same as in generate_catch_exception()!
 236     const Address thread        (rbp, thread_off         * wordSize);
 237 
 238     const Address r15_save(rbp, r15_off * wordSize);
 239     const Address r14_save(rbp, r14_off * wordSize);
 240     const Address r13_save(rbp, r13_off * wordSize);
 241     const Address r12_save(rbp, r12_off * wordSize);
 242     const Address rbx_save(rbp, rbx_off * wordSize);
 243 
 244     // stub code
 245     __ enter();
 246     __ subptr(rsp, -rsp_after_call_off * wordSize);
 247 
 248     // save register parameters
 249 #ifndef _WIN64
 250     __ movptr(parameters,   c_rarg5); // parameters
 251     __ movptr(entry_point,  c_rarg4); // entry_point
 252 #endif
 253 
 254     __ movptr(method,       c_rarg3); // method
 255     __ movl(result_type,  c_rarg2);   // result type
 256     __ movptr(result,       c_rarg1); // result
 257     __ movptr(call_wrapper, c_rarg0); // call wrapper
 258 
 259     // save regs belonging to calling function
 260     __ movptr(rbx_save, rbx);
 261     __ movptr(r12_save, r12);
 262     __ movptr(r13_save, r13);
 263     __ movptr(r14_save, r14);
 264     __ movptr(r15_save, r15);
 265 
 266 #ifdef _WIN64
 267     int last_reg = 15;
 268     if (UseAVX > 2) {
 269       last_reg = 31;
 270     }
 271     if (VM_Version::supports_evex()) {
 272       for (int i = xmm_save_first; i <= last_reg; i++) {
 273         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 274       }
 275     } else {
 276       for (int i = xmm_save_first; i <= last_reg; i++) {
 277         __ movdqu(xmm_save(i), as_XMMRegister(i));
 278       }
 279     }
 280 
 281     const Address rdi_save(rbp, rdi_off * wordSize);
 282     const Address rsi_save(rbp, rsi_off * wordSize);
 283 
 284     __ movptr(rsi_save, rsi);
 285     __ movptr(rdi_save, rdi);
 286 #else
 287     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 288     {
 289       Label skip_ldmx;
 290       __ stmxcsr(mxcsr_save);
 291       __ movl(rax, mxcsr_save);
 292       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 293       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 294       __ cmp32(rax, mxcsr_std);
 295       __ jcc(Assembler::equal, skip_ldmx);
 296       __ ldmxcsr(mxcsr_std);
 297       __ bind(skip_ldmx);
 298     }
 299 #endif
 300 
 301     // Load up thread register
 302     __ movptr(r15_thread, thread);
 303     __ reinit_heapbase();
 304 
 305 #ifdef ASSERT
 306     // make sure we have no pending exceptions
 307     {
 308       Label L;
 309       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 310       __ jcc(Assembler::equal, L);
 311       __ stop("StubRoutines::call_stub: entered with pending exception");
 312       __ bind(L);
 313     }
 314 #endif
 315 
 316     // pass parameters if any
 317     BLOCK_COMMENT("pass parameters if any");
 318     Label parameters_done;
 319     __ movl(c_rarg3, parameter_size);
 320     __ testl(c_rarg3, c_rarg3);
 321     __ jcc(Assembler::zero, parameters_done);
 322 
 323     Label loop;
 324     __ movptr(c_rarg2, parameters);       // parameter pointer
 325     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 326     __ BIND(loop);
 327     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 328     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 329     __ decrementl(c_rarg1);             // decrement counter
 330     __ push(rax);                       // pass parameter
 331     __ jcc(Assembler::notZero, loop);
 332 
 333     // call Java function
 334     __ BIND(parameters_done);
 335     __ movptr(rbx, method);             // get Method*
 336     __ movptr(c_rarg1, entry_point);    // get entry_point
 337     __ mov(r13, rsp);                   // set sender sp
 338     BLOCK_COMMENT("call Java function");
 339     __ call(c_rarg1);
 340 
 341     BLOCK_COMMENT("call_stub_return_address:");
 342     return_address = __ pc();
 343 
 344     // store result depending on type (everything that is not
 345     // T_OBJECT, T_INLINE_TYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 346     __ movptr(r13, result);
 347     Label is_long, is_float, is_double, is_value, exit;
 348     __ movl(rbx, result_type);
 349     __ cmpl(rbx, T_OBJECT);
 350     __ jcc(Assembler::equal, is_long);
 351     __ cmpl(rbx, T_INLINE_TYPE);
 352     __ jcc(Assembler::equal, is_value);
 353     __ cmpl(rbx, T_LONG);
 354     __ jcc(Assembler::equal, is_long);
 355     __ cmpl(rbx, T_FLOAT);
 356     __ jcc(Assembler::equal, is_float);
 357     __ cmpl(rbx, T_DOUBLE);
 358     __ jcc(Assembler::equal, is_double);
 359 
 360     // handle T_INT case
 361     __ movl(Address(r13, 0), rax);
 362 
 363     __ BIND(exit);
 364 
 365     // pop parameters
 366     __ lea(rsp, rsp_after_call);
 367 
 368 #ifdef ASSERT
 369     // verify that threads correspond
 370     {
 371      Label L1, L2, L3;
 372       __ cmpptr(r15_thread, thread);
 373       __ jcc(Assembler::equal, L1);
 374       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 375       __ bind(L1);
 376       __ get_thread(rbx);
 377       __ cmpptr(r15_thread, thread);
 378       __ jcc(Assembler::equal, L2);
 379       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 380       __ bind(L2);
 381       __ cmpptr(r15_thread, rbx);
 382       __ jcc(Assembler::equal, L3);
 383       __ stop("StubRoutines::call_stub: threads must correspond");
 384       __ bind(L3);
 385     }
 386 #endif
 387 
 388     // restore regs belonging to calling function
 389 #ifdef _WIN64
 390     // emit the restores for xmm regs
 391     if (VM_Version::supports_evex()) {
 392       for (int i = xmm_save_first; i <= last_reg; i++) {
 393         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 394       }
 395     } else {
 396       for (int i = xmm_save_first; i <= last_reg; i++) {
 397         __ movdqu(as_XMMRegister(i), xmm_save(i));
 398       }
 399     }
 400 #endif
 401     __ movptr(r15, r15_save);
 402     __ movptr(r14, r14_save);
 403     __ movptr(r13, r13_save);
 404     __ movptr(r12, r12_save);
 405     __ movptr(rbx, rbx_save);
 406 
 407 #ifdef _WIN64
 408     __ movptr(rdi, rdi_save);
 409     __ movptr(rsi, rsi_save);
 410 #else
 411     __ ldmxcsr(mxcsr_save);
 412 #endif
 413 
 414     // restore rsp
 415     __ addptr(rsp, -rsp_after_call_off * wordSize);
 416 
 417     // return
 418     __ vzeroupper();
 419     __ pop(rbp);
 420     __ ret(0);
 421 
 422     // handle return types different from T_INT
 423     __ BIND(is_value);
 424     if (InlineTypeReturnedAsFields) {
 425       // Check for flattened return value
 426       __ testptr(rax, 1);
 427       __ jcc(Assembler::zero, is_long);
 428       // Load pack handler address
 429       __ andptr(rax, -2);
 430       __ movptr(rax, Address(rax, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 431       __ movptr(rbx, Address(rax, InlineKlass::pack_handler_jobject_offset()));
 432       // Call pack handler to initialize the buffer
 433       __ call(rbx);
 434       __ jmp(exit);
 435     }
 436     __ BIND(is_long);
 437     __ movq(Address(r13, 0), rax);
 438     __ jmp(exit);
 439 
 440     __ BIND(is_float);
 441     __ movflt(Address(r13, 0), xmm0);
 442     __ jmp(exit);
 443 
 444     __ BIND(is_double);
 445     __ movdbl(Address(r13, 0), xmm0);
 446     __ jmp(exit);
 447 
 448     return start;
 449   }
 450 
 451   // Return point for a Java call if there's an exception thrown in
 452   // Java code.  The exception is caught and transformed into a
 453   // pending exception stored in JavaThread that can be tested from
 454   // within the VM.
 455   //
 456   // Note: Usually the parameters are removed by the callee. In case
 457   // of an exception crossing an activation frame boundary, that is
 458   // not the case if the callee is compiled code => need to setup the
 459   // rsp.
 460   //
 461   // rax: exception oop
 462 
 463   address generate_catch_exception() {
 464     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 465     address start = __ pc();
 466 
 467     // same as in generate_call_stub():
 468     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 469     const Address thread        (rbp, thread_off         * wordSize);
 470 
 471 #ifdef ASSERT
 472     // verify that threads correspond
 473     {
 474       Label L1, L2, L3;
 475       __ cmpptr(r15_thread, thread);
 476       __ jcc(Assembler::equal, L1);
 477       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 478       __ bind(L1);
 479       __ get_thread(rbx);
 480       __ cmpptr(r15_thread, thread);
 481       __ jcc(Assembler::equal, L2);
 482       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 483       __ bind(L2);
 484       __ cmpptr(r15_thread, rbx);
 485       __ jcc(Assembler::equal, L3);
 486       __ stop("StubRoutines::catch_exception: threads must correspond");
 487       __ bind(L3);
 488     }
 489 #endif
 490 
 491     // set pending exception
 492     __ verify_oop(rax);
 493 
 494     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 495     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 496     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 497     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 498 
 499     // complete return to VM
 500     assert(StubRoutines::_call_stub_return_address != NULL,
 501            "_call_stub_return_address must have been generated before");
 502     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 503 
 504     return start;
 505   }
 506 
 507   // Continuation point for runtime calls returning with a pending
 508   // exception.  The pending exception check happened in the runtime
 509   // or native call stub.  The pending exception in Thread is
 510   // converted into a Java-level exception.
 511   //
 512   // Contract with Java-level exception handlers:
 513   // rax: exception
 514   // rdx: throwing pc
 515   //
 516   // NOTE: At entry of this stub, exception-pc must be on stack !!
 517 
 518   address generate_forward_exception() {
 519     StubCodeMark mark(this, "StubRoutines", "forward exception");
 520     address start = __ pc();
 521 
 522     // Upon entry, the sp points to the return address returning into
 523     // Java (interpreted or compiled) code; i.e., the return address
 524     // becomes the throwing pc.
 525     //
 526     // Arguments pushed before the runtime call are still on the stack
 527     // but the exception handler will reset the stack pointer ->
 528     // ignore them.  A potential result in registers can be ignored as
 529     // well.
 530 
 531 #ifdef ASSERT
 532     // make sure this code is only executed if there is a pending exception
 533     {
 534       Label L;
 535       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 536       __ jcc(Assembler::notEqual, L);
 537       __ stop("StubRoutines::forward exception: no pending exception (1)");
 538       __ bind(L);
 539     }
 540 #endif
 541 
 542     // compute exception handler into rbx
 543     __ movptr(c_rarg0, Address(rsp, 0));
 544     BLOCK_COMMENT("call exception_handler_for_return_address");
 545     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 546                          SharedRuntime::exception_handler_for_return_address),
 547                     r15_thread, c_rarg0);
 548     __ mov(rbx, rax);
 549 
 550     // setup rax & rdx, remove return address & clear pending exception
 551     __ pop(rdx);
 552     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 553     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 554 
 555 #ifdef ASSERT
 556     // make sure exception is set
 557     {
 558       Label L;
 559       __ testptr(rax, rax);
 560       __ jcc(Assembler::notEqual, L);
 561       __ stop("StubRoutines::forward exception: no pending exception (2)");
 562       __ bind(L);
 563     }
 564 #endif
 565 
 566     // continue at exception handler (return address removed)
 567     // rax: exception
 568     // rbx: exception handler
 569     // rdx: throwing pc
 570     __ verify_oop(rax);
 571     __ jmp(rbx);
 572 
 573     return start;
 574   }
 575 
 576   // Support for intptr_t OrderAccess::fence()
 577   //
 578   // Arguments :
 579   //
 580   // Result:
 581   address generate_orderaccess_fence() {
 582     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 583     address start = __ pc();
 584     __ membar(Assembler::StoreLoad);
 585     __ ret(0);
 586 
 587     return start;
 588   }
 589 
 590 
 591   // Support for intptr_t get_previous_sp()
 592   //
 593   // This routine is used to find the previous stack pointer for the
 594   // caller.
 595   address generate_get_previous_sp() {
 596     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 597     address start = __ pc();
 598 
 599     __ movptr(rax, rsp);
 600     __ addptr(rax, 8); // return address is at the top of the stack.
 601     __ ret(0);
 602 
 603     return start;
 604   }
 605 
 606   //----------------------------------------------------------------------------------------------------
 607   // Support for void verify_mxcsr()
 608   //
 609   // This routine is used with -Xcheck:jni to verify that native
 610   // JNI code does not return to Java code without restoring the
 611   // MXCSR register to our expected state.
 612 
 613   address generate_verify_mxcsr() {
 614     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 615     address start = __ pc();
 616 
 617     const Address mxcsr_save(rsp, 0);
 618 
 619     if (CheckJNICalls) {
 620       Label ok_ret;
 621       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 622       __ push(rax);
 623       __ subptr(rsp, wordSize);      // allocate a temp location
 624       __ stmxcsr(mxcsr_save);
 625       __ movl(rax, mxcsr_save);
 626       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 627       __ cmp32(rax, mxcsr_std);
 628       __ jcc(Assembler::equal, ok_ret);
 629 
 630       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 631 
 632       __ ldmxcsr(mxcsr_std);
 633 
 634       __ bind(ok_ret);
 635       __ addptr(rsp, wordSize);
 636       __ pop(rax);
 637     }
 638 
 639     __ ret(0);
 640 
 641     return start;
 642   }
 643 
 644   address generate_f2i_fixup() {
 645     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 646     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 647 
 648     address start = __ pc();
 649 
 650     Label L;
 651 
 652     __ push(rax);
 653     __ push(c_rarg3);
 654     __ push(c_rarg2);
 655     __ push(c_rarg1);
 656 
 657     __ movl(rax, 0x7f800000);
 658     __ xorl(c_rarg3, c_rarg3);
 659     __ movl(c_rarg2, inout);
 660     __ movl(c_rarg1, c_rarg2);
 661     __ andl(c_rarg1, 0x7fffffff);
 662     __ cmpl(rax, c_rarg1); // NaN? -> 0
 663     __ jcc(Assembler::negative, L);
 664     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 665     __ movl(c_rarg3, 0x80000000);
 666     __ movl(rax, 0x7fffffff);
 667     __ cmovl(Assembler::positive, c_rarg3, rax);
 668 
 669     __ bind(L);
 670     __ movptr(inout, c_rarg3);
 671 
 672     __ pop(c_rarg1);
 673     __ pop(c_rarg2);
 674     __ pop(c_rarg3);
 675     __ pop(rax);
 676 
 677     __ ret(0);
 678 
 679     return start;
 680   }
 681 
 682   address generate_f2l_fixup() {
 683     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 684     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 685     address start = __ pc();
 686 
 687     Label L;
 688 
 689     __ push(rax);
 690     __ push(c_rarg3);
 691     __ push(c_rarg2);
 692     __ push(c_rarg1);
 693 
 694     __ movl(rax, 0x7f800000);
 695     __ xorl(c_rarg3, c_rarg3);
 696     __ movl(c_rarg2, inout);
 697     __ movl(c_rarg1, c_rarg2);
 698     __ andl(c_rarg1, 0x7fffffff);
 699     __ cmpl(rax, c_rarg1); // NaN? -> 0
 700     __ jcc(Assembler::negative, L);
 701     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 702     __ mov64(c_rarg3, 0x8000000000000000);
 703     __ mov64(rax, 0x7fffffffffffffff);
 704     __ cmov(Assembler::positive, c_rarg3, rax);
 705 
 706     __ bind(L);
 707     __ movptr(inout, c_rarg3);
 708 
 709     __ pop(c_rarg1);
 710     __ pop(c_rarg2);
 711     __ pop(c_rarg3);
 712     __ pop(rax);
 713 
 714     __ ret(0);
 715 
 716     return start;
 717   }
 718 
 719   address generate_d2i_fixup() {
 720     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 721     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 722 
 723     address start = __ pc();
 724 
 725     Label L;
 726 
 727     __ push(rax);
 728     __ push(c_rarg3);
 729     __ push(c_rarg2);
 730     __ push(c_rarg1);
 731     __ push(c_rarg0);
 732 
 733     __ movl(rax, 0x7ff00000);
 734     __ movq(c_rarg2, inout);
 735     __ movl(c_rarg3, c_rarg2);
 736     __ mov(c_rarg1, c_rarg2);
 737     __ mov(c_rarg0, c_rarg2);
 738     __ negl(c_rarg3);
 739     __ shrptr(c_rarg1, 0x20);
 740     __ orl(c_rarg3, c_rarg2);
 741     __ andl(c_rarg1, 0x7fffffff);
 742     __ xorl(c_rarg2, c_rarg2);
 743     __ shrl(c_rarg3, 0x1f);
 744     __ orl(c_rarg1, c_rarg3);
 745     __ cmpl(rax, c_rarg1);
 746     __ jcc(Assembler::negative, L); // NaN -> 0
 747     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 748     __ movl(c_rarg2, 0x80000000);
 749     __ movl(rax, 0x7fffffff);
 750     __ cmov(Assembler::positive, c_rarg2, rax);
 751 
 752     __ bind(L);
 753     __ movptr(inout, c_rarg2);
 754 
 755     __ pop(c_rarg0);
 756     __ pop(c_rarg1);
 757     __ pop(c_rarg2);
 758     __ pop(c_rarg3);
 759     __ pop(rax);
 760 
 761     __ ret(0);
 762 
 763     return start;
 764   }
 765 
 766   address generate_d2l_fixup() {
 767     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 768     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 769 
 770     address start = __ pc();
 771 
 772     Label L;
 773 
 774     __ push(rax);
 775     __ push(c_rarg3);
 776     __ push(c_rarg2);
 777     __ push(c_rarg1);
 778     __ push(c_rarg0);
 779 
 780     __ movl(rax, 0x7ff00000);
 781     __ movq(c_rarg2, inout);
 782     __ movl(c_rarg3, c_rarg2);
 783     __ mov(c_rarg1, c_rarg2);
 784     __ mov(c_rarg0, c_rarg2);
 785     __ negl(c_rarg3);
 786     __ shrptr(c_rarg1, 0x20);
 787     __ orl(c_rarg3, c_rarg2);
 788     __ andl(c_rarg1, 0x7fffffff);
 789     __ xorl(c_rarg2, c_rarg2);
 790     __ shrl(c_rarg3, 0x1f);
 791     __ orl(c_rarg1, c_rarg3);
 792     __ cmpl(rax, c_rarg1);
 793     __ jcc(Assembler::negative, L); // NaN -> 0
 794     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 795     __ mov64(c_rarg2, 0x8000000000000000);
 796     __ mov64(rax, 0x7fffffffffffffff);
 797     __ cmovq(Assembler::positive, c_rarg2, rax);
 798 
 799     __ bind(L);
 800     __ movq(inout, c_rarg2);
 801 
 802     __ pop(c_rarg0);
 803     __ pop(c_rarg1);
 804     __ pop(c_rarg2);
 805     __ pop(c_rarg3);
 806     __ pop(rax);
 807 
 808     __ ret(0);
 809 
 810     return start;
 811   }
 812 
 813   address generate_iota_indices(const char *stub_name) {
 814     __ align(CodeEntryAlignment);
 815     StubCodeMark mark(this, "StubRoutines", stub_name);
 816     address start = __ pc();
 817     __ emit_data64(0x0706050403020100, relocInfo::none);
 818     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 819     __ emit_data64(0x1716151413121110, relocInfo::none);
 820     __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
 821     __ emit_data64(0x2726252423222120, relocInfo::none);
 822     __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
 823     __ emit_data64(0x3736353433323130, relocInfo::none);
 824     __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
 825     return start;
 826   }
 827 
 828   address generate_vector_byte_shuffle_mask(const char *stub_name) {
 829     __ align(CodeEntryAlignment);
 830     StubCodeMark mark(this, "StubRoutines", stub_name);
 831     address start = __ pc();
 832     __ emit_data64(0x7070707070707070, relocInfo::none);
 833     __ emit_data64(0x7070707070707070, relocInfo::none);
 834     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 835     __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
 836     return start;
 837   }
 838 
 839   address generate_fp_mask(const char *stub_name, int64_t mask) {
 840     __ align(CodeEntryAlignment);
 841     StubCodeMark mark(this, "StubRoutines", stub_name);
 842     address start = __ pc();
 843 
 844     __ emit_data64( mask, relocInfo::none );
 845     __ emit_data64( mask, relocInfo::none );
 846 
 847     return start;
 848   }
 849 
 850   address generate_vector_mask(const char *stub_name, int64_t mask) {
 851     __ align(CodeEntryAlignment);
 852     StubCodeMark mark(this, "StubRoutines", stub_name);
 853     address start = __ pc();
 854 
 855     __ emit_data64(mask, relocInfo::none);
 856     __ emit_data64(mask, relocInfo::none);
 857     __ emit_data64(mask, relocInfo::none);
 858     __ emit_data64(mask, relocInfo::none);
 859     __ emit_data64(mask, relocInfo::none);
 860     __ emit_data64(mask, relocInfo::none);
 861     __ emit_data64(mask, relocInfo::none);
 862     __ emit_data64(mask, relocInfo::none);
 863 
 864     return start;
 865   }
 866 
 867   address generate_vector_byte_perm_mask(const char *stub_name) {
 868     __ align(CodeEntryAlignment);
 869     StubCodeMark mark(this, "StubRoutines", stub_name);
 870     address start = __ pc();
 871 
 872     __ emit_data64(0x0000000000000001, relocInfo::none);
 873     __ emit_data64(0x0000000000000003, relocInfo::none);
 874     __ emit_data64(0x0000000000000005, relocInfo::none);
 875     __ emit_data64(0x0000000000000007, relocInfo::none);
 876     __ emit_data64(0x0000000000000000, relocInfo::none);
 877     __ emit_data64(0x0000000000000002, relocInfo::none);
 878     __ emit_data64(0x0000000000000004, relocInfo::none);
 879     __ emit_data64(0x0000000000000006, relocInfo::none);
 880 
 881     return start;
 882   }
 883 
 884   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
 885     __ align(CodeEntryAlignment);
 886     StubCodeMark mark(this, "StubRoutines", stub_name);
 887     address start = __ pc();
 888 
 889     __ emit_data64(mask, relocInfo::none);
 890     __ emit_data64(mask, relocInfo::none);
 891     __ emit_data64(mask, relocInfo::none);
 892     __ emit_data64(mask, relocInfo::none);
 893     __ emit_data64(mask, relocInfo::none);
 894     __ emit_data64(mask, relocInfo::none);
 895     __ emit_data64(mask, relocInfo::none);
 896     __ emit_data64(mask, relocInfo::none);
 897 
 898     return start;
 899   }
 900 
 901   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 902                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 903                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 904                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 905                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 906     __ align(CodeEntryAlignment);
 907     StubCodeMark mark(this, "StubRoutines", stub_name);
 908     address start = __ pc();
 909 
 910     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 911     __ emit_data(val0, relocInfo::none, 0);
 912     __ emit_data(val1, relocInfo::none, 0);
 913     __ emit_data(val2, relocInfo::none, 0);
 914     __ emit_data(val3, relocInfo::none, 0);
 915     if (len >= Assembler::AVX_256bit) {
 916       __ emit_data(val4, relocInfo::none, 0);
 917       __ emit_data(val5, relocInfo::none, 0);
 918       __ emit_data(val6, relocInfo::none, 0);
 919       __ emit_data(val7, relocInfo::none, 0);
 920       if (len >= Assembler::AVX_512bit) {
 921         __ emit_data(val8, relocInfo::none, 0);
 922         __ emit_data(val9, relocInfo::none, 0);
 923         __ emit_data(val10, relocInfo::none, 0);
 924         __ emit_data(val11, relocInfo::none, 0);
 925         __ emit_data(val12, relocInfo::none, 0);
 926         __ emit_data(val13, relocInfo::none, 0);
 927         __ emit_data(val14, relocInfo::none, 0);
 928         __ emit_data(val15, relocInfo::none, 0);
 929       }
 930     }
 931 
 932     return start;
 933   }
 934 
 935   // Non-destructive plausibility checks for oops
 936   //
 937   // Arguments:
 938   //    all args on stack!
 939   //
 940   // Stack after saving c_rarg3:
 941   //    [tos + 0]: saved c_rarg3
 942   //    [tos + 1]: saved c_rarg2
 943   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 944   //    [tos + 3]: saved flags
 945   //    [tos + 4]: return address
 946   //  * [tos + 5]: error message (char*)
 947   //  * [tos + 6]: object to verify (oop)
 948   //  * [tos + 7]: saved rax - saved by caller and bashed
 949   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 950   //  * = popped on exit
 951   address generate_verify_oop() {
 952     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 953     address start = __ pc();
 954 
 955     Label exit, error;
 956 
 957     __ pushf();
 958     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 959 
 960     __ push(r12);
 961 
 962     // save c_rarg2 and c_rarg3
 963     __ push(c_rarg2);
 964     __ push(c_rarg3);
 965 
 966     enum {
 967            // After previous pushes.
 968            oop_to_verify = 6 * wordSize,
 969            saved_rax     = 7 * wordSize,
 970            saved_r10     = 8 * wordSize,
 971 
 972            // Before the call to MacroAssembler::debug(), see below.
 973            return_addr   = 16 * wordSize,
 974            error_msg     = 17 * wordSize
 975     };
 976 
 977     // get object
 978     __ movptr(rax, Address(rsp, oop_to_verify));
 979 
 980     // make sure object is 'reasonable'
 981     __ testptr(rax, rax);
 982     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
 983 
 984 #if INCLUDE_ZGC
 985     if (UseZGC) {
 986       // Check if metadata bits indicate a bad oop
 987       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
 988       __ jcc(Assembler::notZero, error);
 989     }
 990 #endif
 991 
 992     // Check if the oop is in the right area of memory
 993     __ movptr(c_rarg2, rax);
 994     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 995     __ andptr(c_rarg2, c_rarg3);
 996     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 997     __ cmpptr(c_rarg2, c_rarg3);
 998     __ jcc(Assembler::notZero, error);
 999 
1000     // make sure klass is 'reasonable', which is not zero.
1001     __ load_klass(rax, rax, rscratch1);  // get klass
1002     __ testptr(rax, rax);
1003     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1004 
1005     // return if everything seems ok
1006     __ bind(exit);
1007     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1008     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1009     __ pop(c_rarg3);                             // restore c_rarg3
1010     __ pop(c_rarg2);                             // restore c_rarg2
1011     __ pop(r12);                                 // restore r12
1012     __ popf();                                   // restore flags
1013     __ ret(4 * wordSize);                        // pop caller saved stuff
1014 
1015     // handle errors
1016     __ bind(error);
1017     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1018     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1019     __ pop(c_rarg3);                             // get saved c_rarg3 back
1020     __ pop(c_rarg2);                             // get saved c_rarg2 back
1021     __ pop(r12);                                 // get saved r12 back
1022     __ popf();                                   // get saved flags off stack --
1023                                                  // will be ignored
1024 
1025     __ pusha();                                  // push registers
1026                                                  // (rip is already
1027                                                  // already pushed)
1028     // debug(char* msg, int64_t pc, int64_t regs[])
1029     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1030     // pushed all the registers, so now the stack looks like:
1031     //     [tos +  0] 16 saved registers
1032     //     [tos + 16] return address
1033     //   * [tos + 17] error message (char*)
1034     //   * [tos + 18] object to verify (oop)
1035     //   * [tos + 19] saved rax - saved by caller and bashed
1036     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1037     //   * = popped on exit
1038 
1039     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1040     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1041     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1042     __ mov(r12, rsp);                               // remember rsp
1043     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1044     __ andptr(rsp, -16);                            // align stack as required by ABI
1045     BLOCK_COMMENT("call MacroAssembler::debug");
1046     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1047     __ hlt();
1048     return start;
1049   }
1050 
1051   //
1052   // Verify that a register contains clean 32-bits positive value
1053   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1054   //
1055   //  Input:
1056   //    Rint  -  32-bits value
1057   //    Rtmp  -  scratch
1058   //
1059   void assert_clean_int(Register Rint, Register Rtmp) {
1060 #ifdef ASSERT
1061     Label L;
1062     assert_different_registers(Rtmp, Rint);
1063     __ movslq(Rtmp, Rint);
1064     __ cmpq(Rtmp, Rint);
1065     __ jcc(Assembler::equal, L);
1066     __ stop("high 32-bits of int value are not 0");
1067     __ bind(L);
1068 #endif
1069   }
1070 
1071   //  Generate overlap test for array copy stubs
1072   //
1073   //  Input:
1074   //     c_rarg0 - from
1075   //     c_rarg1 - to
1076   //     c_rarg2 - element count
1077   //
1078   //  Output:
1079   //     rax   - &from[element count - 1]
1080   //
1081   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1082     assert(no_overlap_target != NULL, "must be generated");
1083     array_overlap_test(no_overlap_target, NULL, sf);
1084   }
1085   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1086     array_overlap_test(NULL, &L_no_overlap, sf);
1087   }
1088   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1089     const Register from     = c_rarg0;
1090     const Register to       = c_rarg1;
1091     const Register count    = c_rarg2;
1092     const Register end_from = rax;
1093 
1094     __ cmpptr(to, from);
1095     __ lea(end_from, Address(from, count, sf, 0));
1096     if (NOLp == NULL) {
1097       ExternalAddress no_overlap(no_overlap_target);
1098       __ jump_cc(Assembler::belowEqual, no_overlap);
1099       __ cmpptr(to, end_from);
1100       __ jump_cc(Assembler::aboveEqual, no_overlap);
1101     } else {
1102       __ jcc(Assembler::belowEqual, (*NOLp));
1103       __ cmpptr(to, end_from);
1104       __ jcc(Assembler::aboveEqual, (*NOLp));
1105     }
1106   }
1107 
1108   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1109   //
1110   // Outputs:
1111   //    rdi - rcx
1112   //    rsi - rdx
1113   //    rdx - r8
1114   //    rcx - r9
1115   //
1116   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1117   // are non-volatile.  r9 and r10 should not be used by the caller.
1118   //
1119   DEBUG_ONLY(bool regs_in_thread;)
1120 
1121   void setup_arg_regs(int nargs = 3) {
1122     const Register saved_rdi = r9;
1123     const Register saved_rsi = r10;
1124     assert(nargs == 3 || nargs == 4, "else fix");
1125 #ifdef _WIN64
1126     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1127            "unexpected argument registers");
1128     if (nargs >= 4)
1129       __ mov(rax, r9);  // r9 is also saved_rdi
1130     __ movptr(saved_rdi, rdi);
1131     __ movptr(saved_rsi, rsi);
1132     __ mov(rdi, rcx); // c_rarg0
1133     __ mov(rsi, rdx); // c_rarg1
1134     __ mov(rdx, r8);  // c_rarg2
1135     if (nargs >= 4)
1136       __ mov(rcx, rax); // c_rarg3 (via rax)
1137 #else
1138     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1139            "unexpected argument registers");
1140 #endif
1141     DEBUG_ONLY(regs_in_thread = false;)
1142   }
1143 
1144   void restore_arg_regs() {
1145     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1146     const Register saved_rdi = r9;
1147     const Register saved_rsi = r10;
1148 #ifdef _WIN64
1149     __ movptr(rdi, saved_rdi);
1150     __ movptr(rsi, saved_rsi);
1151 #endif
1152   }
1153 
1154   // This is used in places where r10 is a scratch register, and can
1155   // be adapted if r9 is needed also.
1156   void setup_arg_regs_using_thread() {
1157     const Register saved_r15 = r9;
1158 #ifdef _WIN64
1159     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1160     __ get_thread(r15_thread);
1161     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1162            "unexpected argument registers");
1163     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1164     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1165 
1166     __ mov(rdi, rcx); // c_rarg0
1167     __ mov(rsi, rdx); // c_rarg1
1168     __ mov(rdx, r8);  // c_rarg2
1169 #else
1170     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1171            "unexpected argument registers");
1172 #endif
1173     DEBUG_ONLY(regs_in_thread = true;)
1174   }
1175 
1176   void restore_arg_regs_using_thread() {
1177     assert(regs_in_thread, "wrong call to restore_arg_regs");
1178     const Register saved_r15 = r9;
1179 #ifdef _WIN64
1180     __ get_thread(r15_thread);
1181     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1182     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1183     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1184 #endif
1185   }
1186 
1187   // Copy big chunks forward
1188   //
1189   // Inputs:
1190   //   end_from     - source arrays end address
1191   //   end_to       - destination array end address
1192   //   qword_count  - 64-bits element count, negative
1193   //   to           - scratch
1194   //   L_copy_bytes - entry label
1195   //   L_copy_8_bytes  - exit  label
1196   //
1197   void copy_bytes_forward(Register end_from, Register end_to,
1198                              Register qword_count, Register to,
1199                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1200     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1201     Label L_loop;
1202     __ align(OptoLoopAlignment);
1203     if (UseUnalignedLoadStores) {
1204       Label L_end;
1205       __ BIND(L_loop);
1206       if (UseAVX >= 2) {
1207         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1208         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1209         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1210         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1211       } else {
1212         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1213         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1214         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1215         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1216         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1217         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1218         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1219         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1220       }
1221 
1222       __ BIND(L_copy_bytes);
1223       __ addptr(qword_count, 8);
1224       __ jcc(Assembler::lessEqual, L_loop);
1225       __ subptr(qword_count, 4);  // sub(8) and add(4)
1226       __ jccb(Assembler::greater, L_end);
1227       // Copy trailing 32 bytes
1228       if (UseAVX >= 2) {
1229         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1230         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1231       } else {
1232         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1233         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1234         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1235         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1236       }
1237       __ addptr(qword_count, 4);
1238       __ BIND(L_end);
1239       if (UseAVX >= 2) {
1240         // clean upper bits of YMM registers
1241         __ vpxor(xmm0, xmm0);
1242         __ vpxor(xmm1, xmm1);
1243       }
1244     } else {
1245       // Copy 32-bytes per iteration
1246       __ BIND(L_loop);
1247       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1248       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1249       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1250       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1251       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1252       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1253       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1254       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1255 
1256       __ BIND(L_copy_bytes);
1257       __ addptr(qword_count, 4);
1258       __ jcc(Assembler::lessEqual, L_loop);
1259     }
1260     __ subptr(qword_count, 4);
1261     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1262   }
1263 
1264   // Copy big chunks backward
1265   //
1266   // Inputs:
1267   //   from         - source arrays address
1268   //   dest         - destination array address
1269   //   qword_count  - 64-bits element count
1270   //   to           - scratch
1271   //   L_copy_bytes - entry label
1272   //   L_copy_8_bytes  - exit  label
1273   //
1274   void copy_bytes_backward(Register from, Register dest,
1275                               Register qword_count, Register to,
1276                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1277     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1278     Label L_loop;
1279     __ align(OptoLoopAlignment);
1280     if (UseUnalignedLoadStores) {
1281       Label L_end;
1282       __ BIND(L_loop);
1283       if (UseAVX >= 2) {
1284         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1285         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1286         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1287         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1288       } else {
1289         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1290         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1291         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1292         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1293         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1294         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1295         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1296         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1297       }
1298 
1299       __ BIND(L_copy_bytes);
1300       __ subptr(qword_count, 8);
1301       __ jcc(Assembler::greaterEqual, L_loop);
1302 
1303       __ addptr(qword_count, 4);  // add(8) and sub(4)
1304       __ jccb(Assembler::less, L_end);
1305       // Copy trailing 32 bytes
1306       if (UseAVX >= 2) {
1307         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1308         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1309       } else {
1310         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1311         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1312         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1313         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1314       }
1315       __ subptr(qword_count, 4);
1316       __ BIND(L_end);
1317       if (UseAVX >= 2) {
1318         // clean upper bits of YMM registers
1319         __ vpxor(xmm0, xmm0);
1320         __ vpxor(xmm1, xmm1);
1321       }
1322     } else {
1323       // Copy 32-bytes per iteration
1324       __ BIND(L_loop);
1325       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1326       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1327       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1328       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1329       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1330       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1331       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1332       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1333 
1334       __ BIND(L_copy_bytes);
1335       __ subptr(qword_count, 4);
1336       __ jcc(Assembler::greaterEqual, L_loop);
1337     }
1338     __ addptr(qword_count, 4);
1339     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1340   }
1341 
1342 #ifndef PRODUCT
1343     int& get_profile_ctr(int shift) {
1344       if ( 0 == shift)
1345         return SharedRuntime::_jbyte_array_copy_ctr;
1346       else if(1 == shift)
1347         return SharedRuntime::_jshort_array_copy_ctr;
1348       else if(2 == shift)
1349         return SharedRuntime::_jint_array_copy_ctr;
1350       else
1351         return SharedRuntime::_jlong_array_copy_ctr;
1352     }
1353 #endif
1354 
1355   void setup_argument_regs(BasicType type) {
1356     if (type == T_BYTE || type == T_SHORT) {
1357       setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1358                         // r9 and r10 may be used to save non-volatile registers
1359     } else {
1360       setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1361                                      // r9 is used to save r15_thread
1362     }
1363   }
1364 
1365   void restore_argument_regs(BasicType type) {
1366     if (type == T_BYTE || type == T_SHORT) {
1367       restore_arg_regs();
1368     } else {
1369       restore_arg_regs_using_thread();
1370     }
1371   }
1372 
1373 #if COMPILER2_OR_JVMCI
1374   // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
1375   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
1376   //   for both special cases (various small block sizes) and aligned copy loop. This is the
1377   //   default configuration.
1378   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
1379   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
1380   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
1381   //   better performance for disjoint copies. For conjoint/backward copy vector based
1382   //   copy performs better.
1383   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
1384   //   64 byte vector registers (ZMMs).
1385 
1386   // Inputs:
1387   //   c_rarg0   - source array address
1388   //   c_rarg1   - destination array address
1389   //   c_rarg2   - element count, treated as ssize_t, can be zero
1390   //
1391   //
1392   // Side Effects:
1393   //   disjoint_copy_avx3_masked is set to the no-overlap entry point
1394   //   used by generate_conjoint_[byte/int/short/long]_copy().
1395   //
1396 
1397   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1398                                              bool aligned, bool is_oop, bool dest_uninitialized) {
1399     __ align(CodeEntryAlignment);
1400     StubCodeMark mark(this, "StubRoutines", name);
1401     address start = __ pc();
1402 
1403     bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
1404     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1405     Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1406     const Register from        = rdi;  // source array address
1407     const Register to          = rsi;  // destination array address
1408     const Register count       = rdx;  // elements count
1409     const Register temp1       = r8;
1410     const Register temp2       = r11;
1411     const Register temp3       = rax;
1412     const Register temp4       = rcx;
1413     // End pointers are inclusive, and if count is not zero they point
1414     // to the last unit copied:  end_to[0] := end_from[0]
1415 
1416     __ enter(); // required for proper stackwalking of RuntimeStub frame
1417     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1418 
1419     if (entry != NULL) {
1420       *entry = __ pc();
1421        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1422       BLOCK_COMMENT("Entry:");
1423     }
1424 
1425     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1426     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1427 
1428     setup_argument_regs(type);
1429 
1430     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1431     if (dest_uninitialized) {
1432       decorators |= IS_DEST_UNINITIALIZED;
1433     }
1434     if (aligned) {
1435       decorators |= ARRAYCOPY_ALIGNED;
1436     }
1437     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1438     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1439 
1440     {
1441       // Type(shift)           byte(0), short(1), int(2),   long(3)
1442       int loop_size[]        = { 192,     96,       48,      24};
1443       int threshold[]        = { 4096,    2048,     1024,    512};
1444 
1445       // UnsafeCopyMemory page error: continue after ucm
1446       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1447       // 'from', 'to' and 'count' are now valid
1448 
1449       // temp1 holds remaining count and temp4 holds running count used to compute
1450       // next address offset for start of to/from addresses (temp4 * scale).
1451       __ mov64(temp4, 0);
1452       __ movq(temp1, count);
1453 
1454       // Zero length check.
1455       __ BIND(L_tail);
1456       __ cmpq(temp1, 0);
1457       __ jcc(Assembler::lessEqual, L_exit);
1458 
1459       // Special cases using 32 byte [masked] vector copy operations.
1460       __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1461                                       temp4, temp3, use64byteVector, L_entry, L_exit);
1462 
1463       // PRE-MAIN-POST loop for aligned copy.
1464       __ BIND(L_entry);
1465 
1466       if (AVX3Threshold != 0) {
1467         __ cmpq(count, threshold[shift]);
1468         if (MaxVectorSize == 64) {
1469           // Copy using 64 byte vectors.
1470           __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1471         } else {
1472           assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
1473           // REP MOVS offer a faster copy path.
1474           __ jcc(Assembler::greaterEqual, L_repmovs);
1475         }
1476       }
1477 
1478       if (MaxVectorSize < 64  || AVX3Threshold != 0) {
1479         // Partial copy to make dst address 32 byte aligned.
1480         __ movq(temp2, to);
1481         __ andq(temp2, 31);
1482         __ jcc(Assembler::equal, L_main_pre_loop);
1483 
1484         __ negptr(temp2);
1485         __ addq(temp2, 32);
1486         if (shift) {
1487           __ shrq(temp2, shift);
1488         }
1489         __ movq(temp3, temp2);
1490         __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
1491         __ movq(temp4, temp2);
1492         __ movq(temp1, count);
1493         __ subq(temp1, temp2);
1494 
1495         __ cmpq(temp1, loop_size[shift]);
1496         __ jcc(Assembler::less, L_tail);
1497 
1498         __ BIND(L_main_pre_loop);
1499         __ subq(temp1, loop_size[shift]);
1500 
1501         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1502         __ align32();
1503         __ BIND(L_main_loop);
1504            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
1505            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
1506            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
1507            __ addptr(temp4, loop_size[shift]);
1508            __ subq(temp1, loop_size[shift]);
1509            __ jcc(Assembler::greater, L_main_loop);
1510 
1511         __ addq(temp1, loop_size[shift]);
1512 
1513         // Tail loop.
1514         __ jmp(L_tail);
1515 
1516         __ BIND(L_repmovs);
1517           __ movq(temp2, temp1);
1518           // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
1519           __ movq(temp3, to);
1520           __ movq(to,  from);
1521           __ movq(from, temp3);
1522           // Save to/from for restoration post rep_mov.
1523           __ movq(temp1, to);
1524           __ movq(temp3, from);
1525           if(shift < 3) {
1526             __ shrq(temp2, 3-shift);     // quad word count
1527           }
1528           __ movq(temp4 , temp2);        // move quad ward count into temp4(RCX).
1529           __ rep_mov();
1530           __ shlq(temp2, 3);             // convert quad words into byte count.
1531           if(shift) {
1532             __ shrq(temp2, shift);       // type specific count.
1533           }
1534           // Restore original addresses in to/from.
1535           __ movq(to, temp3);
1536           __ movq(from, temp1);
1537           __ movq(temp4, temp2);
1538           __ movq(temp1, count);
1539           __ subq(temp1, temp2);         // tailing part (less than a quad ward size).
1540           __ jmp(L_tail);
1541       }
1542 
1543       if (MaxVectorSize > 32) {
1544         __ BIND(L_pre_main_post_64);
1545         // Partial copy to make dst address 64 byte aligned.
1546         __ movq(temp2, to);
1547         __ andq(temp2, 63);
1548         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1549 
1550         __ negptr(temp2);
1551         __ addq(temp2, 64);
1552         if (shift) {
1553           __ shrq(temp2, shift);
1554         }
1555         __ movq(temp3, temp2);
1556         __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
1557         __ movq(temp4, temp2);
1558         __ movq(temp1, count);
1559         __ subq(temp1, temp2);
1560 
1561         __ cmpq(temp1, loop_size[shift]);
1562         __ jcc(Assembler::less, L_tail64);
1563 
1564         __ BIND(L_main_pre_loop_64bytes);
1565         __ subq(temp1, loop_size[shift]);
1566 
1567         // Main loop with aligned copy block size of 192 bytes at
1568         // 64 byte copy granularity.
1569         __ align32();
1570         __ BIND(L_main_loop_64bytes);
1571            __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
1572            __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
1573            __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
1574            __ addptr(temp4, loop_size[shift]);
1575            __ subq(temp1, loop_size[shift]);
1576            __ jcc(Assembler::greater, L_main_loop_64bytes);
1577 
1578         __ addq(temp1, loop_size[shift]);
1579         // Zero length check.
1580         __ jcc(Assembler::lessEqual, L_exit);
1581 
1582         __ BIND(L_tail64);
1583 
1584         // Tail handling using 64 byte [masked] vector copy operations.
1585         use64byteVector = true;
1586         __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1587                                         temp4, temp3, use64byteVector, L_entry, L_exit);
1588       }
1589       __ BIND(L_exit);
1590     }
1591 
1592     address ucme_exit_pc = __ pc();
1593     // When called from generic_arraycopy r11 contains specific values
1594     // used during arraycopy epilogue, re-initializing r11.
1595     if (is_oop) {
1596       __ movq(r11, shift == 3 ? count : to);
1597     }
1598     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1599     restore_argument_regs(type);
1600     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1601     __ xorptr(rax, rax); // return 0
1602     __ vzeroupper();
1603     __ leave(); // required for proper stackwalking of RuntimeStub frame
1604     __ ret(0);
1605     return start;
1606   }
1607 
1608   // Inputs:
1609   //   c_rarg0   - source array address
1610   //   c_rarg1   - destination array address
1611   //   c_rarg2   - element count, treated as ssize_t, can be zero
1612   //
1613   //
1614   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1615                                              address nooverlap_target, bool aligned, bool is_oop,
1616                                              bool dest_uninitialized) {
1617     __ align(CodeEntryAlignment);
1618     StubCodeMark mark(this, "StubRoutines", name);
1619     address start = __ pc();
1620 
1621     bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;
1622 
1623     Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1624     Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1625     const Register from        = rdi;  // source array address
1626     const Register to          = rsi;  // destination array address
1627     const Register count       = rdx;  // elements count
1628     const Register temp1       = r8;
1629     const Register temp2       = rcx;
1630     const Register temp3       = r11;
1631     const Register temp4       = rax;
1632     // End pointers are inclusive, and if count is not zero they point
1633     // to the last unit copied:  end_to[0] := end_from[0]
1634 
1635     __ enter(); // required for proper stackwalking of RuntimeStub frame
1636     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1637 
1638     if (entry != NULL) {
1639       *entry = __ pc();
1640        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1641       BLOCK_COMMENT("Entry:");
1642     }
1643 
1644     array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1645 
1646     BasicType type_vec[] = { T_BYTE,  T_SHORT,  T_INT,   T_LONG};
1647     BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1648 
1649     setup_argument_regs(type);
1650 
1651     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1652     if (dest_uninitialized) {
1653       decorators |= IS_DEST_UNINITIALIZED;
1654     }
1655     if (aligned) {
1656       decorators |= ARRAYCOPY_ALIGNED;
1657     }
1658     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1659     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1660     {
1661       // Type(shift)       byte(0), short(1), int(2),   long(3)
1662       int loop_size[]   = { 192,     96,       48,      24};
1663       int threshold[]   = { 4096,    2048,     1024,    512};
1664 
1665       // UnsafeCopyMemory page error: continue after ucm
1666       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1667       // 'from', 'to' and 'count' are now valid
1668 
1669       // temp1 holds remaining count.
1670       __ movq(temp1, count);
1671 
1672       // Zero length check.
1673       __ BIND(L_tail);
1674       __ cmpq(temp1, 0);
1675       __ jcc(Assembler::lessEqual, L_exit);
1676 
1677       __ mov64(temp2, 0);
1678       __ movq(temp3, temp1);
1679       // Special cases using 32 byte [masked] vector copy operations.
1680       __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1681                                                temp4, use64byteVector, L_entry, L_exit);
1682 
1683       // PRE-MAIN-POST loop for aligned copy.
1684       __ BIND(L_entry);
1685 
1686       if (MaxVectorSize > 32 && AVX3Threshold != 0) {
1687         __ cmpq(temp1, threshold[shift]);
1688         __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1689       }
1690 
1691       if (MaxVectorSize < 64  || AVX3Threshold != 0) {
1692         // Partial copy to make dst address 32 byte aligned.
1693         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1694         __ andq(temp2, 31);
1695         __ jcc(Assembler::equal, L_main_pre_loop);
1696 
1697         if (shift) {
1698           __ shrq(temp2, shift);
1699         }
1700         __ subq(temp1, temp2);
1701         __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1702 
1703         __ cmpq(temp1, loop_size[shift]);
1704         __ jcc(Assembler::less, L_tail);
1705 
1706         __ BIND(L_main_pre_loop);
1707 
1708         // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1709         __ align32();
1710         __ BIND(L_main_loop);
1711            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1712            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1713            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1714            __ subptr(temp1, loop_size[shift]);
1715            __ cmpq(temp1, loop_size[shift]);
1716            __ jcc(Assembler::greater, L_main_loop);
1717 
1718         // Tail loop.
1719         __ jmp(L_tail);
1720       }
1721 
1722       if (MaxVectorSize > 32) {
1723         __ BIND(L_pre_main_post_64);
1724         // Partial copy to make dst address 64 byte aligned.
1725         __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1726         __ andq(temp2, 63);
1727         __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1728 
1729         if (shift) {
1730           __ shrq(temp2, shift);
1731         }
1732         __ subq(temp1, temp2);
1733         __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1734 
1735         __ cmpq(temp1, loop_size[shift]);
1736         __ jcc(Assembler::less, L_tail64);
1737 
1738         __ BIND(L_main_pre_loop_64bytes);
1739 
1740         // Main loop with aligned copy block size of 192 bytes at
1741         // 64 byte copy granularity.
1742         __ align32();
1743         __ BIND(L_main_loop_64bytes);
1744            __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1745            __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1746            __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1747            __ subq(temp1, loop_size[shift]);
1748            __ cmpq(temp1, loop_size[shift]);
1749            __ jcc(Assembler::greater, L_main_loop_64bytes);
1750 
1751         // Zero length check.
1752         __ cmpq(temp1, 0);
1753         __ jcc(Assembler::lessEqual, L_exit);
1754 
1755         __ BIND(L_tail64);
1756 
1757         // Tail handling using 64 byte [masked] vector copy operations.
1758         use64byteVector = true;
1759         __ mov64(temp2, 0);
1760         __ movq(temp3, temp1);
1761         __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1762                                                  temp4, use64byteVector, L_entry, L_exit);
1763       }
1764       __ BIND(L_exit);
1765     }
1766     address ucme_exit_pc = __ pc();
1767     // When called from generic_arraycopy r11 contains specific values
1768     // used during arraycopy epilogue, re-initializing r11.
1769     if(is_oop) {
1770       __ movq(r11, count);
1771     }
1772     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1773     restore_argument_regs(type);
1774     inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1775     __ xorptr(rax, rax); // return 0
1776     __ vzeroupper();
1777     __ leave(); // required for proper stackwalking of RuntimeStub frame
1778     __ ret(0);
1779     return start;
1780   }
1781 #endif // COMPILER2_OR_JVMCI
1782 
1783 
1784   // Arguments:
1785   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1786   //             ignored
1787   //   name    - stub name string
1788   //
1789   // Inputs:
1790   //   c_rarg0   - source array address
1791   //   c_rarg1   - destination array address
1792   //   c_rarg2   - element count, treated as ssize_t, can be zero
1793   //
1794   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1795   // we let the hardware handle it.  The one to eight bytes within words,
1796   // dwords or qwords that span cache line boundaries will still be loaded
1797   // and stored atomically.
1798   //
1799   // Side Effects:
1800   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1801   //   used by generate_conjoint_byte_copy().
1802   //
1803   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1804 #if COMPILER2_OR_JVMCI
1805     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1806        return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1807                                                  aligned, false, false);
1808     }
1809 #endif
1810     __ align(CodeEntryAlignment);
1811     StubCodeMark mark(this, "StubRoutines", name);
1812     address start = __ pc();
1813 
1814     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1815     Label L_copy_byte, L_exit;
1816     const Register from        = rdi;  // source array address
1817     const Register to          = rsi;  // destination array address
1818     const Register count       = rdx;  // elements count
1819     const Register byte_count  = rcx;
1820     const Register qword_count = count;
1821     const Register end_from    = from; // source array end address
1822     const Register end_to      = to;   // destination array end address
1823     // End pointers are inclusive, and if count is not zero they point
1824     // to the last unit copied:  end_to[0] := end_from[0]
1825 
1826     __ enter(); // required for proper stackwalking of RuntimeStub frame
1827     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1828 
1829     if (entry != NULL) {
1830       *entry = __ pc();
1831        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1832       BLOCK_COMMENT("Entry:");
1833     }
1834 
1835     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1836                       // r9 and r10 may be used to save non-volatile registers
1837 
1838     {
1839       // UnsafeCopyMemory page error: continue after ucm
1840       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1841       // 'from', 'to' and 'count' are now valid
1842       __ movptr(byte_count, count);
1843       __ shrptr(count, 3); // count => qword_count
1844 
1845       // Copy from low to high addresses.  Use 'to' as scratch.
1846       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1847       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1848       __ negptr(qword_count); // make the count negative
1849       __ jmp(L_copy_bytes);
1850 
1851       // Copy trailing qwords
1852     __ BIND(L_copy_8_bytes);
1853       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1854       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1855       __ increment(qword_count);
1856       __ jcc(Assembler::notZero, L_copy_8_bytes);
1857 
1858       // Check for and copy trailing dword
1859     __ BIND(L_copy_4_bytes);
1860       __ testl(byte_count, 4);
1861       __ jccb(Assembler::zero, L_copy_2_bytes);
1862       __ movl(rax, Address(end_from, 8));
1863       __ movl(Address(end_to, 8), rax);
1864 
1865       __ addptr(end_from, 4);
1866       __ addptr(end_to, 4);
1867 
1868       // Check for and copy trailing word
1869     __ BIND(L_copy_2_bytes);
1870       __ testl(byte_count, 2);
1871       __ jccb(Assembler::zero, L_copy_byte);
1872       __ movw(rax, Address(end_from, 8));
1873       __ movw(Address(end_to, 8), rax);
1874 
1875       __ addptr(end_from, 2);
1876       __ addptr(end_to, 2);
1877 
1878       // Check for and copy trailing byte
1879     __ BIND(L_copy_byte);
1880       __ testl(byte_count, 1);
1881       __ jccb(Assembler::zero, L_exit);
1882       __ movb(rax, Address(end_from, 8));
1883       __ movb(Address(end_to, 8), rax);
1884     }
1885   __ BIND(L_exit);
1886     address ucme_exit_pc = __ pc();
1887     restore_arg_regs();
1888     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1889     __ xorptr(rax, rax); // return 0
1890     __ vzeroupper();
1891     __ leave(); // required for proper stackwalking of RuntimeStub frame
1892     __ ret(0);
1893 
1894     {
1895       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1896       // Copy in multi-bytes chunks
1897       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1898       __ jmp(L_copy_4_bytes);
1899     }
1900     return start;
1901   }
1902 
1903   // Arguments:
1904   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1905   //             ignored
1906   //   name    - stub name string
1907   //
1908   // Inputs:
1909   //   c_rarg0   - source array address
1910   //   c_rarg1   - destination array address
1911   //   c_rarg2   - element count, treated as ssize_t, can be zero
1912   //
1913   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1914   // we let the hardware handle it.  The one to eight bytes within words,
1915   // dwords or qwords that span cache line boundaries will still be loaded
1916   // and stored atomically.
1917   //
1918   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1919                                       address* entry, const char *name) {
1920 #if COMPILER2_OR_JVMCI
1921     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
1922        return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1923                                                  nooverlap_target, aligned, false, false);
1924     }
1925 #endif
1926     __ align(CodeEntryAlignment);
1927     StubCodeMark mark(this, "StubRoutines", name);
1928     address start = __ pc();
1929 
1930     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1931     const Register from        = rdi;  // source array address
1932     const Register to          = rsi;  // destination array address
1933     const Register count       = rdx;  // elements count
1934     const Register byte_count  = rcx;
1935     const Register qword_count = count;
1936 
1937     __ enter(); // required for proper stackwalking of RuntimeStub frame
1938     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1939 
1940     if (entry != NULL) {
1941       *entry = __ pc();
1942       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1943       BLOCK_COMMENT("Entry:");
1944     }
1945 
1946     array_overlap_test(nooverlap_target, Address::times_1);
1947     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1948                       // r9 and r10 may be used to save non-volatile registers
1949 
1950     {
1951       // UnsafeCopyMemory page error: continue after ucm
1952       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1953       // 'from', 'to' and 'count' are now valid
1954       __ movptr(byte_count, count);
1955       __ shrptr(count, 3);   // count => qword_count
1956 
1957       // Copy from high to low addresses.
1958 
1959       // Check for and copy trailing byte
1960       __ testl(byte_count, 1);
1961       __ jcc(Assembler::zero, L_copy_2_bytes);
1962       __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1963       __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1964       __ decrement(byte_count); // Adjust for possible trailing word
1965 
1966       // Check for and copy trailing word
1967     __ BIND(L_copy_2_bytes);
1968       __ testl(byte_count, 2);
1969       __ jcc(Assembler::zero, L_copy_4_bytes);
1970       __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1971       __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1972 
1973       // Check for and copy trailing dword
1974     __ BIND(L_copy_4_bytes);
1975       __ testl(byte_count, 4);
1976       __ jcc(Assembler::zero, L_copy_bytes);
1977       __ movl(rax, Address(from, qword_count, Address::times_8));
1978       __ movl(Address(to, qword_count, Address::times_8), rax);
1979       __ jmp(L_copy_bytes);
1980 
1981       // Copy trailing qwords
1982     __ BIND(L_copy_8_bytes);
1983       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1984       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1985       __ decrement(qword_count);
1986       __ jcc(Assembler::notZero, L_copy_8_bytes);
1987     }
1988     restore_arg_regs();
1989     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1990     __ xorptr(rax, rax); // return 0
1991     __ vzeroupper();
1992     __ leave(); // required for proper stackwalking of RuntimeStub frame
1993     __ ret(0);
1994 
1995     {
1996       // UnsafeCopyMemory page error: continue after ucm
1997       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1998       // Copy in multi-bytes chunks
1999       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2000     }
2001     restore_arg_regs();
2002     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
2003     __ xorptr(rax, rax); // return 0
2004     __ vzeroupper();
2005     __ leave(); // required for proper stackwalking of RuntimeStub frame
2006     __ ret(0);
2007 
2008     return start;
2009   }
2010 
2011   // Arguments:
2012   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2013   //             ignored
2014   //   name    - stub name string
2015   //
2016   // Inputs:
2017   //   c_rarg0   - source array address
2018   //   c_rarg1   - destination array address
2019   //   c_rarg2   - element count, treated as ssize_t, can be zero
2020   //
2021   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2022   // let the hardware handle it.  The two or four words within dwords
2023   // or qwords that span cache line boundaries will still be loaded
2024   // and stored atomically.
2025   //
2026   // Side Effects:
2027   //   disjoint_short_copy_entry is set to the no-overlap entry point
2028   //   used by generate_conjoint_short_copy().
2029   //
2030   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
2031 #if COMPILER2_OR_JVMCI
2032     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2033        return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
2034                                                  aligned, false, false);
2035     }
2036 #endif
2037 
2038     __ align(CodeEntryAlignment);
2039     StubCodeMark mark(this, "StubRoutines", name);
2040     address start = __ pc();
2041 
2042     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
2043     const Register from        = rdi;  // source array address
2044     const Register to          = rsi;  // destination array address
2045     const Register count       = rdx;  // elements count
2046     const Register word_count  = rcx;
2047     const Register qword_count = count;
2048     const Register end_from    = from; // source array end address
2049     const Register end_to      = to;   // destination array end address
2050     // End pointers are inclusive, and if count is not zero they point
2051     // to the last unit copied:  end_to[0] := end_from[0]
2052 
2053     __ enter(); // required for proper stackwalking of RuntimeStub frame
2054     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2055 
2056     if (entry != NULL) {
2057       *entry = __ pc();
2058       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2059       BLOCK_COMMENT("Entry:");
2060     }
2061 
2062     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2063                       // r9 and r10 may be used to save non-volatile registers
2064 
2065     {
2066       // UnsafeCopyMemory page error: continue after ucm
2067       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2068       // 'from', 'to' and 'count' are now valid
2069       __ movptr(word_count, count);
2070       __ shrptr(count, 2); // count => qword_count
2071 
2072       // Copy from low to high addresses.  Use 'to' as scratch.
2073       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2074       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2075       __ negptr(qword_count);
2076       __ jmp(L_copy_bytes);
2077 
2078       // Copy trailing qwords
2079     __ BIND(L_copy_8_bytes);
2080       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2081       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2082       __ increment(qword_count);
2083       __ jcc(Assembler::notZero, L_copy_8_bytes);
2084 
2085       // Original 'dest' is trashed, so we can't use it as a
2086       // base register for a possible trailing word copy
2087 
2088       // Check for and copy trailing dword
2089     __ BIND(L_copy_4_bytes);
2090       __ testl(word_count, 2);
2091       __ jccb(Assembler::zero, L_copy_2_bytes);
2092       __ movl(rax, Address(end_from, 8));
2093       __ movl(Address(end_to, 8), rax);
2094 
2095       __ addptr(end_from, 4);
2096       __ addptr(end_to, 4);
2097 
2098       // Check for and copy trailing word
2099     __ BIND(L_copy_2_bytes);
2100       __ testl(word_count, 1);
2101       __ jccb(Assembler::zero, L_exit);
2102       __ movw(rax, Address(end_from, 8));
2103       __ movw(Address(end_to, 8), rax);
2104     }
2105   __ BIND(L_exit);
2106     address ucme_exit_pc = __ pc();
2107     restore_arg_regs();
2108     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2109     __ xorptr(rax, rax); // return 0
2110     __ vzeroupper();
2111     __ leave(); // required for proper stackwalking of RuntimeStub frame
2112     __ ret(0);
2113 
2114     {
2115       UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
2116       // Copy in multi-bytes chunks
2117       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2118       __ jmp(L_copy_4_bytes);
2119     }
2120 
2121     return start;
2122   }
2123 
2124   address generate_fill(BasicType t, bool aligned, const char *name) {
2125     __ align(CodeEntryAlignment);
2126     StubCodeMark mark(this, "StubRoutines", name);
2127     address start = __ pc();
2128 
2129     BLOCK_COMMENT("Entry:");
2130 
2131     const Register to       = c_rarg0;  // source array address
2132     const Register value    = c_rarg1;  // value
2133     const Register count    = c_rarg2;  // elements count
2134 
2135     __ enter(); // required for proper stackwalking of RuntimeStub frame
2136 
2137     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
2138 
2139     __ vzeroupper();
2140     __ leave(); // required for proper stackwalking of RuntimeStub frame
2141     __ ret(0);
2142     return start;
2143   }
2144 
2145   // Arguments:
2146   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2147   //             ignored
2148   //   name    - stub name string
2149   //
2150   // Inputs:
2151   //   c_rarg0   - source array address
2152   //   c_rarg1   - destination array address
2153   //   c_rarg2   - element count, treated as ssize_t, can be zero
2154   //
2155   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2156   // let the hardware handle it.  The two or four words within dwords
2157   // or qwords that span cache line boundaries will still be loaded
2158   // and stored atomically.
2159   //
2160   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2161                                        address *entry, const char *name) {
2162 #if COMPILER2_OR_JVMCI
2163     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2164        return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
2165                                                  nooverlap_target, aligned, false, false);
2166     }
2167 #endif
2168     __ align(CodeEntryAlignment);
2169     StubCodeMark mark(this, "StubRoutines", name);
2170     address start = __ pc();
2171 
2172     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2173     const Register from        = rdi;  // source array address
2174     const Register to          = rsi;  // destination array address
2175     const Register count       = rdx;  // elements count
2176     const Register word_count  = rcx;
2177     const Register qword_count = count;
2178 
2179     __ enter(); // required for proper stackwalking of RuntimeStub frame
2180     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2181 
2182     if (entry != NULL) {
2183       *entry = __ pc();
2184       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2185       BLOCK_COMMENT("Entry:");
2186     }
2187 
2188     array_overlap_test(nooverlap_target, Address::times_2);
2189     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2190                       // r9 and r10 may be used to save non-volatile registers
2191 
2192     {
2193       // UnsafeCopyMemory page error: continue after ucm
2194       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2195       // 'from', 'to' and 'count' are now valid
2196       __ movptr(word_count, count);
2197       __ shrptr(count, 2); // count => qword_count
2198 
2199       // Copy from high to low addresses.  Use 'to' as scratch.
2200 
2201       // Check for and copy trailing word
2202       __ testl(word_count, 1);
2203       __ jccb(Assembler::zero, L_copy_4_bytes);
2204       __ movw(rax, Address(from, word_count, Address::times_2, -2));
2205       __ movw(Address(to, word_count, Address::times_2, -2), rax);
2206 
2207      // Check for and copy trailing dword
2208     __ BIND(L_copy_4_bytes);
2209       __ testl(word_count, 2);
2210       __ jcc(Assembler::zero, L_copy_bytes);
2211       __ movl(rax, Address(from, qword_count, Address::times_8));
2212       __ movl(Address(to, qword_count, Address::times_8), rax);
2213       __ jmp(L_copy_bytes);
2214 
2215       // Copy trailing qwords
2216     __ BIND(L_copy_8_bytes);
2217       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2218       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2219       __ decrement(qword_count);
2220       __ jcc(Assembler::notZero, L_copy_8_bytes);
2221     }
2222     restore_arg_regs();
2223     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2224     __ xorptr(rax, rax); // return 0
2225     __ vzeroupper();
2226     __ leave(); // required for proper stackwalking of RuntimeStub frame
2227     __ ret(0);
2228 
2229     {
2230       // UnsafeCopyMemory page error: continue after ucm
2231       UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2232       // Copy in multi-bytes chunks
2233       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2234     }
2235     restore_arg_regs();
2236     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2237     __ xorptr(rax, rax); // return 0
2238     __ vzeroupper();
2239     __ leave(); // required for proper stackwalking of RuntimeStub frame
2240     __ ret(0);
2241 
2242     return start;
2243   }
2244 
2245   // Arguments:
2246   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2247   //             ignored
2248   //   is_oop  - true => oop array, so generate store check code
2249   //   name    - stub name string
2250   //
2251   // Inputs:
2252   //   c_rarg0   - source array address
2253   //   c_rarg1   - destination array address
2254   //   c_rarg2   - element count, treated as ssize_t, can be zero
2255   //
2256   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2257   // the hardware handle it.  The two dwords within qwords that span
2258   // cache line boundaries will still be loaded and stored atomicly.
2259   //
2260   // Side Effects:
2261   //   disjoint_int_copy_entry is set to the no-overlap entry point
2262   //   used by generate_conjoint_int_oop_copy().
2263   //
2264   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2265                                          const char *name, bool dest_uninitialized = false) {
2266 #if COMPILER2_OR_JVMCI
2267     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2268        return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
2269                                                  aligned, is_oop, dest_uninitialized);
2270     }
2271 #endif
2272 
2273     __ align(CodeEntryAlignment);
2274     StubCodeMark mark(this, "StubRoutines", name);
2275     address start = __ pc();
2276 
2277     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2278     const Register from        = rdi;  // source array address
2279     const Register to          = rsi;  // destination array address
2280     const Register count       = rdx;  // elements count
2281     const Register dword_count = rcx;
2282     const Register qword_count = count;
2283     const Register end_from    = from; // source array end address
2284     const Register end_to      = to;   // destination array end address
2285     // End pointers are inclusive, and if count is not zero they point
2286     // to the last unit copied:  end_to[0] := end_from[0]
2287 
2288     __ enter(); // required for proper stackwalking of RuntimeStub frame
2289     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2290 
2291     if (entry != NULL) {
2292       *entry = __ pc();
2293       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2294       BLOCK_COMMENT("Entry:");
2295     }
2296 
2297     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2298                                    // r9 is used to save r15_thread
2299 
2300     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2301     if (dest_uninitialized) {
2302       decorators |= IS_DEST_UNINITIALIZED;
2303     }
2304     if (aligned) {
2305       decorators |= ARRAYCOPY_ALIGNED;
2306     }
2307 
2308     BasicType type = is_oop ? T_OBJECT : T_INT;
2309     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2310     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2311 
2312     {
2313       // UnsafeCopyMemory page error: continue after ucm
2314       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2315       // 'from', 'to' and 'count' are now valid
2316       __ movptr(dword_count, count);
2317       __ shrptr(count, 1); // count => qword_count
2318 
2319       // Copy from low to high addresses.  Use 'to' as scratch.
2320       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2321       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2322       __ negptr(qword_count);
2323       __ jmp(L_copy_bytes);
2324 
2325       // Copy trailing qwords
2326     __ BIND(L_copy_8_bytes);
2327       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2328       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2329       __ increment(qword_count);
2330       __ jcc(Assembler::notZero, L_copy_8_bytes);
2331 
2332       // Check for and copy trailing dword
2333     __ BIND(L_copy_4_bytes);
2334       __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2335       __ jccb(Assembler::zero, L_exit);
2336       __ movl(rax, Address(end_from, 8));
2337       __ movl(Address(end_to, 8), rax);
2338     }
2339   __ BIND(L_exit);
2340     address ucme_exit_pc = __ pc();
2341     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2342     restore_arg_regs_using_thread();
2343     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2344     __ vzeroupper();
2345     __ xorptr(rax, rax); // return 0
2346     __ leave(); // required for proper stackwalking of RuntimeStub frame
2347     __ ret(0);
2348 
2349     {
2350       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2351       // Copy in multi-bytes chunks
2352       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2353       __ jmp(L_copy_4_bytes);
2354     }
2355 
2356     return start;
2357   }
2358 
2359   // Arguments:
2360   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2361   //             ignored
2362   //   is_oop  - true => oop array, so generate store check code
2363   //   name    - stub name string
2364   //
2365   // Inputs:
2366   //   c_rarg0   - source array address
2367   //   c_rarg1   - destination array address
2368   //   c_rarg2   - element count, treated as ssize_t, can be zero
2369   //
2370   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2371   // the hardware handle it.  The two dwords within qwords that span
2372   // cache line boundaries will still be loaded and stored atomicly.
2373   //
2374   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2375                                          address *entry, const char *name,
2376                                          bool dest_uninitialized = false) {
2377 #if COMPILER2_OR_JVMCI
2378     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2379        return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
2380                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2381     }
2382 #endif
2383     __ align(CodeEntryAlignment);
2384     StubCodeMark mark(this, "StubRoutines", name);
2385     address start = __ pc();
2386 
2387     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2388     const Register from        = rdi;  // source array address
2389     const Register to          = rsi;  // destination array address
2390     const Register count       = rdx;  // elements count
2391     const Register dword_count = rcx;
2392     const Register qword_count = count;
2393 
2394     __ enter(); // required for proper stackwalking of RuntimeStub frame
2395     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2396 
2397     if (entry != NULL) {
2398       *entry = __ pc();
2399        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2400       BLOCK_COMMENT("Entry:");
2401     }
2402 
2403     array_overlap_test(nooverlap_target, Address::times_4);
2404     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2405                                    // r9 is used to save r15_thread
2406 
2407     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2408     if (dest_uninitialized) {
2409       decorators |= IS_DEST_UNINITIALIZED;
2410     }
2411     if (aligned) {
2412       decorators |= ARRAYCOPY_ALIGNED;
2413     }
2414 
2415     BasicType type = is_oop ? T_OBJECT : T_INT;
2416     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2417     // no registers are destroyed by this call
2418     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2419 
2420     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2421     {
2422       // UnsafeCopyMemory page error: continue after ucm
2423       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2424       // 'from', 'to' and 'count' are now valid
2425       __ movptr(dword_count, count);
2426       __ shrptr(count, 1); // count => qword_count
2427 
2428       // Copy from high to low addresses.  Use 'to' as scratch.
2429 
2430       // Check for and copy trailing dword
2431       __ testl(dword_count, 1);
2432       __ jcc(Assembler::zero, L_copy_bytes);
2433       __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2434       __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2435       __ jmp(L_copy_bytes);
2436 
2437       // Copy trailing qwords
2438     __ BIND(L_copy_8_bytes);
2439       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2440       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2441       __ decrement(qword_count);
2442       __ jcc(Assembler::notZero, L_copy_8_bytes);
2443     }
2444     if (is_oop) {
2445       __ jmp(L_exit);
2446     }
2447     restore_arg_regs_using_thread();
2448     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2449     __ xorptr(rax, rax); // return 0
2450     __ vzeroupper();
2451     __ leave(); // required for proper stackwalking of RuntimeStub frame
2452     __ ret(0);
2453 
2454     {
2455       // UnsafeCopyMemory page error: continue after ucm
2456       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2457       // Copy in multi-bytes chunks
2458       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2459     }
2460 
2461   __ BIND(L_exit);
2462     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2463     restore_arg_regs_using_thread();
2464     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2465     __ xorptr(rax, rax); // return 0
2466     __ vzeroupper();
2467     __ leave(); // required for proper stackwalking of RuntimeStub frame
2468     __ ret(0);
2469 
2470     return start;
2471   }
2472 
2473   // Arguments:
2474   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2475   //             ignored
2476   //   is_oop  - true => oop array, so generate store check code
2477   //   name    - stub name string
2478   //
2479   // Inputs:
2480   //   c_rarg0   - source array address
2481   //   c_rarg1   - destination array address
2482   //   c_rarg2   - element count, treated as ssize_t, can be zero
2483   //
2484  // Side Effects:
2485   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2486   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2487   //
2488   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2489                                           const char *name, bool dest_uninitialized = false) {
2490 #if COMPILER2_OR_JVMCI
2491     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2492        return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
2493                                                  aligned, is_oop, dest_uninitialized);
2494     }
2495 #endif
2496     __ align(CodeEntryAlignment);
2497     StubCodeMark mark(this, "StubRoutines", name);
2498     address start = __ pc();
2499 
2500     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2501     const Register from        = rdi;  // source array address
2502     const Register to          = rsi;  // destination array address
2503     const Register qword_count = rdx;  // elements count
2504     const Register end_from    = from; // source array end address
2505     const Register end_to      = rcx;  // destination array end address
2506     const Register saved_count = r11;
2507     // End pointers are inclusive, and if count is not zero they point
2508     // to the last unit copied:  end_to[0] := end_from[0]
2509 
2510     __ enter(); // required for proper stackwalking of RuntimeStub frame
2511     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2512     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2513 
2514     if (entry != NULL) {
2515       *entry = __ pc();
2516       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2517       BLOCK_COMMENT("Entry:");
2518     }
2519 
2520     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2521                                      // r9 is used to save r15_thread
2522     // 'from', 'to' and 'qword_count' are now valid
2523 
2524     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2525     if (dest_uninitialized) {
2526       decorators |= IS_DEST_UNINITIALIZED;
2527     }
2528     if (aligned) {
2529       decorators |= ARRAYCOPY_ALIGNED;
2530     }
2531 
2532     BasicType type = is_oop ? T_OBJECT : T_LONG;
2533     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2534     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2535     {
2536       // UnsafeCopyMemory page error: continue after ucm
2537       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2538 
2539       // Copy from low to high addresses.  Use 'to' as scratch.
2540       __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2541       __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2542       __ negptr(qword_count);
2543       __ jmp(L_copy_bytes);
2544 
2545       // Copy trailing qwords
2546     __ BIND(L_copy_8_bytes);
2547       __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2548       __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2549       __ increment(qword_count);
2550       __ jcc(Assembler::notZero, L_copy_8_bytes);
2551     }
2552     if (is_oop) {
2553       __ jmp(L_exit);
2554     } else {
2555       restore_arg_regs_using_thread();
2556       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2557       __ xorptr(rax, rax); // return 0
2558       __ vzeroupper();
2559       __ leave(); // required for proper stackwalking of RuntimeStub frame
2560       __ ret(0);
2561     }
2562 
2563     {
2564       // UnsafeCopyMemory page error: continue after ucm
2565       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2566       // Copy in multi-bytes chunks
2567       copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2568     }
2569 
2570     __ BIND(L_exit);
2571     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2572     restore_arg_regs_using_thread();
2573     if (is_oop) {
2574       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2575     } else {
2576       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2577     }
2578     __ vzeroupper();
2579     __ xorptr(rax, rax); // return 0
2580     __ leave(); // required for proper stackwalking of RuntimeStub frame
2581     __ ret(0);
2582 
2583     return start;
2584   }
2585 
2586   // Arguments:
2587   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2588   //             ignored
2589   //   is_oop  - true => oop array, so generate store check code
2590   //   name    - stub name string
2591   //
2592   // Inputs:
2593   //   c_rarg0   - source array address
2594   //   c_rarg1   - destination array address
2595   //   c_rarg2   - element count, treated as ssize_t, can be zero
2596   //
2597   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2598                                           address nooverlap_target, address *entry,
2599                                           const char *name, bool dest_uninitialized = false) {
2600 #if COMPILER2_OR_JVMCI
2601     if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize  >= 32) {
2602        return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2603                                                  nooverlap_target, aligned, is_oop, dest_uninitialized);
2604     }
2605 #endif
2606     __ align(CodeEntryAlignment);
2607     StubCodeMark mark(this, "StubRoutines", name);
2608     address start = __ pc();
2609 
2610     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2611     const Register from        = rdi;  // source array address
2612     const Register to          = rsi;  // destination array address
2613     const Register qword_count = rdx;  // elements count
2614     const Register saved_count = rcx;
2615 
2616     __ enter(); // required for proper stackwalking of RuntimeStub frame
2617     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2618 
2619     if (entry != NULL) {
2620       *entry = __ pc();
2621       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2622       BLOCK_COMMENT("Entry:");
2623     }
2624 
2625     array_overlap_test(nooverlap_target, Address::times_8);
2626     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2627                                    // r9 is used to save r15_thread
2628     // 'from', 'to' and 'qword_count' are now valid
2629 
2630     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2631     if (dest_uninitialized) {
2632       decorators |= IS_DEST_UNINITIALIZED;
2633     }
2634     if (aligned) {
2635       decorators |= ARRAYCOPY_ALIGNED;
2636     }
2637 
2638     BasicType type = is_oop ? T_OBJECT : T_LONG;
2639     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2640     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2641     {
2642       // UnsafeCopyMemory page error: continue after ucm
2643       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2644 
2645       __ jmp(L_copy_bytes);
2646 
2647       // Copy trailing qwords
2648     __ BIND(L_copy_8_bytes);
2649       __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2650       __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2651       __ decrement(qword_count);
2652       __ jcc(Assembler::notZero, L_copy_8_bytes);
2653     }
2654     if (is_oop) {
2655       __ jmp(L_exit);
2656     } else {
2657       restore_arg_regs_using_thread();
2658       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2659       __ xorptr(rax, rax); // return 0
2660       __ vzeroupper();
2661       __ leave(); // required for proper stackwalking of RuntimeStub frame
2662       __ ret(0);
2663     }
2664     {
2665       // UnsafeCopyMemory page error: continue after ucm
2666       UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2667 
2668       // Copy in multi-bytes chunks
2669       copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2670     }
2671     __ BIND(L_exit);
2672     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2673     restore_arg_regs_using_thread();
2674     if (is_oop) {
2675       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2676     } else {
2677       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2678     }
2679     __ vzeroupper();
2680     __ xorptr(rax, rax); // return 0
2681     __ leave(); // required for proper stackwalking of RuntimeStub frame
2682     __ ret(0);
2683 
2684     return start;
2685   }
2686 
2687 
2688   // Helper for generating a dynamic type check.
2689   // Smashes no registers.
2690   void generate_type_check(Register sub_klass,
2691                            Register super_check_offset,
2692                            Register super_klass,
2693                            Label& L_success) {
2694     assert_different_registers(sub_klass, super_check_offset, super_klass);
2695 
2696     BLOCK_COMMENT("type_check:");
2697 
2698     Label L_miss;
2699 
2700     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2701                                      super_check_offset);
2702     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2703 
2704     // Fall through on failure!
2705     __ BIND(L_miss);
2706   }
2707 
2708   //
2709   //  Generate checkcasting array copy stub
2710   //
2711   //  Input:
2712   //    c_rarg0   - source array address
2713   //    c_rarg1   - destination array address
2714   //    c_rarg2   - element count, treated as ssize_t, can be zero
2715   //    c_rarg3   - size_t ckoff (super_check_offset)
2716   // not Win64
2717   //    c_rarg4   - oop ckval (super_klass)
2718   // Win64
2719   //    rsp+40    - oop ckval (super_klass)
2720   //
2721   //  Output:
2722   //    rax ==  0  -  success
2723   //    rax == -1^K - failure, where K is partial transfer count
2724   //
2725   address generate_checkcast_copy(const char *name, address *entry,
2726                                   bool dest_uninitialized = false) {
2727 
2728     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2729 
2730     // Input registers (after setup_arg_regs)
2731     const Register from        = rdi;   // source array address
2732     const Register to          = rsi;   // destination array address
2733     const Register length      = rdx;   // elements count
2734     const Register ckoff       = rcx;   // super_check_offset
2735     const Register ckval       = r8;    // super_klass
2736 
2737     // Registers used as temps (r13, r14 are save-on-entry)
2738     const Register end_from    = from;  // source array end address
2739     const Register end_to      = r13;   // destination array end address
2740     const Register count       = rdx;   // -(count_remaining)
2741     const Register r14_length  = r14;   // saved copy of length
2742     // End pointers are inclusive, and if length is not zero they point
2743     // to the last unit copied:  end_to[0] := end_from[0]
2744 
2745     const Register rax_oop    = rax;    // actual oop copied
2746     const Register r11_klass  = r11;    // oop._klass
2747 
2748     //---------------------------------------------------------------
2749     // Assembler stub will be used for this call to arraycopy
2750     // if the two arrays are subtypes of Object[] but the
2751     // destination array type is not equal to or a supertype
2752     // of the source type.  Each element must be separately
2753     // checked.
2754 
2755     __ align(CodeEntryAlignment);
2756     StubCodeMark mark(this, "StubRoutines", name);
2757     address start = __ pc();
2758 
2759     __ enter(); // required for proper stackwalking of RuntimeStub frame
2760 
2761 #ifdef ASSERT
2762     // caller guarantees that the arrays really are different
2763     // otherwise, we would have to make conjoint checks
2764     { Label L;
2765       array_overlap_test(L, TIMES_OOP);
2766       __ stop("checkcast_copy within a single array");
2767       __ bind(L);
2768     }
2769 #endif //ASSERT
2770 
2771     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2772                        // ckoff => rcx, ckval => r8
2773                        // r9 and r10 may be used to save non-volatile registers
2774 #ifdef _WIN64
2775     // last argument (#4) is on stack on Win64
2776     __ movptr(ckval, Address(rsp, 6 * wordSize));
2777 #endif
2778 
2779     // Caller of this entry point must set up the argument registers.
2780     if (entry != NULL) {
2781       *entry = __ pc();
2782       BLOCK_COMMENT("Entry:");
2783     }
2784 
2785     // allocate spill slots for r13, r14
2786     enum {
2787       saved_r13_offset,
2788       saved_r14_offset,
2789       saved_r10_offset,
2790       saved_rbp_offset
2791     };
2792     __ subptr(rsp, saved_rbp_offset * wordSize);
2793     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2794     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2795     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2796 
2797 #ifdef ASSERT
2798       Label L2;
2799       __ get_thread(r14);
2800       __ cmpptr(r15_thread, r14);
2801       __ jcc(Assembler::equal, L2);
2802       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2803       __ bind(L2);
2804 #endif // ASSERT
2805 
2806     // check that int operands are properly extended to size_t
2807     assert_clean_int(length, rax);
2808     assert_clean_int(ckoff, rax);
2809 
2810 #ifdef ASSERT
2811     BLOCK_COMMENT("assert consistent ckoff/ckval");
2812     // The ckoff and ckval must be mutually consistent,
2813     // even though caller generates both.
2814     { Label L;
2815       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2816       __ cmpl(ckoff, Address(ckval, sco_offset));
2817       __ jcc(Assembler::equal, L);
2818       __ stop("super_check_offset inconsistent");
2819       __ bind(L);
2820     }
2821 #endif //ASSERT
2822 
2823     // Loop-invariant addresses.  They are exclusive end pointers.
2824     Address end_from_addr(from, length, TIMES_OOP, 0);
2825     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2826     // Loop-variant addresses.  They assume post-incremented count < 0.
2827     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2828     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2829 
2830     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2831     if (dest_uninitialized) {
2832       decorators |= IS_DEST_UNINITIALIZED;
2833     }
2834 
2835     BasicType type = T_OBJECT;
2836     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2837     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2838 
2839     // Copy from low to high addresses, indexed from the end of each array.
2840     __ lea(end_from, end_from_addr);
2841     __ lea(end_to,   end_to_addr);
2842     __ movptr(r14_length, length);        // save a copy of the length
2843     assert(length == count, "");          // else fix next line:
2844     __ negptr(count);                     // negate and test the length
2845     __ jcc(Assembler::notZero, L_load_element);
2846 
2847     // Empty array:  Nothing to do.
2848     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2849     __ jmp(L_done);
2850 
2851     // ======== begin loop ========
2852     // (Loop is rotated; its entry is L_load_element.)
2853     // Loop control:
2854     //   for (count = -count; count != 0; count++)
2855     // Base pointers src, dst are biased by 8*(count-1),to last element.
2856     __ align(OptoLoopAlignment);
2857 
2858     __ BIND(L_store_element);
2859     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
2860     __ increment(count);               // increment the count toward zero
2861     __ jcc(Assembler::zero, L_do_card_marks);
2862 
2863     // ======== loop entry is here ========
2864     __ BIND(L_load_element);
2865     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2866     __ testptr(rax_oop, rax_oop);
2867     __ jcc(Assembler::zero, L_store_element);
2868 
2869     __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2870     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2871     // ======== end loop ========
2872 
2873     // It was a real error; we must depend on the caller to finish the job.
2874     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2875     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2876     // and report their number to the caller.
2877     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2878     Label L_post_barrier;
2879     __ addptr(r14_length, count);     // K = (original - remaining) oops
2880     __ movptr(rax, r14_length);       // save the value
2881     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2882     __ jccb(Assembler::notZero, L_post_barrier);
2883     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2884 
2885     // Come here on success only.
2886     __ BIND(L_do_card_marks);
2887     __ xorptr(rax, rax);              // return 0 on success
2888 
2889     __ BIND(L_post_barrier);
2890     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2891 
2892     // Common exit point (success or failure).
2893     __ BIND(L_done);
2894     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2895     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2896     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2897     restore_arg_regs();
2898     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2899     __ leave(); // required for proper stackwalking of RuntimeStub frame
2900     __ ret(0);
2901 
2902     return start;
2903   }
2904 
2905   //
2906   //  Generate 'unsafe' array copy stub
2907   //  Though just as safe as the other stubs, it takes an unscaled
2908   //  size_t argument instead of an element count.
2909   //
2910   //  Input:
2911   //    c_rarg0   - source array address
2912   //    c_rarg1   - destination array address
2913   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2914   //
2915   // Examines the alignment of the operands and dispatches
2916   // to a long, int, short, or byte copy loop.
2917   //
2918   address generate_unsafe_copy(const char *name,
2919                                address byte_copy_entry, address short_copy_entry,
2920                                address int_copy_entry, address long_copy_entry) {
2921 
2922     Label L_long_aligned, L_int_aligned, L_short_aligned;
2923 
2924     // Input registers (before setup_arg_regs)
2925     const Register from        = c_rarg0;  // source array address
2926     const Register to          = c_rarg1;  // destination array address
2927     const Register size        = c_rarg2;  // byte count (size_t)
2928 
2929     // Register used as a temp
2930     const Register bits        = rax;      // test copy of low bits
2931 
2932     __ align(CodeEntryAlignment);
2933     StubCodeMark mark(this, "StubRoutines", name);
2934     address start = __ pc();
2935 
2936     __ enter(); // required for proper stackwalking of RuntimeStub frame
2937 
2938     // bump this on entry, not on exit:
2939     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2940 
2941     __ mov(bits, from);
2942     __ orptr(bits, to);
2943     __ orptr(bits, size);
2944 
2945     __ testb(bits, BytesPerLong-1);
2946     __ jccb(Assembler::zero, L_long_aligned);
2947 
2948     __ testb(bits, BytesPerInt-1);
2949     __ jccb(Assembler::zero, L_int_aligned);
2950 
2951     __ testb(bits, BytesPerShort-1);
2952     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2953 
2954     __ BIND(L_short_aligned);
2955     __ shrptr(size, LogBytesPerShort); // size => short_count
2956     __ jump(RuntimeAddress(short_copy_entry));
2957 
2958     __ BIND(L_int_aligned);
2959     __ shrptr(size, LogBytesPerInt); // size => int_count
2960     __ jump(RuntimeAddress(int_copy_entry));
2961 
2962     __ BIND(L_long_aligned);
2963     __ shrptr(size, LogBytesPerLong); // size => qword_count
2964     __ jump(RuntimeAddress(long_copy_entry));
2965 
2966     return start;
2967   }
2968 
2969   // Perform range checks on the proposed arraycopy.
2970   // Kills temp, but nothing else.
2971   // Also, clean the sign bits of src_pos and dst_pos.
2972   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2973                               Register src_pos, // source position (c_rarg1)
2974                               Register dst,     // destination array oo (c_rarg2)
2975                               Register dst_pos, // destination position (c_rarg3)
2976                               Register length,
2977                               Register temp,
2978                               Label& L_failed) {
2979     BLOCK_COMMENT("arraycopy_range_checks:");
2980 
2981     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2982     __ movl(temp, length);
2983     __ addl(temp, src_pos);             // src_pos + length
2984     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2985     __ jcc(Assembler::above, L_failed);
2986 
2987     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2988     __ movl(temp, length);
2989     __ addl(temp, dst_pos);             // dst_pos + length
2990     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2991     __ jcc(Assembler::above, L_failed);
2992 
2993     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2994     // Move with sign extension can be used since they are positive.
2995     __ movslq(src_pos, src_pos);
2996     __ movslq(dst_pos, dst_pos);
2997 
2998     BLOCK_COMMENT("arraycopy_range_checks done");
2999   }
3000 
3001   //
3002   //  Generate generic array copy stubs
3003   //
3004   //  Input:
3005   //    c_rarg0    -  src oop
3006   //    c_rarg1    -  src_pos (32-bits)
3007   //    c_rarg2    -  dst oop
3008   //    c_rarg3    -  dst_pos (32-bits)
3009   // not Win64
3010   //    c_rarg4    -  element count (32-bits)
3011   // Win64
3012   //    rsp+40     -  element count (32-bits)
3013   //
3014   //  Output:
3015   //    rax ==  0  -  success
3016   //    rax == -1^K - failure, where K is partial transfer count
3017   //
3018   address generate_generic_copy(const char *name,
3019                                 address byte_copy_entry, address short_copy_entry,
3020                                 address int_copy_entry, address oop_copy_entry,
3021                                 address long_copy_entry, address checkcast_copy_entry) {
3022 
3023     Label L_failed, L_failed_0, L_objArray;
3024     Label L_copy_shorts, L_copy_ints, L_copy_longs;
3025 
3026     // Input registers
3027     const Register src        = c_rarg0;  // source array oop
3028     const Register src_pos    = c_rarg1;  // source position
3029     const Register dst        = c_rarg2;  // destination array oop
3030     const Register dst_pos    = c_rarg3;  // destination position
3031 #ifndef _WIN64
3032     const Register length     = c_rarg4;
3033     const Register rklass_tmp = r9;  // load_klass
3034 #else
3035     const Address  length(rsp, 7 * wordSize);  // elements count is on stack on Win64
3036     const Register rklass_tmp = rdi;  // load_klass
3037 #endif
3038 
3039     { int modulus = CodeEntryAlignment;
3040       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
3041       int advance = target - (__ offset() % modulus);
3042       if (advance < 0)  advance += modulus;
3043       if (advance > 0)  __ nop(advance);
3044     }
3045     StubCodeMark mark(this, "StubRoutines", name);
3046 
3047     // Short-hop target to L_failed.  Makes for denser prologue code.
3048     __ BIND(L_failed_0);
3049     __ jmp(L_failed);
3050     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
3051 
3052     __ align(CodeEntryAlignment);
3053     address start = __ pc();
3054 
3055     __ enter(); // required for proper stackwalking of RuntimeStub frame
3056 
3057 #ifdef _WIN64
3058     __ push(rklass_tmp); // rdi is callee-save on Windows
3059 #endif
3060 
3061     // bump this on entry, not on exit:
3062     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
3063 
3064     //-----------------------------------------------------------------------
3065     // Assembler stub will be used for this call to arraycopy
3066     // if the following conditions are met:
3067     //
3068     // (1) src and dst must not be null.
3069     // (2) src_pos must not be negative.
3070     // (3) dst_pos must not be negative.
3071     // (4) length  must not be negative.
3072     // (5) src klass and dst klass should be the same and not NULL.
3073     // (6) src and dst should be arrays.
3074     // (7) src_pos + length must not exceed length of src.
3075     // (8) dst_pos + length must not exceed length of dst.
3076     //
3077 
3078     //  if (src == NULL) return -1;
3079     __ testptr(src, src);         // src oop
3080     size_t j1off = __ offset();
3081     __ jccb(Assembler::zero, L_failed_0);
3082 
3083     //  if (src_pos < 0) return -1;
3084     __ testl(src_pos, src_pos); // src_pos (32-bits)
3085     __ jccb(Assembler::negative, L_failed_0);
3086 
3087     //  if (dst == NULL) return -1;
3088     __ testptr(dst, dst);         // dst oop
3089     __ jccb(Assembler::zero, L_failed_0);
3090 
3091     //  if (dst_pos < 0) return -1;
3092     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3093     size_t j4off = __ offset();
3094     __ jccb(Assembler::negative, L_failed_0);
3095 
3096     // The first four tests are very dense code,
3097     // but not quite dense enough to put four
3098     // jumps in a 16-byte instruction fetch buffer.
3099     // That's good, because some branch predicters
3100     // do not like jumps so close together.
3101     // Make sure of this.
3102     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3103 
3104     // registers used as temp
3105     const Register r11_length    = r11; // elements count to copy
3106     const Register r10_src_klass = r10; // array klass
3107 
3108     //  if (length < 0) return -1;
3109     __ movl(r11_length, length);        // length (elements count, 32-bits value)
3110     __ testl(r11_length, r11_length);
3111     __ jccb(Assembler::negative, L_failed_0);
3112 
3113     __ load_klass(r10_src_klass, src, rklass_tmp);
3114 #ifdef ASSERT
3115     //  assert(src->klass() != NULL);
3116     {
3117       BLOCK_COMMENT("assert klasses not null {");
3118       Label L1, L2;
3119       __ testptr(r10_src_klass, r10_src_klass);
3120       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
3121       __ bind(L1);
3122       __ stop("broken null klass");
3123       __ bind(L2);
3124       __ load_klass(rax, dst, rklass_tmp);
3125       __ cmpq(rax, 0);
3126       __ jcc(Assembler::equal, L1);     // this would be broken also
3127       BLOCK_COMMENT("} assert klasses not null done");
3128     }
3129 #endif
3130 
3131     // Load layout helper (32-bits)
3132     //
3133     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3134     // 32        30    24            16              8     2                 0
3135     //
3136     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3137     //
3138 
3139     const int lh_offset = in_bytes(Klass::layout_helper_offset());
3140 
3141     // Handle objArrays completely differently...
3142     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3143     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3144     __ jcc(Assembler::equal, L_objArray);
3145 
3146     //  if (src->klass() != dst->klass()) return -1;
3147     __ load_klass(rax, dst, rklass_tmp);
3148     __ cmpq(r10_src_klass, rax);
3149     __ jcc(Assembler::notEqual, L_failed);
3150 
3151     const Register rax_lh = rax;  // layout helper
3152     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3153 
3154     // Check for flat inline type array -> return -1
3155     __ testl(rax_lh, Klass::_lh_array_tag_vt_value_bit_inplace);
3156     __ jcc(Assembler::notZero, L_failed);
3157 
3158     // Check for null-free (non-flat) inline type array -> handle as object array
3159     __ testl(rax_lh, Klass::_lh_null_free_bit_inplace);
3160     __ jcc(Assembler::notZero, L_objArray);
3161 
3162     //  if (!src->is_Array()) return -1;
3163     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3164     __ jcc(Assembler::greaterEqual, L_failed);
3165 
3166     // At this point, it is known to be a typeArray (array_tag 0x3).
3167 #ifdef ASSERT
3168     {
3169       BLOCK_COMMENT("assert primitive array {");
3170       Label L;
3171       __ movl(rklass_tmp, rax_lh);
3172       __ sarl(rklass_tmp, Klass::_lh_array_tag_shift);
3173       __ cmpl(rklass_tmp, Klass::_lh_array_tag_type_value);
3174       __ jcc(Assembler::equal, L);
3175       __ stop("must be a primitive array");
3176       __ bind(L);
3177       BLOCK_COMMENT("} assert primitive array done");
3178     }
3179 #endif
3180 
3181     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3182                            r10, L_failed);
3183 
3184     // TypeArrayKlass
3185     //
3186     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3187     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3188     //
3189 
3190     const Register r10_offset = r10;    // array offset
3191     const Register rax_elsize = rax_lh; // element size
3192 
3193     __ movl(r10_offset, rax_lh);
3194     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3195     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3196     __ addptr(src, r10_offset);           // src array offset
3197     __ addptr(dst, r10_offset);           // dst array offset
3198     BLOCK_COMMENT("choose copy loop based on element size");
3199     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3200 
3201 #ifdef _WIN64
3202     __ pop(rklass_tmp); // Restore callee-save rdi
3203 #endif
3204 
3205     // next registers should be set before the jump to corresponding stub
3206     const Register from     = c_rarg0;  // source array address
3207     const Register to       = c_rarg1;  // destination array address
3208     const Register count    = c_rarg2;  // elements count
3209 
3210     // 'from', 'to', 'count' registers should be set in such order
3211     // since they are the same as 'src', 'src_pos', 'dst'.
3212 
3213     __ cmpl(rax_elsize, 0);
3214     __ jccb(Assembler::notEqual, L_copy_shorts);
3215     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3216     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3217     __ movl2ptr(count, r11_length); // length
3218     __ jump(RuntimeAddress(byte_copy_entry));
3219 
3220   __ BIND(L_copy_shorts);
3221     __ cmpl(rax_elsize, LogBytesPerShort);
3222     __ jccb(Assembler::notEqual, L_copy_ints);
3223     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3224     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3225     __ movl2ptr(count, r11_length); // length
3226     __ jump(RuntimeAddress(short_copy_entry));
3227 
3228   __ BIND(L_copy_ints);
3229     __ cmpl(rax_elsize, LogBytesPerInt);
3230     __ jccb(Assembler::notEqual, L_copy_longs);
3231     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3232     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3233     __ movl2ptr(count, r11_length); // length
3234     __ jump(RuntimeAddress(int_copy_entry));
3235 
3236   __ BIND(L_copy_longs);
3237 #ifdef ASSERT
3238     {
3239       BLOCK_COMMENT("assert long copy {");
3240       Label L;
3241       __ cmpl(rax_elsize, LogBytesPerLong);
3242       __ jcc(Assembler::equal, L);
3243       __ stop("must be long copy, but elsize is wrong");
3244       __ bind(L);
3245       BLOCK_COMMENT("} assert long copy done");
3246     }
3247 #endif
3248     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3249     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3250     __ movl2ptr(count, r11_length); // length
3251     __ jump(RuntimeAddress(long_copy_entry));
3252 
3253     // ObjArrayKlass
3254   __ BIND(L_objArray);
3255     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3256 
3257     Label L_plain_copy, L_checkcast_copy;
3258     //  test array classes for subtyping
3259     __ load_klass(rax, dst, rklass_tmp);
3260     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3261     __ jcc(Assembler::notEqual, L_checkcast_copy);
3262 
3263     // Identically typed arrays can be copied without element-wise checks.
3264     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3265                            r10, L_failed);
3266 
3267     __ lea(from, Address(src, src_pos, TIMES_OOP,
3268                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3269     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3270                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3271     __ movl2ptr(count, r11_length); // length
3272   __ BIND(L_plain_copy);
3273 #ifdef _WIN64
3274     __ pop(rklass_tmp); // Restore callee-save rdi
3275 #endif
3276     __ jump(RuntimeAddress(oop_copy_entry));
3277 
3278   __ BIND(L_checkcast_copy);
3279     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3280     {
3281       // Before looking at dst.length, make sure dst is also an objArray.
3282       // This check also fails for flat/null-free arrays which are not supported.
3283       __ cmpl(Address(rax, lh_offset), objArray_lh);
3284       __ jcc(Assembler::notEqual, L_failed);
3285 
3286 #ifdef ASSERT
3287       {
3288         BLOCK_COMMENT("assert not null-free array {");
3289         Label L;
3290         __ movl(rklass_tmp, Address(rax, lh_offset));
3291         __ testl(rklass_tmp, Klass::_lh_null_free_bit_inplace);
3292         __ jcc(Assembler::zero, L);
3293         __ stop("unexpected null-free array");
3294         __ bind(L);
3295         BLOCK_COMMENT("} assert not null-free array");
3296       }
3297 #endif
3298 
3299       // It is safe to examine both src.length and dst.length.
3300       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3301                              rax, L_failed);
3302 
3303       const Register r11_dst_klass = r11;
3304       __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3305 
3306       // Marshal the base address arguments now, freeing registers.
3307       __ lea(from, Address(src, src_pos, TIMES_OOP,
3308                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3309       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3310                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3311       __ movl(count, length);           // length (reloaded)
3312       Register sco_temp = c_rarg3;      // this register is free now
3313       assert_different_registers(from, to, count, sco_temp,
3314                                  r11_dst_klass, r10_src_klass);
3315       assert_clean_int(count, sco_temp);
3316 
3317       // Generate the type check.
3318       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3319       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3320       assert_clean_int(sco_temp, rax);
3321       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3322 
3323       // Fetch destination element klass from the ObjArrayKlass header.
3324       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3325       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3326       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3327       assert_clean_int(sco_temp, rax);
3328 
3329 #ifdef _WIN64
3330       __ pop(rklass_tmp); // Restore callee-save rdi
3331 #endif
3332 
3333       // the checkcast_copy loop needs two extra arguments:
3334       assert(c_rarg3 == sco_temp, "#3 already in place");
3335       // Set up arguments for checkcast_copy_entry.
3336       setup_arg_regs(4);
3337       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3338       __ jump(RuntimeAddress(checkcast_copy_entry));
3339     }
3340 
3341   __ BIND(L_failed);
3342 #ifdef _WIN64
3343     __ pop(rklass_tmp); // Restore callee-save rdi
3344 #endif
3345     __ xorptr(rax, rax);
3346     __ notptr(rax); // return -1
3347     __ leave();   // required for proper stackwalking of RuntimeStub frame
3348     __ ret(0);
3349 
3350     return start;
3351   }
3352 
3353   address generate_data_cache_writeback() {
3354     const Register src        = c_rarg0;  // source address
3355 
3356     __ align(CodeEntryAlignment);
3357 
3358     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3359 
3360     address start = __ pc();
3361     __ enter();
3362     __ cache_wb(Address(src, 0));
3363     __ leave();
3364     __ ret(0);
3365 
3366     return start;
3367   }
3368 
3369   address generate_data_cache_writeback_sync() {
3370     const Register is_pre    = c_rarg0;  // pre or post sync
3371 
3372     __ align(CodeEntryAlignment);
3373 
3374     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3375 
3376     // pre wbsync is a no-op
3377     // post wbsync translates to an sfence
3378 
3379     Label skip;
3380     address start = __ pc();
3381     __ enter();
3382     __ cmpl(is_pre, 0);
3383     __ jcc(Assembler::notEqual, skip);
3384     __ cache_wbsync(false);
3385     __ bind(skip);
3386     __ leave();
3387     __ ret(0);
3388 
3389     return start;
3390   }
3391 
3392   void generate_arraycopy_stubs() {
3393     address entry;
3394     address entry_jbyte_arraycopy;
3395     address entry_jshort_arraycopy;
3396     address entry_jint_arraycopy;
3397     address entry_oop_arraycopy;
3398     address entry_jlong_arraycopy;
3399     address entry_checkcast_arraycopy;
3400 
3401     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3402                                                                            "jbyte_disjoint_arraycopy");
3403     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3404                                                                            "jbyte_arraycopy");
3405 
3406     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3407                                                                             "jshort_disjoint_arraycopy");
3408     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3409                                                                             "jshort_arraycopy");
3410 
3411     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3412                                                                               "jint_disjoint_arraycopy");
3413     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3414                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3415 
3416     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3417                                                                                "jlong_disjoint_arraycopy");
3418     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3419                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3420 
3421 
3422     if (UseCompressedOops) {
3423       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3424                                                                               "oop_disjoint_arraycopy");
3425       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3426                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3427       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3428                                                                                      "oop_disjoint_arraycopy_uninit",
3429                                                                                      /*dest_uninitialized*/true);
3430       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3431                                                                                      NULL, "oop_arraycopy_uninit",
3432                                                                                      /*dest_uninitialized*/true);
3433     } else {
3434       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3435                                                                                "oop_disjoint_arraycopy");
3436       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3437                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3438       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3439                                                                                       "oop_disjoint_arraycopy_uninit",
3440                                                                                       /*dest_uninitialized*/true);
3441       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3442                                                                                       NULL, "oop_arraycopy_uninit",
3443                                                                                       /*dest_uninitialized*/true);
3444     }
3445 
3446     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3447     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3448                                                                         /*dest_uninitialized*/true);
3449 
3450     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3451                                                               entry_jbyte_arraycopy,
3452                                                               entry_jshort_arraycopy,
3453                                                               entry_jint_arraycopy,
3454                                                               entry_jlong_arraycopy);
3455     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3456                                                                entry_jbyte_arraycopy,
3457                                                                entry_jshort_arraycopy,
3458                                                                entry_jint_arraycopy,
3459                                                                entry_oop_arraycopy,
3460                                                                entry_jlong_arraycopy,
3461                                                                entry_checkcast_arraycopy);
3462 
3463     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3464     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3465     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3466     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3467     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3468     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3469 
3470     // We don't generate specialized code for HeapWord-aligned source
3471     // arrays, so just use the code we've already generated
3472     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3473     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3474 
3475     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3476     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3477 
3478     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3479     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3480 
3481     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3482     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3483 
3484     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3485     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3486 
3487     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3488     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3489   }
3490 
3491   // AES intrinsic stubs
3492   enum {AESBlockSize = 16};
3493 
3494   address generate_key_shuffle_mask() {
3495     __ align(16);
3496     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3497     address start = __ pc();
3498     __ emit_data64( 0x0405060700010203, relocInfo::none );
3499     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3500     return start;
3501   }
3502 
3503   address generate_counter_shuffle_mask() {
3504     __ align(16);
3505     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3506     address start = __ pc();
3507     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3508     __ emit_data64(0x0001020304050607, relocInfo::none);
3509     return start;
3510   }
3511 
3512   // Utility routine for loading a 128-bit key word in little endian format
3513   // can optionally specify that the shuffle mask is already in an xmmregister
3514   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3515     __ movdqu(xmmdst, Address(key, offset));
3516     if (xmm_shuf_mask != NULL) {
3517       __ pshufb(xmmdst, xmm_shuf_mask);
3518     } else {
3519       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3520     }
3521   }
3522 
3523   // Utility routine for increase 128bit counter (iv in CTR mode)
3524   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3525     __ pextrq(reg, xmmdst, 0x0);
3526     __ addq(reg, inc_delta);
3527     __ pinsrq(xmmdst, reg, 0x0);
3528     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3529     __ pextrq(reg, xmmdst, 0x01); // Carry
3530     __ addq(reg, 0x01);
3531     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3532     __ BIND(next_block);          // next instruction
3533   }
3534 
3535   // Arguments:
3536   //
3537   // Inputs:
3538   //   c_rarg0   - source byte array address
3539   //   c_rarg1   - destination byte array address
3540   //   c_rarg2   - K (key) in little endian int array
3541   //
3542   address generate_aescrypt_encryptBlock() {
3543     assert(UseAES, "need AES instructions and misaligned SSE support");
3544     __ align(CodeEntryAlignment);
3545     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3546     Label L_doLast;
3547     address start = __ pc();
3548 
3549     const Register from        = c_rarg0;  // source array address
3550     const Register to          = c_rarg1;  // destination array address
3551     const Register key         = c_rarg2;  // key array address
3552     const Register keylen      = rax;
3553 
3554     const XMMRegister xmm_result = xmm0;
3555     const XMMRegister xmm_key_shuf_mask = xmm1;
3556     // On win64 xmm6-xmm15 must be preserved so don't use them.
3557     const XMMRegister xmm_temp1  = xmm2;
3558     const XMMRegister xmm_temp2  = xmm3;
3559     const XMMRegister xmm_temp3  = xmm4;
3560     const XMMRegister xmm_temp4  = xmm5;
3561 
3562     __ enter(); // required for proper stackwalking of RuntimeStub frame
3563 
3564     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3565     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3566 
3567     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3568     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3569 
3570     // For encryption, the java expanded key ordering is just what we need
3571     // we don't know if the key is aligned, hence not using load-execute form
3572 
3573     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3574     __ pxor(xmm_result, xmm_temp1);
3575 
3576     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3577     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3578     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3579     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3580 
3581     __ aesenc(xmm_result, xmm_temp1);
3582     __ aesenc(xmm_result, xmm_temp2);
3583     __ aesenc(xmm_result, xmm_temp3);
3584     __ aesenc(xmm_result, xmm_temp4);
3585 
3586     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3587     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3588     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3589     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3590 
3591     __ aesenc(xmm_result, xmm_temp1);
3592     __ aesenc(xmm_result, xmm_temp2);
3593     __ aesenc(xmm_result, xmm_temp3);
3594     __ aesenc(xmm_result, xmm_temp4);
3595 
3596     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3597     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3598 
3599     __ cmpl(keylen, 44);
3600     __ jccb(Assembler::equal, L_doLast);
3601 
3602     __ aesenc(xmm_result, xmm_temp1);
3603     __ aesenc(xmm_result, xmm_temp2);
3604 
3605     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3606     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3607 
3608     __ cmpl(keylen, 52);
3609     __ jccb(Assembler::equal, L_doLast);
3610 
3611     __ aesenc(xmm_result, xmm_temp1);
3612     __ aesenc(xmm_result, xmm_temp2);
3613 
3614     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3615     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3616 
3617     __ BIND(L_doLast);
3618     __ aesenc(xmm_result, xmm_temp1);
3619     __ aesenclast(xmm_result, xmm_temp2);
3620     __ movdqu(Address(to, 0), xmm_result);        // store the result
3621     __ xorptr(rax, rax); // return 0
3622     __ leave(); // required for proper stackwalking of RuntimeStub frame
3623     __ ret(0);
3624 
3625     return start;
3626   }
3627 
3628 
3629   // Arguments:
3630   //
3631   // Inputs:
3632   //   c_rarg0   - source byte array address
3633   //   c_rarg1   - destination byte array address
3634   //   c_rarg2   - K (key) in little endian int array
3635   //
3636   address generate_aescrypt_decryptBlock() {
3637     assert(UseAES, "need AES instructions and misaligned SSE support");
3638     __ align(CodeEntryAlignment);
3639     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3640     Label L_doLast;
3641     address start = __ pc();
3642 
3643     const Register from        = c_rarg0;  // source array address
3644     const Register to          = c_rarg1;  // destination array address
3645     const Register key         = c_rarg2;  // key array address
3646     const Register keylen      = rax;
3647 
3648     const XMMRegister xmm_result = xmm0;
3649     const XMMRegister xmm_key_shuf_mask = xmm1;
3650     // On win64 xmm6-xmm15 must be preserved so don't use them.
3651     const XMMRegister xmm_temp1  = xmm2;
3652     const XMMRegister xmm_temp2  = xmm3;
3653     const XMMRegister xmm_temp3  = xmm4;
3654     const XMMRegister xmm_temp4  = xmm5;
3655 
3656     __ enter(); // required for proper stackwalking of RuntimeStub frame
3657 
3658     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3659     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3660 
3661     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3662     __ movdqu(xmm_result, Address(from, 0));
3663 
3664     // for decryption java expanded key ordering is rotated one position from what we want
3665     // so we start from 0x10 here and hit 0x00 last
3666     // we don't know if the key is aligned, hence not using load-execute form
3667     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3668     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3669     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3670     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3671 
3672     __ pxor  (xmm_result, xmm_temp1);
3673     __ aesdec(xmm_result, xmm_temp2);
3674     __ aesdec(xmm_result, xmm_temp3);
3675     __ aesdec(xmm_result, xmm_temp4);
3676 
3677     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3678     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3679     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3680     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3681 
3682     __ aesdec(xmm_result, xmm_temp1);
3683     __ aesdec(xmm_result, xmm_temp2);
3684     __ aesdec(xmm_result, xmm_temp3);
3685     __ aesdec(xmm_result, xmm_temp4);
3686 
3687     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3688     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3689     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3690 
3691     __ cmpl(keylen, 44);
3692     __ jccb(Assembler::equal, L_doLast);
3693 
3694     __ aesdec(xmm_result, xmm_temp1);
3695     __ aesdec(xmm_result, xmm_temp2);
3696 
3697     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3698     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3699 
3700     __ cmpl(keylen, 52);
3701     __ jccb(Assembler::equal, L_doLast);
3702 
3703     __ aesdec(xmm_result, xmm_temp1);
3704     __ aesdec(xmm_result, xmm_temp2);
3705 
3706     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3707     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3708 
3709     __ BIND(L_doLast);
3710     __ aesdec(xmm_result, xmm_temp1);
3711     __ aesdec(xmm_result, xmm_temp2);
3712 
3713     // for decryption the aesdeclast operation is always on key+0x00
3714     __ aesdeclast(xmm_result, xmm_temp3);
3715     __ movdqu(Address(to, 0), xmm_result);  // store the result
3716     __ xorptr(rax, rax); // return 0
3717     __ leave(); // required for proper stackwalking of RuntimeStub frame
3718     __ ret(0);
3719 
3720     return start;
3721   }
3722 
3723 
3724   // Arguments:
3725   //
3726   // Inputs:
3727   //   c_rarg0   - source byte array address
3728   //   c_rarg1   - destination byte array address
3729   //   c_rarg2   - K (key) in little endian int array
3730   //   c_rarg3   - r vector byte array address
3731   //   c_rarg4   - input length
3732   //
3733   // Output:
3734   //   rax       - input length
3735   //
3736   address generate_cipherBlockChaining_encryptAESCrypt() {
3737     assert(UseAES, "need AES instructions and misaligned SSE support");
3738     __ align(CodeEntryAlignment);
3739     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3740     address start = __ pc();
3741 
3742     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3743     const Register from        = c_rarg0;  // source array address
3744     const Register to          = c_rarg1;  // destination array address
3745     const Register key         = c_rarg2;  // key array address
3746     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3747                                            // and left with the results of the last encryption block
3748 #ifndef _WIN64
3749     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3750 #else
3751     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3752     const Register len_reg     = r11;      // pick the volatile windows register
3753 #endif
3754     const Register pos         = rax;
3755 
3756     // xmm register assignments for the loops below
3757     const XMMRegister xmm_result = xmm0;
3758     const XMMRegister xmm_temp   = xmm1;
3759     // keys 0-10 preloaded into xmm2-xmm12
3760     const int XMM_REG_NUM_KEY_FIRST = 2;
3761     const int XMM_REG_NUM_KEY_LAST  = 15;
3762     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3763     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3764     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3765     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3766     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3767 
3768     __ enter(); // required for proper stackwalking of RuntimeStub frame
3769 
3770 #ifdef _WIN64
3771     // on win64, fill len_reg from stack position
3772     __ movl(len_reg, len_mem);
3773 #else
3774     __ push(len_reg); // Save
3775 #endif
3776 
3777     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3778     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3779     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3780     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3781       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3782       offset += 0x10;
3783     }
3784     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3785 
3786     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3787     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3788     __ cmpl(rax, 44);
3789     __ jcc(Assembler::notEqual, L_key_192_256);
3790 
3791     // 128 bit code follows here
3792     __ movptr(pos, 0);
3793     __ align(OptoLoopAlignment);
3794 
3795     __ BIND(L_loopTop_128);
3796     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3797     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3798     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3799     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3800       __ aesenc(xmm_result, as_XMMRegister(rnum));
3801     }
3802     __ aesenclast(xmm_result, xmm_key10);
3803     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3804     // no need to store r to memory until we exit
3805     __ addptr(pos, AESBlockSize);
3806     __ subptr(len_reg, AESBlockSize);
3807     __ jcc(Assembler::notEqual, L_loopTop_128);
3808 
3809     __ BIND(L_exit);
3810     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3811 
3812 #ifdef _WIN64
3813     __ movl(rax, len_mem);
3814 #else
3815     __ pop(rax); // return length
3816 #endif
3817     __ leave(); // required for proper stackwalking of RuntimeStub frame
3818     __ ret(0);
3819 
3820     __ BIND(L_key_192_256);
3821     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3822     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3823     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3824     __ cmpl(rax, 52);
3825     __ jcc(Assembler::notEqual, L_key_256);
3826 
3827     // 192-bit code follows here (could be changed to use more xmm registers)
3828     __ movptr(pos, 0);
3829     __ align(OptoLoopAlignment);
3830 
3831     __ BIND(L_loopTop_192);
3832     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3833     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3834     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3835     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3836       __ aesenc(xmm_result, as_XMMRegister(rnum));
3837     }
3838     __ aesenclast(xmm_result, xmm_key12);
3839     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3840     // no need to store r to memory until we exit
3841     __ addptr(pos, AESBlockSize);
3842     __ subptr(len_reg, AESBlockSize);
3843     __ jcc(Assembler::notEqual, L_loopTop_192);
3844     __ jmp(L_exit);
3845 
3846     __ BIND(L_key_256);
3847     // 256-bit code follows here (could be changed to use more xmm registers)
3848     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3849     __ movptr(pos, 0);
3850     __ align(OptoLoopAlignment);
3851 
3852     __ BIND(L_loopTop_256);
3853     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3854     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3855     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3856     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3857       __ aesenc(xmm_result, as_XMMRegister(rnum));
3858     }
3859     load_key(xmm_temp, key, 0xe0);
3860     __ aesenclast(xmm_result, xmm_temp);
3861     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3862     // no need to store r to memory until we exit
3863     __ addptr(pos, AESBlockSize);
3864     __ subptr(len_reg, AESBlockSize);
3865     __ jcc(Assembler::notEqual, L_loopTop_256);
3866     __ jmp(L_exit);
3867 
3868     return start;
3869   }
3870 
3871   // Safefetch stubs.
3872   void generate_safefetch(const char* name, int size, address* entry,
3873                           address* fault_pc, address* continuation_pc) {
3874     // safefetch signatures:
3875     //   int      SafeFetch32(int*      adr, int      errValue);
3876     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3877     //
3878     // arguments:
3879     //   c_rarg0 = adr
3880     //   c_rarg1 = errValue
3881     //
3882     // result:
3883     //   PPC_RET  = *adr or errValue
3884 
3885     StubCodeMark mark(this, "StubRoutines", name);
3886 
3887     // Entry point, pc or function descriptor.
3888     *entry = __ pc();
3889 
3890     // Load *adr into c_rarg1, may fault.
3891     *fault_pc = __ pc();
3892     switch (size) {
3893       case 4:
3894         // int32_t
3895         __ movl(c_rarg1, Address(c_rarg0, 0));
3896         break;
3897       case 8:
3898         // int64_t
3899         __ movq(c_rarg1, Address(c_rarg0, 0));
3900         break;
3901       default:
3902         ShouldNotReachHere();
3903     }
3904 
3905     // return errValue or *adr
3906     *continuation_pc = __ pc();
3907     __ movq(rax, c_rarg1);
3908     __ ret(0);
3909   }
3910 
3911   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3912   // to hide instruction latency
3913   //
3914   // Arguments:
3915   //
3916   // Inputs:
3917   //   c_rarg0   - source byte array address
3918   //   c_rarg1   - destination byte array address
3919   //   c_rarg2   - K (key) in little endian int array
3920   //   c_rarg3   - r vector byte array address
3921   //   c_rarg4   - input length
3922   //
3923   // Output:
3924   //   rax       - input length
3925   //
3926   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3927     assert(UseAES, "need AES instructions and misaligned SSE support");
3928     __ align(CodeEntryAlignment);
3929     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3930     address start = __ pc();
3931 
3932     const Register from        = c_rarg0;  // source array address
3933     const Register to          = c_rarg1;  // destination array address
3934     const Register key         = c_rarg2;  // key array address
3935     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3936                                            // and left with the results of the last encryption block
3937 #ifndef _WIN64
3938     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3939 #else
3940     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3941     const Register len_reg     = r11;      // pick the volatile windows register
3942 #endif
3943     const Register pos         = rax;
3944 
3945     const int PARALLEL_FACTOR = 4;
3946     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3947 
3948     Label L_exit;
3949     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3950     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3951     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3952     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3953     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3954 
3955     // keys 0-10 preloaded into xmm5-xmm15
3956     const int XMM_REG_NUM_KEY_FIRST = 5;
3957     const int XMM_REG_NUM_KEY_LAST  = 15;
3958     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3959     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3960 
3961     __ enter(); // required for proper stackwalking of RuntimeStub frame
3962 
3963 #ifdef _WIN64
3964     // on win64, fill len_reg from stack position
3965     __ movl(len_reg, len_mem);
3966 #else
3967     __ push(len_reg); // Save
3968 #endif
3969     __ push(rbx);
3970     // the java expanded key ordering is rotated one position from what we want
3971     // so we start from 0x10 here and hit 0x00 last
3972     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3973     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3974     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3975     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3976       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3977       offset += 0x10;
3978     }
3979     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3980 
3981     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3982 
3983     // registers holding the four results in the parallelized loop
3984     const XMMRegister xmm_result0 = xmm0;
3985     const XMMRegister xmm_result1 = xmm2;
3986     const XMMRegister xmm_result2 = xmm3;
3987     const XMMRegister xmm_result3 = xmm4;
3988 
3989     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3990 
3991     __ xorptr(pos, pos);
3992 
3993     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3994     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3995     __ cmpl(rbx, 52);
3996     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3997     __ cmpl(rbx, 60);
3998     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3999 
4000 #define DoFour(opc, src_reg)           \
4001   __ opc(xmm_result0, src_reg);         \
4002   __ opc(xmm_result1, src_reg);         \
4003   __ opc(xmm_result2, src_reg);         \
4004   __ opc(xmm_result3, src_reg);         \
4005 
4006     for (int k = 0; k < 3; ++k) {
4007       __ BIND(L_multiBlock_loopTopHead[k]);
4008       if (k != 0) {
4009         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
4010         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
4011       }
4012       if (k == 1) {
4013         __ subptr(rsp, 6 * wordSize);
4014         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
4015         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
4016         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
4017         load_key(xmm1, key, 0xc0);  // 0xc0;
4018         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
4019       } else if (k == 2) {
4020         __ subptr(rsp, 10 * wordSize);
4021         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
4022         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
4023         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
4024         load_key(xmm1, key, 0xe0);  // 0xe0;
4025         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
4026         load_key(xmm15, key, 0xb0); // 0xb0;
4027         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
4028         load_key(xmm1, key, 0xc0);  // 0xc0;
4029         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
4030       }
4031       __ align(OptoLoopAlignment);
4032       __ BIND(L_multiBlock_loopTop[k]);
4033       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
4034       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
4035 
4036       if  (k != 0) {
4037         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
4038         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
4039       }
4040 
4041       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
4042       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4043       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4044       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4045 
4046       DoFour(pxor, xmm_key_first);
4047       if (k == 0) {
4048         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
4049           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4050         }
4051         DoFour(aesdeclast, xmm_key_last);
4052       } else if (k == 1) {
4053         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
4054           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4055         }
4056         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4057         DoFour(aesdec, xmm1);  // key : 0xc0
4058         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4059         DoFour(aesdeclast, xmm_key_last);
4060       } else if (k == 2) {
4061         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
4062           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4063         }
4064         DoFour(aesdec, xmm1);  // key : 0xc0
4065         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
4066         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
4067         DoFour(aesdec, xmm15);  // key : 0xd0
4068         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
4069         DoFour(aesdec, xmm1);  // key : 0xe0
4070         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
4071         DoFour(aesdeclast, xmm_key_last);
4072       }
4073 
4074       // for each result, xor with the r vector of previous cipher block
4075       __ pxor(xmm_result0, xmm_prev_block_cipher);
4076       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4077       __ pxor(xmm_result1, xmm_prev_block_cipher);
4078       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4079       __ pxor(xmm_result2, xmm_prev_block_cipher);
4080       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4081       __ pxor(xmm_result3, xmm_prev_block_cipher);
4082       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
4083       if (k != 0) {
4084         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
4085       }
4086 
4087       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
4088       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4089       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4090       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4091 
4092       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
4093       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
4094       __ jmp(L_multiBlock_loopTop[k]);
4095 
4096       // registers used in the non-parallelized loops
4097       // xmm register assignments for the loops below
4098       const XMMRegister xmm_result = xmm0;
4099       const XMMRegister xmm_prev_block_cipher_save = xmm2;
4100       const XMMRegister xmm_key11 = xmm3;
4101       const XMMRegister xmm_key12 = xmm4;
4102       const XMMRegister key_tmp = xmm4;
4103 
4104       __ BIND(L_singleBlock_loopTopHead[k]);
4105       if (k == 1) {
4106         __ addptr(rsp, 6 * wordSize);
4107       } else if (k == 2) {
4108         __ addptr(rsp, 10 * wordSize);
4109       }
4110       __ cmpptr(len_reg, 0); // any blocks left??
4111       __ jcc(Assembler::equal, L_exit);
4112       __ BIND(L_singleBlock_loopTopHead2[k]);
4113       if (k == 1) {
4114         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
4115         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
4116       }
4117       if (k == 2) {
4118         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
4119       }
4120       __ align(OptoLoopAlignment);
4121       __ BIND(L_singleBlock_loopTop[k]);
4122       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
4123       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
4124       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
4125       for (int rnum = 1; rnum <= 9 ; rnum++) {
4126           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4127       }
4128       if (k == 1) {
4129         __ aesdec(xmm_result, xmm_key11);
4130         __ aesdec(xmm_result, xmm_key12);
4131       }
4132       if (k == 2) {
4133         __ aesdec(xmm_result, xmm_key11);
4134         load_key(key_tmp, key, 0xc0);
4135         __ aesdec(xmm_result, key_tmp);
4136         load_key(key_tmp, key, 0xd0);
4137         __ aesdec(xmm_result, key_tmp);
4138         load_key(key_tmp, key, 0xe0);
4139         __ aesdec(xmm_result, key_tmp);
4140       }
4141 
4142       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
4143       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
4144       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
4145       // no need to store r to memory until we exit
4146       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
4147       __ addptr(pos, AESBlockSize);
4148       __ subptr(len_reg, AESBlockSize);
4149       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
4150       if (k != 2) {
4151         __ jmp(L_exit);
4152       }
4153     } //for 128/192/256
4154 
4155     __ BIND(L_exit);
4156     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
4157     __ pop(rbx);
4158 #ifdef _WIN64
4159     __ movl(rax, len_mem);
4160 #else
4161     __ pop(rax); // return length
4162 #endif
4163     __ leave(); // required for proper stackwalking of RuntimeStub frame
4164     __ ret(0);
4165     return start;
4166 }
4167 
4168   address generate_electronicCodeBook_encryptAESCrypt() {
4169     __ align(CodeEntryAlignment);
4170     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
4171     address start = __ pc();
4172     const Register from = c_rarg0;  // source array address
4173     const Register to = c_rarg1;  // destination array address
4174     const Register key = c_rarg2;  // key array address
4175     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4176     __ enter(); // required for proper stackwalking of RuntimeStub frame
4177     __ aesecb_encrypt(from, to, key, len);
4178     __ leave(); // required for proper stackwalking of RuntimeStub frame
4179     __ ret(0);
4180     return start;
4181  }
4182 
4183   address generate_electronicCodeBook_decryptAESCrypt() {
4184     __ align(CodeEntryAlignment);
4185     StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
4186     address start = __ pc();
4187     const Register from = c_rarg0;  // source array address
4188     const Register to = c_rarg1;  // destination array address
4189     const Register key = c_rarg2;  // key array address
4190     const Register len = c_rarg3;  // src len (must be multiple of blocksize 16)
4191     __ enter(); // required for proper stackwalking of RuntimeStub frame
4192     __ aesecb_decrypt(from, to, key, len);
4193     __ leave(); // required for proper stackwalking of RuntimeStub frame
4194     __ ret(0);
4195     return start;
4196   }
4197 
4198   // ofs and limit are use for multi-block byte array.
4199   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
4200   address generate_md5_implCompress(bool multi_block, const char *name) {
4201     __ align(CodeEntryAlignment);
4202     StubCodeMark mark(this, "StubRoutines", name);
4203     address start = __ pc();
4204 
4205     const Register buf_param = r15;
4206     const Address state_param(rsp, 0 * wordSize);
4207     const Address ofs_param  (rsp, 1 * wordSize    );
4208     const Address limit_param(rsp, 1 * wordSize + 4);
4209 
4210     __ enter();
4211     __ push(rbx);
4212     __ push(rdi);
4213     __ push(rsi);
4214     __ push(r15);
4215     __ subptr(rsp, 2 * wordSize);
4216 
4217     __ movptr(buf_param, c_rarg0);
4218     __ movptr(state_param, c_rarg1);
4219     if (multi_block) {
4220       __ movl(ofs_param, c_rarg2);
4221       __ movl(limit_param, c_rarg3);
4222     }
4223     __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
4224 
4225     __ addptr(rsp, 2 * wordSize);
4226     __ pop(r15);
4227     __ pop(rsi);
4228     __ pop(rdi);
4229     __ pop(rbx);
4230     __ leave();
4231     __ ret(0);
4232     return start;
4233   }
4234 
4235   address generate_upper_word_mask() {
4236     __ align64();
4237     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
4238     address start = __ pc();
4239     __ emit_data64(0x0000000000000000, relocInfo::none);
4240     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
4241     return start;
4242   }
4243 
4244   address generate_shuffle_byte_flip_mask() {
4245     __ align64();
4246     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
4247     address start = __ pc();
4248     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4249     __ emit_data64(0x0001020304050607, relocInfo::none);
4250     return start;
4251   }
4252 
4253   // ofs and limit are use for multi-block byte array.
4254   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4255   address generate_sha1_implCompress(bool multi_block, const char *name) {
4256     __ align(CodeEntryAlignment);
4257     StubCodeMark mark(this, "StubRoutines", name);
4258     address start = __ pc();
4259 
4260     Register buf = c_rarg0;
4261     Register state = c_rarg1;
4262     Register ofs = c_rarg2;
4263     Register limit = c_rarg3;
4264 
4265     const XMMRegister abcd = xmm0;
4266     const XMMRegister e0 = xmm1;
4267     const XMMRegister e1 = xmm2;
4268     const XMMRegister msg0 = xmm3;
4269 
4270     const XMMRegister msg1 = xmm4;
4271     const XMMRegister msg2 = xmm5;
4272     const XMMRegister msg3 = xmm6;
4273     const XMMRegister shuf_mask = xmm7;
4274 
4275     __ enter();
4276 
4277     __ subptr(rsp, 4 * wordSize);
4278 
4279     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4280       buf, state, ofs, limit, rsp, multi_block);
4281 
4282     __ addptr(rsp, 4 * wordSize);
4283 
4284     __ leave();
4285     __ ret(0);
4286     return start;
4287   }
4288 
4289   address generate_pshuffle_byte_flip_mask() {
4290     __ align64();
4291     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4292     address start = __ pc();
4293     __ emit_data64(0x0405060700010203, relocInfo::none);
4294     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4295 
4296     if (VM_Version::supports_avx2()) {
4297       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4298       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4299       // _SHUF_00BA
4300       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4301       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4302       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4303       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4304       // _SHUF_DC00
4305       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4306       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4307       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4308       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4309     }
4310 
4311     return start;
4312   }
4313 
4314   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4315   address generate_pshuffle_byte_flip_mask_sha512() {
4316     __ align32();
4317     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4318     address start = __ pc();
4319     if (VM_Version::supports_avx2()) {
4320       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4321       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4322       __ emit_data64(0x1011121314151617, relocInfo::none);
4323       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4324       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4325       __ emit_data64(0x0000000000000000, relocInfo::none);
4326       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4327       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4328     }
4329 
4330     return start;
4331   }
4332 
4333 // ofs and limit are use for multi-block byte array.
4334 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4335   address generate_sha256_implCompress(bool multi_block, const char *name) {
4336     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4337     __ align(CodeEntryAlignment);
4338     StubCodeMark mark(this, "StubRoutines", name);
4339     address start = __ pc();
4340 
4341     Register buf = c_rarg0;
4342     Register state = c_rarg1;
4343     Register ofs = c_rarg2;
4344     Register limit = c_rarg3;
4345 
4346     const XMMRegister msg = xmm0;
4347     const XMMRegister state0 = xmm1;
4348     const XMMRegister state1 = xmm2;
4349     const XMMRegister msgtmp0 = xmm3;
4350 
4351     const XMMRegister msgtmp1 = xmm4;
4352     const XMMRegister msgtmp2 = xmm5;
4353     const XMMRegister msgtmp3 = xmm6;
4354     const XMMRegister msgtmp4 = xmm7;
4355 
4356     const XMMRegister shuf_mask = xmm8;
4357 
4358     __ enter();
4359 
4360     __ subptr(rsp, 4 * wordSize);
4361 
4362     if (VM_Version::supports_sha()) {
4363       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4364         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4365     } else if (VM_Version::supports_avx2()) {
4366       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4367         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4368     }
4369     __ addptr(rsp, 4 * wordSize);
4370     __ vzeroupper();
4371     __ leave();
4372     __ ret(0);
4373     return start;
4374   }
4375 
4376   address generate_sha512_implCompress(bool multi_block, const char *name) {
4377     assert(VM_Version::supports_avx2(), "");
4378     assert(VM_Version::supports_bmi2(), "");
4379     __ align(CodeEntryAlignment);
4380     StubCodeMark mark(this, "StubRoutines", name);
4381     address start = __ pc();
4382 
4383     Register buf = c_rarg0;
4384     Register state = c_rarg1;
4385     Register ofs = c_rarg2;
4386     Register limit = c_rarg3;
4387 
4388     const XMMRegister msg = xmm0;
4389     const XMMRegister state0 = xmm1;
4390     const XMMRegister state1 = xmm2;
4391     const XMMRegister msgtmp0 = xmm3;
4392     const XMMRegister msgtmp1 = xmm4;
4393     const XMMRegister msgtmp2 = xmm5;
4394     const XMMRegister msgtmp3 = xmm6;
4395     const XMMRegister msgtmp4 = xmm7;
4396 
4397     const XMMRegister shuf_mask = xmm8;
4398 
4399     __ enter();
4400 
4401     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4402     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4403 
4404     __ vzeroupper();
4405     __ leave();
4406     __ ret(0);
4407     return start;
4408   }
4409 
4410   address ghash_polynomial512_addr() {
4411     __ align(CodeEntryAlignment);
4412     StubCodeMark mark(this, "StubRoutines", "_ghash_poly512_addr");
4413     address start = __ pc();
4414     __ emit_data64(0x00000001C2000000, relocInfo::none); // POLY for reduction
4415     __ emit_data64(0xC200000000000000, relocInfo::none);
4416     __ emit_data64(0x00000001C2000000, relocInfo::none);
4417     __ emit_data64(0xC200000000000000, relocInfo::none);
4418     __ emit_data64(0x00000001C2000000, relocInfo::none);
4419     __ emit_data64(0xC200000000000000, relocInfo::none);
4420     __ emit_data64(0x00000001C2000000, relocInfo::none);
4421     __ emit_data64(0xC200000000000000, relocInfo::none);
4422     __ emit_data64(0x0000000000000001, relocInfo::none); // POLY
4423     __ emit_data64(0xC200000000000000, relocInfo::none);
4424     __ emit_data64(0x0000000000000001, relocInfo::none); // TWOONE
4425     __ emit_data64(0x0000000100000000, relocInfo::none);
4426     return start;
4427 }
4428 
4429   // Vector AES Galois Counter Mode implementation. Parameters:
4430   // Windows regs            |  Linux regs
4431   // in = c_rarg0 (rcx)      |  c_rarg0 (rsi)
4432   // len = c_rarg1 (rdx)     |  c_rarg1 (rdi)
4433   // ct = c_rarg2 (r8)       |  c_rarg2 (rdx)
4434   // out = c_rarg3 (r9)      |  c_rarg3 (rcx)
4435   // key = r10               |  c_rarg4 (r8)
4436   // state = r13             |  c_rarg5 (r9)
4437   // subkeyHtbl = r14        |  r11
4438   // counter = rsi           |  r12
4439   // return - number of processed bytes
4440   address generate_galoisCounterMode_AESCrypt() {
4441     __ align(CodeEntryAlignment);
4442     StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
4443     address start = __ pc();
4444     const Register in = c_rarg0;
4445     const Register len = c_rarg1;
4446     const Register ct = c_rarg2;
4447     const Register out = c_rarg3;
4448     // and updated with the incremented counter in the end
4449 #ifndef _WIN64
4450     const Register key = c_rarg4;
4451     const Register state = c_rarg5;
4452     const Address subkeyH_mem(rbp, 2 * wordSize);
4453     const Register subkeyHtbl = r11;
4454     const Address avx512_subkeyH_mem(rbp, 3 * wordSize);
4455     const Register avx512_subkeyHtbl = r13;
4456     const Address counter_mem(rbp, 4 * wordSize);
4457     const Register counter = r12;
4458 #else
4459     const Address key_mem(rbp, 6 * wordSize);
4460     const Register key = r10;
4461     const Address state_mem(rbp, 7 * wordSize);
4462     const Register state = r13;
4463     const Address subkeyH_mem(rbp, 8 * wordSize);
4464     const Register subkeyHtbl = r14;
4465     const Address avx512_subkeyH_mem(rbp, 9 * wordSize);
4466     const Register avx512_subkeyHtbl = r12;
4467     const Address counter_mem(rbp, 10 * wordSize);
4468     const Register counter = rsi;
4469 #endif
4470     __ enter();
4471    // Save state before entering routine
4472     __ push(r12);
4473     __ push(r13);
4474     __ push(r14);
4475     __ push(r15);
4476     __ push(rbx);
4477 #ifdef _WIN64
4478     // on win64, fill len_reg from stack position
4479     __ push(rsi);
4480     __ movptr(key, key_mem);
4481     __ movptr(state, state_mem);
4482 #endif
4483     __ movptr(subkeyHtbl, subkeyH_mem);
4484     __ movptr(avx512_subkeyHtbl, avx512_subkeyH_mem);
4485     __ movptr(counter, counter_mem);
4486 
4487     __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
4488 
4489     // Restore state before leaving routine
4490 #ifdef _WIN64
4491     __ pop(rsi);
4492 #endif
4493     __ pop(rbx);
4494     __ pop(r15);
4495     __ pop(r14);
4496     __ pop(r13);
4497     __ pop(r12);
4498 
4499     __ leave(); // required for proper stackwalking of RuntimeStub frame
4500     __ ret(0);
4501      return start;
4502   }
4503 
4504   // This mask is used for incrementing counter value(linc0, linc4, etc.)
4505   address counter_mask_addr() {
4506     __ align64();
4507     StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
4508     address start = __ pc();
4509     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
4510     __ emit_data64(0x0001020304050607, relocInfo::none);
4511     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4512     __ emit_data64(0x0001020304050607, relocInfo::none);
4513     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4514     __ emit_data64(0x0001020304050607, relocInfo::none);
4515     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4516     __ emit_data64(0x0001020304050607, relocInfo::none);
4517     __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4518     __ emit_data64(0x0000000000000000, relocInfo::none);
4519     __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4520     __ emit_data64(0x0000000000000000, relocInfo::none);
4521     __ emit_data64(0x0000000000000002, relocInfo::none);
4522     __ emit_data64(0x0000000000000000, relocInfo::none);
4523     __ emit_data64(0x0000000000000003, relocInfo::none);
4524     __ emit_data64(0x0000000000000000, relocInfo::none);
4525     __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4526     __ emit_data64(0x0000000000000000, relocInfo::none);
4527     __ emit_data64(0x0000000000000004, relocInfo::none);
4528     __ emit_data64(0x0000000000000000, relocInfo::none);
4529     __ emit_data64(0x0000000000000004, relocInfo::none);
4530     __ emit_data64(0x0000000000000000, relocInfo::none);
4531     __ emit_data64(0x0000000000000004, relocInfo::none);
4532     __ emit_data64(0x0000000000000000, relocInfo::none);
4533     __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4534     __ emit_data64(0x0000000000000000, relocInfo::none);
4535     __ emit_data64(0x0000000000000008, relocInfo::none);
4536     __ emit_data64(0x0000000000000000, relocInfo::none);
4537     __ emit_data64(0x0000000000000008, relocInfo::none);
4538     __ emit_data64(0x0000000000000000, relocInfo::none);
4539     __ emit_data64(0x0000000000000008, relocInfo::none);
4540     __ emit_data64(0x0000000000000000, relocInfo::none);
4541     __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4542     __ emit_data64(0x0000000000000000, relocInfo::none);
4543     __ emit_data64(0x0000000000000020, relocInfo::none);
4544     __ emit_data64(0x0000000000000000, relocInfo::none);
4545     __ emit_data64(0x0000000000000020, relocInfo::none);
4546     __ emit_data64(0x0000000000000000, relocInfo::none);
4547     __ emit_data64(0x0000000000000020, relocInfo::none);
4548     __ emit_data64(0x0000000000000000, relocInfo::none);
4549     __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4550     __ emit_data64(0x0000000000000000, relocInfo::none);
4551     __ emit_data64(0x0000000000000010, relocInfo::none);
4552     __ emit_data64(0x0000000000000000, relocInfo::none);
4553     __ emit_data64(0x0000000000000010, relocInfo::none);
4554     __ emit_data64(0x0000000000000000, relocInfo::none);
4555     __ emit_data64(0x0000000000000010, relocInfo::none);
4556     __ emit_data64(0x0000000000000000, relocInfo::none);
4557     return start;
4558   }
4559 
4560  // Vector AES Counter implementation
4561   address generate_counterMode_VectorAESCrypt()  {
4562     __ align(CodeEntryAlignment);
4563     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4564     address start = __ pc();
4565     const Register from = c_rarg0; // source array address
4566     const Register to = c_rarg1; // destination array address
4567     const Register key = c_rarg2; // key array address r8
4568     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4569     // and updated with the incremented counter in the end
4570 #ifndef _WIN64
4571     const Register len_reg = c_rarg4;
4572     const Register saved_encCounter_start = c_rarg5;
4573     const Register used_addr = r10;
4574     const Address  used_mem(rbp, 2 * wordSize);
4575     const Register used = r11;
4576 #else
4577     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4578     const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4579     const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4580     const Register len_reg = r10; // pick the first volatile windows register
4581     const Register saved_encCounter_start = r11;
4582     const Register used_addr = r13;
4583     const Register used = r14;
4584 #endif
4585     __ enter();
4586    // Save state before entering routine
4587     __ push(r12);
4588     __ push(r13);
4589     __ push(r14);
4590     __ push(r15);
4591 #ifdef _WIN64
4592     // on win64, fill len_reg from stack position
4593     __ movl(len_reg, len_mem);
4594     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4595     __ movptr(used_addr, used_mem);
4596     __ movl(used, Address(used_addr, 0));
4597 #else
4598     __ push(len_reg); // Save
4599     __ movptr(used_addr, used_mem);
4600     __ movl(used, Address(used_addr, 0));
4601 #endif
4602     __ push(rbx);
4603     __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
4604     // Restore state before leaving routine
4605     __ pop(rbx);
4606 #ifdef _WIN64
4607     __ movl(rax, len_mem); // return length
4608 #else
4609     __ pop(rax); // return length
4610 #endif
4611     __ pop(r15);
4612     __ pop(r14);
4613     __ pop(r13);
4614     __ pop(r12);
4615 
4616     __ leave(); // required for proper stackwalking of RuntimeStub frame
4617     __ ret(0);
4618     return start;
4619   }
4620 
4621   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4622   // to hide instruction latency
4623   //
4624   // Arguments:
4625   //
4626   // Inputs:
4627   //   c_rarg0   - source byte array address
4628   //   c_rarg1   - destination byte array address
4629   //   c_rarg2   - K (key) in little endian int array
4630   //   c_rarg3   - counter vector byte array address
4631   //   Linux
4632   //     c_rarg4   -          input length
4633   //     c_rarg5   -          saved encryptedCounter start
4634   //     rbp + 6 * wordSize - saved used length
4635   //   Windows
4636   //     rbp + 6 * wordSize - input length
4637   //     rbp + 7 * wordSize - saved encryptedCounter start
4638   //     rbp + 8 * wordSize - saved used length
4639   //
4640   // Output:
4641   //   rax       - input length
4642   //
4643   address generate_counterMode_AESCrypt_Parallel() {
4644     assert(UseAES, "need AES instructions and misaligned SSE support");
4645     __ align(CodeEntryAlignment);
4646     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4647     address start = __ pc();
4648     const Register from = c_rarg0; // source array address
4649     const Register to = c_rarg1; // destination array address
4650     const Register key = c_rarg2; // key array address
4651     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4652                                       // and updated with the incremented counter in the end
4653 #ifndef _WIN64
4654     const Register len_reg = c_rarg4;
4655     const Register saved_encCounter_start = c_rarg5;
4656     const Register used_addr = r10;
4657     const Address  used_mem(rbp, 2 * wordSize);
4658     const Register used = r11;
4659 #else
4660     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4661     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4662     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4663     const Register len_reg = r10; // pick the first volatile windows register
4664     const Register saved_encCounter_start = r11;
4665     const Register used_addr = r13;
4666     const Register used = r14;
4667 #endif
4668     const Register pos = rax;
4669 
4670     const int PARALLEL_FACTOR = 6;
4671     const XMMRegister xmm_counter_shuf_mask = xmm0;
4672     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4673     const XMMRegister xmm_curr_counter = xmm2;
4674 
4675     const XMMRegister xmm_key_tmp0 = xmm3;
4676     const XMMRegister xmm_key_tmp1 = xmm4;
4677 
4678     // registers holding the four results in the parallelized loop
4679     const XMMRegister xmm_result0 = xmm5;
4680     const XMMRegister xmm_result1 = xmm6;
4681     const XMMRegister xmm_result2 = xmm7;
4682     const XMMRegister xmm_result3 = xmm8;
4683     const XMMRegister xmm_result4 = xmm9;
4684     const XMMRegister xmm_result5 = xmm10;
4685 
4686     const XMMRegister xmm_from0 = xmm11;
4687     const XMMRegister xmm_from1 = xmm12;
4688     const XMMRegister xmm_from2 = xmm13;
4689     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4690     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4691     const XMMRegister xmm_from5 = xmm4;
4692 
4693     //for key_128, key_192, key_256
4694     const int rounds[3] = {10, 12, 14};
4695     Label L_exit_preLoop, L_preLoop_start;
4696     Label L_multiBlock_loopTop[3];
4697     Label L_singleBlockLoopTop[3];
4698     Label L__incCounter[3][6]; //for 6 blocks
4699     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4700     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4701     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4702 
4703     Label L_exit;
4704 
4705     __ enter(); // required for proper stackwalking of RuntimeStub frame
4706 
4707 #ifdef _WIN64
4708     // allocate spill slots for r13, r14
4709     enum {
4710         saved_r13_offset,
4711         saved_r14_offset
4712     };
4713     __ subptr(rsp, 2 * wordSize);
4714     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4715     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4716 
4717     // on win64, fill len_reg from stack position
4718     __ movl(len_reg, len_mem);
4719     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4720     __ movptr(used_addr, used_mem);
4721     __ movl(used, Address(used_addr, 0));
4722 #else
4723     __ push(len_reg); // Save
4724     __ movptr(used_addr, used_mem);
4725     __ movl(used, Address(used_addr, 0));
4726 #endif
4727 
4728     __ push(rbx); // Save RBX
4729     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4730     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4731     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4732     __ movptr(pos, 0);
4733 
4734     // Use the partially used encrpyted counter from last invocation
4735     __ BIND(L_preLoop_start);
4736     __ cmpptr(used, 16);
4737     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4738       __ cmpptr(len_reg, 0);
4739       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4740       __ movb(rbx, Address(saved_encCounter_start, used));
4741       __ xorb(rbx, Address(from, pos));
4742       __ movb(Address(to, pos), rbx);
4743       __ addptr(pos, 1);
4744       __ addptr(used, 1);
4745       __ subptr(len_reg, 1);
4746 
4747     __ jmp(L_preLoop_start);
4748 
4749     __ BIND(L_exit_preLoop);
4750     __ movl(Address(used_addr, 0), used);
4751 
4752     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4753     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4754     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4755     __ cmpl(rbx, 52);
4756     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4757     __ cmpl(rbx, 60);
4758     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4759 
4760 #define CTR_DoSix(opc, src_reg)                \
4761     __ opc(xmm_result0, src_reg);              \
4762     __ opc(xmm_result1, src_reg);              \
4763     __ opc(xmm_result2, src_reg);              \
4764     __ opc(xmm_result3, src_reg);              \
4765     __ opc(xmm_result4, src_reg);              \
4766     __ opc(xmm_result5, src_reg);
4767 
4768     // k == 0 :  generate code for key_128
4769     // k == 1 :  generate code for key_192
4770     // k == 2 :  generate code for key_256
4771     for (int k = 0; k < 3; ++k) {
4772       //multi blocks starts here
4773       __ align(OptoLoopAlignment);
4774       __ BIND(L_multiBlock_loopTop[k]);
4775       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4776       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4777       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4778 
4779       //load, then increase counters
4780       CTR_DoSix(movdqa, xmm_curr_counter);
4781       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4782       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4783       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4784       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4785       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4786       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4787       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4788       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4789 
4790       //load two ROUND_KEYs at a time
4791       for (int i = 1; i < rounds[k]; ) {
4792         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4793         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4794         CTR_DoSix(aesenc, xmm_key_tmp1);
4795         i++;
4796         if (i != rounds[k]) {
4797           CTR_DoSix(aesenc, xmm_key_tmp0);
4798         } else {
4799           CTR_DoSix(aesenclast, xmm_key_tmp0);
4800         }
4801         i++;
4802       }
4803 
4804       // get next PARALLEL_FACTOR blocks into xmm_result registers
4805       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4806       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4807       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4808       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4809       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4810       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4811 
4812       __ pxor(xmm_result0, xmm_from0);
4813       __ pxor(xmm_result1, xmm_from1);
4814       __ pxor(xmm_result2, xmm_from2);
4815       __ pxor(xmm_result3, xmm_from3);
4816       __ pxor(xmm_result4, xmm_from4);
4817       __ pxor(xmm_result5, xmm_from5);
4818 
4819       // store 6 results into the next 64 bytes of output
4820       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4821       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4822       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4823       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4824       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4825       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4826 
4827       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4828       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4829       __ jmp(L_multiBlock_loopTop[k]);
4830 
4831       // singleBlock starts here
4832       __ align(OptoLoopAlignment);
4833       __ BIND(L_singleBlockLoopTop[k]);
4834       __ cmpptr(len_reg, 0);
4835       __ jcc(Assembler::lessEqual, L_exit);
4836       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4837       __ movdqa(xmm_result0, xmm_curr_counter);
4838       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4839       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4840       __ pxor(xmm_result0, xmm_key_tmp0);
4841       for (int i = 1; i < rounds[k]; i++) {
4842         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4843         __ aesenc(xmm_result0, xmm_key_tmp0);
4844       }
4845       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4846       __ aesenclast(xmm_result0, xmm_key_tmp0);
4847       __ cmpptr(len_reg, AESBlockSize);
4848       __ jcc(Assembler::less, L_processTail_insr[k]);
4849         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4850         __ pxor(xmm_result0, xmm_from0);
4851         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4852         __ addptr(pos, AESBlockSize);
4853         __ subptr(len_reg, AESBlockSize);
4854         __ jmp(L_singleBlockLoopTop[k]);
4855       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4856         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4857         __ testptr(len_reg, 8);
4858         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4859           __ subptr(pos,8);
4860           __ pinsrq(xmm_from0, Address(from, pos), 0);
4861         __ BIND(L_processTail_4_insr[k]);
4862         __ testptr(len_reg, 4);
4863         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4864           __ subptr(pos,4);
4865           __ pslldq(xmm_from0, 4);
4866           __ pinsrd(xmm_from0, Address(from, pos), 0);
4867         __ BIND(L_processTail_2_insr[k]);
4868         __ testptr(len_reg, 2);
4869         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4870           __ subptr(pos, 2);
4871           __ pslldq(xmm_from0, 2);
4872           __ pinsrw(xmm_from0, Address(from, pos), 0);
4873         __ BIND(L_processTail_1_insr[k]);
4874         __ testptr(len_reg, 1);
4875         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4876           __ subptr(pos, 1);
4877           __ pslldq(xmm_from0, 1);
4878           __ pinsrb(xmm_from0, Address(from, pos), 0);
4879         __ BIND(L_processTail_exit_insr[k]);
4880 
4881         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4882         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4883 
4884         __ testptr(len_reg, 8);
4885         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4886           __ pextrq(Address(to, pos), xmm_result0, 0);
4887           __ psrldq(xmm_result0, 8);
4888           __ addptr(pos, 8);
4889         __ BIND(L_processTail_4_extr[k]);
4890         __ testptr(len_reg, 4);
4891         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4892           __ pextrd(Address(to, pos), xmm_result0, 0);
4893           __ psrldq(xmm_result0, 4);
4894           __ addptr(pos, 4);
4895         __ BIND(L_processTail_2_extr[k]);
4896         __ testptr(len_reg, 2);
4897         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4898           __ pextrw(Address(to, pos), xmm_result0, 0);
4899           __ psrldq(xmm_result0, 2);
4900           __ addptr(pos, 2);
4901         __ BIND(L_processTail_1_extr[k]);
4902         __ testptr(len_reg, 1);
4903         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4904           __ pextrb(Address(to, pos), xmm_result0, 0);
4905 
4906         __ BIND(L_processTail_exit_extr[k]);
4907         __ movl(Address(used_addr, 0), len_reg);
4908         __ jmp(L_exit);
4909 
4910     }
4911 
4912     __ BIND(L_exit);
4913     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4914     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4915     __ pop(rbx); // pop the saved RBX.
4916 #ifdef _WIN64
4917     __ movl(rax, len_mem);
4918     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4919     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4920     __ addptr(rsp, 2 * wordSize);
4921 #else
4922     __ pop(rax); // return 'len'
4923 #endif
4924     __ leave(); // required for proper stackwalking of RuntimeStub frame
4925     __ ret(0);
4926     return start;
4927   }
4928 
4929 void roundDec(XMMRegister xmm_reg) {
4930   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4931   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4932   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4933   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4934   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4935   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4936   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4937   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4938 }
4939 
4940 void roundDeclast(XMMRegister xmm_reg) {
4941   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4942   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4943   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4944   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4945   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4946   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4947   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4948   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4949 }
4950 
4951   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4952     __ movdqu(xmmdst, Address(key, offset));
4953     if (xmm_shuf_mask != NULL) {
4954       __ pshufb(xmmdst, xmm_shuf_mask);
4955     } else {
4956       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4957     }
4958     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4959 
4960   }
4961 
4962 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4963     assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
4964     __ align(CodeEntryAlignment);
4965     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4966     address start = __ pc();
4967 
4968     const Register from = c_rarg0;  // source array address
4969     const Register to = c_rarg1;  // destination array address
4970     const Register key = c_rarg2;  // key array address
4971     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4972     // and left with the results of the last encryption block
4973 #ifndef _WIN64
4974     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4975 #else
4976     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4977     const Register len_reg = r11;      // pick the volatile windows register
4978 #endif
4979 
4980     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4981           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4982 
4983     __ enter();
4984 
4985 #ifdef _WIN64
4986   // on win64, fill len_reg from stack position
4987     __ movl(len_reg, len_mem);
4988 #else
4989     __ push(len_reg); // Save
4990 #endif
4991     __ push(rbx);
4992     __ vzeroupper();
4993 
4994     // Temporary variable declaration for swapping key bytes
4995     const XMMRegister xmm_key_shuf_mask = xmm1;
4996     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4997 
4998     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4999     const Register rounds = rbx;
5000     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
5001 
5002     const XMMRegister IV = xmm0;
5003     // Load IV and broadcast value to 512-bits
5004     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
5005 
5006     // Temporary variables for storing round keys
5007     const XMMRegister RK0 = xmm30;
5008     const XMMRegister RK1 = xmm9;
5009     const XMMRegister RK2 = xmm18;
5010     const XMMRegister RK3 = xmm19;
5011     const XMMRegister RK4 = xmm20;
5012     const XMMRegister RK5 = xmm21;
5013     const XMMRegister RK6 = xmm22;
5014     const XMMRegister RK7 = xmm23;
5015     const XMMRegister RK8 = xmm24;
5016     const XMMRegister RK9 = xmm25;
5017     const XMMRegister RK10 = xmm26;
5018 
5019      // Load and shuffle key
5020     // the java expanded key ordering is rotated one position from what we want
5021     // so we start from 1*16 here and hit 0*16 last
5022     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
5023     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
5024     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
5025     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
5026     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
5027     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
5028     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
5029     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
5030     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
5031     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
5032     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
5033 
5034     // Variables for storing source cipher text
5035     const XMMRegister S0 = xmm10;
5036     const XMMRegister S1 = xmm11;
5037     const XMMRegister S2 = xmm12;
5038     const XMMRegister S3 = xmm13;
5039     const XMMRegister S4 = xmm14;
5040     const XMMRegister S5 = xmm15;
5041     const XMMRegister S6 = xmm16;
5042     const XMMRegister S7 = xmm17;
5043 
5044     // Variables for storing decrypted text
5045     const XMMRegister B0 = xmm1;
5046     const XMMRegister B1 = xmm2;
5047     const XMMRegister B2 = xmm3;
5048     const XMMRegister B3 = xmm4;
5049     const XMMRegister B4 = xmm5;
5050     const XMMRegister B5 = xmm6;
5051     const XMMRegister B6 = xmm7;
5052     const XMMRegister B7 = xmm8;
5053 
5054     __ cmpl(rounds, 44);
5055     __ jcc(Assembler::greater, KEY_192);
5056     __ jmp(Loop);
5057 
5058     __ BIND(KEY_192);
5059     const XMMRegister RK11 = xmm27;
5060     const XMMRegister RK12 = xmm28;
5061     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
5062     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
5063 
5064     __ cmpl(rounds, 52);
5065     __ jcc(Assembler::greater, KEY_256);
5066     __ jmp(Loop);
5067 
5068     __ BIND(KEY_256);
5069     const XMMRegister RK13 = xmm29;
5070     const XMMRegister RK14 = xmm31;
5071     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
5072     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
5073 
5074     __ BIND(Loop);
5075     __ cmpl(len_reg, 512);
5076     __ jcc(Assembler::below, Lcbc_dec_rem);
5077     __ BIND(Loop1);
5078     __ subl(len_reg, 512);
5079     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
5080     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
5081     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
5082     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
5083     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
5084     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
5085     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
5086     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
5087     __ leaq(from, Address(from, 8 * 64));
5088 
5089     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5090     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
5091     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
5092     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
5093     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
5094     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
5095     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
5096     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
5097 
5098     __ evalignq(IV, S0, IV, 0x06);
5099     __ evalignq(S0, S1, S0, 0x06);
5100     __ evalignq(S1, S2, S1, 0x06);
5101     __ evalignq(S2, S3, S2, 0x06);
5102     __ evalignq(S3, S4, S3, 0x06);
5103     __ evalignq(S4, S5, S4, 0x06);
5104     __ evalignq(S5, S6, S5, 0x06);
5105     __ evalignq(S6, S7, S6, 0x06);
5106 
5107     roundDec(RK2);
5108     roundDec(RK3);
5109     roundDec(RK4);
5110     roundDec(RK5);
5111     roundDec(RK6);
5112     roundDec(RK7);
5113     roundDec(RK8);
5114     roundDec(RK9);
5115     roundDec(RK10);
5116 
5117     __ cmpl(rounds, 44);
5118     __ jcc(Assembler::belowEqual, L_128);
5119     roundDec(RK11);
5120     roundDec(RK12);
5121 
5122     __ cmpl(rounds, 52);
5123     __ jcc(Assembler::belowEqual, L_192);
5124     roundDec(RK13);
5125     roundDec(RK14);
5126 
5127     __ BIND(L_256);
5128     roundDeclast(RK0);
5129     __ jmp(Loop2);
5130 
5131     __ BIND(L_128);
5132     roundDeclast(RK0);
5133     __ jmp(Loop2);
5134 
5135     __ BIND(L_192);
5136     roundDeclast(RK0);
5137 
5138     __ BIND(Loop2);
5139     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5140     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
5141     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
5142     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
5143     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
5144     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
5145     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
5146     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
5147     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
5148 
5149     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
5150     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
5151     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
5152     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
5153     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
5154     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
5155     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
5156     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
5157     __ leaq(to, Address(to, 8 * 64));
5158     __ jmp(Loop);
5159 
5160     __ BIND(Lcbc_dec_rem);
5161     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
5162 
5163     __ BIND(Lcbc_dec_rem_loop);
5164     __ subl(len_reg, 16);
5165     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
5166 
5167     __ movdqu(S0, Address(from, 0));
5168     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5169     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
5170     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
5171     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
5172     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
5173     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
5174     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
5175     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
5176     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
5177     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
5178     __ cmpl(rounds, 44);
5179     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5180 
5181     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
5182     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
5183     __ cmpl(rounds, 52);
5184     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5185 
5186     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
5187     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
5188 
5189     __ BIND(Lcbc_dec_rem_last);
5190     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
5191 
5192     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5193     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
5194     __ movdqu(Address(to, 0), B0);
5195     __ leaq(from, Address(from, 16));
5196     __ leaq(to, Address(to, 16));
5197     __ jmp(Lcbc_dec_rem_loop);
5198 
5199     __ BIND(Lcbc_dec_ret);
5200     __ movdqu(Address(rvec, 0), IV);
5201 
5202     // Zero out the round keys
5203     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
5204     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
5205     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
5206     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
5207     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
5208     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
5209     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
5210     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
5211     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
5212     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
5213     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
5214     __ cmpl(rounds, 44);
5215     __ jcc(Assembler::belowEqual, Lcbc_exit);
5216     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
5217     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
5218     __ cmpl(rounds, 52);
5219     __ jcc(Assembler::belowEqual, Lcbc_exit);
5220     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
5221     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
5222 
5223     __ BIND(Lcbc_exit);
5224     __ pop(rbx);
5225 #ifdef _WIN64
5226     __ movl(rax, len_mem);
5227 #else
5228     __ pop(rax); // return length
5229 #endif
5230     __ leave(); // required for proper stackwalking of RuntimeStub frame
5231     __ ret(0);
5232     return start;
5233 }
5234 
5235 // Polynomial x^128+x^127+x^126+x^121+1
5236 address ghash_polynomial_addr() {
5237     __ align(CodeEntryAlignment);
5238     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
5239     address start = __ pc();
5240     __ emit_data64(0x0000000000000001, relocInfo::none);
5241     __ emit_data64(0xc200000000000000, relocInfo::none);
5242     return start;
5243 }
5244 
5245 address ghash_shufflemask_addr() {
5246     __ align(CodeEntryAlignment);
5247     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
5248     address start = __ pc();
5249     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5250     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5251     return start;
5252 }
5253 
5254 // Ghash single and multi block operations using AVX instructions
5255 address generate_avx_ghash_processBlocks() {
5256     __ align(CodeEntryAlignment);
5257 
5258     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5259     address start = __ pc();
5260 
5261     // arguments
5262     const Register state = c_rarg0;
5263     const Register htbl = c_rarg1;
5264     const Register data = c_rarg2;
5265     const Register blocks = c_rarg3;
5266     __ enter();
5267    // Save state before entering routine
5268     __ avx_ghash(state, htbl, data, blocks);
5269     __ leave(); // required for proper stackwalking of RuntimeStub frame
5270     __ ret(0);
5271     return start;
5272 }
5273 
5274   // byte swap x86 long
5275   address generate_ghash_long_swap_mask() {
5276     __ align(CodeEntryAlignment);
5277     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
5278     address start = __ pc();
5279     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
5280     __ emit_data64(0x0706050403020100, relocInfo::none );
5281   return start;
5282   }
5283 
5284   // byte swap x86 byte array
5285   address generate_ghash_byte_swap_mask() {
5286     __ align(CodeEntryAlignment);
5287     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
5288     address start = __ pc();
5289     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
5290     __ emit_data64(0x0001020304050607, relocInfo::none );
5291   return start;
5292   }
5293 
5294   /* Single and multi-block ghash operations */
5295   address generate_ghash_processBlocks() {
5296     __ align(CodeEntryAlignment);
5297     Label L_ghash_loop, L_exit;
5298     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5299     address start = __ pc();
5300 
5301     const Register state        = c_rarg0;
5302     const Register subkeyH      = c_rarg1;
5303     const Register data         = c_rarg2;
5304     const Register blocks       = c_rarg3;
5305 
5306     const XMMRegister xmm_temp0 = xmm0;
5307     const XMMRegister xmm_temp1 = xmm1;
5308     const XMMRegister xmm_temp2 = xmm2;
5309     const XMMRegister xmm_temp3 = xmm3;
5310     const XMMRegister xmm_temp4 = xmm4;
5311     const XMMRegister xmm_temp5 = xmm5;
5312     const XMMRegister xmm_temp6 = xmm6;
5313     const XMMRegister xmm_temp7 = xmm7;
5314     const XMMRegister xmm_temp8 = xmm8;
5315     const XMMRegister xmm_temp9 = xmm9;
5316     const XMMRegister xmm_temp10 = xmm10;
5317 
5318     __ enter();
5319 
5320     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
5321 
5322     __ movdqu(xmm_temp0, Address(state, 0));
5323     __ pshufb(xmm_temp0, xmm_temp10);
5324 
5325 
5326     __ BIND(L_ghash_loop);
5327     __ movdqu(xmm_temp2, Address(data, 0));
5328     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
5329 
5330     __ movdqu(xmm_temp1, Address(subkeyH, 0));
5331     __ pshufb(xmm_temp1, xmm_temp10);
5332 
5333     __ pxor(xmm_temp0, xmm_temp2);
5334 
5335     //
5336     // Multiply with the hash key
5337     //
5338     __ movdqu(xmm_temp3, xmm_temp0);
5339     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
5340     __ movdqu(xmm_temp4, xmm_temp0);
5341     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
5342 
5343     __ movdqu(xmm_temp5, xmm_temp0);
5344     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
5345     __ movdqu(xmm_temp6, xmm_temp0);
5346     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
5347 
5348     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
5349 
5350     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
5351     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
5352     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
5353     __ pxor(xmm_temp3, xmm_temp5);
5354     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
5355                                         // of the carry-less multiplication of
5356                                         // xmm0 by xmm1.
5357 
5358     // We shift the result of the multiplication by one bit position
5359     // to the left to cope for the fact that the bits are reversed.
5360     __ movdqu(xmm_temp7, xmm_temp3);
5361     __ movdqu(xmm_temp8, xmm_temp6);
5362     __ pslld(xmm_temp3, 1);
5363     __ pslld(xmm_temp6, 1);
5364     __ psrld(xmm_temp7, 31);
5365     __ psrld(xmm_temp8, 31);
5366     __ movdqu(xmm_temp9, xmm_temp7);
5367     __ pslldq(xmm_temp8, 4);
5368     __ pslldq(xmm_temp7, 4);
5369     __ psrldq(xmm_temp9, 12);
5370     __ por(xmm_temp3, xmm_temp7);
5371     __ por(xmm_temp6, xmm_temp8);
5372     __ por(xmm_temp6, xmm_temp9);
5373 
5374     //
5375     // First phase of the reduction
5376     //
5377     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
5378     // independently.
5379     __ movdqu(xmm_temp7, xmm_temp3);
5380     __ movdqu(xmm_temp8, xmm_temp3);
5381     __ movdqu(xmm_temp9, xmm_temp3);
5382     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
5383     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
5384     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
5385     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
5386     __ pxor(xmm_temp7, xmm_temp9);
5387     __ movdqu(xmm_temp8, xmm_temp7);
5388     __ pslldq(xmm_temp7, 12);
5389     __ psrldq(xmm_temp8, 4);
5390     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
5391 
5392     //
5393     // Second phase of the reduction
5394     //
5395     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
5396     // shift operations.
5397     __ movdqu(xmm_temp2, xmm_temp3);
5398     __ movdqu(xmm_temp4, xmm_temp3);
5399     __ movdqu(xmm_temp5, xmm_temp3);
5400     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
5401     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
5402     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
5403     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
5404     __ pxor(xmm_temp2, xmm_temp5);
5405     __ pxor(xmm_temp2, xmm_temp8);
5406     __ pxor(xmm_temp3, xmm_temp2);
5407     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
5408 
5409     __ decrement(blocks);
5410     __ jcc(Assembler::zero, L_exit);
5411     __ movdqu(xmm_temp0, xmm_temp6);
5412     __ addptr(data, 16);
5413     __ jmp(L_ghash_loop);
5414 
5415     __ BIND(L_exit);
5416     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
5417     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
5418     __ leave();
5419     __ ret(0);
5420     return start;
5421   }
5422 
5423   address base64_shuffle_addr()
5424   {
5425     __ align64();
5426     StubCodeMark mark(this, "StubRoutines", "shuffle_base64");
5427     address start = __ pc();
5428     assert(((unsigned long long)start & 0x3f) == 0,
5429            "Alignment problem (0x%08llx)", (unsigned long long)start);
5430     __ emit_data64(0x0405030401020001, relocInfo::none);
5431     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5432     __ emit_data64(0x10110f100d0e0c0d, relocInfo::none);
5433     __ emit_data64(0x1617151613141213, relocInfo::none);
5434     __ emit_data64(0x1c1d1b1c191a1819, relocInfo::none);
5435     __ emit_data64(0x222321221f201e1f, relocInfo::none);
5436     __ emit_data64(0x2829272825262425, relocInfo::none);
5437     __ emit_data64(0x2e2f2d2e2b2c2a2b, relocInfo::none);
5438     return start;
5439   }
5440 
5441   address base64_avx2_shuffle_addr()
5442   {
5443     __ align32();
5444     StubCodeMark mark(this, "StubRoutines", "avx2_shuffle_base64");
5445     address start = __ pc();
5446     __ emit_data64(0x0809070805060405, relocInfo::none);
5447     __ emit_data64(0x0e0f0d0e0b0c0a0b, relocInfo::none);
5448     __ emit_data64(0x0405030401020001, relocInfo::none);
5449     __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5450     return start;
5451   }
5452 
5453   address base64_avx2_input_mask_addr()
5454   {
5455     __ align32();
5456     StubCodeMark mark(this, "StubRoutines", "avx2_input_mask_base64");
5457     address start = __ pc();
5458     __ emit_data64(0x8000000000000000, relocInfo::none);
5459     __ emit_data64(0x8000000080000000, relocInfo::none);
5460     __ emit_data64(0x8000000080000000, relocInfo::none);
5461     __ emit_data64(0x8000000080000000, relocInfo::none);
5462     return start;
5463   }
5464 
5465   address base64_avx2_lut_addr()
5466   {
5467     __ align32();
5468     StubCodeMark mark(this, "StubRoutines", "avx2_lut_base64");
5469     address start = __ pc();
5470     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5471     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5472     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5473     __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5474 
5475     // URL LUT
5476     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5477     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5478     __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5479     __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5480     return start;
5481   }
5482 
5483   address base64_encoding_table_addr()
5484   {
5485     __ align64();
5486     StubCodeMark mark(this, "StubRoutines", "encoding_table_base64");
5487     address start = __ pc();
5488     assert(((unsigned long long)start & 0x3f) == 0, "Alignment problem (0x%08llx)", (unsigned long long)start);
5489     __ emit_data64(0x4847464544434241, relocInfo::none);
5490     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5491     __ emit_data64(0x5857565554535251, relocInfo::none);
5492     __ emit_data64(0x6665646362615a59, relocInfo::none);
5493     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5494     __ emit_data64(0x767574737271706f, relocInfo::none);
5495     __ emit_data64(0x333231307a797877, relocInfo::none);
5496     __ emit_data64(0x2f2b393837363534, relocInfo::none);
5497 
5498     // URL table
5499     __ emit_data64(0x4847464544434241, relocInfo::none);
5500     __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5501     __ emit_data64(0x5857565554535251, relocInfo::none);
5502     __ emit_data64(0x6665646362615a59, relocInfo::none);
5503     __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5504     __ emit_data64(0x767574737271706f, relocInfo::none);
5505     __ emit_data64(0x333231307a797877, relocInfo::none);
5506     __ emit_data64(0x5f2d393837363534, relocInfo::none);
5507     return start;
5508   }
5509 
5510   // Code for generating Base64 encoding.
5511   // Intrinsic function prototype in Base64.java:
5512   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp,
5513   // boolean isURL) {
5514   address generate_base64_encodeBlock()
5515   {
5516     __ align(CodeEntryAlignment);
5517     StubCodeMark mark(this, "StubRoutines", "implEncode");
5518     address start = __ pc();
5519     __ enter();
5520 
5521     // Save callee-saved registers before using them
5522     __ push(r12);
5523     __ push(r13);
5524     __ push(r14);
5525     __ push(r15);
5526 
5527     // arguments
5528     const Register source = c_rarg0;       // Source Array
5529     const Register start_offset = c_rarg1; // start offset
5530     const Register end_offset = c_rarg2;   // end offset
5531     const Register dest = c_rarg3;   // destination array
5532 
5533 #ifndef _WIN64
5534     const Register dp = c_rarg4;    // Position for writing to dest array
5535     const Register isURL = c_rarg5; // Base64 or URL character set
5536 #else
5537     const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5538     const Address isURL_mem(rbp, 7 * wordSize);
5539     const Register isURL = r10; // pick the volatile windows register
5540     const Register dp = r12;
5541     __ movl(dp, dp_mem);
5542     __ movl(isURL, isURL_mem);
5543 #endif
5544 
5545     const Register length = r14;
5546     const Register encode_table = r13;
5547     Label L_process3, L_exit, L_processdata, L_vbmiLoop, L_not512, L_32byteLoop;
5548 
5549     // calculate length from offsets
5550     __ movl(length, end_offset);
5551     __ subl(length, start_offset);
5552     __ cmpl(length, 0);
5553     __ jcc(Assembler::lessEqual, L_exit);
5554 
5555     // Code for 512-bit VBMI encoding.  Encodes 48 input bytes into 64
5556     // output bytes. We read 64 input bytes and ignore the last 16, so be
5557     // sure not to read past the end of the input buffer.
5558     if (VM_Version::supports_avx512_vbmi()) {
5559       __ cmpl(length, 64); // Do not overrun input buffer.
5560       __ jcc(Assembler::below, L_not512);
5561 
5562       __ shll(isURL, 6); // index into decode table based on isURL
5563       __ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5564       __ addptr(encode_table, isURL);
5565       __ shrl(isURL, 6); // restore isURL
5566 
5567       __ mov64(rax, 0x3036242a1016040aull); // Shifts
5568       __ evmovdquq(xmm3, ExternalAddress(StubRoutines::x86::base64_shuffle_addr()), Assembler::AVX_512bit, r15);
5569       __ evmovdquq(xmm2, Address(encode_table, 0), Assembler::AVX_512bit);
5570       __ evpbroadcastq(xmm1, rax, Assembler::AVX_512bit);
5571 
5572       __ align32();
5573       __ BIND(L_vbmiLoop);
5574 
5575       __ vpermb(xmm0, xmm3, Address(source, start_offset), Assembler::AVX_512bit);
5576       __ subl(length, 48);
5577 
5578       // Put the input bytes into the proper lanes for writing, then
5579       // encode them.
5580       __ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
5581       __ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
5582 
5583       // Write to destination
5584       __ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
5585 
5586       __ addptr(dest, 64);
5587       __ addptr(source, 48);
5588       __ cmpl(length, 64);
5589       __ jcc(Assembler::aboveEqual, L_vbmiLoop);
5590 
5591       __ vzeroupper();
5592     }
5593 
5594     __ BIND(L_not512);
5595     if (VM_Version::supports_avx2()
5596         && VM_Version::supports_avx512vlbw()) {
5597       /*
5598       ** This AVX2 encoder is based off the paper at:
5599       **      https://dl.acm.org/doi/10.1145/3132709
5600       **
5601       ** We use AVX2 SIMD instructions to encode 24 bytes into 32
5602       ** output bytes.
5603       **
5604       */
5605       // Lengths under 32 bytes are done with scalar routine
5606       __ cmpl(length, 31);
5607       __ jcc(Assembler::belowEqual, L_process3);
5608 
5609       // Set up supporting constant table data
5610       __ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
5611       // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
5612       __ movl(rax, 0x0fc0fc00);
5613       __ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
5614       __ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
5615 
5616       // Multiplication constant for "shifting" right by 6 and 10
5617       // bits
5618       __ movl(rax, 0x04000040);
5619 
5620       __ subl(length, 24);
5621       __ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
5622 
5623       // For the first load, we mask off reading of the first 4
5624       // bytes into the register. This is so we can get 4 3-byte
5625       // chunks into each lane of the register, avoiding having to
5626       // handle end conditions.  We then shuffle these bytes into a
5627       // specific order so that manipulation is easier.
5628       //
5629       // The initial read loads the XMM register like this:
5630       //
5631       // Lower 128-bit lane:
5632       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5633       // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1
5634       // | C2 | D0 | D1 | D2 |
5635       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5636       //
5637       // Upper 128-bit lane:
5638       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5639       // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2
5640       // | XX | XX | XX | XX |
5641       // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5642       //
5643       // Where A0 is the first input byte, B0 is the fourth, etc.
5644       // The alphabetical significance denotes the 3 bytes to be
5645       // consumed and encoded into 4 bytes.
5646       //
5647       // We then shuffle the register so each 32-bit word contains
5648       // the sequence:
5649       //    A1 A0 A2 A1, B1, B0, B2, B1, etc.
5650       // Each of these byte sequences are then manipulated into 4
5651       // 6-bit values ready for encoding.
5652       //
5653       // If we focus on one set of 3-byte chunks, changing the
5654       // nomenclature such that A0 => a, A1 => b, and A2 => c, we
5655       // shuffle such that each 24-bit chunk contains:
5656       //
5657       // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6
5658       // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0
5659       // Explain this step.
5660       // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4
5661       // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2
5662       //
5663       // W first and off all but bits 4-9 and 16-21 (c5..c0 and
5664       // a5..a0) and shift them using a vector multiplication
5665       // operation (vpmulhuw) which effectively shifts c right by 6
5666       // bits and a right by 10 bits.  We similarly mask bits 10-15
5667       // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4
5668       // bits respecively.  This is done using vpmullw.  We end up
5669       // with 4 6-bit values, thus splitting the 3 input bytes,
5670       // ready for encoding:
5671       //    0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0
5672       //
5673       // For translation, we recognize that there are 5 distinct
5674       // ranges of legal Base64 characters as below:
5675       //
5676       //   +-------------+-------------+------------+
5677       //   | 6-bit value | ASCII range |   offset   |
5678       //   +-------------+-------------+------------+
5679       //   |    0..25    |    A..Z     |     65     |
5680       //   |   26..51    |    a..z     |     71     |
5681       //   |   52..61    |    0..9     |     -4     |
5682       //   |     62      |   + or -    | -19 or -17 |
5683       //   |     63      |   / or _    | -16 or 32  |
5684       //   +-------------+-------------+------------+
5685       //
5686       // We note that vpshufb does a parallel lookup in a
5687       // destination register using the lower 4 bits of bytes from a
5688       // source register.  If we use a saturated subtraction and
5689       // subtract 51 from each 6-bit value, bytes from [0,51]
5690       // saturate to 0, and [52,63] map to a range of [1,12].  We
5691       // distinguish the [0,25] and [26,51] ranges by assigning a
5692       // value of 13 for all 6-bit values less than 26.  We end up
5693       // with:
5694       //
5695       //   +-------------+-------------+------------+
5696       //   | 6-bit value |   Reduced   |   offset   |
5697       //   +-------------+-------------+------------+
5698       //   |    0..25    |     13      |     65     |
5699       //   |   26..51    |      0      |     71     |
5700       //   |   52..61    |    0..9     |     -4     |
5701       //   |     62      |     11      | -19 or -17 |
5702       //   |     63      |     12      | -16 or 32  |
5703       //   +-------------+-------------+------------+
5704       //
5705       // We then use a final vpshufb to add the appropriate offset,
5706       // translating the bytes.
5707       //
5708       // Load input bytes - only 28 bytes.  Mask the first load to
5709       // not load into the full register.
5710       __ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
5711 
5712       // Move 3-byte chunks of input (12 bytes) into 16 bytes,
5713       // ordering by:
5714       //   1, 0, 2, 1; 4, 3, 5, 4; etc.  This groups 6-bit chunks
5715       //   for easy masking
5716       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5717 
5718       __ addl(start_offset, 24);
5719 
5720       // Load masking register for first and third (and multiples)
5721       // 6-bit values.
5722       __ movl(rax, 0x003f03f0);
5723       __ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
5724       // Multiplication constant for "shifting" left by 4 and 8 bits
5725       __ movl(rax, 0x01000010);
5726       __ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
5727 
5728       // Isolate 6-bit chunks of interest
5729       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5730 
5731       // Load constants for encoding
5732       __ movl(rax, 0x19191919);
5733       __ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
5734       __ movl(rax, 0x33333333);
5735       __ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
5736 
5737       // Shift output bytes 0 and 2 into proper lanes
5738       __ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
5739 
5740       // Mask and shift output bytes 1 and 3 into proper lanes and
5741       // combine
5742       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5743       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5744       __ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
5745 
5746       // Find out which are 0..25.  This indicates which input
5747       // values fall in the range of 'A'-'Z', which require an
5748       // additional offset (see comments above)
5749       __ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
5750       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5751       __ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
5752 
5753       // Load the proper lookup table
5754       __ lea(r11, ExternalAddress(StubRoutines::x86::base64_avx2_lut_addr()));
5755       __ movl(r15, isURL);
5756       __ shll(r15, 5);
5757       __ vmovdqu(xmm2, Address(r11, r15));
5758 
5759       // Shuffle the offsets based on the range calculation done
5760       // above. This allows us to add the correct offset to the
5761       // 6-bit value corresponding to the range documented above.
5762       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5763       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5764 
5765       // Store the encoded bytes
5766       __ vmovdqu(Address(dest, dp), xmm0);
5767       __ addl(dp, 32);
5768 
5769       __ cmpl(length, 31);
5770       __ jcc(Assembler::belowEqual, L_process3);
5771 
5772       __ align32();
5773       __ BIND(L_32byteLoop);
5774 
5775       // Get next 32 bytes
5776       __ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
5777 
5778       __ subl(length, 24);
5779       __ addl(start_offset, 24);
5780 
5781       // This logic is identical to the above, with only constant
5782       // register loads removed.  Shuffle the input, mask off 6-bit
5783       // chunks, shift them into place, then add the offset to
5784       // encode.
5785       __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5786 
5787       __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5788       __ vpmulhuw(xmm10, xmm0, xmm7, Assembler::AVX_256bit);
5789       __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5790       __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5791       __ vpor(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
5792       __ vpcmpgtb(xmm10, xmm0, xmm3, Assembler::AVX_256bit);
5793       __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5794       __ vpsubb(xmm1, xmm1, xmm10, Assembler::AVX_256bit);
5795       __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5796       __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5797 
5798       // Store the encoded bytes
5799       __ vmovdqu(Address(dest, dp), xmm0);
5800       __ addl(dp, 32);
5801 
5802       __ cmpl(length, 31);
5803       __ jcc(Assembler::above, L_32byteLoop);
5804 
5805       __ BIND(L_process3);
5806       __ vzeroupper();
5807     } else {
5808       __ BIND(L_process3);
5809     }
5810 
5811     __ cmpl(length, 3);
5812     __ jcc(Assembler::below, L_exit);
5813 
5814     // Load the encoding table based on isURL
5815     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5816     __ movl(r15, isURL);
5817     __ shll(r15, 6);
5818     __ addptr(r11, r15);
5819 
5820     __ BIND(L_processdata);
5821 
5822     // Load 3 bytes
5823     __ load_unsigned_byte(r15, Address(source, start_offset));
5824     __ load_unsigned_byte(r10, Address(source, start_offset, Address::times_1, 1));
5825     __ load_unsigned_byte(r13, Address(source, start_offset, Address::times_1, 2));
5826 
5827     // Build a 32-bit word with bytes 1, 2, 0, 1
5828     __ movl(rax, r10);
5829     __ shll(r10, 24);
5830     __ orl(rax, r10);
5831 
5832     __ subl(length, 3);
5833 
5834     __ shll(r15, 8);
5835     __ shll(r13, 16);
5836     __ orl(rax, r15);
5837 
5838     __ addl(start_offset, 3);
5839 
5840     __ orl(rax, r13);
5841     // At this point, rax contains | byte1 | byte2 | byte0 | byte1
5842     // r13 has byte2 << 16 - need low-order 6 bits to translate.
5843     // This translated byte is the fourth output byte.
5844     __ shrl(r13, 16);
5845     __ andl(r13, 0x3f);
5846 
5847     // The high-order 6 bits of r15 (byte0) is translated.
5848     // The translated byte is the first output byte.
5849     __ shrl(r15, 10);
5850 
5851     __ load_unsigned_byte(r13, Address(r11, r13));
5852     __ load_unsigned_byte(r15, Address(r11, r15));
5853 
5854     __ movb(Address(dest, dp, Address::times_1, 3), r13);
5855 
5856     // Extract high-order 4 bits of byte1 and low-order 2 bits of byte0.
5857     // This translated byte is the second output byte.
5858     __ shrl(rax, 4);
5859     __ movl(r10, rax);
5860     __ andl(rax, 0x3f);
5861 
5862     __ movb(Address(dest, dp, Address::times_1, 0), r15);
5863 
5864     __ load_unsigned_byte(rax, Address(r11, rax));
5865 
5866     // Extract low-order 2 bits of byte1 and high-order 4 bits of byte2.
5867     // This translated byte is the third output byte.
5868     __ shrl(r10, 18);
5869     __ andl(r10, 0x3f);
5870 
5871     __ load_unsigned_byte(r10, Address(r11, r10));
5872 
5873     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5874     __ movb(Address(dest, dp, Address::times_1, 2), r10);
5875 
5876     __ addl(dp, 4);
5877     __ cmpl(length, 3);
5878     __ jcc(Assembler::aboveEqual, L_processdata);
5879 
5880     __ BIND(L_exit);
5881     __ pop(r15);
5882     __ pop(r14);
5883     __ pop(r13);
5884     __ pop(r12);
5885     __ leave();
5886     __ ret(0);
5887     return start;
5888   }
5889 
5890   // base64 AVX512vbmi tables
5891   address base64_vbmi_lookup_lo_addr() {
5892     __ align64();
5893     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64");
5894     address start = __ pc();
5895     assert(((unsigned long long)start & 0x3f) == 0,
5896            "Alignment problem (0x%08llx)", (unsigned long long)start);
5897     __ emit_data64(0x8080808080808080, relocInfo::none);
5898     __ emit_data64(0x8080808080808080, relocInfo::none);
5899     __ emit_data64(0x8080808080808080, relocInfo::none);
5900     __ emit_data64(0x8080808080808080, relocInfo::none);
5901     __ emit_data64(0x8080808080808080, relocInfo::none);
5902     __ emit_data64(0x3f8080803e808080, relocInfo::none);
5903     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5904     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5905     return start;
5906   }
5907 
5908   address base64_vbmi_lookup_hi_addr() {
5909     __ align64();
5910     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64");
5911     address start = __ pc();
5912     assert(((unsigned long long)start & 0x3f) == 0,
5913            "Alignment problem (0x%08llx)", (unsigned long long)start);
5914     __ emit_data64(0x0605040302010080, relocInfo::none);
5915     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5916     __ emit_data64(0x161514131211100f, relocInfo::none);
5917     __ emit_data64(0x8080808080191817, relocInfo::none);
5918     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5919     __ emit_data64(0x2827262524232221, relocInfo::none);
5920     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5921     __ emit_data64(0x8080808080333231, relocInfo::none);
5922     return start;
5923   }
5924   address base64_vbmi_lookup_lo_url_addr() {
5925     __ align64();
5926     StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64url");
5927     address start = __ pc();
5928     assert(((unsigned long long)start & 0x3f) == 0,
5929            "Alignment problem (0x%08llx)", (unsigned long long)start);
5930     __ emit_data64(0x8080808080808080, relocInfo::none);
5931     __ emit_data64(0x8080808080808080, relocInfo::none);
5932     __ emit_data64(0x8080808080808080, relocInfo::none);
5933     __ emit_data64(0x8080808080808080, relocInfo::none);
5934     __ emit_data64(0x8080808080808080, relocInfo::none);
5935     __ emit_data64(0x80803e8080808080, relocInfo::none);
5936     __ emit_data64(0x3b3a393837363534, relocInfo::none);
5937     __ emit_data64(0x8080808080803d3c, relocInfo::none);
5938     return start;
5939   }
5940 
5941   address base64_vbmi_lookup_hi_url_addr() {
5942     __ align64();
5943     StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64url");
5944     address start = __ pc();
5945     assert(((unsigned long long)start & 0x3f) == 0,
5946            "Alignment problem (0x%08llx)", (unsigned long long)start);
5947     __ emit_data64(0x0605040302010080, relocInfo::none);
5948     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5949     __ emit_data64(0x161514131211100f, relocInfo::none);
5950     __ emit_data64(0x3f80808080191817, relocInfo::none);
5951     __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5952     __ emit_data64(0x2827262524232221, relocInfo::none);
5953     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5954     __ emit_data64(0x8080808080333231, relocInfo::none);
5955     return start;
5956   }
5957 
5958   address base64_vbmi_pack_vec_addr() {
5959     __ align64();
5960     StubCodeMark mark(this, "StubRoutines", "pack_vec_base64");
5961     address start = __ pc();
5962     assert(((unsigned long long)start & 0x3f) == 0,
5963            "Alignment problem (0x%08llx)", (unsigned long long)start);
5964     __ emit_data64(0x090a040506000102, relocInfo::none);
5965     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5966     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5967     __ emit_data64(0x292a242526202122, relocInfo::none);
5968     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5969     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5970     __ emit_data64(0x0000000000000000, relocInfo::none);
5971     __ emit_data64(0x0000000000000000, relocInfo::none);
5972     return start;
5973   }
5974 
5975   address base64_vbmi_join_0_1_addr() {
5976     __ align64();
5977     StubCodeMark mark(this, "StubRoutines", "join_0_1_base64");
5978     address start = __ pc();
5979     assert(((unsigned long long)start & 0x3f) == 0,
5980            "Alignment problem (0x%08llx)", (unsigned long long)start);
5981     __ emit_data64(0x090a040506000102, relocInfo::none);
5982     __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5983     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5984     __ emit_data64(0x292a242526202122, relocInfo::none);
5985     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5986     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5987     __ emit_data64(0x494a444546404142, relocInfo::none);
5988     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5989     return start;
5990   }
5991 
5992   address base64_vbmi_join_1_2_addr() {
5993     __ align64();
5994     StubCodeMark mark(this, "StubRoutines", "join_1_2_base64");
5995     address start = __ pc();
5996     assert(((unsigned long long)start & 0x3f) == 0,
5997            "Alignment problem (0x%08llx)", (unsigned long long)start);
5998     __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5999     __ emit_data64(0x292a242526202122, relocInfo::none);
6000     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6001     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6002     __ emit_data64(0x494a444546404142, relocInfo::none);
6003     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6004     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
6005     __ emit_data64(0x696a646566606162, relocInfo::none);
6006     return start;
6007   }
6008 
6009   address base64_vbmi_join_2_3_addr() {
6010     __ align64();
6011     StubCodeMark mark(this, "StubRoutines", "join_2_3_base64");
6012     address start = __ pc();
6013     assert(((unsigned long long)start & 0x3f) == 0,
6014            "Alignment problem (0x%08llx)", (unsigned long long)start);
6015     __ emit_data64(0x363031322c2d2e28, relocInfo::none);
6016     __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
6017     __ emit_data64(0x494a444546404142, relocInfo::none);
6018     __ emit_data64(0x565051524c4d4e48, relocInfo::none);
6019     __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
6020     __ emit_data64(0x696a646566606162, relocInfo::none);
6021     __ emit_data64(0x767071726c6d6e68, relocInfo::none);
6022     __ emit_data64(0x7c7d7e78797a7475, relocInfo::none);
6023     return start;
6024   }
6025 
6026   address base64_decoding_table_addr() {
6027     StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
6028     address start = __ pc();
6029     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6030     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6031     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6032     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6033     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6034     __ emit_data64(0x3fffffff3effffff, relocInfo::none);
6035     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6036     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6037     __ emit_data64(0x06050403020100ff, relocInfo::none);
6038     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6039     __ emit_data64(0x161514131211100f, relocInfo::none);
6040     __ emit_data64(0xffffffffff191817, relocInfo::none);
6041     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6042     __ emit_data64(0x2827262524232221, relocInfo::none);
6043     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6044     __ emit_data64(0xffffffffff333231, relocInfo::none);
6045     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6046     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6047     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6048     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6049     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6050     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6051     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6052     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6053     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6054     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6055     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6056     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6057     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6058     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6059     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6060     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6061 
6062     // URL table
6063     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6064     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6065     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6066     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6067     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6068     __ emit_data64(0xffff3effffffffff, relocInfo::none);
6069     __ emit_data64(0x3b3a393837363534, relocInfo::none);
6070     __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6071     __ emit_data64(0x06050403020100ff, relocInfo::none);
6072     __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6073     __ emit_data64(0x161514131211100f, relocInfo::none);
6074     __ emit_data64(0x3fffffffff191817, relocInfo::none);
6075     __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6076     __ emit_data64(0x2827262524232221, relocInfo::none);
6077     __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6078     __ emit_data64(0xffffffffff333231, relocInfo::none);
6079     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6080     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6081     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6082     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6083     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6084     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6085     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6086     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6087     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6088     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6089     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6090     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6091     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6092     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6093     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6094     __ emit_data64(0xffffffffffffffff, relocInfo::none);
6095     return start;
6096   }
6097 
6098 
6099 // Code for generating Base64 decoding.
6100 //
6101 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
6102 //
6103 // Intrinsic function prototype in Base64.java:
6104 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME) {
6105   address generate_base64_decodeBlock() {
6106     __ align(CodeEntryAlignment);
6107     StubCodeMark mark(this, "StubRoutines", "implDecode");
6108     address start = __ pc();
6109     __ enter();
6110 
6111     // Save callee-saved registers before using them
6112     __ push(r12);
6113     __ push(r13);
6114     __ push(r14);
6115     __ push(r15);
6116     __ push(rbx);
6117 
6118     // arguments
6119     const Register source = c_rarg0; // Source Array
6120     const Register start_offset = c_rarg1; // start offset
6121     const Register end_offset = c_rarg2; // end offset
6122     const Register dest = c_rarg3; // destination array
6123     const Register isMIME = rbx;
6124 
6125 #ifndef _WIN64
6126     const Register dp = c_rarg4;  // Position for writing to dest array
6127     const Register isURL = c_rarg5;// Base64 or URL character set
6128     __ movl(isMIME, Address(rbp, 2 * wordSize));
6129 #else
6130     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
6131     const Address isURL_mem(rbp, 7 * wordSize);
6132     const Register isURL = r10;      // pick the volatile windows register
6133     const Register dp = r12;
6134     __ movl(dp, dp_mem);
6135     __ movl(isURL, isURL_mem);
6136     __ movl(isMIME, Address(rbp, 8 * wordSize));
6137 #endif
6138 
6139     const XMMRegister lookup_lo = xmm5;
6140     const XMMRegister lookup_hi = xmm6;
6141     const XMMRegister errorvec = xmm7;
6142     const XMMRegister pack16_op = xmm9;
6143     const XMMRegister pack32_op = xmm8;
6144     const XMMRegister input0 = xmm3;
6145     const XMMRegister input1 = xmm20;
6146     const XMMRegister input2 = xmm21;
6147     const XMMRegister input3 = xmm19;
6148     const XMMRegister join01 = xmm12;
6149     const XMMRegister join12 = xmm11;
6150     const XMMRegister join23 = xmm10;
6151     const XMMRegister translated0 = xmm2;
6152     const XMMRegister translated1 = xmm1;
6153     const XMMRegister translated2 = xmm0;
6154     const XMMRegister translated3 = xmm4;
6155 
6156     const XMMRegister merged0 = xmm2;
6157     const XMMRegister merged1 = xmm1;
6158     const XMMRegister merged2 = xmm0;
6159     const XMMRegister merged3 = xmm4;
6160     const XMMRegister merge_ab_bc0 = xmm2;
6161     const XMMRegister merge_ab_bc1 = xmm1;
6162     const XMMRegister merge_ab_bc2 = xmm0;
6163     const XMMRegister merge_ab_bc3 = xmm4;
6164 
6165     const XMMRegister pack24bits = xmm4;
6166 
6167     const Register length = r14;
6168     const Register output_size = r13;
6169     const Register output_mask = r15;
6170     const KRegister input_mask = k1;
6171 
6172     const XMMRegister input_initial_valid_b64 = xmm0;
6173     const XMMRegister tmp = xmm10;
6174     const XMMRegister mask = xmm0;
6175     const XMMRegister invalid_b64 = xmm1;
6176 
6177     Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
6178     Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
6179     Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
6180 
6181     // calculate length from offsets
6182     __ movl(length, end_offset);
6183     __ subl(length, start_offset);
6184     __ push(dest);          // Save for return value calc
6185 
6186     // If AVX512 VBMI not supported, just compile non-AVX code
6187     if(VM_Version::supports_avx512_vbmi() &&
6188        VM_Version::supports_avx512bw()) {
6189       __ cmpl(length, 128);     // 128-bytes is break-even for AVX-512
6190       __ jcc(Assembler::lessEqual, L_bruteForce);
6191 
6192       __ cmpl(isMIME, 0);
6193       __ jcc(Assembler::notEqual, L_bruteForce);
6194 
6195       // Load lookup tables based on isURL
6196       __ cmpl(isURL, 0);
6197       __ jcc(Assembler::notZero, L_loadURL);
6198 
6199       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_addr()), Assembler::AVX_512bit, r13);
6200       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_addr()), Assembler::AVX_512bit, r13);
6201 
6202       __ BIND(L_continue);
6203 
6204       __ movl(r15, 0x01400140);
6205       __ evpbroadcastd(pack16_op, r15, Assembler::AVX_512bit);
6206 
6207       __ movl(r15, 0x00011000);
6208       __ evpbroadcastd(pack32_op, r15, Assembler::AVX_512bit);
6209 
6210       __ cmpl(length, 0xff);
6211       __ jcc(Assembler::lessEqual, L_process64);
6212 
6213       // load masks required for decoding data
6214       __ BIND(L_processdata);
6215       __ evmovdquq(join01, ExternalAddress(StubRoutines::x86::base64_vbmi_join_0_1_addr()), Assembler::AVX_512bit,r13);
6216       __ evmovdquq(join12, ExternalAddress(StubRoutines::x86::base64_vbmi_join_1_2_addr()), Assembler::AVX_512bit, r13);
6217       __ evmovdquq(join23, ExternalAddress(StubRoutines::x86::base64_vbmi_join_2_3_addr()), Assembler::AVX_512bit, r13);
6218 
6219       __ align32();
6220       __ BIND(L_process256);
6221       // Grab input data
6222       __ evmovdquq(input0, Address(source, start_offset, Address::times_1, 0x00), Assembler::AVX_512bit);
6223       __ evmovdquq(input1, Address(source, start_offset, Address::times_1, 0x40), Assembler::AVX_512bit);
6224       __ evmovdquq(input2, Address(source, start_offset, Address::times_1, 0x80), Assembler::AVX_512bit);
6225       __ evmovdquq(input3, Address(source, start_offset, Address::times_1, 0xc0), Assembler::AVX_512bit);
6226 
6227       // Copy the low part of the lookup table into the destination of the permutation
6228       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6229       __ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
6230       __ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
6231       __ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
6232 
6233       // Translate the base64 input into "decoded" bytes
6234       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6235       __ evpermt2b(translated1, input1, lookup_hi, Assembler::AVX_512bit);
6236       __ evpermt2b(translated2, input2, lookup_hi, Assembler::AVX_512bit);
6237       __ evpermt2b(translated3, input3, lookup_hi, Assembler::AVX_512bit);
6238 
6239       // OR all of the translations together to check for errors (high-order bit of byte set)
6240       __ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
6241 
6242       __ vpternlogd(input3, 0xfe, translated0, translated1, Assembler::AVX_512bit);
6243       __ vpternlogd(input0, 0xfe, translated2, translated3, Assembler::AVX_512bit);
6244       __ vpor(errorvec, input3, input0, Assembler::AVX_512bit);
6245 
6246       // Check if there was an error - if so, try 64-byte chunks
6247       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6248       __ kortestql(k3, k3);
6249       __ jcc(Assembler::notZero, L_process64);
6250 
6251       // The merging and shuffling happens here
6252       // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa]
6253       // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd]
6254       // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
6255       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6256       __ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
6257       __ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
6258       __ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
6259 
6260       // Now do the same with packed 16-bit values.
6261       // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb]
6262       // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12
6263       // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
6264       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6265       __ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
6266       __ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
6267       __ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
6268 
6269       // The join vectors specify which byte from which vector goes into the outputs
6270       // One of every 4 bytes in the extended vector is zero, so we pack them into their
6271       // final positions in the register for storing (256 bytes in, 192 bytes out)
6272       __ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
6273       __ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
6274       __ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
6275 
6276       // Store result
6277       __ evmovdquq(Address(dest, dp, Address::times_1, 0x00), merged0, Assembler::AVX_512bit);
6278       __ evmovdquq(Address(dest, dp, Address::times_1, 0x40), merged1, Assembler::AVX_512bit);
6279       __ evmovdquq(Address(dest, dp, Address::times_1, 0x80), merged2, Assembler::AVX_512bit);
6280 
6281       __ addptr(source, 0x100);
6282       __ addptr(dest, 0xc0);
6283       __ subl(length, 0x100);
6284       __ cmpl(length, 64 * 4);
6285       __ jcc(Assembler::greaterEqual, L_process256);
6286 
6287       // At this point, we've decoded 64 * 4 * n bytes.
6288       // The remaining length will be <= 64 * 4 - 1.
6289       // UNLESS there was an error decoding the first 256-byte chunk.  In this
6290       // case, the length will be arbitrarily long.
6291       //
6292       // Note that this will be the path for MIME-encoded strings.
6293 
6294       __ BIND(L_process64);
6295 
6296       __ evmovdquq(pack24bits, ExternalAddress(StubRoutines::x86::base64_vbmi_pack_vec_addr()), Assembler::AVX_512bit, r13);
6297 
6298       __ cmpl(length, 63);
6299       __ jcc(Assembler::lessEqual, L_finalBit);
6300 
6301       __ align32();
6302       __ BIND(L_process64Loop);
6303 
6304       // Handle first 64-byte block
6305 
6306       __ evmovdquq(input0, Address(source, start_offset), Assembler::AVX_512bit);
6307       __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6308       __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6309 
6310       __ vpor(errorvec, translated0, input0, Assembler::AVX_512bit);
6311 
6312       // Check for error and bomb out before updating dest
6313       __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6314       __ kortestql(k3, k3);
6315       __ jcc(Assembler::notZero, L_exit);
6316 
6317       // Pack output register, selecting correct byte ordering
6318       __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6319       __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6320       __ vpermb(merged0, pack24bits, merged0, Assembler::AVX_512bit);
6321 
6322       __ evmovdquq(Address(dest, dp), merged0, Assembler::AVX_512bit);
6323 
6324       __ subl(length, 64);
6325       __ addptr(source, 64);
6326       __ addptr(dest, 48);
6327 
6328       __ cmpl(length, 64);
6329       __ jcc(Assembler::greaterEqual, L_process64Loop);
6330 
6331       __ cmpl(length, 0);
6332       __ jcc(Assembler::lessEqual, L_exit);
6333 
6334       __ BIND(L_finalBit);
6335       // Now have 1 to 63 bytes left to decode
6336 
6337       // I was going to let Java take care of the final fragment
6338       // however it will repeatedly call this routine for every 4 bytes
6339       // of input data, so handle the rest here.
6340       __ movq(rax, -1);
6341       __ bzhiq(rax, rax, length);    // Input mask in rax
6342 
6343       __ movl(output_size, length);
6344       __ shrl(output_size, 2);   // Find (len / 4) * 3 (output length)
6345       __ lea(output_size, Address(output_size, output_size, Address::times_2, 0));
6346       // output_size in r13
6347 
6348       // Strip pad characters, if any, and adjust length and mask
6349       __ cmpb(Address(source, length, Address::times_1, -1), '=');
6350       __ jcc(Assembler::equal, L_padding);
6351 
6352       __ BIND(L_donePadding);
6353 
6354       // Output size is (64 - output_size), output mask is (all 1s >> output_size).
6355       __ kmovql(input_mask, rax);
6356       __ movq(output_mask, -1);
6357       __ bzhiq(output_mask, output_mask, output_size);
6358 
6359       // Load initial input with all valid base64 characters.  Will be used
6360       // in merging source bytes to avoid masking when determining if an error occurred.
6361       __ movl(rax, 0x61616161);
6362       __ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
6363 
6364       // A register containing all invalid base64 decoded values
6365       __ movl(rax, 0x80808080);
6366       __ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
6367 
6368       // input_mask is in k1
6369       // output_size is in r13
6370       // output_mask is in r15
6371       // zmm0 - free
6372       // zmm1 - 0x00011000
6373       // zmm2 - 0x01400140
6374       // zmm3 - errorvec
6375       // zmm4 - pack vector
6376       // zmm5 - lookup_lo
6377       // zmm6 - lookup_hi
6378       // zmm7 - errorvec
6379       // zmm8 - 0x61616161
6380       // zmm9 - 0x80808080
6381 
6382       // Load only the bytes from source, merging into our "fully-valid" register
6383       __ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
6384 
6385       // Decode all bytes within our merged input
6386       __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
6387       __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
6388       __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
6389 
6390       // Check for error.  Compare (decoded | initial) to all invalid.
6391       // If any bytes have their high-order bit set, then we have an error.
6392       __ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
6393       __ kortestql(k2, k2);
6394 
6395       // If we have an error, use the brute force loop to decode what we can (4-byte chunks).
6396       __ jcc(Assembler::notZero, L_bruteForce);
6397 
6398       // Shuffle output bytes
6399       __ vpmaddubsw(tmp, tmp, pack16_op, Assembler::AVX_512bit);
6400       __ vpmaddwd(tmp, tmp, pack32_op, Assembler::AVX_512bit);
6401 
6402       __ vpermb(tmp, pack24bits, tmp, Assembler::AVX_512bit);
6403       __ kmovql(k1, output_mask);
6404       __ evmovdqub(Address(dest, dp), k1, tmp, true, Assembler::AVX_512bit);
6405 
6406       __ addptr(dest, output_size);
6407 
6408       __ BIND(L_exit);
6409       __ vzeroupper();
6410       __ pop(rax);             // Get original dest value
6411       __ subptr(dest, rax);      // Number of bytes converted
6412       __ movptr(rax, dest);
6413       __ pop(rbx);
6414       __ pop(r15);
6415       __ pop(r14);
6416       __ pop(r13);
6417       __ pop(r12);
6418       __ leave();
6419       __ ret(0);
6420 
6421       __ BIND(L_loadURL);
6422       __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_url_addr()), Assembler::AVX_512bit, r13);
6423       __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_url_addr()), Assembler::AVX_512bit, r13);
6424       __ jmp(L_continue);
6425 
6426       __ BIND(L_padding);
6427       __ decrementq(output_size, 1);
6428       __ shrq(rax, 1);
6429 
6430       __ cmpb(Address(source, length, Address::times_1, -2), '=');
6431       __ jcc(Assembler::notEqual, L_donePadding);
6432 
6433       __ decrementq(output_size, 1);
6434       __ shrq(rax, 1);
6435       __ jmp(L_donePadding);
6436 
6437       __ align32();
6438       __ BIND(L_bruteForce);
6439     }   // End of if(avx512_vbmi)
6440 
6441     // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
6442 
6443     // Register state (Linux):
6444     // r12-15 - saved on stack
6445     // rdi - src
6446     // rsi - sp
6447     // rdx - sl
6448     // rcx - dst
6449     // r8 - dp
6450     // r9 - isURL
6451 
6452     // Register state (Windows):
6453     // r12-15 - saved on stack
6454     // rcx - src
6455     // rdx - sp
6456     // r8 - sl
6457     // r9 - dst
6458     // r12 - dp
6459     // r10 - isURL
6460 
6461     // Registers (common):
6462     // length (r14) - bytes in src
6463 
6464     const Register decode_table = r11;
6465     const Register out_byte_count = rbx;
6466     const Register byte1 = r13;
6467     const Register byte2 = r15;
6468     const Register byte3 = WINDOWS_ONLY(r8) NOT_WINDOWS(rdx);
6469     const Register byte4 = WINDOWS_ONLY(r10) NOT_WINDOWS(r9);
6470 
6471     __ shrl(length, 2);    // Multiple of 4 bytes only - length is # 4-byte chunks
6472     __ cmpl(length, 0);
6473     __ jcc(Assembler::lessEqual, L_exit_no_vzero);
6474 
6475     __ shll(isURL, 8);    // index into decode table based on isURL
6476     __ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
6477     __ addptr(decode_table, isURL);
6478 
6479     __ jmp(L_bottomLoop);
6480 
6481     __ align32();
6482     __ BIND(L_forceLoop);
6483     __ shll(byte1, 18);
6484     __ shll(byte2, 12);
6485     __ shll(byte3, 6);
6486     __ orl(byte1, byte2);
6487     __ orl(byte1, byte3);
6488     __ orl(byte1, byte4);
6489 
6490     __ addptr(source, 4);
6491 
6492     __ movb(Address(dest, dp, Address::times_1, 2), byte1);
6493     __ shrl(byte1, 8);
6494     __ movb(Address(dest, dp, Address::times_1, 1), byte1);
6495     __ shrl(byte1, 8);
6496     __ movb(Address(dest, dp, Address::times_1, 0), byte1);
6497 
6498     __ addptr(dest, 3);
6499     __ decrementl(length, 1);
6500     __ jcc(Assembler::zero, L_exit_no_vzero);
6501 
6502     __ BIND(L_bottomLoop);
6503     __ load_unsigned_byte(byte1, Address(source, start_offset, Address::times_1, 0x00));
6504     __ load_unsigned_byte(byte2, Address(source, start_offset, Address::times_1, 0x01));
6505     __ load_signed_byte(byte1, Address(decode_table, byte1));
6506     __ load_signed_byte(byte2, Address(decode_table, byte2));
6507     __ load_unsigned_byte(byte3, Address(source, start_offset, Address::times_1, 0x02));
6508     __ load_unsigned_byte(byte4, Address(source, start_offset, Address::times_1, 0x03));
6509     __ load_signed_byte(byte3, Address(decode_table, byte3));
6510     __ load_signed_byte(byte4, Address(decode_table, byte4));
6511 
6512     __ mov(rax, byte1);
6513     __ orl(rax, byte2);
6514     __ orl(rax, byte3);
6515     __ orl(rax, byte4);
6516     __ jcc(Assembler::positive, L_forceLoop);
6517 
6518     __ BIND(L_exit_no_vzero);
6519     __ pop(rax);             // Get original dest value
6520     __ subptr(dest, rax);      // Number of bytes converted
6521     __ movptr(rax, dest);
6522     __ pop(rbx);
6523     __ pop(r15);
6524     __ pop(r14);
6525     __ pop(r13);
6526     __ pop(r12);
6527     __ leave();
6528     __ ret(0);
6529 
6530     return start;
6531   }
6532 
6533 
6534   /**
6535    *  Arguments:
6536    *
6537    * Inputs:
6538    *   c_rarg0   - int crc
6539    *   c_rarg1   - byte* buf
6540    *   c_rarg2   - int length
6541    *
6542    * Ouput:
6543    *       rax   - int crc result
6544    */
6545   address generate_updateBytesCRC32() {
6546     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
6547 
6548     __ align(CodeEntryAlignment);
6549     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6550 
6551     address start = __ pc();
6552     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6553     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6554     // rscratch1: r10
6555     const Register crc   = c_rarg0;  // crc
6556     const Register buf   = c_rarg1;  // source java byte array address
6557     const Register len   = c_rarg2;  // length
6558     const Register table = c_rarg3;  // crc_table address (reuse register)
6559     const Register tmp1   = r11;
6560     const Register tmp2   = r10;
6561     assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
6562 
6563     BLOCK_COMMENT("Entry:");
6564     __ enter(); // required for proper stackwalking of RuntimeStub frame
6565 
6566     if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6567         VM_Version::supports_avx512bw() &&
6568         VM_Version::supports_avx512vl()) {
6569       __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6570     } else {
6571       __ kernel_crc32(crc, buf, len, table, tmp1);
6572     }
6573 
6574     __ movl(rax, crc);
6575     __ vzeroupper();
6576     __ leave(); // required for proper stackwalking of RuntimeStub frame
6577     __ ret(0);
6578 
6579     return start;
6580   }
6581 
6582   /**
6583   *  Arguments:
6584   *
6585   * Inputs:
6586   *   c_rarg0   - int crc
6587   *   c_rarg1   - byte* buf
6588   *   c_rarg2   - long length
6589   *   c_rarg3   - table_start - optional (present only when doing a library_call,
6590   *              not used by x86 algorithm)
6591   *
6592   * Ouput:
6593   *       rax   - int crc result
6594   */
6595   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
6596       assert(UseCRC32CIntrinsics, "need SSE4_2");
6597       __ align(CodeEntryAlignment);
6598       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
6599       address start = __ pc();
6600       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
6601       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
6602       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
6603       const Register crc = c_rarg0;  // crc
6604       const Register buf = c_rarg1;  // source java byte array address
6605       const Register len = c_rarg2;  // length
6606       const Register a = rax;
6607       const Register j = r9;
6608       const Register k = r10;
6609       const Register l = r11;
6610 #ifdef _WIN64
6611       const Register y = rdi;
6612       const Register z = rsi;
6613 #else
6614       const Register y = rcx;
6615       const Register z = r8;
6616 #endif
6617       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
6618 
6619       BLOCK_COMMENT("Entry:");
6620       __ enter(); // required for proper stackwalking of RuntimeStub frame
6621 #ifdef _WIN64
6622       __ push(y);
6623       __ push(z);
6624 #endif
6625       __ crc32c_ipl_alg2_alt2(crc, buf, len,
6626                               a, j, k,
6627                               l, y, z,
6628                               c_farg0, c_farg1, c_farg2,
6629                               is_pclmulqdq_supported);
6630       __ movl(rax, crc);
6631 #ifdef _WIN64
6632       __ pop(z);
6633       __ pop(y);
6634 #endif
6635       __ vzeroupper();
6636       __ leave(); // required for proper stackwalking of RuntimeStub frame
6637       __ ret(0);
6638 
6639       return start;
6640   }
6641 
6642 
6643   /***
6644    *  Arguments:
6645    *
6646    *  Inputs:
6647    *   c_rarg0   - int   adler
6648    *   c_rarg1   - byte* buff
6649    *   c_rarg2   - int   len
6650    *
6651    * Output:
6652    *   rax   - int adler result
6653    */
6654 
6655   address generate_updateBytesAdler32() {
6656       assert(UseAdler32Intrinsics, "need AVX2");
6657 
6658       __ align(CodeEntryAlignment);
6659       StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
6660 
6661       address start = __ pc();
6662 
6663       const Register data = r9;
6664       const Register size = r10;
6665 
6666       const XMMRegister yshuf0 = xmm6;
6667       const XMMRegister yshuf1 = xmm7;
6668       assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
6669 
6670       BLOCK_COMMENT("Entry:");
6671       __ enter(); // required for proper stackwalking of RuntimeStub frame
6672 
6673       __ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
6674       __ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
6675       __ movptr(data, c_rarg1); //data
6676       __ movl(size, c_rarg2); //length
6677       __ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
6678       __ leave();
6679       __ ret(0);
6680       return start;
6681   }
6682 
6683   /**
6684    *  Arguments:
6685    *
6686    *  Input:
6687    *    c_rarg0   - x address
6688    *    c_rarg1   - x length
6689    *    c_rarg2   - y address
6690    *    c_rarg3   - y length
6691    * not Win64
6692    *    c_rarg4   - z address
6693    *    c_rarg5   - z length
6694    * Win64
6695    *    rsp+40    - z address
6696    *    rsp+48    - z length
6697    */
6698   address generate_multiplyToLen() {
6699     __ align(CodeEntryAlignment);
6700     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
6701 
6702     address start = __ pc();
6703     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6704     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6705     const Register x     = rdi;
6706     const Register xlen  = rax;
6707     const Register y     = rsi;
6708     const Register ylen  = rcx;
6709     const Register z     = r8;
6710     const Register zlen  = r11;
6711 
6712     // Next registers will be saved on stack in multiply_to_len().
6713     const Register tmp1  = r12;
6714     const Register tmp2  = r13;
6715     const Register tmp3  = r14;
6716     const Register tmp4  = r15;
6717     const Register tmp5  = rbx;
6718 
6719     BLOCK_COMMENT("Entry:");
6720     __ enter(); // required for proper stackwalking of RuntimeStub frame
6721 
6722 #ifndef _WIN64
6723     __ movptr(zlen, r9); // Save r9 in r11 - zlen
6724 #endif
6725     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
6726                        // ylen => rcx, z => r8, zlen => r11
6727                        // r9 and r10 may be used to save non-volatile registers
6728 #ifdef _WIN64
6729     // last 2 arguments (#4, #5) are on stack on Win64
6730     __ movptr(z, Address(rsp, 6 * wordSize));
6731     __ movptr(zlen, Address(rsp, 7 * wordSize));
6732 #endif
6733 
6734     __ movptr(xlen, rsi);
6735     __ movptr(y,    rdx);
6736     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
6737 
6738     restore_arg_regs();
6739 
6740     __ leave(); // required for proper stackwalking of RuntimeStub frame
6741     __ ret(0);
6742 
6743     return start;
6744   }
6745 
6746   /**
6747   *  Arguments:
6748   *
6749   *  Input:
6750   *    c_rarg0   - obja     address
6751   *    c_rarg1   - objb     address
6752   *    c_rarg3   - length   length
6753   *    c_rarg4   - scale    log2_array_indxscale
6754   *
6755   *  Output:
6756   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
6757   */
6758   address generate_vectorizedMismatch() {
6759     __ align(CodeEntryAlignment);
6760     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
6761     address start = __ pc();
6762 
6763     BLOCK_COMMENT("Entry:");
6764     __ enter();
6765 
6766 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6767     const Register scale = c_rarg0;  //rcx, will exchange with r9
6768     const Register objb = c_rarg1;   //rdx
6769     const Register length = c_rarg2; //r8
6770     const Register obja = c_rarg3;   //r9
6771     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
6772 
6773     const Register tmp1 = r10;
6774     const Register tmp2 = r11;
6775 #endif
6776 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6777     const Register obja = c_rarg0;   //U:rdi
6778     const Register objb = c_rarg1;   //U:rsi
6779     const Register length = c_rarg2; //U:rdx
6780     const Register scale = c_rarg3;  //U:rcx
6781     const Register tmp1 = r8;
6782     const Register tmp2 = r9;
6783 #endif
6784     const Register result = rax; //return value
6785     const XMMRegister vec0 = xmm0;
6786     const XMMRegister vec1 = xmm1;
6787     const XMMRegister vec2 = xmm2;
6788 
6789     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
6790 
6791     __ vzeroupper();
6792     __ leave();
6793     __ ret(0);
6794 
6795     return start;
6796   }
6797 
6798 /**
6799    *  Arguments:
6800    *
6801   //  Input:
6802   //    c_rarg0   - x address
6803   //    c_rarg1   - x length
6804   //    c_rarg2   - z address
6805   //    c_rarg3   - z lenth
6806    *
6807    */
6808   address generate_squareToLen() {
6809 
6810     __ align(CodeEntryAlignment);
6811     StubCodeMark mark(this, "StubRoutines", "squareToLen");
6812 
6813     address start = __ pc();
6814     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6815     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
6816     const Register x      = rdi;
6817     const Register len    = rsi;
6818     const Register z      = r8;
6819     const Register zlen   = rcx;
6820 
6821    const Register tmp1      = r12;
6822    const Register tmp2      = r13;
6823    const Register tmp3      = r14;
6824    const Register tmp4      = r15;
6825    const Register tmp5      = rbx;
6826 
6827     BLOCK_COMMENT("Entry:");
6828     __ enter(); // required for proper stackwalking of RuntimeStub frame
6829 
6830     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
6831                        // zlen => rcx
6832                        // r9 and r10 may be used to save non-volatile registers
6833     __ movptr(r8, rdx);
6834     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6835 
6836     restore_arg_regs();
6837 
6838     __ leave(); // required for proper stackwalking of RuntimeStub frame
6839     __ ret(0);
6840 
6841     return start;
6842   }
6843 
6844   address generate_method_entry_barrier() {
6845     __ align(CodeEntryAlignment);
6846     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
6847 
6848     Label deoptimize_label;
6849 
6850     address start = __ pc();
6851 
6852     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
6853 
6854     BLOCK_COMMENT("Entry:");
6855     __ enter(); // save rbp
6856 
6857     // save c_rarg0, because we want to use that value.
6858     // We could do without it but then we depend on the number of slots used by pusha
6859     __ push(c_rarg0);
6860 
6861     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
6862 
6863     __ pusha();
6864 
6865     // The method may have floats as arguments, and we must spill them before calling
6866     // the VM runtime.
6867     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
6868     const int xmm_size = wordSize * 2;
6869     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
6870     __ subptr(rsp, xmm_spill_size);
6871     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
6872     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
6873     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
6874     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
6875     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
6876     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
6877     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
6878     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
6879 
6880     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
6881 
6882     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
6883     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
6884     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
6885     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
6886     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
6887     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
6888     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
6889     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
6890     __ addptr(rsp, xmm_spill_size);
6891 
6892     __ cmpl(rax, 1); // 1 means deoptimize
6893     __ jcc(Assembler::equal, deoptimize_label);
6894 
6895     __ popa();
6896     __ pop(c_rarg0);
6897 
6898     __ leave();
6899 
6900     __ addptr(rsp, 1 * wordSize); // cookie
6901     __ ret(0);
6902 
6903 
6904     __ BIND(deoptimize_label);
6905 
6906     __ popa();
6907     __ pop(c_rarg0);
6908 
6909     __ leave();
6910 
6911     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
6912     // here while still having a correct stack is valuable
6913     __ testptr(rsp, Address(rsp, 0));
6914 
6915     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
6916     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
6917 
6918     return start;
6919   }
6920 
6921    /**
6922    *  Arguments:
6923    *
6924    *  Input:
6925    *    c_rarg0   - out address
6926    *    c_rarg1   - in address
6927    *    c_rarg2   - offset
6928    *    c_rarg3   - len
6929    * not Win64
6930    *    c_rarg4   - k
6931    * Win64
6932    *    rsp+40    - k
6933    */
6934   address generate_mulAdd() {
6935     __ align(CodeEntryAlignment);
6936     StubCodeMark mark(this, "StubRoutines", "mulAdd");
6937 
6938     address start = __ pc();
6939     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6940     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6941     const Register out     = rdi;
6942     const Register in      = rsi;
6943     const Register offset  = r11;
6944     const Register len     = rcx;
6945     const Register k       = r8;
6946 
6947     // Next registers will be saved on stack in mul_add().
6948     const Register tmp1  = r12;
6949     const Register tmp2  = r13;
6950     const Register tmp3  = r14;
6951     const Register tmp4  = r15;
6952     const Register tmp5  = rbx;
6953 
6954     BLOCK_COMMENT("Entry:");
6955     __ enter(); // required for proper stackwalking of RuntimeStub frame
6956 
6957     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
6958                        // len => rcx, k => r8
6959                        // r9 and r10 may be used to save non-volatile registers
6960 #ifdef _WIN64
6961     // last argument is on stack on Win64
6962     __ movl(k, Address(rsp, 6 * wordSize));
6963 #endif
6964     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
6965     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6966 
6967     restore_arg_regs();
6968 
6969     __ leave(); // required for proper stackwalking of RuntimeStub frame
6970     __ ret(0);
6971 
6972     return start;
6973   }
6974 
6975   address generate_bigIntegerRightShift() {
6976     __ align(CodeEntryAlignment);
6977     StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
6978 
6979     address start = __ pc();
6980     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
6981     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
6982     const Register newArr = rdi;
6983     const Register oldArr = rsi;
6984     const Register newIdx = rdx;
6985     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
6986     const Register totalNumIter = r8;
6987 
6988     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
6989     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
6990     const Register tmp1 = r11;                    // Caller save.
6991     const Register tmp2 = rax;                    // Caller save.
6992     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
6993     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
6994     const Register tmp5 = r14;                    // Callee save.
6995     const Register tmp6 = r15;
6996 
6997     const XMMRegister x0 = xmm0;
6998     const XMMRegister x1 = xmm1;
6999     const XMMRegister x2 = xmm2;
7000 
7001     BLOCK_COMMENT("Entry:");
7002     __ enter(); // required for proper stackwalking of RuntimeStub frame
7003 
7004 #ifdef _WINDOWS
7005     setup_arg_regs(4);
7006     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7007     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7008     // Save callee save registers.
7009     __ push(tmp3);
7010     __ push(tmp4);
7011 #endif
7012     __ push(tmp5);
7013 
7014     // Rename temps used throughout the code.
7015     const Register idx = tmp1;
7016     const Register nIdx = tmp2;
7017 
7018     __ xorl(idx, idx);
7019 
7020     // Start right shift from end of the array.
7021     // For example, if #iteration = 4 and newIdx = 1
7022     // then dest[4] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
7023     // if #iteration = 4 and newIdx = 0
7024     // then dest[3] = src[4] >> shiftCount  | src[3] <<< (shiftCount - 32)
7025     __ movl(idx, totalNumIter);
7026     __ movl(nIdx, idx);
7027     __ addl(nIdx, newIdx);
7028 
7029     // If vectorization is enabled, check if the number of iterations is at least 64
7030     // If not, then go to ShifTwo processing 2 iterations
7031     if (VM_Version::supports_avx512_vbmi2()) {
7032       __ cmpptr(totalNumIter, (AVX3Threshold/64));
7033       __ jcc(Assembler::less, ShiftTwo);
7034 
7035       if (AVX3Threshold < 16 * 64) {
7036         __ cmpl(totalNumIter, 16);
7037         __ jcc(Assembler::less, ShiftTwo);
7038       }
7039       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7040       __ subl(idx, 16);
7041       __ subl(nIdx, 16);
7042       __ BIND(Shift512Loop);
7043       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
7044       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7045       __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
7046       __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
7047       __ subl(nIdx, 16);
7048       __ subl(idx, 16);
7049       __ jcc(Assembler::greaterEqual, Shift512Loop);
7050       __ addl(idx, 16);
7051       __ addl(nIdx, 16);
7052     }
7053     __ BIND(ShiftTwo);
7054     __ cmpl(idx, 2);
7055     __ jcc(Assembler::less, ShiftOne);
7056     __ subl(idx, 2);
7057     __ subl(nIdx, 2);
7058     __ BIND(ShiftTwoLoop);
7059     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
7060     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7061     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7062     __ shrdl(tmp5, tmp4);
7063     __ shrdl(tmp4, tmp3);
7064     __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
7065     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7066     __ subl(nIdx, 2);
7067     __ subl(idx, 2);
7068     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7069     __ addl(idx, 2);
7070     __ addl(nIdx, 2);
7071 
7072     // Do the last iteration
7073     __ BIND(ShiftOne);
7074     __ cmpl(idx, 1);
7075     __ jcc(Assembler::less, Exit);
7076     __ subl(idx, 1);
7077     __ subl(nIdx, 1);
7078     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7079     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7080     __ shrdl(tmp4, tmp3);
7081     __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7082     __ BIND(Exit);
7083     // Restore callee save registers.
7084     __ pop(tmp5);
7085 #ifdef _WINDOWS
7086     __ pop(tmp4);
7087     __ pop(tmp3);
7088     restore_arg_regs();
7089 #endif
7090     __ leave(); // required for proper stackwalking of RuntimeStub frame
7091     __ ret(0);
7092     return start;
7093   }
7094 
7095    /**
7096    *  Arguments:
7097    *
7098    *  Input:
7099    *    c_rarg0   - newArr address
7100    *    c_rarg1   - oldArr address
7101    *    c_rarg2   - newIdx
7102    *    c_rarg3   - shiftCount
7103    * not Win64
7104    *    c_rarg4   - numIter
7105    * Win64
7106    *    rsp40    - numIter
7107    */
7108   address generate_bigIntegerLeftShift() {
7109     __ align(CodeEntryAlignment);
7110     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
7111     address start = __ pc();
7112     Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7113     // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7114     const Register newArr = rdi;
7115     const Register oldArr = rsi;
7116     const Register newIdx = rdx;
7117     const Register shiftCount = rcx;  // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7118     const Register totalNumIter = r8;
7119     // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7120     // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7121     const Register tmp1 = r11;                    // Caller save.
7122     const Register tmp2 = rax;                    // Caller save.
7123     const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9);   // Windows: Callee save. Linux: Caller save.
7124     const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10);  // Windows: Callee save. Linux: Caller save.
7125     const Register tmp5 = r14;                    // Callee save.
7126 
7127     const XMMRegister x0 = xmm0;
7128     const XMMRegister x1 = xmm1;
7129     const XMMRegister x2 = xmm2;
7130     BLOCK_COMMENT("Entry:");
7131     __ enter(); // required for proper stackwalking of RuntimeStub frame
7132 
7133 #ifdef _WINDOWS
7134     setup_arg_regs(4);
7135     // For windows, since last argument is on stack, we need to move it to the appropriate register.
7136     __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7137     // Save callee save registers.
7138     __ push(tmp3);
7139     __ push(tmp4);
7140 #endif
7141     __ push(tmp5);
7142 
7143     // Rename temps used throughout the code
7144     const Register idx = tmp1;
7145     const Register numIterTmp = tmp2;
7146 
7147     // Start idx from zero.
7148     __ xorl(idx, idx);
7149     // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
7150     __ lea(newArr, Address(newArr, newIdx, Address::times_4));
7151     __ movl(numIterTmp, totalNumIter);
7152 
7153     // If vectorization is enabled, check if the number of iterations is at least 64
7154     // If not, then go to ShiftTwo shifting two numbers at a time
7155     if (VM_Version::supports_avx512_vbmi2()) {
7156       __ cmpl(totalNumIter, (AVX3Threshold/64));
7157       __ jcc(Assembler::less, ShiftTwo);
7158 
7159       if (AVX3Threshold < 16 * 64) {
7160         __ cmpl(totalNumIter, 16);
7161         __ jcc(Assembler::less, ShiftTwo);
7162       }
7163       __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7164       __ subl(numIterTmp, 16);
7165       __ BIND(Shift512Loop);
7166       __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7167       __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
7168       __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
7169       __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
7170       __ addl(idx, 16);
7171       __ subl(numIterTmp, 16);
7172       __ jcc(Assembler::greaterEqual, Shift512Loop);
7173       __ addl(numIterTmp, 16);
7174     }
7175     __ BIND(ShiftTwo);
7176     __ cmpl(totalNumIter, 1);
7177     __ jcc(Assembler::less, Exit);
7178     __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7179     __ subl(numIterTmp, 2);
7180     __ jcc(Assembler::less, ShiftOne);
7181 
7182     __ BIND(ShiftTwoLoop);
7183     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7184     __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
7185     __ shldl(tmp3, tmp4);
7186     __ shldl(tmp4, tmp5);
7187     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7188     __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
7189     __ movl(tmp3, tmp5);
7190     __ addl(idx, 2);
7191     __ subl(numIterTmp, 2);
7192     __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7193 
7194     // Do the last iteration
7195     __ BIND(ShiftOne);
7196     __ addl(numIterTmp, 2);
7197     __ cmpl(numIterTmp, 1);
7198     __ jcc(Assembler::less, Exit);
7199     __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7200     __ shldl(tmp3, tmp4);
7201     __ movl(Address(newArr, idx, Address::times_4), tmp3);
7202 
7203     __ BIND(Exit);
7204     // Restore callee save registers.
7205     __ pop(tmp5);
7206 #ifdef _WINDOWS
7207     __ pop(tmp4);
7208     __ pop(tmp3);
7209     restore_arg_regs();
7210 #endif
7211     __ leave(); // required for proper stackwalking of RuntimeStub frame
7212     __ ret(0);
7213     return start;
7214   }
7215 
7216   address generate_libmExp() {
7217     StubCodeMark mark(this, "StubRoutines", "libmExp");
7218 
7219     address start = __ pc();
7220 
7221     const XMMRegister x0  = xmm0;
7222     const XMMRegister x1  = xmm1;
7223     const XMMRegister x2  = xmm2;
7224     const XMMRegister x3  = xmm3;
7225 
7226     const XMMRegister x4  = xmm4;
7227     const XMMRegister x5  = xmm5;
7228     const XMMRegister x6  = xmm6;
7229     const XMMRegister x7  = xmm7;
7230 
7231     const Register tmp   = r11;
7232 
7233     BLOCK_COMMENT("Entry:");
7234     __ enter(); // required for proper stackwalking of RuntimeStub frame
7235 
7236     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7237 
7238     __ leave(); // required for proper stackwalking of RuntimeStub frame
7239     __ ret(0);
7240 
7241     return start;
7242 
7243   }
7244 
7245   address generate_libmLog() {
7246     StubCodeMark mark(this, "StubRoutines", "libmLog");
7247 
7248     address start = __ pc();
7249 
7250     const XMMRegister x0 = xmm0;
7251     const XMMRegister x1 = xmm1;
7252     const XMMRegister x2 = xmm2;
7253     const XMMRegister x3 = xmm3;
7254 
7255     const XMMRegister x4 = xmm4;
7256     const XMMRegister x5 = xmm5;
7257     const XMMRegister x6 = xmm6;
7258     const XMMRegister x7 = xmm7;
7259 
7260     const Register tmp1 = r11;
7261     const Register tmp2 = r8;
7262 
7263     BLOCK_COMMENT("Entry:");
7264     __ enter(); // required for proper stackwalking of RuntimeStub frame
7265 
7266     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
7267 
7268     __ leave(); // required for proper stackwalking of RuntimeStub frame
7269     __ ret(0);
7270 
7271     return start;
7272 
7273   }
7274 
7275   address generate_libmLog10() {
7276     StubCodeMark mark(this, "StubRoutines", "libmLog10");
7277 
7278     address start = __ pc();
7279 
7280     const XMMRegister x0 = xmm0;
7281     const XMMRegister x1 = xmm1;
7282     const XMMRegister x2 = xmm2;
7283     const XMMRegister x3 = xmm3;
7284 
7285     const XMMRegister x4 = xmm4;
7286     const XMMRegister x5 = xmm5;
7287     const XMMRegister x6 = xmm6;
7288     const XMMRegister x7 = xmm7;
7289 
7290     const Register tmp = r11;
7291 
7292     BLOCK_COMMENT("Entry:");
7293     __ enter(); // required for proper stackwalking of RuntimeStub frame
7294 
7295     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7296 
7297     __ leave(); // required for proper stackwalking of RuntimeStub frame
7298     __ ret(0);
7299 
7300     return start;
7301 
7302   }
7303 
7304   address generate_libmPow() {
7305     StubCodeMark mark(this, "StubRoutines", "libmPow");
7306 
7307     address start = __ pc();
7308 
7309     const XMMRegister x0 = xmm0;
7310     const XMMRegister x1 = xmm1;
7311     const XMMRegister x2 = xmm2;
7312     const XMMRegister x3 = xmm3;
7313 
7314     const XMMRegister x4 = xmm4;
7315     const XMMRegister x5 = xmm5;
7316     const XMMRegister x6 = xmm6;
7317     const XMMRegister x7 = xmm7;
7318 
7319     const Register tmp1 = r8;
7320     const Register tmp2 = r9;
7321     const Register tmp3 = r10;
7322     const Register tmp4 = r11;
7323 
7324     BLOCK_COMMENT("Entry:");
7325     __ enter(); // required for proper stackwalking of RuntimeStub frame
7326 
7327     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7328 
7329     __ leave(); // required for proper stackwalking of RuntimeStub frame
7330     __ ret(0);
7331 
7332     return start;
7333 
7334   }
7335 
7336   address generate_libmSin() {
7337     StubCodeMark mark(this, "StubRoutines", "libmSin");
7338 
7339     address start = __ pc();
7340 
7341     const XMMRegister x0 = xmm0;
7342     const XMMRegister x1 = xmm1;
7343     const XMMRegister x2 = xmm2;
7344     const XMMRegister x3 = xmm3;
7345 
7346     const XMMRegister x4 = xmm4;
7347     const XMMRegister x5 = xmm5;
7348     const XMMRegister x6 = xmm6;
7349     const XMMRegister x7 = xmm7;
7350 
7351     const Register tmp1 = r8;
7352     const Register tmp2 = r9;
7353     const Register tmp3 = r10;
7354     const Register tmp4 = r11;
7355 
7356     BLOCK_COMMENT("Entry:");
7357     __ enter(); // required for proper stackwalking of RuntimeStub frame
7358 
7359 #ifdef _WIN64
7360     __ push(rsi);
7361     __ push(rdi);
7362 #endif
7363     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7364 
7365 #ifdef _WIN64
7366     __ pop(rdi);
7367     __ pop(rsi);
7368 #endif
7369 
7370     __ leave(); // required for proper stackwalking of RuntimeStub frame
7371     __ ret(0);
7372 
7373     return start;
7374 
7375   }
7376 
7377   address generate_libmCos() {
7378     StubCodeMark mark(this, "StubRoutines", "libmCos");
7379 
7380     address start = __ pc();
7381 
7382     const XMMRegister x0 = xmm0;
7383     const XMMRegister x1 = xmm1;
7384     const XMMRegister x2 = xmm2;
7385     const XMMRegister x3 = xmm3;
7386 
7387     const XMMRegister x4 = xmm4;
7388     const XMMRegister x5 = xmm5;
7389     const XMMRegister x6 = xmm6;
7390     const XMMRegister x7 = xmm7;
7391 
7392     const Register tmp1 = r8;
7393     const Register tmp2 = r9;
7394     const Register tmp3 = r10;
7395     const Register tmp4 = r11;
7396 
7397     BLOCK_COMMENT("Entry:");
7398     __ enter(); // required for proper stackwalking of RuntimeStub frame
7399 
7400 #ifdef _WIN64
7401     __ push(rsi);
7402     __ push(rdi);
7403 #endif
7404     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7405 
7406 #ifdef _WIN64
7407     __ pop(rdi);
7408     __ pop(rsi);
7409 #endif
7410 
7411     __ leave(); // required for proper stackwalking of RuntimeStub frame
7412     __ ret(0);
7413 
7414     return start;
7415 
7416   }
7417 
7418   address generate_libmTan() {
7419     StubCodeMark mark(this, "StubRoutines", "libmTan");
7420 
7421     address start = __ pc();
7422 
7423     const XMMRegister x0 = xmm0;
7424     const XMMRegister x1 = xmm1;
7425     const XMMRegister x2 = xmm2;
7426     const XMMRegister x3 = xmm3;
7427 
7428     const XMMRegister x4 = xmm4;
7429     const XMMRegister x5 = xmm5;
7430     const XMMRegister x6 = xmm6;
7431     const XMMRegister x7 = xmm7;
7432 
7433     const Register tmp1 = r8;
7434     const Register tmp2 = r9;
7435     const Register tmp3 = r10;
7436     const Register tmp4 = r11;
7437 
7438     BLOCK_COMMENT("Entry:");
7439     __ enter(); // required for proper stackwalking of RuntimeStub frame
7440 
7441 #ifdef _WIN64
7442     __ push(rsi);
7443     __ push(rdi);
7444 #endif
7445     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7446 
7447 #ifdef _WIN64
7448     __ pop(rdi);
7449     __ pop(rsi);
7450 #endif
7451 
7452     __ leave(); // required for proper stackwalking of RuntimeStub frame
7453     __ ret(0);
7454 
7455     return start;
7456 
7457   }
7458 
7459 #undef __
7460 #define __ masm->
7461 
7462   // Continuation point for throwing of implicit exceptions that are
7463   // not handled in the current activation. Fabricates an exception
7464   // oop and initiates normal exception dispatching in this
7465   // frame. Since we need to preserve callee-saved values (currently
7466   // only for C2, but done for C1 as well) we need a callee-saved oop
7467   // map and therefore have to make these stubs into RuntimeStubs
7468   // rather than BufferBlobs.  If the compiler needs all registers to
7469   // be preserved between the fault point and the exception handler
7470   // then it must assume responsibility for that in
7471   // AbstractCompiler::continuation_for_implicit_null_exception or
7472   // continuation_for_implicit_division_by_zero_exception. All other
7473   // implicit exceptions (e.g., NullPointerException or
7474   // AbstractMethodError on entry) are either at call sites or
7475   // otherwise assume that stack unwinding will be initiated, so
7476   // caller saved registers were assumed volatile in the compiler.
7477   address generate_throw_exception(const char* name,
7478                                    address runtime_entry,
7479                                    Register arg1 = noreg,
7480                                    Register arg2 = noreg) {
7481     // Information about frame layout at time of blocking runtime call.
7482     // Note that we only have to preserve callee-saved registers since
7483     // the compilers are responsible for supplying a continuation point
7484     // if they expect all registers to be preserved.
7485     enum layout {
7486       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
7487       rbp_off2,
7488       return_off,
7489       return_off2,
7490       framesize // inclusive of return address
7491     };
7492 
7493     int insts_size = 512;
7494     int locs_size  = 64;
7495 
7496     CodeBuffer code(name, insts_size, locs_size);
7497     OopMapSet* oop_maps  = new OopMapSet();
7498     MacroAssembler* masm = new MacroAssembler(&code);
7499 
7500     address start = __ pc();
7501 
7502     // This is an inlined and slightly modified version of call_VM
7503     // which has the ability to fetch the return PC out of
7504     // thread-local storage and also sets up last_Java_sp slightly
7505     // differently than the real call_VM
7506 
7507     __ enter(); // required for proper stackwalking of RuntimeStub frame
7508 
7509     assert(is_even(framesize/2), "sp not 16-byte aligned");
7510 
7511     // return address and rbp are already in place
7512     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
7513 
7514     int frame_complete = __ pc() - start;
7515 
7516     // Set up last_Java_sp and last_Java_fp
7517     address the_pc = __ pc();
7518     __ set_last_Java_frame(rsp, rbp, the_pc);
7519     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
7520 
7521     // Call runtime
7522     if (arg1 != noreg) {
7523       assert(arg2 != c_rarg1, "clobbered");
7524       __ movptr(c_rarg1, arg1);
7525     }
7526     if (arg2 != noreg) {
7527       __ movptr(c_rarg2, arg2);
7528     }
7529     __ movptr(c_rarg0, r15_thread);
7530     BLOCK_COMMENT("call runtime_entry");
7531     __ call(RuntimeAddress(runtime_entry));
7532 
7533     // Generate oop map
7534     OopMap* map = new OopMap(framesize, 0);
7535 
7536     oop_maps->add_gc_map(the_pc - start, map);
7537 
7538     __ reset_last_Java_frame(true);
7539 
7540     __ leave(); // required for proper stackwalking of RuntimeStub frame
7541 
7542     // check for pending exceptions
7543 #ifdef ASSERT
7544     Label L;
7545     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
7546             (int32_t) NULL_WORD);
7547     __ jcc(Assembler::notEqual, L);
7548     __ should_not_reach_here();
7549     __ bind(L);
7550 #endif // ASSERT
7551     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7552 
7553 
7554     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7555     RuntimeStub* stub =
7556       RuntimeStub::new_runtime_stub(name,
7557                                     &code,
7558                                     frame_complete,
7559                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7560                                     oop_maps, false);
7561     return stub->entry_point();
7562   }
7563 
7564   void create_control_words() {
7565     // Round to nearest, 64-bit mode, exceptions masked
7566     StubRoutines::x86::_mxcsr_std = 0x1F80;
7567   }
7568 
7569   // Call here from the interpreter or compiled code to either load
7570   // multiple returned values from the inline type instance being
7571   // returned to registers or to store returned values to a newly
7572   // allocated inline type instance.
7573   address generate_return_value_stub(address destination, const char* name, bool has_res) {
7574     // We need to save all registers the calling convention may use so
7575     // the runtime calls read or update those registers. This needs to
7576     // be in sync with SharedRuntime::java_return_convention().
7577     enum layout {
7578       pad_off = frame::arg_reg_save_area_bytes/BytesPerInt, pad_off_2,
7579       rax_off, rax_off_2,
7580       j_rarg5_off, j_rarg5_2,
7581       j_rarg4_off, j_rarg4_2,
7582       j_rarg3_off, j_rarg3_2,
7583       j_rarg2_off, j_rarg2_2,
7584       j_rarg1_off, j_rarg1_2,
7585       j_rarg0_off, j_rarg0_2,
7586       j_farg0_off, j_farg0_2,
7587       j_farg1_off, j_farg1_2,
7588       j_farg2_off, j_farg2_2,
7589       j_farg3_off, j_farg3_2,
7590       j_farg4_off, j_farg4_2,
7591       j_farg5_off, j_farg5_2,
7592       j_farg6_off, j_farg6_2,
7593       j_farg7_off, j_farg7_2,
7594       rbp_off, rbp_off_2,
7595       return_off, return_off_2,
7596 
7597       framesize
7598     };
7599 
7600     CodeBuffer buffer(name, 1000, 512);
7601     MacroAssembler* masm = new MacroAssembler(&buffer);
7602 
7603     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
7604     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
7605     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
7606     int frame_size_in_words = frame_size_in_bytes / wordSize;
7607 
7608     OopMapSet *oop_maps = new OopMapSet();
7609     OopMap* map = new OopMap(frame_size_in_slots, 0);
7610 
7611     map->set_callee_saved(VMRegImpl::stack2reg(rax_off), rax->as_VMReg());
7612     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
7613     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
7614     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
7615     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
7616     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
7617     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
7618     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
7619     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
7620     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
7621     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
7622     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
7623     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
7624     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
7625     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
7626 
7627     int start = __ offset();
7628 
7629     __ subptr(rsp, frame_size_in_bytes - 8 /* return address*/);
7630 
7631     __ movptr(Address(rsp, rbp_off * BytesPerInt), rbp);
7632     __ movdbl(Address(rsp, j_farg7_off * BytesPerInt), j_farg7);
7633     __ movdbl(Address(rsp, j_farg6_off * BytesPerInt), j_farg6);
7634     __ movdbl(Address(rsp, j_farg5_off * BytesPerInt), j_farg5);
7635     __ movdbl(Address(rsp, j_farg4_off * BytesPerInt), j_farg4);
7636     __ movdbl(Address(rsp, j_farg3_off * BytesPerInt), j_farg3);
7637     __ movdbl(Address(rsp, j_farg2_off * BytesPerInt), j_farg2);
7638     __ movdbl(Address(rsp, j_farg1_off * BytesPerInt), j_farg1);
7639     __ movdbl(Address(rsp, j_farg0_off * BytesPerInt), j_farg0);
7640 
7641     __ movptr(Address(rsp, j_rarg0_off * BytesPerInt), j_rarg0);
7642     __ movptr(Address(rsp, j_rarg1_off * BytesPerInt), j_rarg1);
7643     __ movptr(Address(rsp, j_rarg2_off * BytesPerInt), j_rarg2);
7644     __ movptr(Address(rsp, j_rarg3_off * BytesPerInt), j_rarg3);
7645     __ movptr(Address(rsp, j_rarg4_off * BytesPerInt), j_rarg4);
7646     __ movptr(Address(rsp, j_rarg5_off * BytesPerInt), j_rarg5);
7647     __ movptr(Address(rsp, rax_off * BytesPerInt), rax);
7648 
7649     int frame_complete = __ offset();
7650 
7651     __ set_last_Java_frame(noreg, noreg, NULL);
7652 
7653     __ mov(c_rarg0, r15_thread);
7654     __ mov(c_rarg1, rax);
7655 
7656     __ call(RuntimeAddress(destination));
7657 
7658     // Set an oopmap for the call site.
7659 
7660     oop_maps->add_gc_map( __ offset() - start, map);
7661 
7662     // clear last_Java_sp
7663     __ reset_last_Java_frame(false);
7664 
7665     __ movptr(rbp, Address(rsp, rbp_off * BytesPerInt));
7666     __ movdbl(j_farg7, Address(rsp, j_farg7_off * BytesPerInt));
7667     __ movdbl(j_farg6, Address(rsp, j_farg6_off * BytesPerInt));
7668     __ movdbl(j_farg5, Address(rsp, j_farg5_off * BytesPerInt));
7669     __ movdbl(j_farg4, Address(rsp, j_farg4_off * BytesPerInt));
7670     __ movdbl(j_farg3, Address(rsp, j_farg3_off * BytesPerInt));
7671     __ movdbl(j_farg2, Address(rsp, j_farg2_off * BytesPerInt));
7672     __ movdbl(j_farg1, Address(rsp, j_farg1_off * BytesPerInt));
7673     __ movdbl(j_farg0, Address(rsp, j_farg0_off * BytesPerInt));
7674 
7675     __ movptr(j_rarg0, Address(rsp, j_rarg0_off * BytesPerInt));
7676     __ movptr(j_rarg1, Address(rsp, j_rarg1_off * BytesPerInt));
7677     __ movptr(j_rarg2, Address(rsp, j_rarg2_off * BytesPerInt));
7678     __ movptr(j_rarg3, Address(rsp, j_rarg3_off * BytesPerInt));
7679     __ movptr(j_rarg4, Address(rsp, j_rarg4_off * BytesPerInt));
7680     __ movptr(j_rarg5, Address(rsp, j_rarg5_off * BytesPerInt));
7681     __ movptr(rax, Address(rsp, rax_off * BytesPerInt));
7682 
7683     __ addptr(rsp, frame_size_in_bytes-8);
7684 
7685     // check for pending exceptions
7686     Label pending;
7687     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
7688     __ jcc(Assembler::notEqual, pending);
7689 
7690     if (has_res) {
7691       __ get_vm_result(rax, r15_thread);
7692     }
7693 
7694     __ ret(0);
7695 
7696     __ bind(pending);
7697 
7698     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
7699     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7700 
7701     // -------------
7702     // make sure all code is generated
7703     masm->flush();
7704 
7705     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, false);
7706     return stub->entry_point();
7707   }
7708 
7709   // Initialization
7710   void generate_initial() {
7711     // Generates all stubs and initializes the entry points
7712 
7713     // This platform-specific settings are needed by generate_call_stub()
7714     create_control_words();
7715 
7716     // entry points that exist in all platforms Note: This is code
7717     // that could be shared among different platforms - however the
7718     // benefit seems to be smaller than the disadvantage of having a
7719     // much more complicated generator structure. See also comment in
7720     // stubRoutines.hpp.
7721 
7722     StubRoutines::_forward_exception_entry = generate_forward_exception();
7723 
7724     // Generate these first because they are called from other stubs
7725     if (InlineTypeReturnedAsFields) {
7726       StubRoutines::_load_inline_type_fields_in_regs =
7727         generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
7728       StubRoutines::_store_inline_type_fields_to_buf =
7729         generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
7730     }
7731     StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
7732 
7733     // is referenced by megamorphic call
7734     StubRoutines::_catch_exception_entry = generate_catch_exception();
7735 
7736     // atomic calls
7737     StubRoutines::_fence_entry                = generate_orderaccess_fence();
7738 
7739     // platform dependent
7740     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
7741 
7742     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
7743 
7744     StubRoutines::x86::_f2i_fixup             = generate_f2i_fixup();
7745     StubRoutines::x86::_f2l_fixup             = generate_f2l_fixup();
7746     StubRoutines::x86::_d2i_fixup             = generate_d2i_fixup();
7747     StubRoutines::x86::_d2l_fixup             = generate_d2l_fixup();
7748 
7749     StubRoutines::x86::_float_sign_mask       = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
7750     StubRoutines::x86::_float_sign_flip       = generate_fp_mask("float_sign_flip",  0x8000000080000000);
7751     StubRoutines::x86::_double_sign_mask      = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7752     StubRoutines::x86::_double_sign_flip      = generate_fp_mask("double_sign_flip", 0x8000000000000000);
7753 
7754     // Build this early so it's available for the interpreter.
7755     StubRoutines::_throw_StackOverflowError_entry =
7756       generate_throw_exception("StackOverflowError throw_exception",
7757                                CAST_FROM_FN_PTR(address,
7758                                                 SharedRuntime::
7759                                                 throw_StackOverflowError));
7760     StubRoutines::_throw_delayed_StackOverflowError_entry =
7761       generate_throw_exception("delayed StackOverflowError throw_exception",
7762                                CAST_FROM_FN_PTR(address,
7763                                                 SharedRuntime::
7764                                                 throw_delayed_StackOverflowError));
7765     if (UseCRC32Intrinsics) {
7766       // set table address before stub generation which use it
7767       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
7768       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7769     }
7770 
7771     if (UseCRC32CIntrinsics) {
7772       bool supports_clmul = VM_Version::supports_clmul();
7773       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
7774       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
7775       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
7776     }
7777 
7778     if (UseAdler32Intrinsics) {
7779        StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7780     }
7781 
7782     if (UseLibmIntrinsic && InlineIntrinsics) {
7783       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
7784           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
7785           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7786         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
7787         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
7788         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
7789         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
7790         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
7791         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
7792         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
7793         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
7794         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
7795         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
7796         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
7797         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
7798         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
7799         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
7800       }
7801       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
7802         StubRoutines::_dexp = generate_libmExp();
7803       }
7804       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7805         StubRoutines::_dlog = generate_libmLog();
7806       }
7807       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
7808         StubRoutines::_dlog10 = generate_libmLog10();
7809       }
7810       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
7811         StubRoutines::_dpow = generate_libmPow();
7812       }
7813       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7814         StubRoutines::_dsin = generate_libmSin();
7815       }
7816       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7817         StubRoutines::_dcos = generate_libmCos();
7818       }
7819       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7820         StubRoutines::_dtan = generate_libmTan();
7821       }
7822     }
7823 
7824     // Safefetch stubs.
7825     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7826                                                        &StubRoutines::_safefetch32_fault_pc,
7827                                                        &StubRoutines::_safefetch32_continuation_pc);
7828     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7829                                                        &StubRoutines::_safefetchN_fault_pc,
7830                                                        &StubRoutines::_safefetchN_continuation_pc);
7831   }
7832 
7833   void generate_all() {
7834     // Generates all stubs and initializes the entry points
7835 
7836     // These entry points require SharedInfo::stack0 to be set up in
7837     // non-core builds and need to be relocatable, so they each
7838     // fabricate a RuntimeStub internally.
7839     StubRoutines::_throw_AbstractMethodError_entry =
7840       generate_throw_exception("AbstractMethodError throw_exception",
7841                                CAST_FROM_FN_PTR(address,
7842                                                 SharedRuntime::
7843                                                 throw_AbstractMethodError));
7844 
7845     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7846       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7847                                CAST_FROM_FN_PTR(address,
7848                                                 SharedRuntime::
7849                                                 throw_IncompatibleClassChangeError));
7850 
7851     StubRoutines::_throw_NullPointerException_at_call_entry =
7852       generate_throw_exception("NullPointerException at call throw_exception",
7853                                CAST_FROM_FN_PTR(address,
7854                                                 SharedRuntime::
7855                                                 throw_NullPointerException_at_call));
7856 
7857     // entry points that are platform specific
7858     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
7859     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
7860     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7861     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
7862     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
7863     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
7864     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
7865     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
7866     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
7867     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
7868                                                                         0xFFFFFFFF, 0, 0, 0);
7869     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
7870                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
7871     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
7872     StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
7873     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
7874     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
7875     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
7876     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
7877 
7878     // support for verify_oop (must happen after universe_init)
7879     if (VerifyOops) {
7880       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7881     }
7882 
7883     // data cache line writeback
7884     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7885     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7886 
7887     // arraycopy stubs used by compilers
7888     generate_arraycopy_stubs();
7889 
7890     // don't bother generating these AES intrinsic stubs unless global flag is set
7891     if (UseAESIntrinsics) {
7892       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
7893       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7894       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7895       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7896       if (VM_Version::supports_avx512_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
7897         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
7898         StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
7899         StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
7900         StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7901         StubRoutines::x86::_ghash_poly512_addr = ghash_polynomial512_addr();
7902         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7903         StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7904       } else {
7905         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
7906       }
7907     }
7908 
7909     if (UseAESCTRIntrinsics) {
7910       if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
7911         if (StubRoutines::x86::_counter_mask_addr == NULL) {
7912           StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7913         }
7914         StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
7915       } else {
7916         StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
7917         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
7918       }
7919     }
7920 
7921     if (UseMD5Intrinsics) {
7922       StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
7923       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
7924     }
7925     if (UseSHA1Intrinsics) {
7926       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
7927       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
7928       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
7929       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
7930     }
7931     if (UseSHA256Intrinsics) {
7932       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
7933       char* dst = (char*)StubRoutines::x86::_k256_W;
7934       char* src = (char*)StubRoutines::x86::_k256;
7935       for (int ii = 0; ii < 16; ++ii) {
7936         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
7937         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
7938       }
7939       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
7940       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
7941       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
7942       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
7943     }
7944     if (UseSHA512Intrinsics) {
7945       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
7946       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
7947       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
7948       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
7949     }
7950 
7951     // Generate GHASH intrinsics code
7952     if (UseGHASHIntrinsics) {
7953       if (StubRoutines::x86::_ghash_long_swap_mask_addr == NULL) {
7954         StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7955       }
7956     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
7957       if (VM_Version::supports_avx()) {
7958         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
7959         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
7960         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
7961       } else {
7962         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7963       }
7964     }
7965 
7966 
7967     if (UseBASE64Intrinsics) {
7968       if(VM_Version::supports_avx2() &&
7969          VM_Version::supports_avx512bw() &&
7970          VM_Version::supports_avx512vl()) {
7971         StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
7972         StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
7973         StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
7974       }
7975       StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
7976       if (VM_Version::supports_avx512_vbmi()) {
7977         StubRoutines::x86::_shuffle_base64 = base64_shuffle_addr();
7978         StubRoutines::x86::_lookup_lo_base64 = base64_vbmi_lookup_lo_addr();
7979         StubRoutines::x86::_lookup_hi_base64 = base64_vbmi_lookup_hi_addr();
7980         StubRoutines::x86::_lookup_lo_base64url = base64_vbmi_lookup_lo_url_addr();
7981         StubRoutines::x86::_lookup_hi_base64url = base64_vbmi_lookup_hi_url_addr();
7982         StubRoutines::x86::_pack_vec_base64 = base64_vbmi_pack_vec_addr();
7983         StubRoutines::x86::_join_0_1_base64 = base64_vbmi_join_0_1_addr();
7984         StubRoutines::x86::_join_1_2_base64 = base64_vbmi_join_1_2_addr();
7985         StubRoutines::x86::_join_2_3_base64 = base64_vbmi_join_2_3_addr();
7986       }
7987       StubRoutines::x86::_decoding_table_base64 = base64_decoding_table_addr();
7988       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7989       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7990     }
7991 
7992     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7993     if (bs_nm != NULL) {
7994       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
7995     }
7996 #ifdef COMPILER2
7997     if (UseMultiplyToLenIntrinsic) {
7998       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7999     }
8000     if (UseSquareToLenIntrinsic) {
8001       StubRoutines::_squareToLen = generate_squareToLen();
8002     }
8003     if (UseMulAddIntrinsic) {
8004       StubRoutines::_mulAdd = generate_mulAdd();
8005     }
8006     if (VM_Version::supports_avx512_vbmi2()) {
8007       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8008       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
8009     }
8010     if (UseMontgomeryMultiplyIntrinsic) {
8011       StubRoutines::_montgomeryMultiply
8012         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
8013     }
8014     if (UseMontgomerySquareIntrinsic) {
8015       StubRoutines::_montgomerySquare
8016         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
8017     }
8018 
8019     // Get svml stub routine addresses
8020     void *libsvml = NULL;
8021     char ebuf[1024];
8022     char dll_name[JVM_MAXPATHLEN];
8023     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "svml")) {
8024       libsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
8025     }
8026     if (libsvml != NULL) {
8027       // SVML method naming convention
8028       //   All the methods are named as __svml_op<T><N>_ha_<VV>
8029       //   Where:
8030       //      ha stands for high accuracy
8031       //      <T> is optional to indicate float/double
8032       //              Set to f for vector float operation
8033       //              Omitted for vector double operation
8034       //      <N> is the number of elements in the vector
8035       //              1, 2, 4, 8, 16
8036       //              e.g. 128 bit float vector has 4 float elements
8037       //      <VV> indicates the avx/sse level:
8038       //              z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2
8039       //      e.g. __svml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns
8040       //           __svml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
8041 
8042       log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "svml" JNI_LIB_SUFFIX, p2i(libsvml));
8043       if (UseAVX > 2) {
8044         for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
8045           int vop = VectorSupport::VECTOR_OP_SVML_START + op;
8046           if ((!VM_Version::supports_avx512dq()) &&
8047               (vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) {
8048             continue;
8049           }
8050           snprintf(ebuf, sizeof(ebuf), "__svml_%sf16_ha_z0", VectorSupport::svmlname[op]);
8051           StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libsvml, ebuf);
8052 
8053           snprintf(ebuf, sizeof(ebuf), "__svml_%s8_ha_z0", VectorSupport::svmlname[op]);
8054           StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libsvml, ebuf);
8055         }
8056       }
8057       const char* avx_sse_str = (UseAVX >= 2) ? "l9" : ((UseAVX == 1) ? "e9" : "ex");
8058       for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
8059         int vop = VectorSupport::VECTOR_OP_SVML_START + op;
8060         if (vop == VectorSupport::VECTOR_OP_POW) {
8061           continue;
8062         }
8063         snprintf(ebuf, sizeof(ebuf), "__svml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8064         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsvml, ebuf);
8065 
8066         snprintf(ebuf, sizeof(ebuf), "__svml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8067         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsvml, ebuf);
8068 
8069         snprintf(ebuf, sizeof(ebuf), "__svml_%sf8_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8070         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
8071 
8072         snprintf(ebuf, sizeof(ebuf), "__svml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8073         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsvml, ebuf);
8074 
8075         snprintf(ebuf, sizeof(ebuf), "__svml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8076         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsvml, ebuf);
8077 
8078         snprintf(ebuf, sizeof(ebuf), "__svml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
8079         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libsvml, ebuf);
8080       }
8081     }
8082 #endif // COMPILER2
8083 
8084     if (UseVectorizedMismatchIntrinsic) {
8085       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
8086     }
8087   }
8088 
8089  public:
8090   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
8091     if (all) {
8092       generate_all();
8093     } else {
8094       generate_initial();
8095     }
8096   }
8097 }; // end class declaration
8098 
8099 #define UCM_TABLE_MAX_ENTRIES 16
8100 void StubGenerator_generate(CodeBuffer* code, bool all) {
8101   if (UnsafeCopyMemory::_table == NULL) {
8102     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
8103   }
8104   StubGenerator g(code, all);
8105 }