1 /*
   2  * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/oopMap.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "gc/shared/barrierSetNMethod.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_x86.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/frame.inline.hpp"
  41 #include "runtime/handles.inline.hpp"
  42 #include "runtime/sharedRuntime.hpp"
  43 #include "runtime/stubCodeGenerator.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "runtime/thread.inline.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 
  50 // Declaration and definition of StubGenerator (no .hpp file).
  51 // For a more detailed description of the stub routine structure
  52 // see the comment in stubRoutines.hpp
  53 
  54 #define __ _masm->
  55 #define a__ ((Assembler*)_masm)->
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #else
  60 #define BLOCK_COMMENT(str) __ block_comment(str)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 
  65 const int MXCSR_MASK  = 0xFFC0;  // Mask out any pending exceptions
  66 const int FPU_CNTRL_WRD_MASK = 0xFFFF;
  67 
  68 // -------------------------------------------------------------------------------------------------------------------------
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ incrementl(ExternalAddress((address)&counter));
  79   }
  80 #define inc_counter_np(counter) \
  81   BLOCK_COMMENT("inc_counter " #counter); \
  82   inc_counter_np_(counter);
  83 #endif //PRODUCT
  84 
  85   void inc_copy_counter_np(BasicType t) {
  86 #ifndef PRODUCT
  87     switch (t) {
  88     case T_BYTE:    inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); return;
  89     case T_SHORT:   inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); return;
  90     case T_INT:     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); return;
  91     case T_LONG:    inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); return;
  92     case T_OBJECT:  inc_counter_np(SharedRuntime::_oop_array_copy_ctr); return;
  93     default:        ShouldNotReachHere();
  94     }
  95 #endif //PRODUCT
  96   }
  97 
  98   //------------------------------------------------------------------------------------------------------------------------
  99   // Call stubs are used to call Java from C
 100   //
 101   //    [ return_from_Java     ] <--- rsp
 102   //    [ argument word n      ]
 103   //      ...
 104   // -N [ argument word 1      ]
 105   // -7 [ Possible padding for stack alignment ]
 106   // -6 [ Possible padding for stack alignment ]
 107   // -5 [ Possible padding for stack alignment ]
 108   // -4 [ mxcsr save           ] <--- rsp_after_call
 109   // -3 [ saved rbx,            ]
 110   // -2 [ saved rsi            ]
 111   // -1 [ saved rdi            ]
 112   //  0 [ saved rbp,            ] <--- rbp,
 113   //  1 [ return address       ]
 114   //  2 [ ptr. to call wrapper ]
 115   //  3 [ result               ]
 116   //  4 [ result_type          ]
 117   //  5 [ method               ]
 118   //  6 [ entry_point          ]
 119   //  7 [ parameters           ]
 120   //  8 [ parameter_size       ]
 121   //  9 [ thread               ]
 122 
 123 
 124   address generate_call_stub(address& return_address) {
 125     StubCodeMark mark(this, "StubRoutines", "call_stub");
 126     address start = __ pc();
 127 
 128     // stub code parameters / addresses
 129     assert(frame::entry_frame_call_wrapper_offset == 2, "adjust this code");
 130     bool  sse_save = false;
 131     const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_catch_exception()!
 132     const int     locals_count_in_bytes  (4*wordSize);
 133     const Address mxcsr_save    (rbp, -4 * wordSize);
 134     const Address saved_rbx     (rbp, -3 * wordSize);
 135     const Address saved_rsi     (rbp, -2 * wordSize);
 136     const Address saved_rdi     (rbp, -1 * wordSize);
 137     const Address result        (rbp,  3 * wordSize);
 138     const Address result_type   (rbp,  4 * wordSize);
 139     const Address method        (rbp,  5 * wordSize);
 140     const Address entry_point   (rbp,  6 * wordSize);
 141     const Address parameters    (rbp,  7 * wordSize);
 142     const Address parameter_size(rbp,  8 * wordSize);
 143     const Address thread        (rbp,  9 * wordSize); // same as in generate_catch_exception()!
 144     sse_save =  UseSSE > 0;
 145 
 146     // stub code
 147     __ enter();
 148     __ movptr(rcx, parameter_size);              // parameter counter
 149     __ shlptr(rcx, Interpreter::logStackElementSize); // convert parameter count to bytes
 150     __ addptr(rcx, locals_count_in_bytes);       // reserve space for register saves
 151     __ subptr(rsp, rcx);
 152     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
 153 
 154     // save rdi, rsi, & rbx, according to C calling conventions
 155     __ movptr(saved_rdi, rdi);
 156     __ movptr(saved_rsi, rsi);
 157     __ movptr(saved_rbx, rbx);
 158 
 159     // save and initialize %mxcsr
 160     if (sse_save) {
 161       Label skip_ldmx;
 162       __ stmxcsr(mxcsr_save);
 163       __ movl(rax, mxcsr_save);
 164       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 165       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 166       __ cmp32(rax, mxcsr_std);
 167       __ jcc(Assembler::equal, skip_ldmx);
 168       __ ldmxcsr(mxcsr_std);
 169       __ bind(skip_ldmx);
 170     }
 171 
 172     // make sure the control word is correct.
 173     __ fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_std()));
 174 
 175 #ifdef ASSERT
 176     // make sure we have no pending exceptions
 177     { Label L;
 178       __ movptr(rcx, thread);
 179       __ cmpptr(Address(rcx, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 180       __ jcc(Assembler::equal, L);
 181       __ stop("StubRoutines::call_stub: entered with pending exception");
 182       __ bind(L);
 183     }
 184 #endif
 185 
 186     // pass parameters if any
 187     BLOCK_COMMENT("pass parameters if any");
 188     Label parameters_done;
 189     __ movl(rcx, parameter_size);  // parameter counter
 190     __ testl(rcx, rcx);
 191     __ jcc(Assembler::zero, parameters_done);
 192 
 193     // parameter passing loop
 194 
 195     Label loop;
 196     // Copy Java parameters in reverse order (receiver last)
 197     // Note that the argument order is inverted in the process
 198     // source is rdx[rcx: N-1..0]
 199     // dest   is rsp[rbx: 0..N-1]
 200 
 201     __ movptr(rdx, parameters);          // parameter pointer
 202     __ xorptr(rbx, rbx);
 203 
 204     __ BIND(loop);
 205 
 206     // get parameter
 207     __ movptr(rax, Address(rdx, rcx, Interpreter::stackElementScale(), -wordSize));
 208     __ movptr(Address(rsp, rbx, Interpreter::stackElementScale(),
 209                     Interpreter::expr_offset_in_bytes(0)), rax);          // store parameter
 210     __ increment(rbx);
 211     __ decrement(rcx);
 212     __ jcc(Assembler::notZero, loop);
 213 
 214     // call Java function
 215     __ BIND(parameters_done);
 216     __ movptr(rbx, method);           // get Method*
 217     __ movptr(rax, entry_point);      // get entry_point
 218     __ mov(rsi, rsp);                 // set sender sp
 219     BLOCK_COMMENT("call Java function");
 220     __ call(rax);
 221 
 222     BLOCK_COMMENT("call_stub_return_address:");
 223     return_address = __ pc();
 224 
 225 #ifdef COMPILER2
 226     {
 227       Label L_skip;
 228       if (UseSSE >= 2) {
 229         __ verify_FPU(0, "call_stub_return");
 230       } else {
 231         for (int i = 1; i < 8; i++) {
 232           __ ffree(i);
 233         }
 234 
 235         // UseSSE <= 1 so double result should be left on TOS
 236         __ movl(rsi, result_type);
 237         __ cmpl(rsi, T_DOUBLE);
 238         __ jcc(Assembler::equal, L_skip);
 239         if (UseSSE == 0) {
 240           // UseSSE == 0 so float result should be left on TOS
 241           __ cmpl(rsi, T_FLOAT);
 242           __ jcc(Assembler::equal, L_skip);
 243         }
 244         __ ffree(0);
 245       }
 246       __ BIND(L_skip);
 247     }
 248 #endif // COMPILER2
 249 
 250     // store result depending on type
 251     // (everything that is not T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 252     __ movptr(rdi, result);
 253     Label is_long, is_float, is_double, exit;
 254     __ movl(rsi, result_type);
 255     __ cmpl(rsi, T_LONG);
 256     __ jcc(Assembler::equal, is_long);
 257     __ cmpl(rsi, T_FLOAT);
 258     __ jcc(Assembler::equal, is_float);
 259     __ cmpl(rsi, T_DOUBLE);
 260     __ jcc(Assembler::equal, is_double);
 261 
 262     // handle T_INT case
 263     __ movl(Address(rdi, 0), rax);
 264     __ BIND(exit);
 265 
 266     // check that FPU stack is empty
 267     __ verify_FPU(0, "generate_call_stub");
 268 
 269     // pop parameters
 270     __ lea(rsp, rsp_after_call);
 271 
 272     // restore %mxcsr
 273     if (sse_save) {
 274       __ ldmxcsr(mxcsr_save);
 275     }
 276 
 277     // restore rdi, rsi and rbx,
 278     __ movptr(rbx, saved_rbx);
 279     __ movptr(rsi, saved_rsi);
 280     __ movptr(rdi, saved_rdi);
 281     __ addptr(rsp, 4*wordSize);
 282 
 283     // return
 284     __ pop(rbp);
 285     __ ret(0);
 286 
 287     // handle return types different from T_INT
 288     __ BIND(is_long);
 289     __ movl(Address(rdi, 0 * wordSize), rax);
 290     __ movl(Address(rdi, 1 * wordSize), rdx);
 291     __ jmp(exit);
 292 
 293     __ BIND(is_float);
 294     // interpreter uses xmm0 for return values
 295     if (UseSSE >= 1) {
 296       __ movflt(Address(rdi, 0), xmm0);
 297     } else {
 298       __ fstp_s(Address(rdi, 0));
 299     }
 300     __ jmp(exit);
 301 
 302     __ BIND(is_double);
 303     // interpreter uses xmm0 for return values
 304     if (UseSSE >= 2) {
 305       __ movdbl(Address(rdi, 0), xmm0);
 306     } else {
 307       __ fstp_d(Address(rdi, 0));
 308     }
 309     __ jmp(exit);
 310 
 311     return start;
 312   }
 313 
 314 
 315   //------------------------------------------------------------------------------------------------------------------------
 316   // Return point for a Java call if there's an exception thrown in Java code.
 317   // The exception is caught and transformed into a pending exception stored in
 318   // JavaThread that can be tested from within the VM.
 319   //
 320   // Note: Usually the parameters are removed by the callee. In case of an exception
 321   //       crossing an activation frame boundary, that is not the case if the callee
 322   //       is compiled code => need to setup the rsp.
 323   //
 324   // rax,: exception oop
 325 
 326   address generate_catch_exception() {
 327     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 328     const Address rsp_after_call(rbp, -4 * wordSize); // same as in generate_call_stub()!
 329     const Address thread        (rbp,  9 * wordSize); // same as in generate_call_stub()!
 330     address start = __ pc();
 331 
 332     // get thread directly
 333     __ movptr(rcx, thread);
 334 #ifdef ASSERT
 335     // verify that threads correspond
 336     { Label L;
 337       __ get_thread(rbx);
 338       __ cmpptr(rbx, rcx);
 339       __ jcc(Assembler::equal, L);
 340       __ stop("StubRoutines::catch_exception: threads must correspond");
 341       __ bind(L);
 342     }
 343 #endif
 344     // set pending exception
 345     __ verify_oop(rax);
 346     __ movptr(Address(rcx, Thread::pending_exception_offset()), rax          );
 347     __ lea(Address(rcx, Thread::exception_file_offset   ()),
 348            ExternalAddress((address)__FILE__));
 349     __ movl(Address(rcx, Thread::exception_line_offset   ()), __LINE__ );
 350     // complete return to VM
 351     assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
 352     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 353 
 354     return start;
 355   }
 356 
 357 
 358   //------------------------------------------------------------------------------------------------------------------------
 359   // Continuation point for runtime calls returning with a pending exception.
 360   // The pending exception check happened in the runtime or native call stub.
 361   // The pending exception in Thread is converted into a Java-level exception.
 362   //
 363   // Contract with Java-level exception handlers:
 364   // rax: exception
 365   // rdx: throwing pc
 366   //
 367   // NOTE: At entry of this stub, exception-pc must be on stack !!
 368 
 369   address generate_forward_exception() {
 370     StubCodeMark mark(this, "StubRoutines", "forward exception");
 371     address start = __ pc();
 372     const Register thread = rcx;
 373 
 374     // other registers used in this stub
 375     const Register exception_oop = rax;
 376     const Register handler_addr  = rbx;
 377     const Register exception_pc  = rdx;
 378 
 379     // Upon entry, the sp points to the return address returning into Java
 380     // (interpreted or compiled) code; i.e., the return address becomes the
 381     // throwing pc.
 382     //
 383     // Arguments pushed before the runtime call are still on the stack but
 384     // the exception handler will reset the stack pointer -> ignore them.
 385     // A potential result in registers can be ignored as well.
 386 
 387 #ifdef ASSERT
 388     // make sure this code is only executed if there is a pending exception
 389     { Label L;
 390       __ get_thread(thread);
 391       __ cmpptr(Address(thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 392       __ jcc(Assembler::notEqual, L);
 393       __ stop("StubRoutines::forward exception: no pending exception (1)");
 394       __ bind(L);
 395     }
 396 #endif
 397 
 398     // compute exception handler into rbx,
 399     __ get_thread(thread);
 400     __ movptr(exception_pc, Address(rsp, 0));
 401     BLOCK_COMMENT("call exception_handler_for_return_address");
 402     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), thread, exception_pc);
 403     __ mov(handler_addr, rax);
 404 
 405     // setup rax & rdx, remove return address & clear pending exception
 406     __ get_thread(thread);
 407     __ pop(exception_pc);
 408     __ movptr(exception_oop, Address(thread, Thread::pending_exception_offset()));
 409     __ movptr(Address(thread, Thread::pending_exception_offset()), NULL_WORD);
 410 
 411 #ifdef ASSERT
 412     // make sure exception is set
 413     { Label L;
 414       __ testptr(exception_oop, exception_oop);
 415       __ jcc(Assembler::notEqual, L);
 416       __ stop("StubRoutines::forward exception: no pending exception (2)");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // Verify that there is really a valid exception in RAX.
 422     __ verify_oop(exception_oop);
 423 
 424     // continue at exception handler (return address removed)
 425     // rax: exception
 426     // rbx: exception handler
 427     // rdx: throwing pc
 428     __ jmp(handler_addr);
 429 
 430     return start;
 431   }
 432 
 433   //----------------------------------------------------------------------------------------------------
 434   // Support for void verify_mxcsr()
 435   //
 436   // This routine is used with -Xcheck:jni to verify that native
 437   // JNI code does not return to Java code without restoring the
 438   // MXCSR register to our expected state.
 439 
 440 
 441   address generate_verify_mxcsr() {
 442     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 443     address start = __ pc();
 444 
 445     const Address mxcsr_save(rsp, 0);
 446 
 447     if (CheckJNICalls && UseSSE > 0 ) {
 448       Label ok_ret;
 449       ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
 450       __ push(rax);
 451       __ subptr(rsp, wordSize);      // allocate a temp location
 452       __ stmxcsr(mxcsr_save);
 453       __ movl(rax, mxcsr_save);
 454       __ andl(rax, MXCSR_MASK);
 455       __ cmp32(rax, mxcsr_std);
 456       __ jcc(Assembler::equal, ok_ret);
 457 
 458       __ warn("MXCSR changed by native JNI code.");
 459 
 460       __ ldmxcsr(mxcsr_std);
 461 
 462       __ bind(ok_ret);
 463       __ addptr(rsp, wordSize);
 464       __ pop(rax);
 465     }
 466 
 467     __ ret(0);
 468 
 469     return start;
 470   }
 471 
 472 
 473   //---------------------------------------------------------------------------
 474   // Support for void verify_fpu_cntrl_wrd()
 475   //
 476   // This routine is used with -Xcheck:jni to verify that native
 477   // JNI code does not return to Java code without restoring the
 478   // FP control word to our expected state.
 479 
 480   address generate_verify_fpu_cntrl_wrd() {
 481     StubCodeMark mark(this, "StubRoutines", "verify_spcw");
 482     address start = __ pc();
 483 
 484     const Address fpu_cntrl_wrd_save(rsp, 0);
 485 
 486     if (CheckJNICalls) {
 487       Label ok_ret;
 488       __ push(rax);
 489       __ subptr(rsp, wordSize);      // allocate a temp location
 490       __ fnstcw(fpu_cntrl_wrd_save);
 491       __ movl(rax, fpu_cntrl_wrd_save);
 492       __ andl(rax, FPU_CNTRL_WRD_MASK);
 493       ExternalAddress fpu_std(StubRoutines::x86::addr_fpu_cntrl_wrd_std());
 494       __ cmp32(rax, fpu_std);
 495       __ jcc(Assembler::equal, ok_ret);
 496 
 497       __ warn("Floating point control word changed by native JNI code.");
 498 
 499       __ fldcw(fpu_std);
 500 
 501       __ bind(ok_ret);
 502       __ addptr(rsp, wordSize);
 503       __ pop(rax);
 504     }
 505 
 506     __ ret(0);
 507 
 508     return start;
 509   }
 510 
 511   //---------------------------------------------------------------------------
 512   // Wrapper for slow-case handling of double-to-integer conversion
 513   // d2i or f2i fast case failed either because it is nan or because
 514   // of under/overflow.
 515   // Input:  FPU TOS: float value
 516   // Output: rax, (rdx): integer (long) result
 517 
 518   address generate_d2i_wrapper(BasicType t, address fcn) {
 519     StubCodeMark mark(this, "StubRoutines", "d2i_wrapper");
 520     address start = __ pc();
 521 
 522   // Capture info about frame layout
 523   enum layout { FPUState_off         = 0,
 524                 rbp_off              = FPUStateSizeInWords,
 525                 rdi_off,
 526                 rsi_off,
 527                 rcx_off,
 528                 rbx_off,
 529                 saved_argument_off,
 530                 saved_argument_off2, // 2nd half of double
 531                 framesize
 532   };
 533 
 534   assert(FPUStateSizeInWords == 27, "update stack layout");
 535 
 536     // Save outgoing argument to stack across push_FPU_state()
 537     __ subptr(rsp, wordSize * 2);
 538     __ fstp_d(Address(rsp, 0));
 539 
 540     // Save CPU & FPU state
 541     __ push(rbx);
 542     __ push(rcx);
 543     __ push(rsi);
 544     __ push(rdi);
 545     __ push(rbp);
 546     __ push_FPU_state();
 547 
 548     // push_FPU_state() resets the FP top of stack
 549     // Load original double into FP top of stack
 550     __ fld_d(Address(rsp, saved_argument_off * wordSize));
 551     // Store double into stack as outgoing argument
 552     __ subptr(rsp, wordSize*2);
 553     __ fst_d(Address(rsp, 0));
 554 
 555     // Prepare FPU for doing math in C-land
 556     __ empty_FPU_stack();
 557     // Call the C code to massage the double.  Result in EAX
 558     if (t == T_INT)
 559       { BLOCK_COMMENT("SharedRuntime::d2i"); }
 560     else if (t == T_LONG)
 561       { BLOCK_COMMENT("SharedRuntime::d2l"); }
 562     __ call_VM_leaf( fcn, 2 );
 563 
 564     // Restore CPU & FPU state
 565     __ pop_FPU_state();
 566     __ pop(rbp);
 567     __ pop(rdi);
 568     __ pop(rsi);
 569     __ pop(rcx);
 570     __ pop(rbx);
 571     __ addptr(rsp, wordSize * 2);
 572 
 573     __ ret(0);
 574 
 575     return start;
 576   }
 577   //---------------------------------------------------------------------------------------------------
 578 
 579   address generate_vector_mask(const char *stub_name, int32_t mask) {
 580     __ align(CodeEntryAlignment);
 581     StubCodeMark mark(this, "StubRoutines", stub_name);
 582     address start = __ pc();
 583 
 584     for (int i = 0; i < 16; i++) {
 585       __ emit_data(mask, relocInfo::none, 0);
 586     }
 587 
 588     return start;
 589   }
 590 
 591   address generate_count_leading_zeros_lut(const char *stub_name) {
 592     __ align64();
 593     StubCodeMark mark(this, "StubRoutines", stub_name);
 594     address start = __ pc();
 595     __ emit_data(0x02020304, relocInfo::none, 0);
 596     __ emit_data(0x01010101, relocInfo::none, 0);
 597     __ emit_data(0x00000000, relocInfo::none, 0);
 598     __ emit_data(0x00000000, relocInfo::none, 0);
 599     __ emit_data(0x02020304, relocInfo::none, 0);
 600     __ emit_data(0x01010101, relocInfo::none, 0);
 601     __ emit_data(0x00000000, relocInfo::none, 0);
 602     __ emit_data(0x00000000, relocInfo::none, 0);
 603     __ emit_data(0x02020304, relocInfo::none, 0);
 604     __ emit_data(0x01010101, relocInfo::none, 0);
 605     __ emit_data(0x00000000, relocInfo::none, 0);
 606     __ emit_data(0x00000000, relocInfo::none, 0);
 607     __ emit_data(0x02020304, relocInfo::none, 0);
 608     __ emit_data(0x01010101, relocInfo::none, 0);
 609     __ emit_data(0x00000000, relocInfo::none, 0);
 610     __ emit_data(0x00000000, relocInfo::none, 0);
 611     return start;
 612   }
 613 
 614 
 615   address generate_popcount_avx_lut(const char *stub_name) {
 616     __ align64();
 617     StubCodeMark mark(this, "StubRoutines", stub_name);
 618     address start = __ pc();
 619     __ emit_data(0x02010100, relocInfo::none, 0);
 620     __ emit_data(0x03020201, relocInfo::none, 0);
 621     __ emit_data(0x03020201, relocInfo::none, 0);
 622     __ emit_data(0x04030302, relocInfo::none, 0);
 623     __ emit_data(0x02010100, relocInfo::none, 0);
 624     __ emit_data(0x03020201, relocInfo::none, 0);
 625     __ emit_data(0x03020201, relocInfo::none, 0);
 626     __ emit_data(0x04030302, relocInfo::none, 0);
 627     __ emit_data(0x02010100, relocInfo::none, 0);
 628     __ emit_data(0x03020201, relocInfo::none, 0);
 629     __ emit_data(0x03020201, relocInfo::none, 0);
 630     __ emit_data(0x04030302, relocInfo::none, 0);
 631     __ emit_data(0x02010100, relocInfo::none, 0);
 632     __ emit_data(0x03020201, relocInfo::none, 0);
 633     __ emit_data(0x03020201, relocInfo::none, 0);
 634     __ emit_data(0x04030302, relocInfo::none, 0);
 635     return start;
 636   }
 637 
 638 
 639   address generate_iota_indices(const char *stub_name) {
 640     __ align(CodeEntryAlignment);
 641     StubCodeMark mark(this, "StubRoutines", stub_name);
 642     address start = __ pc();
 643     __ emit_data(0x03020100, relocInfo::none, 0);
 644     __ emit_data(0x07060504, relocInfo::none, 0);
 645     __ emit_data(0x0B0A0908, relocInfo::none, 0);
 646     __ emit_data(0x0F0E0D0C, relocInfo::none, 0);
 647     __ emit_data(0x13121110, relocInfo::none, 0);
 648     __ emit_data(0x17161514, relocInfo::none, 0);
 649     __ emit_data(0x1B1A1918, relocInfo::none, 0);
 650     __ emit_data(0x1F1E1D1C, relocInfo::none, 0);
 651     __ emit_data(0x23222120, relocInfo::none, 0);
 652     __ emit_data(0x27262524, relocInfo::none, 0);
 653     __ emit_data(0x2B2A2928, relocInfo::none, 0);
 654     __ emit_data(0x2F2E2D2C, relocInfo::none, 0);
 655     __ emit_data(0x33323130, relocInfo::none, 0);
 656     __ emit_data(0x37363534, relocInfo::none, 0);
 657     __ emit_data(0x3B3A3938, relocInfo::none, 0);
 658     __ emit_data(0x3F3E3D3C, relocInfo::none, 0);
 659     return start;
 660   }
 661 
 662   address generate_vector_reverse_bit_lut(const char *stub_name) {
 663     __ align(CodeEntryAlignment);
 664     StubCodeMark mark(this, "StubRoutines", stub_name);
 665     address start = __ pc();
 666     __ emit_data(0x0C040800, relocInfo::none, 0);
 667     __ emit_data(0x0E060A02, relocInfo::none, 0);
 668     __ emit_data(0x0D050901, relocInfo::none, 0);
 669     __ emit_data(0x0F070B03, relocInfo::none, 0);
 670     __ emit_data(0x0C040800, relocInfo::none, 0);
 671     __ emit_data(0x0E060A02, relocInfo::none, 0);
 672     __ emit_data(0x0D050901, relocInfo::none, 0);
 673     __ emit_data(0x0F070B03, relocInfo::none, 0);
 674     __ emit_data(0x0C040800, relocInfo::none, 0);
 675     __ emit_data(0x0E060A02, relocInfo::none, 0);
 676     __ emit_data(0x0D050901, relocInfo::none, 0);
 677     __ emit_data(0x0F070B03, relocInfo::none, 0);
 678     __ emit_data(0x0C040800, relocInfo::none, 0);
 679     __ emit_data(0x0E060A02, relocInfo::none, 0);
 680     __ emit_data(0x0D050901, relocInfo::none, 0);
 681     __ emit_data(0x0F070B03, relocInfo::none, 0);
 682     return start;
 683   }
 684 
 685   address generate_vector_reverse_byte_perm_mask_long(const char *stub_name) {
 686     __ align(CodeEntryAlignment);
 687     StubCodeMark mark(this, "StubRoutines", stub_name);
 688     address start = __ pc();
 689     __ emit_data(0x04050607, relocInfo::none, 0);
 690     __ emit_data(0x00010203, relocInfo::none, 0);
 691     __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
 692     __ emit_data(0x08090A0B, relocInfo::none, 0);
 693     __ emit_data(0x04050607, relocInfo::none, 0);
 694     __ emit_data(0x00010203, relocInfo::none, 0);
 695     __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
 696     __ emit_data(0x08090A0B, relocInfo::none, 0);
 697     __ emit_data(0x04050607, relocInfo::none, 0);
 698     __ emit_data(0x00010203, relocInfo::none, 0);
 699     __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
 700     __ emit_data(0x08090A0B, relocInfo::none, 0);
 701     __ emit_data(0x04050607, relocInfo::none, 0);
 702     __ emit_data(0x00010203, relocInfo::none, 0);
 703     __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
 704     __ emit_data(0x08090A0B, relocInfo::none, 0);
 705     return start;
 706   }
 707 
 708   address generate_vector_reverse_byte_perm_mask_int(const char *stub_name) {
 709     __ align(CodeEntryAlignment);
 710     StubCodeMark mark(this, "StubRoutines", stub_name);
 711     address start = __ pc();
 712     __ emit_data(0x00010203, relocInfo::none, 0);
 713     __ emit_data(0x04050607, relocInfo::none, 0);
 714     __ emit_data(0x08090A0B, relocInfo::none, 0);
 715     __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
 716     __ emit_data(0x00010203, relocInfo::none, 0);
 717     __ emit_data(0x04050607, relocInfo::none, 0);
 718     __ emit_data(0x08090A0B, relocInfo::none, 0);
 719     __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
 720     __ emit_data(0x00010203, relocInfo::none, 0);
 721     __ emit_data(0x04050607, relocInfo::none, 0);
 722     __ emit_data(0x08090A0B, relocInfo::none, 0);
 723     __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
 724     __ emit_data(0x00010203, relocInfo::none, 0);
 725     __ emit_data(0x04050607, relocInfo::none, 0);
 726     __ emit_data(0x08090A0B, relocInfo::none, 0);
 727     __ emit_data(0x0C0D0E0F, relocInfo::none, 0);
 728     return start;
 729   }
 730 
 731   address generate_vector_reverse_byte_perm_mask_short(const char *stub_name) {
 732     __ align(CodeEntryAlignment);
 733     StubCodeMark mark(this, "StubRoutines", stub_name);
 734     address start = __ pc();
 735     __ emit_data(0x02030001, relocInfo::none, 0);
 736     __ emit_data(0x06070405, relocInfo::none, 0);
 737     __ emit_data(0x0A0B0809, relocInfo::none, 0);
 738     __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
 739     __ emit_data(0x02030001, relocInfo::none, 0);
 740     __ emit_data(0x06070405, relocInfo::none, 0);
 741     __ emit_data(0x0A0B0809, relocInfo::none, 0);
 742     __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
 743     __ emit_data(0x02030001, relocInfo::none, 0);
 744     __ emit_data(0x06070405, relocInfo::none, 0);
 745     __ emit_data(0x0A0B0809, relocInfo::none, 0);
 746     __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
 747     __ emit_data(0x02030001, relocInfo::none, 0);
 748     __ emit_data(0x06070405, relocInfo::none, 0);
 749     __ emit_data(0x0A0B0809, relocInfo::none, 0);
 750     __ emit_data(0x0E0F0C0D, relocInfo::none, 0);
 751     return start;
 752   }
 753 
 754   address generate_vector_byte_shuffle_mask(const char *stub_name) {
 755     __ align(CodeEntryAlignment);
 756     StubCodeMark mark(this, "StubRoutines", stub_name);
 757     address start = __ pc();
 758     __ emit_data(0x70707070, relocInfo::none, 0);
 759     __ emit_data(0x70707070, relocInfo::none, 0);
 760     __ emit_data(0x70707070, relocInfo::none, 0);
 761     __ emit_data(0x70707070, relocInfo::none, 0);
 762     __ emit_data(0xF0F0F0F0, relocInfo::none, 0);
 763     __ emit_data(0xF0F0F0F0, relocInfo::none, 0);
 764     __ emit_data(0xF0F0F0F0, relocInfo::none, 0);
 765     __ emit_data(0xF0F0F0F0, relocInfo::none, 0);
 766     return start;
 767   }
 768 
 769   address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) {
 770     __ align(CodeEntryAlignment);
 771     StubCodeMark mark(this, "StubRoutines", stub_name);
 772     address start = __ pc();
 773 
 774     for (int i = 0; i < 8; i++) {
 775       __ emit_data(masklo, relocInfo::none, 0);
 776       __ emit_data(maskhi, relocInfo::none, 0);
 777     }
 778 
 779     return start;
 780   }
 781 
 782   //----------------------------------------------------------------------------------------------------
 783 
 784   address generate_vector_byte_perm_mask(const char *stub_name) {
 785     __ align(CodeEntryAlignment);
 786     StubCodeMark mark(this, "StubRoutines", stub_name);
 787     address start = __ pc();
 788 
 789     __ emit_data(0x00000001, relocInfo::none, 0);
 790     __ emit_data(0x00000000, relocInfo::none, 0);
 791     __ emit_data(0x00000003, relocInfo::none, 0);
 792     __ emit_data(0x00000000, relocInfo::none, 0);
 793     __ emit_data(0x00000005, relocInfo::none, 0);
 794     __ emit_data(0x00000000, relocInfo::none, 0);
 795     __ emit_data(0x00000007, relocInfo::none, 0);
 796     __ emit_data(0x00000000, relocInfo::none, 0);
 797     __ emit_data(0x00000000, relocInfo::none, 0);
 798     __ emit_data(0x00000000, relocInfo::none, 0);
 799     __ emit_data(0x00000002, relocInfo::none, 0);
 800     __ emit_data(0x00000000, relocInfo::none, 0);
 801     __ emit_data(0x00000004, relocInfo::none, 0);
 802     __ emit_data(0x00000000, relocInfo::none, 0);
 803     __ emit_data(0x00000006, relocInfo::none, 0);
 804     __ emit_data(0x00000000, relocInfo::none, 0);
 805 
 806     return start;
 807   }
 808 
 809   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
 810                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
 811                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
 812                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
 813                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
 814     __ align(CodeEntryAlignment);
 815     StubCodeMark mark(this, "StubRoutines", stub_name);
 816     address start = __ pc();
 817 
 818     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
 819     __ emit_data(val0, relocInfo::none, 0);
 820     __ emit_data(val1, relocInfo::none, 0);
 821     __ emit_data(val2, relocInfo::none, 0);
 822     __ emit_data(val3, relocInfo::none, 0);
 823     if (len >= Assembler::AVX_256bit) {
 824       __ emit_data(val4, relocInfo::none, 0);
 825       __ emit_data(val5, relocInfo::none, 0);
 826       __ emit_data(val6, relocInfo::none, 0);
 827       __ emit_data(val7, relocInfo::none, 0);
 828       if (len >= Assembler::AVX_512bit) {
 829         __ emit_data(val8, relocInfo::none, 0);
 830         __ emit_data(val9, relocInfo::none, 0);
 831         __ emit_data(val10, relocInfo::none, 0);
 832         __ emit_data(val11, relocInfo::none, 0);
 833         __ emit_data(val12, relocInfo::none, 0);
 834         __ emit_data(val13, relocInfo::none, 0);
 835         __ emit_data(val14, relocInfo::none, 0);
 836         __ emit_data(val15, relocInfo::none, 0);
 837       }
 838     }
 839 
 840     return start;
 841   }
 842 
 843   //----------------------------------------------------------------------------------------------------
 844   // Non-destructive plausibility checks for oops
 845 
 846   address generate_verify_oop() {
 847     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 848     address start = __ pc();
 849 
 850     // Incoming arguments on stack after saving rax,:
 851     //
 852     // [tos    ]: saved rdx
 853     // [tos + 1]: saved EFLAGS
 854     // [tos + 2]: return address
 855     // [tos + 3]: char* error message
 856     // [tos + 4]: oop   object to verify
 857     // [tos + 5]: saved rax, - saved by caller and bashed
 858 
 859     Label exit, error;
 860     __ pushf();
 861     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 862     __ push(rdx);                                // save rdx
 863     // make sure object is 'reasonable'
 864     __ movptr(rax, Address(rsp, 4 * wordSize));    // get object
 865     __ testptr(rax, rax);
 866     __ jcc(Assembler::zero, exit);               // if obj is NULL it is ok
 867 
 868     // Check if the oop is in the right area of memory
 869     const int oop_mask = Universe::verify_oop_mask();
 870     const int oop_bits = Universe::verify_oop_bits();
 871     __ mov(rdx, rax);
 872     __ andptr(rdx, oop_mask);
 873     __ cmpptr(rdx, oop_bits);
 874     __ jcc(Assembler::notZero, error);
 875 
 876     // make sure klass is 'reasonable', which is not zero.
 877     __ movptr(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
 878     __ testptr(rax, rax);
 879     __ jcc(Assembler::zero, error);              // if klass is NULL it is broken
 880 
 881     // return if everything seems ok
 882     __ bind(exit);
 883     __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
 884     __ pop(rdx);                                 // restore rdx
 885     __ popf();                                   // restore EFLAGS
 886     __ ret(3 * wordSize);                        // pop arguments
 887 
 888     // handle errors
 889     __ bind(error);
 890     __ movptr(rax, Address(rsp, 5 * wordSize));  // get saved rax, back
 891     __ pop(rdx);                                 // get saved rdx back
 892     __ popf();                                   // get saved EFLAGS off stack -- will be ignored
 893     __ pusha();                                  // push registers (eip = return address & msg are already pushed)
 894     BLOCK_COMMENT("call MacroAssembler::debug");
 895     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 896     __ hlt();
 897     return start;
 898   }
 899 
 900 
 901   // Copy 64 bytes chunks
 902   //
 903   // Inputs:
 904   //   from        - source array address
 905   //   to_from     - destination array address - from
 906   //   qword_count - 8-bytes element count, negative
 907   //
 908   void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
 909     assert( UseSSE >= 2, "supported cpu only" );
 910     Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
 911 
 912     // Copy 64-byte chunks
 913     __ jmpb(L_copy_64_bytes);
 914     __ align(OptoLoopAlignment);
 915   __ BIND(L_copy_64_bytes_loop);
 916 
 917     if (UseUnalignedLoadStores) {
 918       if (UseAVX > 2) {
 919         __ evmovdqul(xmm0, Address(from, 0), Assembler::AVX_512bit);
 920         __ evmovdqul(Address(from, to_from, Address::times_1, 0), xmm0, Assembler::AVX_512bit);
 921       } else if (UseAVX == 2) {
 922         __ vmovdqu(xmm0, Address(from,  0));
 923         __ vmovdqu(Address(from, to_from, Address::times_1,  0), xmm0);
 924         __ vmovdqu(xmm1, Address(from, 32));
 925         __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
 926       } else {
 927         __ movdqu(xmm0, Address(from, 0));
 928         __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
 929         __ movdqu(xmm1, Address(from, 16));
 930         __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
 931         __ movdqu(xmm2, Address(from, 32));
 932         __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
 933         __ movdqu(xmm3, Address(from, 48));
 934         __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
 935       }
 936     } else {
 937       __ movq(xmm0, Address(from, 0));
 938       __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
 939       __ movq(xmm1, Address(from, 8));
 940       __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
 941       __ movq(xmm2, Address(from, 16));
 942       __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
 943       __ movq(xmm3, Address(from, 24));
 944       __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
 945       __ movq(xmm4, Address(from, 32));
 946       __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
 947       __ movq(xmm5, Address(from, 40));
 948       __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
 949       __ movq(xmm6, Address(from, 48));
 950       __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
 951       __ movq(xmm7, Address(from, 56));
 952       __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
 953     }
 954 
 955     __ addl(from, 64);
 956   __ BIND(L_copy_64_bytes);
 957     __ subl(qword_count, 8);
 958     __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
 959 
 960     if (UseUnalignedLoadStores && (UseAVX == 2)) {
 961       // clean upper bits of YMM registers
 962       __ vpxor(xmm0, xmm0);
 963       __ vpxor(xmm1, xmm1);
 964     }
 965     __ addl(qword_count, 8);
 966     __ jccb(Assembler::zero, L_exit);
 967     //
 968     // length is too short, just copy qwords
 969     //
 970   __ BIND(L_copy_8_bytes);
 971     __ movq(xmm0, Address(from, 0));
 972     __ movq(Address(from, to_from, Address::times_1), xmm0);
 973     __ addl(from, 8);
 974     __ decrement(qword_count);
 975     __ jcc(Assembler::greater, L_copy_8_bytes);
 976   __ BIND(L_exit);
 977   }
 978 
 979   address generate_disjoint_copy(BasicType t, bool aligned,
 980                                  Address::ScaleFactor sf,
 981                                  address* entry, const char *name,
 982                                  bool dest_uninitialized = false) {
 983     __ align(CodeEntryAlignment);
 984     StubCodeMark mark(this, "StubRoutines", name);
 985     address start = __ pc();
 986 
 987     Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
 988     Label L_copy_2_bytes, L_copy_4_bytes, L_copy_64_bytes;
 989 
 990     int shift = Address::times_ptr - sf;
 991 
 992     const Register from     = rsi;  // source array address
 993     const Register to       = rdi;  // destination array address
 994     const Register count    = rcx;  // elements count
 995     const Register to_from  = to;   // (to - from)
 996     const Register saved_to = rdx;  // saved destination array address
 997 
 998     __ enter(); // required for proper stackwalking of RuntimeStub frame
 999     __ push(rsi);
1000     __ push(rdi);
1001     __ movptr(from , Address(rsp, 12+ 4));
1002     __ movptr(to   , Address(rsp, 12+ 8));
1003     __ movl(count, Address(rsp, 12+ 12));
1004 
1005     if (entry != NULL) {
1006       *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1007       BLOCK_COMMENT("Entry:");
1008     }
1009 
1010     if (t == T_OBJECT) {
1011       __ testl(count, count);
1012       __ jcc(Assembler::zero, L_0_count);
1013     }
1014 
1015     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1016     if (dest_uninitialized) {
1017       decorators |= IS_DEST_UNINITIALIZED;
1018     }
1019     if (aligned) {
1020       decorators |= ARRAYCOPY_ALIGNED;
1021     }
1022 
1023     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1024     bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
1025     {
1026       bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
1027       // UnsafeCopyMemory page error: continue after ucm
1028       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1029       __ subptr(to, from); // to --> to_from
1030       __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1031       __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1032       if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
1033         // align source address at 4 bytes address boundary
1034         if (t == T_BYTE) {
1035           // One byte misalignment happens only for byte arrays
1036           __ testl(from, 1);
1037           __ jccb(Assembler::zero, L_skip_align1);
1038           __ movb(rax, Address(from, 0));
1039           __ movb(Address(from, to_from, Address::times_1, 0), rax);
1040           __ increment(from);
1041           __ decrement(count);
1042         __ BIND(L_skip_align1);
1043         }
1044         // Two bytes misalignment happens only for byte and short (char) arrays
1045         __ testl(from, 2);
1046         __ jccb(Assembler::zero, L_skip_align2);
1047         __ movw(rax, Address(from, 0));
1048         __ movw(Address(from, to_from, Address::times_1, 0), rax);
1049         __ addptr(from, 2);
1050         __ subl(count, 1<<(shift-1));
1051       __ BIND(L_skip_align2);
1052       }
1053       if (!UseXMMForArrayCopy) {
1054         __ mov(rax, count);      // save 'count'
1055         __ shrl(count, shift); // bytes count
1056         __ addptr(to_from, from);// restore 'to'
1057         __ rep_mov();
1058         __ subptr(to_from, from);// restore 'to_from'
1059         __ mov(count, rax);      // restore 'count'
1060         __ jmpb(L_copy_2_bytes); // all dwords were copied
1061       } else {
1062         if (!UseUnalignedLoadStores) {
1063           // align to 8 bytes, we know we are 4 byte aligned to start
1064           __ testptr(from, 4);
1065           __ jccb(Assembler::zero, L_copy_64_bytes);
1066           __ movl(rax, Address(from, 0));
1067           __ movl(Address(from, to_from, Address::times_1, 0), rax);
1068           __ addptr(from, 4);
1069           __ subl(count, 1<<shift);
1070         }
1071       __ BIND(L_copy_64_bytes);
1072         __ mov(rax, count);
1073         __ shrl(rax, shift+1);  // 8 bytes chunk count
1074         //
1075         // Copy 8-byte chunks through XMM registers, 8 per iteration of the loop
1076         //
1077         xmm_copy_forward(from, to_from, rax);
1078       }
1079       // copy tailing dword
1080     __ BIND(L_copy_4_bytes);
1081       __ testl(count, 1<<shift);
1082       __ jccb(Assembler::zero, L_copy_2_bytes);
1083       __ movl(rax, Address(from, 0));
1084       __ movl(Address(from, to_from, Address::times_1, 0), rax);
1085       if (t == T_BYTE || t == T_SHORT) {
1086         __ addptr(from, 4);
1087       __ BIND(L_copy_2_bytes);
1088         // copy tailing word
1089         __ testl(count, 1<<(shift-1));
1090         __ jccb(Assembler::zero, L_copy_byte);
1091         __ movw(rax, Address(from, 0));
1092         __ movw(Address(from, to_from, Address::times_1, 0), rax);
1093         if (t == T_BYTE) {
1094           __ addptr(from, 2);
1095         __ BIND(L_copy_byte);
1096           // copy tailing byte
1097           __ testl(count, 1);
1098           __ jccb(Assembler::zero, L_exit);
1099           __ movb(rax, Address(from, 0));
1100           __ movb(Address(from, to_from, Address::times_1, 0), rax);
1101         __ BIND(L_exit);
1102         } else {
1103         __ BIND(L_copy_byte);
1104         }
1105       } else {
1106       __ BIND(L_copy_2_bytes);
1107       }
1108     }
1109 
1110     __ movl(count, Address(rsp, 12+12)); // reread 'count'
1111     bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
1112 
1113     if (t == T_OBJECT) {
1114     __ BIND(L_0_count);
1115     }
1116     inc_copy_counter_np(t);
1117     __ pop(rdi);
1118     __ pop(rsi);
1119     __ leave(); // required for proper stackwalking of RuntimeStub frame
1120     __ vzeroupper();
1121     __ xorptr(rax, rax); // return 0
1122     __ ret(0);
1123     return start;
1124   }
1125 
1126 
1127   address generate_fill(BasicType t, bool aligned, const char *name) {
1128     __ align(CodeEntryAlignment);
1129     StubCodeMark mark(this, "StubRoutines", name);
1130     address start = __ pc();
1131 
1132     BLOCK_COMMENT("Entry:");
1133 
1134     const Register to       = rdi;  // source array address
1135     const Register value    = rdx;  // value
1136     const Register count    = rsi;  // elements count
1137 
1138     __ enter(); // required for proper stackwalking of RuntimeStub frame
1139     __ push(rsi);
1140     __ push(rdi);
1141     __ movptr(to   , Address(rsp, 12+ 4));
1142     __ movl(value, Address(rsp, 12+ 8));
1143     __ movl(count, Address(rsp, 12+ 12));
1144 
1145     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1146 
1147     __ pop(rdi);
1148     __ pop(rsi);
1149     __ leave(); // required for proper stackwalking of RuntimeStub frame
1150     __ ret(0);
1151     return start;
1152   }
1153 
1154   address generate_conjoint_copy(BasicType t, bool aligned,
1155                                  Address::ScaleFactor sf,
1156                                  address nooverlap_target,
1157                                  address* entry, const char *name,
1158                                  bool dest_uninitialized = false) {
1159     __ align(CodeEntryAlignment);
1160     StubCodeMark mark(this, "StubRoutines", name);
1161     address start = __ pc();
1162 
1163     Label L_0_count, L_exit, L_skip_align1, L_skip_align2, L_copy_byte;
1164     Label L_copy_2_bytes, L_copy_4_bytes, L_copy_8_bytes, L_copy_8_bytes_loop;
1165 
1166     int shift = Address::times_ptr - sf;
1167 
1168     const Register src   = rax;  // source array address
1169     const Register dst   = rdx;  // destination array address
1170     const Register from  = rsi;  // source array address
1171     const Register to    = rdi;  // destination array address
1172     const Register count = rcx;  // elements count
1173     const Register end   = rax;  // array end address
1174 
1175     __ enter(); // required for proper stackwalking of RuntimeStub frame
1176     __ push(rsi);
1177     __ push(rdi);
1178     __ movptr(src  , Address(rsp, 12+ 4));   // from
1179     __ movptr(dst  , Address(rsp, 12+ 8));   // to
1180     __ movl2ptr(count, Address(rsp, 12+12)); // count
1181 
1182     if (entry != NULL) {
1183       *entry = __ pc(); // Entry point from generic arraycopy stub.
1184       BLOCK_COMMENT("Entry:");
1185     }
1186 
1187     // nooverlap_target expects arguments in rsi and rdi.
1188     __ mov(from, src);
1189     __ mov(to  , dst);
1190 
1191     // arrays overlap test: dispatch to disjoint stub if necessary.
1192     RuntimeAddress nooverlap(nooverlap_target);
1193     __ cmpptr(dst, src);
1194     __ lea(end, Address(src, count, sf, 0)); // src + count * elem_size
1195     __ jump_cc(Assembler::belowEqual, nooverlap);
1196     __ cmpptr(dst, end);
1197     __ jump_cc(Assembler::aboveEqual, nooverlap);
1198 
1199     if (t == T_OBJECT) {
1200       __ testl(count, count);
1201       __ jcc(Assembler::zero, L_0_count);
1202     }
1203 
1204     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1205     if (dest_uninitialized) {
1206       decorators |= IS_DEST_UNINITIALIZED;
1207     }
1208     if (aligned) {
1209       decorators |= ARRAYCOPY_ALIGNED;
1210     }
1211 
1212     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1213     bs->arraycopy_prologue(_masm, decorators, t, from, to, count);
1214 
1215     {
1216       bool add_entry = (t != T_OBJECT && (!aligned || t == T_INT));
1217       // UnsafeCopyMemory page error: continue after ucm
1218       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1219       // copy from high to low
1220       __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1221       __ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
1222       if (t == T_BYTE || t == T_SHORT) {
1223         // Align the end of destination array at 4 bytes address boundary
1224         __ lea(end, Address(dst, count, sf, 0));
1225         if (t == T_BYTE) {
1226           // One byte misalignment happens only for byte arrays
1227           __ testl(end, 1);
1228           __ jccb(Assembler::zero, L_skip_align1);
1229           __ decrement(count);
1230           __ movb(rdx, Address(from, count, sf, 0));
1231           __ movb(Address(to, count, sf, 0), rdx);
1232         __ BIND(L_skip_align1);
1233         }
1234         // Two bytes misalignment happens only for byte and short (char) arrays
1235         __ testl(end, 2);
1236         __ jccb(Assembler::zero, L_skip_align2);
1237         __ subptr(count, 1<<(shift-1));
1238         __ movw(rdx, Address(from, count, sf, 0));
1239         __ movw(Address(to, count, sf, 0), rdx);
1240       __ BIND(L_skip_align2);
1241         __ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
1242         __ jcc(Assembler::below, L_copy_4_bytes);
1243       }
1244 
1245       if (!UseXMMForArrayCopy) {
1246         __ std();
1247         __ mov(rax, count); // Save 'count'
1248         __ mov(rdx, to);    // Save 'to'
1249         __ lea(rsi, Address(from, count, sf, -4));
1250         __ lea(rdi, Address(to  , count, sf, -4));
1251         __ shrptr(count, shift); // bytes count
1252         __ rep_mov();
1253         __ cld();
1254         __ mov(count, rax); // restore 'count'
1255         __ andl(count, (1<<shift)-1);      // mask the number of rest elements
1256         __ movptr(from, Address(rsp, 12+4)); // reread 'from'
1257         __ mov(to, rdx);   // restore 'to'
1258         __ jmpb(L_copy_2_bytes); // all dword were copied
1259       } else {
1260         // Align to 8 bytes the end of array. It is aligned to 4 bytes already.
1261         __ testptr(end, 4);
1262         __ jccb(Assembler::zero, L_copy_8_bytes);
1263         __ subl(count, 1<<shift);
1264         __ movl(rdx, Address(from, count, sf, 0));
1265         __ movl(Address(to, count, sf, 0), rdx);
1266         __ jmpb(L_copy_8_bytes);
1267 
1268         __ align(OptoLoopAlignment);
1269         // Move 8 bytes
1270       __ BIND(L_copy_8_bytes_loop);
1271         __ movq(xmm0, Address(from, count, sf, 0));
1272         __ movq(Address(to, count, sf, 0), xmm0);
1273       __ BIND(L_copy_8_bytes);
1274         __ subl(count, 2<<shift);
1275         __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1276         __ addl(count, 2<<shift);
1277       }
1278     __ BIND(L_copy_4_bytes);
1279       // copy prefix qword
1280       __ testl(count, 1<<shift);
1281       __ jccb(Assembler::zero, L_copy_2_bytes);
1282       __ movl(rdx, Address(from, count, sf, -4));
1283       __ movl(Address(to, count, sf, -4), rdx);
1284 
1285       if (t == T_BYTE || t == T_SHORT) {
1286           __ subl(count, (1<<shift));
1287         __ BIND(L_copy_2_bytes);
1288           // copy prefix dword
1289           __ testl(count, 1<<(shift-1));
1290           __ jccb(Assembler::zero, L_copy_byte);
1291           __ movw(rdx, Address(from, count, sf, -2));
1292           __ movw(Address(to, count, sf, -2), rdx);
1293           if (t == T_BYTE) {
1294             __ subl(count, 1<<(shift-1));
1295           __ BIND(L_copy_byte);
1296             // copy prefix byte
1297             __ testl(count, 1);
1298             __ jccb(Assembler::zero, L_exit);
1299             __ movb(rdx, Address(from, 0));
1300             __ movb(Address(to, 0), rdx);
1301           __ BIND(L_exit);
1302           } else {
1303           __ BIND(L_copy_byte);
1304           }
1305       } else {
1306       __ BIND(L_copy_2_bytes);
1307       }
1308     }
1309 
1310     __ movl2ptr(count, Address(rsp, 12+12)); // reread count
1311     bs->arraycopy_epilogue(_masm, decorators, t, from, to, count);
1312 
1313     if (t == T_OBJECT) {
1314     __ BIND(L_0_count);
1315     }
1316     inc_copy_counter_np(t);
1317     __ pop(rdi);
1318     __ pop(rsi);
1319     __ leave(); // required for proper stackwalking of RuntimeStub frame
1320     __ xorptr(rax, rax); // return 0
1321     __ ret(0);
1322     return start;
1323   }
1324 
1325 
1326   address generate_disjoint_long_copy(address* entry, const char *name) {
1327     __ align(CodeEntryAlignment);
1328     StubCodeMark mark(this, "StubRoutines", name);
1329     address start = __ pc();
1330 
1331     Label L_copy_8_bytes, L_copy_8_bytes_loop;
1332     const Register from       = rax;  // source array address
1333     const Register to         = rdx;  // destination array address
1334     const Register count      = rcx;  // elements count
1335     const Register to_from    = rdx;  // (to - from)
1336 
1337     __ enter(); // required for proper stackwalking of RuntimeStub frame
1338     __ movptr(from , Address(rsp, 8+0));       // from
1339     __ movptr(to   , Address(rsp, 8+4));       // to
1340     __ movl2ptr(count, Address(rsp, 8+8));     // count
1341 
1342     *entry = __ pc(); // Entry point from conjoint arraycopy stub.
1343     BLOCK_COMMENT("Entry:");
1344 
1345     {
1346       // UnsafeCopyMemory page error: continue after ucm
1347       UnsafeCopyMemoryMark ucmm(this, true, true);
1348       __ subptr(to, from); // to --> to_from
1349       if (UseXMMForArrayCopy) {
1350         xmm_copy_forward(from, to_from, count);
1351       } else {
1352         __ jmpb(L_copy_8_bytes);
1353         __ align(OptoLoopAlignment);
1354       __ BIND(L_copy_8_bytes_loop);
1355         __ fild_d(Address(from, 0));
1356         __ fistp_d(Address(from, to_from, Address::times_1));
1357         __ addptr(from, 8);
1358       __ BIND(L_copy_8_bytes);
1359         __ decrement(count);
1360         __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1361       }
1362     }
1363     inc_copy_counter_np(T_LONG);
1364     __ leave(); // required for proper stackwalking of RuntimeStub frame
1365     __ vzeroupper();
1366     __ xorptr(rax, rax); // return 0
1367     __ ret(0);
1368     return start;
1369   }
1370 
1371   address generate_conjoint_long_copy(address nooverlap_target,
1372                                       address* entry, const char *name) {
1373     __ align(CodeEntryAlignment);
1374     StubCodeMark mark(this, "StubRoutines", name);
1375     address start = __ pc();
1376 
1377     Label L_copy_8_bytes, L_copy_8_bytes_loop;
1378     const Register from       = rax;  // source array address
1379     const Register to         = rdx;  // destination array address
1380     const Register count      = rcx;  // elements count
1381     const Register end_from   = rax;  // source array end address
1382 
1383     __ enter(); // required for proper stackwalking of RuntimeStub frame
1384     __ movptr(from , Address(rsp, 8+0));       // from
1385     __ movptr(to   , Address(rsp, 8+4));       // to
1386     __ movl2ptr(count, Address(rsp, 8+8));     // count
1387 
1388     *entry = __ pc(); // Entry point from generic arraycopy stub.
1389     BLOCK_COMMENT("Entry:");
1390 
1391     // arrays overlap test
1392     __ cmpptr(to, from);
1393     RuntimeAddress nooverlap(nooverlap_target);
1394     __ jump_cc(Assembler::belowEqual, nooverlap);
1395     __ lea(end_from, Address(from, count, Address::times_8, 0));
1396     __ cmpptr(to, end_from);
1397     __ movptr(from, Address(rsp, 8));  // from
1398     __ jump_cc(Assembler::aboveEqual, nooverlap);
1399 
1400     {
1401       // UnsafeCopyMemory page error: continue after ucm
1402       UnsafeCopyMemoryMark ucmm(this, true, true);
1403 
1404       __ jmpb(L_copy_8_bytes);
1405 
1406       __ align(OptoLoopAlignment);
1407     __ BIND(L_copy_8_bytes_loop);
1408       if (UseXMMForArrayCopy) {
1409         __ movq(xmm0, Address(from, count, Address::times_8));
1410         __ movq(Address(to, count, Address::times_8), xmm0);
1411       } else {
1412         __ fild_d(Address(from, count, Address::times_8));
1413         __ fistp_d(Address(to, count, Address::times_8));
1414       }
1415     __ BIND(L_copy_8_bytes);
1416       __ decrement(count);
1417       __ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
1418 
1419     }
1420     inc_copy_counter_np(T_LONG);
1421     __ leave(); // required for proper stackwalking of RuntimeStub frame
1422     __ xorptr(rax, rax); // return 0
1423     __ ret(0);
1424     return start;
1425   }
1426 
1427 
1428   // Helper for generating a dynamic type check.
1429   // The sub_klass must be one of {rbx, rdx, rsi}.
1430   // The temp is killed.
1431   void generate_type_check(Register sub_klass,
1432                            Address& super_check_offset_addr,
1433                            Address& super_klass_addr,
1434                            Register temp,
1435                            Label* L_success, Label* L_failure) {
1436     BLOCK_COMMENT("type_check:");
1437 
1438     Label L_fallthrough;
1439 #define LOCAL_JCC(assembler_con, label_ptr)                             \
1440     if (label_ptr != NULL)  __ jcc(assembler_con, *(label_ptr));        \
1441     else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
1442 
1443     // The following is a strange variation of the fast path which requires
1444     // one less register, because needed values are on the argument stack.
1445     // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
1446     //                                  L_success, L_failure, NULL);
1447     assert_different_registers(sub_klass, temp);
1448 
1449     int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1450 
1451     // if the pointers are equal, we are done (e.g., String[] elements)
1452     __ cmpptr(sub_klass, super_klass_addr);
1453     LOCAL_JCC(Assembler::equal, L_success);
1454 
1455     // check the supertype display:
1456     __ movl2ptr(temp, super_check_offset_addr);
1457     Address super_check_addr(sub_klass, temp, Address::times_1, 0);
1458     __ movptr(temp, super_check_addr); // load displayed supertype
1459     __ cmpptr(temp, super_klass_addr); // test the super type
1460     LOCAL_JCC(Assembler::equal, L_success);
1461 
1462     // if it was a primary super, we can just fail immediately
1463     __ cmpl(super_check_offset_addr, sc_offset);
1464     LOCAL_JCC(Assembler::notEqual, L_failure);
1465 
1466     // The repne_scan instruction uses fixed registers, which will get spilled.
1467     // We happen to know this works best when super_klass is in rax.
1468     Register super_klass = temp;
1469     __ movptr(super_klass, super_klass_addr);
1470     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
1471                                      L_success, L_failure);
1472 
1473     __ bind(L_fallthrough);
1474 
1475     if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
1476     if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
1477 
1478 #undef LOCAL_JCC
1479   }
1480 
1481   //
1482   //  Generate checkcasting array copy stub
1483   //
1484   //  Input:
1485   //    4(rsp)   - source array address
1486   //    8(rsp)   - destination array address
1487   //   12(rsp)   - element count, can be zero
1488   //   16(rsp)   - size_t ckoff (super_check_offset)
1489   //   20(rsp)   - oop ckval (super_klass)
1490   //
1491   //  Output:
1492   //    rax, ==  0  -  success
1493   //    rax, == -1^K - failure, where K is partial transfer count
1494   //
1495   address generate_checkcast_copy(const char *name, address* entry, bool dest_uninitialized = false) {
1496     __ align(CodeEntryAlignment);
1497     StubCodeMark mark(this, "StubRoutines", name);
1498     address start = __ pc();
1499 
1500     Label L_load_element, L_store_element, L_do_card_marks, L_done;
1501 
1502     // register use:
1503     //  rax, rdx, rcx -- loop control (end_from, end_to, count)
1504     //  rdi, rsi      -- element access (oop, klass)
1505     //  rbx,           -- temp
1506     const Register from       = rax;    // source array address
1507     const Register to         = rdx;    // destination array address
1508     const Register length     = rcx;    // elements count
1509     const Register elem       = rdi;    // each oop copied
1510     const Register elem_klass = rsi;    // each elem._klass (sub_klass)
1511     const Register temp       = rbx;    // lone remaining temp
1512 
1513     __ enter(); // required for proper stackwalking of RuntimeStub frame
1514 
1515     __ push(rsi);
1516     __ push(rdi);
1517     __ push(rbx);
1518 
1519     Address   from_arg(rsp, 16+ 4);     // from
1520     Address     to_arg(rsp, 16+ 8);     // to
1521     Address length_arg(rsp, 16+12);     // elements count
1522     Address  ckoff_arg(rsp, 16+16);     // super_check_offset
1523     Address  ckval_arg(rsp, 16+20);     // super_klass
1524 
1525     // Load up:
1526     __ movptr(from,     from_arg);
1527     __ movptr(to,         to_arg);
1528     __ movl2ptr(length, length_arg);
1529 
1530     if (entry != NULL) {
1531       *entry = __ pc(); // Entry point from generic arraycopy stub.
1532       BLOCK_COMMENT("Entry:");
1533     }
1534 
1535     //---------------------------------------------------------------
1536     // Assembler stub will be used for this call to arraycopy
1537     // if the two arrays are subtypes of Object[] but the
1538     // destination array type is not equal to or a supertype
1539     // of the source type.  Each element must be separately
1540     // checked.
1541 
1542     // Loop-invariant addresses.  They are exclusive end pointers.
1543     Address end_from_addr(from, length, Address::times_ptr, 0);
1544     Address   end_to_addr(to,   length, Address::times_ptr, 0);
1545 
1546     Register end_from = from;           // re-use
1547     Register end_to   = to;             // re-use
1548     Register count    = length;         // re-use
1549 
1550     // Loop-variant addresses.  They assume post-incremented count < 0.
1551     Address from_element_addr(end_from, count, Address::times_ptr, 0);
1552     Address   to_element_addr(end_to,   count, Address::times_ptr, 0);
1553     Address elem_klass_addr(elem, oopDesc::klass_offset_in_bytes());
1554 
1555     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1556     if (dest_uninitialized) {
1557       decorators |= IS_DEST_UNINITIALIZED;
1558     }
1559 
1560     BasicType type = T_OBJECT;
1561     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1562     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1563 
1564     // Copy from low to high addresses, indexed from the end of each array.
1565     __ lea(end_from, end_from_addr);
1566     __ lea(end_to,   end_to_addr);
1567     assert(length == count, "");        // else fix next line:
1568     __ negptr(count);                   // negate and test the length
1569     __ jccb(Assembler::notZero, L_load_element);
1570 
1571     // Empty array:  Nothing to do.
1572     __ xorptr(rax, rax);                  // return 0 on (trivial) success
1573     __ jmp(L_done);
1574 
1575     // ======== begin loop ========
1576     // (Loop is rotated; its entry is L_load_element.)
1577     // Loop control:
1578     //   for (count = -count; count != 0; count++)
1579     // Base pointers src, dst are biased by 8*count,to last element.
1580     __ align(OptoLoopAlignment);
1581 
1582     __ BIND(L_store_element);
1583     __ movptr(to_element_addr, elem);     // store the oop
1584     __ increment(count);                // increment the count toward zero
1585     __ jccb(Assembler::zero, L_do_card_marks);
1586 
1587     // ======== loop entry is here ========
1588     __ BIND(L_load_element);
1589     __ movptr(elem, from_element_addr);   // load the oop
1590     __ testptr(elem, elem);
1591     __ jccb(Assembler::zero, L_store_element);
1592 
1593     // (Could do a trick here:  Remember last successful non-null
1594     // element stored and make a quick oop equality check on it.)
1595 
1596     __ movptr(elem_klass, elem_klass_addr); // query the object klass
1597     generate_type_check(elem_klass, ckoff_arg, ckval_arg, temp,
1598                         &L_store_element, NULL);
1599     // (On fall-through, we have failed the element type check.)
1600     // ======== end loop ========
1601 
1602     // It was a real error; we must depend on the caller to finish the job.
1603     // Register "count" = -1 * number of *remaining* oops, length_arg = *total* oops.
1604     // Emit GC store barriers for the oops we have copied (length_arg + count),
1605     // and report their number to the caller.
1606     assert_different_registers(to, count, rax);
1607     Label L_post_barrier;
1608     __ addl(count, length_arg);         // transfers = (length - remaining)
1609     __ movl2ptr(rax, count);            // save the value
1610     __ notptr(rax);                     // report (-1^K) to caller (does not affect flags)
1611     __ jccb(Assembler::notZero, L_post_barrier);
1612     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
1613 
1614     // Come here on success only.
1615     __ BIND(L_do_card_marks);
1616     __ xorptr(rax, rax);                // return 0 on success
1617     __ movl2ptr(count, length_arg);
1618 
1619     __ BIND(L_post_barrier);
1620     __ movptr(to, to_arg);              // reload
1621     bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1622 
1623     // Common exit point (success or failure).
1624     __ BIND(L_done);
1625     __ pop(rbx);
1626     __ pop(rdi);
1627     __ pop(rsi);
1628     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1629     __ leave(); // required for proper stackwalking of RuntimeStub frame
1630     __ ret(0);
1631 
1632     return start;
1633   }
1634 
1635   //
1636   //  Generate 'unsafe' array copy stub
1637   //  Though just as safe as the other stubs, it takes an unscaled
1638   //  size_t argument instead of an element count.
1639   //
1640   //  Input:
1641   //    4(rsp)   - source array address
1642   //    8(rsp)   - destination array address
1643   //   12(rsp)   - byte count, can be zero
1644   //
1645   //  Output:
1646   //    rax, ==  0  -  success
1647   //    rax, == -1  -  need to call System.arraycopy
1648   //
1649   // Examines the alignment of the operands and dispatches
1650   // to a long, int, short, or byte copy loop.
1651   //
1652   address generate_unsafe_copy(const char *name,
1653                                address byte_copy_entry,
1654                                address short_copy_entry,
1655                                address int_copy_entry,
1656                                address long_copy_entry) {
1657 
1658     Label L_long_aligned, L_int_aligned, L_short_aligned;
1659 
1660     __ align(CodeEntryAlignment);
1661     StubCodeMark mark(this, "StubRoutines", name);
1662     address start = __ pc();
1663 
1664     const Register from       = rax;  // source array address
1665     const Register to         = rdx;  // destination array address
1666     const Register count      = rcx;  // elements count
1667 
1668     __ enter(); // required for proper stackwalking of RuntimeStub frame
1669     __ push(rsi);
1670     __ push(rdi);
1671     Address  from_arg(rsp, 12+ 4);      // from
1672     Address    to_arg(rsp, 12+ 8);      // to
1673     Address count_arg(rsp, 12+12);      // byte count
1674 
1675     // Load up:
1676     __ movptr(from ,  from_arg);
1677     __ movptr(to   ,    to_arg);
1678     __ movl2ptr(count, count_arg);
1679 
1680     // bump this on entry, not on exit:
1681     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1682 
1683     const Register bits = rsi;
1684     __ mov(bits, from);
1685     __ orptr(bits, to);
1686     __ orptr(bits, count);
1687 
1688     __ testl(bits, BytesPerLong-1);
1689     __ jccb(Assembler::zero, L_long_aligned);
1690 
1691     __ testl(bits, BytesPerInt-1);
1692     __ jccb(Assembler::zero, L_int_aligned);
1693 
1694     __ testl(bits, BytesPerShort-1);
1695     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
1696 
1697     __ BIND(L_short_aligned);
1698     __ shrptr(count, LogBytesPerShort); // size => short_count
1699     __ movl(count_arg, count);          // update 'count'
1700     __ jump(RuntimeAddress(short_copy_entry));
1701 
1702     __ BIND(L_int_aligned);
1703     __ shrptr(count, LogBytesPerInt); // size => int_count
1704     __ movl(count_arg, count);          // update 'count'
1705     __ jump(RuntimeAddress(int_copy_entry));
1706 
1707     __ BIND(L_long_aligned);
1708     __ shrptr(count, LogBytesPerLong); // size => qword_count
1709     __ movl(count_arg, count);          // update 'count'
1710     __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1711     __ pop(rsi);
1712     __ jump(RuntimeAddress(long_copy_entry));
1713 
1714     return start;
1715   }
1716 
1717 
1718   // Perform range checks on the proposed arraycopy.
1719   // Smashes src_pos and dst_pos.  (Uses them up for temps.)
1720   void arraycopy_range_checks(Register src,
1721                               Register src_pos,
1722                               Register dst,
1723                               Register dst_pos,
1724                               Address& length,
1725                               Label& L_failed) {
1726     BLOCK_COMMENT("arraycopy_range_checks:");
1727     const Register src_end = src_pos;   // source array end position
1728     const Register dst_end = dst_pos;   // destination array end position
1729     __ addl(src_end, length); // src_pos + length
1730     __ addl(dst_end, length); // dst_pos + length
1731 
1732     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
1733     __ cmpl(src_end, Address(src, arrayOopDesc::length_offset_in_bytes()));
1734     __ jcc(Assembler::above, L_failed);
1735 
1736     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
1737     __ cmpl(dst_end, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1738     __ jcc(Assembler::above, L_failed);
1739 
1740     BLOCK_COMMENT("arraycopy_range_checks done");
1741   }
1742 
1743 
1744   //
1745   //  Generate generic array copy stubs
1746   //
1747   //  Input:
1748   //     4(rsp)    -  src oop
1749   //     8(rsp)    -  src_pos
1750   //    12(rsp)    -  dst oop
1751   //    16(rsp)    -  dst_pos
1752   //    20(rsp)    -  element count
1753   //
1754   //  Output:
1755   //    rax, ==  0  -  success
1756   //    rax, == -1^K - failure, where K is partial transfer count
1757   //
1758   address generate_generic_copy(const char *name,
1759                                 address entry_jbyte_arraycopy,
1760                                 address entry_jshort_arraycopy,
1761                                 address entry_jint_arraycopy,
1762                                 address entry_oop_arraycopy,
1763                                 address entry_jlong_arraycopy,
1764                                 address entry_checkcast_arraycopy) {
1765     Label L_failed, L_failed_0, L_objArray;
1766 
1767     { int modulus = CodeEntryAlignment;
1768       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
1769       int advance = target - (__ offset() % modulus);
1770       if (advance < 0)  advance += modulus;
1771       if (advance > 0)  __ nop(advance);
1772     }
1773     StubCodeMark mark(this, "StubRoutines", name);
1774 
1775     // Short-hop target to L_failed.  Makes for denser prologue code.
1776     __ BIND(L_failed_0);
1777     __ jmp(L_failed);
1778     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
1779 
1780     __ align(CodeEntryAlignment);
1781     address start = __ pc();
1782 
1783     __ enter(); // required for proper stackwalking of RuntimeStub frame
1784     __ push(rsi);
1785     __ push(rdi);
1786 
1787     // bump this on entry, not on exit:
1788     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1789 
1790     // Input values
1791     Address SRC     (rsp, 12+ 4);
1792     Address SRC_POS (rsp, 12+ 8);
1793     Address DST     (rsp, 12+12);
1794     Address DST_POS (rsp, 12+16);
1795     Address LENGTH  (rsp, 12+20);
1796 
1797     //-----------------------------------------------------------------------
1798     // Assembler stub will be used for this call to arraycopy
1799     // if the following conditions are met:
1800     //
1801     // (1) src and dst must not be null.
1802     // (2) src_pos must not be negative.
1803     // (3) dst_pos must not be negative.
1804     // (4) length  must not be negative.
1805     // (5) src klass and dst klass should be the same and not NULL.
1806     // (6) src and dst should be arrays.
1807     // (7) src_pos + length must not exceed length of src.
1808     // (8) dst_pos + length must not exceed length of dst.
1809     //
1810 
1811     const Register src     = rax;       // source array oop
1812     const Register src_pos = rsi;
1813     const Register dst     = rdx;       // destination array oop
1814     const Register dst_pos = rdi;
1815     const Register length  = rcx;       // transfer count
1816 
1817     //  if (src == NULL) return -1;
1818     __ movptr(src, SRC);      // src oop
1819     __ testptr(src, src);
1820     __ jccb(Assembler::zero, L_failed_0);
1821 
1822     //  if (src_pos < 0) return -1;
1823     __ movl2ptr(src_pos, SRC_POS);  // src_pos
1824     __ testl(src_pos, src_pos);
1825     __ jccb(Assembler::negative, L_failed_0);
1826 
1827     //  if (dst == NULL) return -1;
1828     __ movptr(dst, DST);      // dst oop
1829     __ testptr(dst, dst);
1830     __ jccb(Assembler::zero, L_failed_0);
1831 
1832     //  if (dst_pos < 0) return -1;
1833     __ movl2ptr(dst_pos, DST_POS);  // dst_pos
1834     __ testl(dst_pos, dst_pos);
1835     __ jccb(Assembler::negative, L_failed_0);
1836 
1837     //  if (length < 0) return -1;
1838     __ movl2ptr(length, LENGTH);   // length
1839     __ testl(length, length);
1840     __ jccb(Assembler::negative, L_failed_0);
1841 
1842     //  if (src->klass() == NULL) return -1;
1843     Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
1844     Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
1845     const Register rcx_src_klass = rcx;    // array klass
1846     __ movptr(rcx_src_klass, Address(src, oopDesc::klass_offset_in_bytes()));
1847 
1848 #ifdef ASSERT
1849     //  assert(src->klass() != NULL);
1850     BLOCK_COMMENT("assert klasses not null");
1851     { Label L1, L2;
1852       __ testptr(rcx_src_klass, rcx_src_klass);
1853       __ jccb(Assembler::notZero, L2);   // it is broken if klass is NULL
1854       __ bind(L1);
1855       __ stop("broken null klass");
1856       __ bind(L2);
1857       __ cmpptr(dst_klass_addr, (int32_t)NULL_WORD);
1858       __ jccb(Assembler::equal, L1);      // this would be broken also
1859       BLOCK_COMMENT("assert done");
1860     }
1861 #endif //ASSERT
1862 
1863     // Load layout helper (32-bits)
1864     //
1865     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1866     // 32        30    24            16              8     2                 0
1867     //
1868     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1869     //
1870 
1871     int lh_offset = in_bytes(Klass::layout_helper_offset());
1872     Address src_klass_lh_addr(rcx_src_klass, lh_offset);
1873 
1874     // Handle objArrays completely differently...
1875     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1876     __ cmpl(src_klass_lh_addr, objArray_lh);
1877     __ jcc(Assembler::equal, L_objArray);
1878 
1879     //  if (src->klass() != dst->klass()) return -1;
1880     __ cmpptr(rcx_src_klass, dst_klass_addr);
1881     __ jccb(Assembler::notEqual, L_failed_0);
1882 
1883     const Register rcx_lh = rcx;  // layout helper
1884     assert(rcx_lh == rcx_src_klass, "known alias");
1885     __ movl(rcx_lh, src_klass_lh_addr);
1886 
1887     //  if (!src->is_Array()) return -1;
1888     __ cmpl(rcx_lh, Klass::_lh_neutral_value);
1889     __ jcc(Assembler::greaterEqual, L_failed_0); // signed cmp
1890 
1891     // At this point, it is known to be a typeArray (array_tag 0x3).
1892 #ifdef ASSERT
1893     { Label L;
1894       __ cmpl(rcx_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1895       __ jcc(Assembler::greaterEqual, L); // signed cmp
1896       __ stop("must be a primitive array");
1897       __ bind(L);
1898     }
1899 #endif
1900 
1901     assert_different_registers(src, src_pos, dst, dst_pos, rcx_lh);
1902     arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1903 
1904     // TypeArrayKlass
1905     //
1906     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1907     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1908     //
1909     const Register rsi_offset = rsi; // array offset
1910     const Register src_array  = src; // src array offset
1911     const Register dst_array  = dst; // dst array offset
1912     const Register rdi_elsize = rdi; // log2 element size
1913 
1914     __ mov(rsi_offset, rcx_lh);
1915     __ shrptr(rsi_offset, Klass::_lh_header_size_shift);
1916     __ andptr(rsi_offset, Klass::_lh_header_size_mask);   // array_offset
1917     __ addptr(src_array, rsi_offset);  // src array offset
1918     __ addptr(dst_array, rsi_offset);  // dst array offset
1919     __ andptr(rcx_lh, Klass::_lh_log2_element_size_mask); // log2 elsize
1920 
1921     // next registers should be set before the jump to corresponding stub
1922     const Register from       = src; // source array address
1923     const Register to         = dst; // destination array address
1924     const Register count      = rcx; // elements count
1925     // some of them should be duplicated on stack
1926 #define FROM   Address(rsp, 12+ 4)
1927 #define TO     Address(rsp, 12+ 8)   // Not used now
1928 #define COUNT  Address(rsp, 12+12)   // Only for oop arraycopy
1929 
1930     BLOCK_COMMENT("scale indexes to element size");
1931     __ movl2ptr(rsi, SRC_POS);  // src_pos
1932     __ shlptr(rsi);             // src_pos << rcx (log2 elsize)
1933     assert(src_array == from, "");
1934     __ addptr(from, rsi);       // from = src_array + SRC_POS << log2 elsize
1935     __ movl2ptr(rdi, DST_POS);  // dst_pos
1936     __ shlptr(rdi);             // dst_pos << rcx (log2 elsize)
1937     assert(dst_array == to, "");
1938     __ addptr(to,  rdi);        // to   = dst_array + DST_POS << log2 elsize
1939     __ movptr(FROM, from);      // src_addr
1940     __ mov(rdi_elsize, rcx_lh); // log2 elsize
1941     __ movl2ptr(count, LENGTH); // elements count
1942 
1943     BLOCK_COMMENT("choose copy loop based on element size");
1944     __ cmpl(rdi_elsize, 0);
1945 
1946     __ jump_cc(Assembler::equal, RuntimeAddress(entry_jbyte_arraycopy));
1947     __ cmpl(rdi_elsize, LogBytesPerShort);
1948     __ jump_cc(Assembler::equal, RuntimeAddress(entry_jshort_arraycopy));
1949     __ cmpl(rdi_elsize, LogBytesPerInt);
1950     __ jump_cc(Assembler::equal, RuntimeAddress(entry_jint_arraycopy));
1951 #ifdef ASSERT
1952     __ cmpl(rdi_elsize, LogBytesPerLong);
1953     __ jccb(Assembler::notEqual, L_failed);
1954 #endif
1955     __ pop(rdi); // Do pops here since jlong_arraycopy stub does not do it.
1956     __ pop(rsi);
1957     __ jump(RuntimeAddress(entry_jlong_arraycopy));
1958 
1959   __ BIND(L_failed);
1960     __ xorptr(rax, rax);
1961     __ notptr(rax); // return -1
1962     __ pop(rdi);
1963     __ pop(rsi);
1964     __ leave(); // required for proper stackwalking of RuntimeStub frame
1965     __ ret(0);
1966 
1967     // ObjArrayKlass
1968   __ BIND(L_objArray);
1969     // live at this point:  rcx_src_klass, src[_pos], dst[_pos]
1970 
1971     Label L_plain_copy, L_checkcast_copy;
1972     //  test array classes for subtyping
1973     __ cmpptr(rcx_src_klass, dst_klass_addr); // usual case is exact equality
1974     __ jccb(Assembler::notEqual, L_checkcast_copy);
1975 
1976     // Identically typed arrays can be copied without element-wise checks.
1977     assert_different_registers(src, src_pos, dst, dst_pos, rcx_src_klass);
1978     arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
1979 
1980   __ BIND(L_plain_copy);
1981     __ movl2ptr(count, LENGTH); // elements count
1982     __ movl2ptr(src_pos, SRC_POS);  // reload src_pos
1983     __ lea(from, Address(src, src_pos, Address::times_ptr,
1984                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
1985     __ movl2ptr(dst_pos, DST_POS);  // reload dst_pos
1986     __ lea(to,   Address(dst, dst_pos, Address::times_ptr,
1987                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
1988     __ movptr(FROM,  from);   // src_addr
1989     __ movptr(TO,    to);     // dst_addr
1990     __ movl(COUNT, count);  // count
1991     __ jump(RuntimeAddress(entry_oop_arraycopy));
1992 
1993   __ BIND(L_checkcast_copy);
1994     // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
1995     {
1996       // Handy offsets:
1997       int  ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1998       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1999 
2000       Register rsi_dst_klass = rsi;
2001       Register rdi_temp      = rdi;
2002       assert(rsi_dst_klass == src_pos, "expected alias w/ src_pos");
2003       assert(rdi_temp      == dst_pos, "expected alias w/ dst_pos");
2004       Address dst_klass_lh_addr(rsi_dst_klass, lh_offset);
2005 
2006       // Before looking at dst.length, make sure dst is also an objArray.
2007       __ movptr(rsi_dst_klass, dst_klass_addr);
2008       __ cmpl(dst_klass_lh_addr, objArray_lh);
2009       __ jccb(Assembler::notEqual, L_failed);
2010 
2011       // It is safe to examine both src.length and dst.length.
2012       __ movl2ptr(src_pos, SRC_POS);        // reload rsi
2013       arraycopy_range_checks(src, src_pos, dst, dst_pos, LENGTH, L_failed);
2014       // (Now src_pos and dst_pos are killed, but not src and dst.)
2015 
2016       // We'll need this temp (don't forget to pop it after the type check).
2017       __ push(rbx);
2018       Register rbx_src_klass = rbx;
2019 
2020       __ mov(rbx_src_klass, rcx_src_klass); // spill away from rcx
2021       __ movptr(rsi_dst_klass, dst_klass_addr);
2022       Address super_check_offset_addr(rsi_dst_klass, sco_offset);
2023       Label L_fail_array_check;
2024       generate_type_check(rbx_src_klass,
2025                           super_check_offset_addr, dst_klass_addr,
2026                           rdi_temp, NULL, &L_fail_array_check);
2027       // (On fall-through, we have passed the array type check.)
2028       __ pop(rbx);
2029       __ jmp(L_plain_copy);
2030 
2031       __ BIND(L_fail_array_check);
2032       // Reshuffle arguments so we can call checkcast_arraycopy:
2033 
2034       // match initial saves for checkcast_arraycopy
2035       // push(rsi);    // already done; see above
2036       // push(rdi);    // already done; see above
2037       // push(rbx);    // already done; see above
2038 
2039       // Marshal outgoing arguments now, freeing registers.
2040       Address   from_arg(rsp, 16+ 4);   // from
2041       Address     to_arg(rsp, 16+ 8);   // to
2042       Address length_arg(rsp, 16+12);   // elements count
2043       Address  ckoff_arg(rsp, 16+16);   // super_check_offset
2044       Address  ckval_arg(rsp, 16+20);   // super_klass
2045 
2046       Address SRC_POS_arg(rsp, 16+ 8);
2047       Address DST_POS_arg(rsp, 16+16);
2048       Address  LENGTH_arg(rsp, 16+20);
2049       // push rbx, changed the incoming offsets (why not just use rbp,??)
2050       // assert(SRC_POS_arg.disp() == SRC_POS.disp() + 4, "");
2051 
2052       __ movptr(rbx, Address(rsi_dst_klass, ek_offset));
2053       __ movl2ptr(length, LENGTH_arg);    // reload elements count
2054       __ movl2ptr(src_pos, SRC_POS_arg);  // reload src_pos
2055       __ movl2ptr(dst_pos, DST_POS_arg);  // reload dst_pos
2056 
2057       __ movptr(ckval_arg, rbx);          // destination element type
2058       __ movl(rbx, Address(rbx, sco_offset));
2059       __ movl(ckoff_arg, rbx);          // corresponding class check offset
2060 
2061       __ movl(length_arg, length);      // outgoing length argument
2062 
2063       __ lea(from, Address(src, src_pos, Address::times_ptr,
2064                             arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2065       __ movptr(from_arg, from);
2066 
2067       __ lea(to, Address(dst, dst_pos, Address::times_ptr,
2068                           arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2069       __ movptr(to_arg, to);
2070       __ jump(RuntimeAddress(entry_checkcast_arraycopy));
2071     }
2072 
2073     return start;
2074   }
2075 
2076   void generate_arraycopy_stubs() {
2077     address entry;
2078     address entry_jbyte_arraycopy;
2079     address entry_jshort_arraycopy;
2080     address entry_jint_arraycopy;
2081     address entry_oop_arraycopy;
2082     address entry_jlong_arraycopy;
2083     address entry_checkcast_arraycopy;
2084 
2085     StubRoutines::_arrayof_jbyte_disjoint_arraycopy =
2086         generate_disjoint_copy(T_BYTE,  true, Address::times_1, &entry,
2087                                "arrayof_jbyte_disjoint_arraycopy");
2088     StubRoutines::_arrayof_jbyte_arraycopy =
2089         generate_conjoint_copy(T_BYTE,  true, Address::times_1,  entry,
2090                                NULL, "arrayof_jbyte_arraycopy");
2091     StubRoutines::_jbyte_disjoint_arraycopy =
2092         generate_disjoint_copy(T_BYTE, false, Address::times_1, &entry,
2093                                "jbyte_disjoint_arraycopy");
2094     StubRoutines::_jbyte_arraycopy =
2095         generate_conjoint_copy(T_BYTE, false, Address::times_1,  entry,
2096                                &entry_jbyte_arraycopy, "jbyte_arraycopy");
2097 
2098     StubRoutines::_arrayof_jshort_disjoint_arraycopy =
2099         generate_disjoint_copy(T_SHORT,  true, Address::times_2, &entry,
2100                                "arrayof_jshort_disjoint_arraycopy");
2101     StubRoutines::_arrayof_jshort_arraycopy =
2102         generate_conjoint_copy(T_SHORT,  true, Address::times_2,  entry,
2103                                NULL, "arrayof_jshort_arraycopy");
2104     StubRoutines::_jshort_disjoint_arraycopy =
2105         generate_disjoint_copy(T_SHORT, false, Address::times_2, &entry,
2106                                "jshort_disjoint_arraycopy");
2107     StubRoutines::_jshort_arraycopy =
2108         generate_conjoint_copy(T_SHORT, false, Address::times_2,  entry,
2109                                &entry_jshort_arraycopy, "jshort_arraycopy");
2110 
2111     // Next arrays are always aligned on 4 bytes at least.
2112     StubRoutines::_jint_disjoint_arraycopy =
2113         generate_disjoint_copy(T_INT, true, Address::times_4, &entry,
2114                                "jint_disjoint_arraycopy");
2115     StubRoutines::_jint_arraycopy =
2116         generate_conjoint_copy(T_INT, true, Address::times_4,  entry,
2117                                &entry_jint_arraycopy, "jint_arraycopy");
2118 
2119     StubRoutines::_oop_disjoint_arraycopy =
2120         generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2121                                "oop_disjoint_arraycopy");
2122     StubRoutines::_oop_arraycopy =
2123         generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2124                                &entry_oop_arraycopy, "oop_arraycopy");
2125 
2126     StubRoutines::_oop_disjoint_arraycopy_uninit =
2127         generate_disjoint_copy(T_OBJECT, true, Address::times_ptr, &entry,
2128                                "oop_disjoint_arraycopy_uninit",
2129                                /*dest_uninitialized*/true);
2130     StubRoutines::_oop_arraycopy_uninit =
2131         generate_conjoint_copy(T_OBJECT, true, Address::times_ptr,  entry,
2132                                NULL, "oop_arraycopy_uninit",
2133                                /*dest_uninitialized*/true);
2134 
2135     StubRoutines::_jlong_disjoint_arraycopy =
2136         generate_disjoint_long_copy(&entry, "jlong_disjoint_arraycopy");
2137     StubRoutines::_jlong_arraycopy =
2138         generate_conjoint_long_copy(entry, &entry_jlong_arraycopy,
2139                                     "jlong_arraycopy");
2140 
2141     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2142     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2143     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2144     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2145     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2146     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2147 
2148     StubRoutines::_arrayof_jint_disjoint_arraycopy       = StubRoutines::_jint_disjoint_arraycopy;
2149     StubRoutines::_arrayof_oop_disjoint_arraycopy        = StubRoutines::_oop_disjoint_arraycopy;
2150     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
2151     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = StubRoutines::_jlong_disjoint_arraycopy;
2152 
2153     StubRoutines::_arrayof_jint_arraycopy       = StubRoutines::_jint_arraycopy;
2154     StubRoutines::_arrayof_oop_arraycopy        = StubRoutines::_oop_arraycopy;
2155     StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
2156     StubRoutines::_arrayof_jlong_arraycopy      = StubRoutines::_jlong_arraycopy;
2157 
2158     StubRoutines::_checkcast_arraycopy =
2159         generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2160     StubRoutines::_checkcast_arraycopy_uninit =
2161         generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, /*dest_uninitialized*/true);
2162 
2163     StubRoutines::_unsafe_arraycopy =
2164         generate_unsafe_copy("unsafe_arraycopy",
2165                                entry_jbyte_arraycopy,
2166                                entry_jshort_arraycopy,
2167                                entry_jint_arraycopy,
2168                                entry_jlong_arraycopy);
2169 
2170     StubRoutines::_generic_arraycopy =
2171         generate_generic_copy("generic_arraycopy",
2172                                entry_jbyte_arraycopy,
2173                                entry_jshort_arraycopy,
2174                                entry_jint_arraycopy,
2175                                entry_oop_arraycopy,
2176                                entry_jlong_arraycopy,
2177                                entry_checkcast_arraycopy);
2178   }
2179 
2180   // AES intrinsic stubs
2181   enum {AESBlockSize = 16};
2182 
2183   address generate_key_shuffle_mask() {
2184     __ align(16);
2185     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2186     address start = __ pc();
2187     __ emit_data(0x00010203, relocInfo::none, 0 );
2188     __ emit_data(0x04050607, relocInfo::none, 0 );
2189     __ emit_data(0x08090a0b, relocInfo::none, 0 );
2190     __ emit_data(0x0c0d0e0f, relocInfo::none, 0 );
2191     return start;
2192   }
2193 
2194   address generate_counter_shuffle_mask() {
2195     __ align(16);
2196     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2197     address start = __ pc();
2198     __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
2199     __ emit_data(0x08090a0b, relocInfo::none, 0);
2200     __ emit_data(0x04050607, relocInfo::none, 0);
2201     __ emit_data(0x00010203, relocInfo::none, 0);
2202     return start;
2203   }
2204 
2205   // Utility routine for loading a 128-bit key word in little endian format
2206   // can optionally specify that the shuffle mask is already in an xmmregister
2207   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2208     __ movdqu(xmmdst, Address(key, offset));
2209     if (xmm_shuf_mask != NULL) {
2210       __ pshufb(xmmdst, xmm_shuf_mask);
2211     } else {
2212       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2213     }
2214   }
2215 
2216   // aesenc using specified key+offset
2217   // can optionally specify that the shuffle mask is already in an xmmregister
2218   void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2219     load_key(xmmtmp, key, offset, xmm_shuf_mask);
2220     __ aesenc(xmmdst, xmmtmp);
2221   }
2222 
2223   // aesdec using specified key+offset
2224   // can optionally specify that the shuffle mask is already in an xmmregister
2225   void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2226     load_key(xmmtmp, key, offset, xmm_shuf_mask);
2227     __ aesdec(xmmdst, xmmtmp);
2228   }
2229 
2230   // Utility routine for increase 128bit counter (iv in CTR mode)
2231   //  XMM_128bit,  D3, D2, D1, D0
2232   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2233     __ pextrd(reg, xmmdst, 0x0);
2234     __ addl(reg, inc_delta);
2235     __ pinsrd(xmmdst, reg, 0x0);
2236     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2237 
2238     __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
2239     __ addl(reg, 0x01);
2240     __ pinsrd(xmmdst, reg, 0x01);
2241     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2242 
2243     __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
2244     __ addl(reg, 0x01);
2245     __ pinsrd(xmmdst, reg, 0x02);
2246     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2247 
2248     __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
2249     __ addl(reg, 0x01);
2250     __ pinsrd(xmmdst, reg, 0x03);
2251 
2252     __ BIND(next_block);          // next instruction
2253   }
2254 
2255 
2256   // Arguments:
2257   //
2258   // Inputs:
2259   //   c_rarg0   - source byte array address
2260   //   c_rarg1   - destination byte array address
2261   //   c_rarg2   - K (key) in little endian int array
2262   //
2263   address generate_aescrypt_encryptBlock() {
2264     assert(UseAES, "need AES instructions and misaligned SSE support");
2265     __ align(CodeEntryAlignment);
2266     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2267     Label L_doLast;
2268     address start = __ pc();
2269 
2270     const Register from        = rdx;      // source array address
2271     const Register to          = rdx;      // destination array address
2272     const Register key         = rcx;      // key array address
2273     const Register keylen      = rax;
2274     const Address  from_param(rbp, 8+0);
2275     const Address  to_param  (rbp, 8+4);
2276     const Address  key_param (rbp, 8+8);
2277 
2278     const XMMRegister xmm_result = xmm0;
2279     const XMMRegister xmm_key_shuf_mask = xmm1;
2280     const XMMRegister xmm_temp1  = xmm2;
2281     const XMMRegister xmm_temp2  = xmm3;
2282     const XMMRegister xmm_temp3  = xmm4;
2283     const XMMRegister xmm_temp4  = xmm5;
2284 
2285     __ enter();   // required for proper stackwalking of RuntimeStub frame
2286 
2287     __ movptr(from, from_param);
2288     __ movptr(key, key_param);
2289 
2290     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2291     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2292 
2293     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2294     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2295     __ movptr(to, to_param);
2296 
2297     // For encryption, the java expanded key ordering is just what we need
2298 
2299     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2300     __ pxor(xmm_result, xmm_temp1);
2301 
2302     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2303     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2304     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2305     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2306 
2307     __ aesenc(xmm_result, xmm_temp1);
2308     __ aesenc(xmm_result, xmm_temp2);
2309     __ aesenc(xmm_result, xmm_temp3);
2310     __ aesenc(xmm_result, xmm_temp4);
2311 
2312     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2313     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2314     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2315     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2316 
2317     __ aesenc(xmm_result, xmm_temp1);
2318     __ aesenc(xmm_result, xmm_temp2);
2319     __ aesenc(xmm_result, xmm_temp3);
2320     __ aesenc(xmm_result, xmm_temp4);
2321 
2322     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2323     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2324 
2325     __ cmpl(keylen, 44);
2326     __ jccb(Assembler::equal, L_doLast);
2327 
2328     __ aesenc(xmm_result, xmm_temp1);
2329     __ aesenc(xmm_result, xmm_temp2);
2330 
2331     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2332     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2333 
2334     __ cmpl(keylen, 52);
2335     __ jccb(Assembler::equal, L_doLast);
2336 
2337     __ aesenc(xmm_result, xmm_temp1);
2338     __ aesenc(xmm_result, xmm_temp2);
2339 
2340     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2341     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2342 
2343     __ BIND(L_doLast);
2344     __ aesenc(xmm_result, xmm_temp1);
2345     __ aesenclast(xmm_result, xmm_temp2);
2346     __ movdqu(Address(to, 0), xmm_result);        // store the result
2347     __ xorptr(rax, rax); // return 0
2348     __ leave(); // required for proper stackwalking of RuntimeStub frame
2349     __ ret(0);
2350 
2351     return start;
2352   }
2353 
2354 
2355   // Arguments:
2356   //
2357   // Inputs:
2358   //   c_rarg0   - source byte array address
2359   //   c_rarg1   - destination byte array address
2360   //   c_rarg2   - K (key) in little endian int array
2361   //
2362   address generate_aescrypt_decryptBlock() {
2363     assert(UseAES, "need AES instructions and misaligned SSE support");
2364     __ align(CodeEntryAlignment);
2365     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2366     Label L_doLast;
2367     address start = __ pc();
2368 
2369     const Register from        = rdx;      // source array address
2370     const Register to          = rdx;      // destination array address
2371     const Register key         = rcx;      // key array address
2372     const Register keylen      = rax;
2373     const Address  from_param(rbp, 8+0);
2374     const Address  to_param  (rbp, 8+4);
2375     const Address  key_param (rbp, 8+8);
2376 
2377     const XMMRegister xmm_result = xmm0;
2378     const XMMRegister xmm_key_shuf_mask = xmm1;
2379     const XMMRegister xmm_temp1  = xmm2;
2380     const XMMRegister xmm_temp2  = xmm3;
2381     const XMMRegister xmm_temp3  = xmm4;
2382     const XMMRegister xmm_temp4  = xmm5;
2383 
2384     __ enter(); // required for proper stackwalking of RuntimeStub frame
2385 
2386     __ movptr(from, from_param);
2387     __ movptr(key, key_param);
2388 
2389     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2390     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2391 
2392     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2393     __ movdqu(xmm_result, Address(from, 0));
2394     __ movptr(to, to_param);
2395 
2396     // for decryption java expanded key ordering is rotated one position from what we want
2397     // so we start from 0x10 here and hit 0x00 last
2398     // we don't know if the key is aligned, hence not using load-execute form
2399     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2400     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2401     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
2402     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
2403 
2404     __ pxor  (xmm_result, xmm_temp1);
2405     __ aesdec(xmm_result, xmm_temp2);
2406     __ aesdec(xmm_result, xmm_temp3);
2407     __ aesdec(xmm_result, xmm_temp4);
2408 
2409     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
2410     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
2411     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
2412     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
2413 
2414     __ aesdec(xmm_result, xmm_temp1);
2415     __ aesdec(xmm_result, xmm_temp2);
2416     __ aesdec(xmm_result, xmm_temp3);
2417     __ aesdec(xmm_result, xmm_temp4);
2418 
2419     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
2420     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
2421     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
2422 
2423     __ cmpl(keylen, 44);
2424     __ jccb(Assembler::equal, L_doLast);
2425 
2426     __ aesdec(xmm_result, xmm_temp1);
2427     __ aesdec(xmm_result, xmm_temp2);
2428 
2429     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
2430     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
2431 
2432     __ cmpl(keylen, 52);
2433     __ jccb(Assembler::equal, L_doLast);
2434 
2435     __ aesdec(xmm_result, xmm_temp1);
2436     __ aesdec(xmm_result, xmm_temp2);
2437 
2438     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
2439     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
2440 
2441     __ BIND(L_doLast);
2442     __ aesdec(xmm_result, xmm_temp1);
2443     __ aesdec(xmm_result, xmm_temp2);
2444 
2445     // for decryption the aesdeclast operation is always on key+0x00
2446     __ aesdeclast(xmm_result, xmm_temp3);
2447     __ movdqu(Address(to, 0), xmm_result);  // store the result
2448     __ xorptr(rax, rax); // return 0
2449     __ leave(); // required for proper stackwalking of RuntimeStub frame
2450     __ ret(0);
2451 
2452     return start;
2453   }
2454 
2455   void handleSOERegisters(bool saving) {
2456     const int saveFrameSizeInBytes = 4 * wordSize;
2457     const Address saved_rbx     (rbp, -3 * wordSize);
2458     const Address saved_rsi     (rbp, -2 * wordSize);
2459     const Address saved_rdi     (rbp, -1 * wordSize);
2460 
2461     if (saving) {
2462       __ subptr(rsp, saveFrameSizeInBytes);
2463       __ movptr(saved_rsi, rsi);
2464       __ movptr(saved_rdi, rdi);
2465       __ movptr(saved_rbx, rbx);
2466     } else {
2467       // restoring
2468       __ movptr(rsi, saved_rsi);
2469       __ movptr(rdi, saved_rdi);
2470       __ movptr(rbx, saved_rbx);
2471     }
2472   }
2473 
2474   // Arguments:
2475   //
2476   // Inputs:
2477   //   c_rarg0   - source byte array address
2478   //   c_rarg1   - destination byte array address
2479   //   c_rarg2   - K (key) in little endian int array
2480   //   c_rarg3   - r vector byte array address
2481   //   c_rarg4   - input length
2482   //
2483   // Output:
2484   //   rax       - input length
2485   //
2486   address generate_cipherBlockChaining_encryptAESCrypt() {
2487     assert(UseAES, "need AES instructions and misaligned SSE support");
2488     __ align(CodeEntryAlignment);
2489     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2490     address start = __ pc();
2491 
2492     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
2493     const Register from        = rsi;      // source array address
2494     const Register to          = rdx;      // destination array address
2495     const Register key         = rcx;      // key array address
2496     const Register rvec        = rdi;      // r byte array initialized from initvector array address
2497                                            // and left with the results of the last encryption block
2498     const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2499     const Register pos         = rax;
2500 
2501     // xmm register assignments for the loops below
2502     const XMMRegister xmm_result = xmm0;
2503     const XMMRegister xmm_temp   = xmm1;
2504     // first 6 keys preloaded into xmm2-xmm7
2505     const int XMM_REG_NUM_KEY_FIRST = 2;
2506     const int XMM_REG_NUM_KEY_LAST  = 7;
2507     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
2508 
2509     __ enter(); // required for proper stackwalking of RuntimeStub frame
2510     handleSOERegisters(true /*saving*/);
2511 
2512     // load registers from incoming parameters
2513     const Address  from_param(rbp, 8+0);
2514     const Address  to_param  (rbp, 8+4);
2515     const Address  key_param (rbp, 8+8);
2516     const Address  rvec_param (rbp, 8+12);
2517     const Address  len_param  (rbp, 8+16);
2518     __ movptr(from , from_param);
2519     __ movptr(to   , to_param);
2520     __ movptr(key  , key_param);
2521     __ movptr(rvec , rvec_param);
2522     __ movptr(len_reg , len_param);
2523 
2524     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
2525     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2526     // load up xmm regs 2 thru 7 with keys 0-5
2527     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2528       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
2529       offset += 0x10;
2530     }
2531 
2532     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
2533 
2534     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2535     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2536     __ cmpl(rax, 44);
2537     __ jcc(Assembler::notEqual, L_key_192_256);
2538 
2539     // 128 bit code follows here
2540     __ movl(pos, 0);
2541     __ align(OptoLoopAlignment);
2542     __ BIND(L_loopTop_128);
2543     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2544     __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2545 
2546     __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2547     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2548       __ aesenc(xmm_result, as_XMMRegister(rnum));
2549     }
2550     for (int key_offset = 0x60; key_offset <= 0x90; key_offset += 0x10) {
2551       aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2552     }
2553     load_key(xmm_temp, key, 0xa0);
2554     __ aesenclast(xmm_result, xmm_temp);
2555 
2556     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
2557     // no need to store r to memory until we exit
2558     __ addptr(pos, AESBlockSize);
2559     __ subptr(len_reg, AESBlockSize);
2560     __ jcc(Assembler::notEqual, L_loopTop_128);
2561 
2562     __ BIND(L_exit);
2563     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
2564 
2565     handleSOERegisters(false /*restoring*/);
2566     __ movptr(rax, len_param); // return length
2567     __ leave();                                  // required for proper stackwalking of RuntimeStub frame
2568     __ ret(0);
2569 
2570     __ BIND(L_key_192_256);
2571     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
2572     __ cmpl(rax, 52);
2573     __ jcc(Assembler::notEqual, L_key_256);
2574 
2575     // 192-bit code follows here (could be changed to use more xmm registers)
2576     __ movl(pos, 0);
2577     __ align(OptoLoopAlignment);
2578     __ BIND(L_loopTop_192);
2579     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2580     __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2581 
2582     __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2583     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2584       __ aesenc(xmm_result, as_XMMRegister(rnum));
2585     }
2586     for (int key_offset = 0x60; key_offset <= 0xb0; key_offset += 0x10) {
2587       aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2588     }
2589     load_key(xmm_temp, key, 0xc0);
2590     __ aesenclast(xmm_result, xmm_temp);
2591 
2592     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2593     // no need to store r to memory until we exit
2594     __ addptr(pos, AESBlockSize);
2595     __ subptr(len_reg, AESBlockSize);
2596     __ jcc(Assembler::notEqual, L_loopTop_192);
2597     __ jmp(L_exit);
2598 
2599     __ BIND(L_key_256);
2600     // 256-bit code follows here (could be changed to use more xmm registers)
2601     __ movl(pos, 0);
2602     __ align(OptoLoopAlignment);
2603     __ BIND(L_loopTop_256);
2604     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
2605     __ pxor  (xmm_result, xmm_temp);                                // xor with the current r vector
2606 
2607     __ pxor  (xmm_result, xmm_key0);                                // do the aes rounds
2608     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
2609       __ aesenc(xmm_result, as_XMMRegister(rnum));
2610     }
2611     for (int key_offset = 0x60; key_offset <= 0xd0; key_offset += 0x10) {
2612       aes_enc_key(xmm_result, xmm_temp, key, key_offset);
2613     }
2614     load_key(xmm_temp, key, 0xe0);
2615     __ aesenclast(xmm_result, xmm_temp);
2616 
2617     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);   // store into the next 16 bytes of output
2618     // no need to store r to memory until we exit
2619     __ addptr(pos, AESBlockSize);
2620     __ subptr(len_reg, AESBlockSize);
2621     __ jcc(Assembler::notEqual, L_loopTop_256);
2622     __ jmp(L_exit);
2623 
2624     return start;
2625   }
2626 
2627 
2628   // CBC AES Decryption.
2629   // In 32-bit stub, because of lack of registers we do not try to parallelize 4 blocks at a time.
2630   //
2631   // Arguments:
2632   //
2633   // Inputs:
2634   //   c_rarg0   - source byte array address
2635   //   c_rarg1   - destination byte array address
2636   //   c_rarg2   - K (key) in little endian int array
2637   //   c_rarg3   - r vector byte array address
2638   //   c_rarg4   - input length
2639   //
2640   // Output:
2641   //   rax       - input length
2642   //
2643 
2644   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
2645     assert(UseAES, "need AES instructions and misaligned SSE support");
2646     __ align(CodeEntryAlignment);
2647     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2648     address start = __ pc();
2649 
2650     const Register from        = rsi;      // source array address
2651     const Register to          = rdx;      // destination array address
2652     const Register key         = rcx;      // key array address
2653     const Register rvec        = rdi;      // r byte array initialized from initvector array address
2654                                            // and left with the results of the last encryption block
2655     const Register len_reg     = rbx;      // src len (must be multiple of blocksize 16)
2656     const Register pos         = rax;
2657 
2658     const int PARALLEL_FACTOR = 4;
2659     const int ROUNDS[3] = { 10, 12, 14 }; //aes rounds for key128, key192, key256
2660 
2661     Label L_exit;
2662     Label L_singleBlock_loopTop[3]; //128, 192, 256
2663     Label L_multiBlock_loopTop[3]; //128, 192, 256
2664 
2665     const XMMRegister xmm_prev_block_cipher = xmm0; // holds cipher of previous block
2666     const XMMRegister xmm_key_shuf_mask = xmm1;
2667 
2668     const XMMRegister xmm_key_tmp0 = xmm2;
2669     const XMMRegister xmm_key_tmp1 = xmm3;
2670 
2671     // registers holding the six results in the parallelized loop
2672     const XMMRegister xmm_result0 = xmm4;
2673     const XMMRegister xmm_result1 = xmm5;
2674     const XMMRegister xmm_result2 = xmm6;
2675     const XMMRegister xmm_result3 = xmm7;
2676 
2677     __ enter(); // required for proper stackwalking of RuntimeStub frame
2678     handleSOERegisters(true /*saving*/);
2679 
2680     // load registers from incoming parameters
2681     const Address  from_param(rbp, 8+0);
2682     const Address  to_param  (rbp, 8+4);
2683     const Address  key_param (rbp, 8+8);
2684     const Address  rvec_param (rbp, 8+12);
2685     const Address  len_param  (rbp, 8+16);
2686 
2687     __ movptr(from , from_param);
2688     __ movptr(to   , to_param);
2689     __ movptr(key  , key_param);
2690     __ movptr(rvec , rvec_param);
2691     __ movptr(len_reg , len_param);
2692 
2693     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2694     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
2695 
2696     __ xorptr(pos, pos);
2697 
2698     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
2699     // rvec is reused
2700     __ movl(rvec, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2701     __ cmpl(rvec, 52);
2702     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
2703     __ cmpl(rvec, 60);
2704     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
2705 
2706 #define DoFour(opc, src_reg)           \
2707   __ opc(xmm_result0, src_reg);         \
2708   __ opc(xmm_result1, src_reg);         \
2709   __ opc(xmm_result2, src_reg);         \
2710   __ opc(xmm_result3, src_reg);         \
2711 
2712     for (int k = 0; k < 3; ++k) {
2713       __ align(OptoLoopAlignment);
2714       __ BIND(L_multiBlock_loopTop[k]);
2715       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
2716       __ jcc(Assembler::less, L_singleBlock_loopTop[k]);
2717 
2718       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
2719       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2720       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2721       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2722 
2723       // the java expanded key ordering is rotated one position from what we want
2724       // so we start from 0x10 here and hit 0x00 last
2725       load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2726       DoFour(pxor, xmm_key_tmp0); //xor with first key
2727       // do the aes dec rounds
2728       for (int rnum = 1; rnum <= ROUNDS[k];) {
2729         //load two keys at a time
2730         //k1->0x20, ..., k9->0xa0, k10->0x00
2731         load_key(xmm_key_tmp1, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2732         load_key(xmm_key_tmp0, key, ((rnum + 2) % (ROUNDS[k] + 1)) * 0x10, xmm_key_shuf_mask); // hit 0x00 last!
2733         DoFour(aesdec, xmm_key_tmp1);
2734         rnum++;
2735         if (rnum != ROUNDS[k]) {
2736           DoFour(aesdec, xmm_key_tmp0);
2737         }
2738         else {
2739           DoFour(aesdeclast, xmm_key_tmp0);
2740         }
2741         rnum++;
2742       }
2743 
2744       // for each result, xor with the r vector of previous cipher block
2745       __ pxor(xmm_result0, xmm_prev_block_cipher);
2746       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2747       __ pxor(xmm_result1, xmm_prev_block_cipher);
2748       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2749       __ pxor(xmm_result2, xmm_prev_block_cipher);
2750       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2751       __ pxor(xmm_result3, xmm_prev_block_cipher);
2752       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
2753 
2754             // store 4 results into the next 64 bytes of output
2755        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2756        __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2757        __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2758        __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2759 
2760        __ addptr(pos, 4 * AESBlockSize);
2761        __ subptr(len_reg, 4 * AESBlockSize);
2762        __ jmp(L_multiBlock_loopTop[k]);
2763 
2764        //singleBlock starts here
2765        __ align(OptoLoopAlignment);
2766        __ BIND(L_singleBlock_loopTop[k]);
2767        __ cmpptr(len_reg, 0); // any blocks left?
2768        __ jcc(Assembler::equal, L_exit);
2769        __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
2770        __ movdqa(xmm_result1, xmm_result0);
2771 
2772        load_key(xmm_key_tmp0, key, 0x10, xmm_key_shuf_mask);
2773        __ pxor(xmm_result0, xmm_key_tmp0);
2774        // do the aes dec rounds
2775        for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
2776          // the java expanded key ordering is rotated one position from what we want
2777          load_key(xmm_key_tmp0, key, (rnum + 1) * 0x10, xmm_key_shuf_mask);
2778          __ aesdec(xmm_result0, xmm_key_tmp0);
2779        }
2780        load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
2781        __ aesdeclast(xmm_result0, xmm_key_tmp0);
2782        __ pxor(xmm_result0, xmm_prev_block_cipher); // xor with the current r vector
2783        __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result0); // store into the next 16 bytes of output
2784        // no need to store r to memory until we exit
2785        __ movdqa(xmm_prev_block_cipher, xmm_result1); // set up next r vector with cipher input from this block
2786 
2787        __ addptr(pos, AESBlockSize);
2788        __ subptr(len_reg, AESBlockSize);
2789        __ jmp(L_singleBlock_loopTop[k]);
2790     }//for 128/192/256
2791 
2792     __ BIND(L_exit);
2793     __ movptr(rvec, rvec_param);                        // restore this since reused earlier
2794     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
2795     handleSOERegisters(false /*restoring*/);
2796     __ movptr(rax, len_param);                          // return length
2797     __ leave();                                         // required for proper stackwalking of RuntimeStub frame
2798     __ ret(0);
2799 
2800     return start;
2801   }
2802 
2803   // CTR AES crypt.
2804   // In 32-bit stub, parallelize 4 blocks at a time
2805   // Arguments:
2806   //
2807   // Inputs:
2808   //   c_rarg0   - source byte array address
2809   //   c_rarg1   - destination byte array address
2810   //   c_rarg2   - K (key) in little endian int array
2811   //   c_rarg3   - counter vector byte array address
2812   //   c_rarg4   - input length
2813   //
2814   // Output:
2815   //   rax       - input length
2816   //
2817   address generate_counterMode_AESCrypt_Parallel() {
2818     assert(UseAES, "need AES instructions and misaligned SSE support");
2819     __ align(CodeEntryAlignment);
2820     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2821     address start = __ pc();
2822     const Register from        = rsi;      // source array address
2823     const Register to          = rdx;      // destination array address
2824     const Register key         = rcx;      // key array address
2825     const Register counter     = rdi;      // counter byte array initialized from initvector array address
2826                                            // and updated with the incremented counter in the end
2827     const Register len_reg     = rbx;
2828     const Register pos         = rax;
2829 
2830     __ enter(); // required for proper stackwalking of RuntimeStub frame
2831     handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
2832 
2833     // load registers from incoming parameters
2834     const Address  from_param(rbp, 8+0);
2835     const Address  to_param  (rbp, 8+4);
2836     const Address  key_param (rbp, 8+8);
2837     const Address  rvec_param (rbp, 8+12);
2838     const Address  len_param  (rbp, 8+16);
2839     const Address  saved_counter_param(rbp, 8 + 20);
2840     const Address  used_addr_param(rbp, 8 + 24);
2841 
2842     __ movptr(from , from_param);
2843     __ movptr(to   , to_param);
2844     __ movptr(len_reg , len_param);
2845 
2846     // Use the partially used encrpyted counter from last invocation
2847     Label L_exit_preLoop, L_preLoop_start;
2848 
2849     // Use the registers 'counter' and 'key' here in this preloop
2850     // to hold of last 2 params 'used' and 'saved_encCounter_start'
2851     Register used = counter;
2852     Register saved_encCounter_start = key;
2853     Register used_addr = saved_encCounter_start;
2854 
2855     __ movptr(used_addr, used_addr_param);
2856     __ movptr(used, Address(used_addr, 0));
2857     __ movptr(saved_encCounter_start, saved_counter_param);
2858 
2859     __ BIND(L_preLoop_start);
2860     __ cmpptr(used, 16);
2861     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
2862     __ cmpptr(len_reg, 0);
2863     __ jcc(Assembler::lessEqual, L_exit_preLoop);
2864     __ movb(rax, Address(saved_encCounter_start, used));
2865     __ xorb(rax, Address(from, 0));
2866     __ movb(Address(to, 0), rax);
2867     __ addptr(from, 1);
2868     __ addptr(to, 1);
2869     __ addptr(used, 1);
2870     __ subptr(len_reg, 1);
2871 
2872     __ jmp(L_preLoop_start);
2873 
2874     __ BIND(L_exit_preLoop);
2875     __ movptr(used_addr, used_addr_param);
2876     __ movptr(used_addr, used_addr_param);
2877     __ movl(Address(used_addr, 0), used);
2878 
2879     // load the parameters 'key' and 'counter'
2880     __ movptr(key, key_param);
2881     __ movptr(counter, rvec_param);
2882 
2883     // xmm register assignments for the loops below
2884     const XMMRegister xmm_curr_counter      = xmm0;
2885     const XMMRegister xmm_counter_shuf_mask = xmm1;  // need to be reloaded
2886     const XMMRegister xmm_key_shuf_mask     = xmm2;  // need to be reloaded
2887     const XMMRegister xmm_key               = xmm3;
2888     const XMMRegister xmm_result0           = xmm4;
2889     const XMMRegister xmm_result1           = xmm5;
2890     const XMMRegister xmm_result2           = xmm6;
2891     const XMMRegister xmm_result3           = xmm7;
2892     const XMMRegister xmm_from0             = xmm1;   //reuse XMM register
2893     const XMMRegister xmm_from1             = xmm2;
2894     const XMMRegister xmm_from2             = xmm3;
2895     const XMMRegister xmm_from3             = xmm4;
2896 
2897     //for key_128, key_192, key_256
2898     const int rounds[3] = {10, 12, 14};
2899     Label L_singleBlockLoopTop[3];
2900     Label L_multiBlock_loopTop[3];
2901     Label L_key192_top, L_key256_top;
2902     Label L_incCounter[3][4]; // 3: different key length,  4: 4 blocks at a time
2903     Label L_incCounter_single[3]; //for single block, key128, key192, key256
2904     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
2905     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
2906 
2907     Label L_exit;
2908     const int PARALLEL_FACTOR = 4;  //because of the limited register number
2909 
2910     // initialize counter with initial counter
2911     __ movdqu(xmm_curr_counter, Address(counter, 0x00));
2912     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2913     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
2914 
2915     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2916     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2917     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2918     __ cmpl(rax, 52);
2919     __ jcc(Assembler::equal, L_key192_top);
2920     __ cmpl(rax, 60);
2921     __ jcc(Assembler::equal, L_key256_top);
2922 
2923     //key128 begins here
2924     __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
2925 
2926 #define CTR_DoFour(opc, src_reg)               \
2927     __ opc(xmm_result0, src_reg);              \
2928     __ opc(xmm_result1, src_reg);              \
2929     __ opc(xmm_result2, src_reg);              \
2930     __ opc(xmm_result3, src_reg);
2931 
2932     // k == 0 :  generate code for key_128
2933     // k == 1 :  generate code for key_192
2934     // k == 2 :  generate code for key_256
2935     for (int k = 0; k < 3; ++k) {
2936       //multi blocks starts here
2937       __ align(OptoLoopAlignment);
2938       __ BIND(L_multiBlock_loopTop[k]);
2939       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
2940       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
2941 
2942       __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2943       __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2944 
2945       //load, then increase counters
2946       CTR_DoFour(movdqa, xmm_curr_counter);
2947       __ push(rbx);
2948       inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
2949       inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
2950       inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
2951       inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
2952       __ pop (rbx);
2953 
2954       load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
2955 
2956       CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
2957       CTR_DoFour(pxor, xmm_key);   //PXOR with Round 0 key
2958 
2959       for (int i = 1; i < rounds[k]; ++i) {
2960         load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
2961         CTR_DoFour(aesenc, xmm_key);
2962       }
2963       load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
2964       CTR_DoFour(aesenclast, xmm_key);
2965 
2966       // get next PARALLEL_FACTOR blocks into xmm_from registers
2967       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
2968       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
2969       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
2970 
2971       // PXOR with input text
2972       __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
2973       __ pxor(xmm_result1, xmm_from1);
2974       __ pxor(xmm_result2, xmm_from2);
2975 
2976       // store PARALLEL_FACTOR results into the next 64 bytes of output
2977       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
2978       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
2979       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
2980 
2981       // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
2982       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
2983       __ pxor(xmm_result3, xmm_from3);
2984       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
2985 
2986       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
2987       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
2988       __ jmp(L_multiBlock_loopTop[k]);
2989 
2990       // singleBlock starts here
2991       __ align(OptoLoopAlignment);
2992       __ BIND(L_singleBlockLoopTop[k]);
2993       __ cmpptr(len_reg, 0);
2994       __ jcc(Assembler::equal, L_exit);
2995       __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2996       __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
2997       __ movdqa(xmm_result0, xmm_curr_counter);
2998       load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
2999       __ push(rbx);//rbx is used for increasing counter
3000       inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
3001       __ pop (rbx);
3002       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
3003       __ pxor(xmm_result0, xmm_key);
3004       for (int i = 1; i < rounds[k]; i++) {
3005         load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
3006         __ aesenc(xmm_result0, xmm_key);
3007       }
3008       load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
3009       __ aesenclast(xmm_result0, xmm_key);
3010       __ cmpptr(len_reg, AESBlockSize);
3011       __ jcc(Assembler::less, L_processTail_insr[k]);
3012         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3013         __ pxor(xmm_result0, xmm_from0);
3014         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
3015         __ addptr(pos, AESBlockSize);
3016         __ subptr(len_reg, AESBlockSize);
3017         __ jmp(L_singleBlockLoopTop[k]);
3018 
3019       __ BIND(L_processTail_insr[k]);                                               // Process the tail part of the input array
3020         __ addptr(pos, len_reg);                                                    // 1. Insert bytes from src array into xmm_from0 register
3021         __ testptr(len_reg, 8);
3022         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
3023           __ subptr(pos,8);
3024           __ pinsrd(xmm_from0, Address(from, pos), 0);
3025           __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
3026         __ BIND(L_processTail_4_insr[k]);
3027         __ testptr(len_reg, 4);
3028         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
3029           __ subptr(pos,4);
3030           __ pslldq(xmm_from0, 4);
3031           __ pinsrd(xmm_from0, Address(from, pos), 0);
3032         __ BIND(L_processTail_2_insr[k]);
3033         __ testptr(len_reg, 2);
3034         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
3035           __ subptr(pos, 2);
3036           __ pslldq(xmm_from0, 2);
3037           __ pinsrw(xmm_from0, Address(from, pos), 0);
3038         __ BIND(L_processTail_1_insr[k]);
3039         __ testptr(len_reg, 1);
3040         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
3041           __ subptr(pos, 1);
3042           __ pslldq(xmm_from0, 1);
3043           __ pinsrb(xmm_from0, Address(from, pos), 0);
3044         __ BIND(L_processTail_exit_insr[k]);
3045 
3046         __ movptr(saved_encCounter_start, saved_counter_param);
3047         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);               // 2. Perform pxor of the encrypted counter and plaintext Bytes.
3048         __ pxor(xmm_result0, xmm_from0);                                          //    Also the encrypted counter is saved for next invocation.
3049 
3050         __ testptr(len_reg, 8);
3051         __ jcc(Assembler::zero, L_processTail_4_extr[k]);                        // 3. Extract bytes from xmm_result0 into the dest. array
3052           __ pextrd(Address(to, pos), xmm_result0, 0);
3053           __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
3054           __ psrldq(xmm_result0, 8);
3055           __ addptr(pos, 8);
3056         __ BIND(L_processTail_4_extr[k]);
3057         __ testptr(len_reg, 4);
3058         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
3059           __ pextrd(Address(to, pos), xmm_result0, 0);
3060           __ psrldq(xmm_result0, 4);
3061           __ addptr(pos, 4);
3062         __ BIND(L_processTail_2_extr[k]);
3063         __ testptr(len_reg, 2);
3064         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
3065           __ pextrb(Address(to, pos), xmm_result0, 0);
3066           __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
3067           __ psrldq(xmm_result0, 2);
3068           __ addptr(pos, 2);
3069         __ BIND(L_processTail_1_extr[k]);
3070         __ testptr(len_reg, 1);
3071         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
3072           __ pextrb(Address(to, pos), xmm_result0, 0);
3073 
3074         __ BIND(L_processTail_exit_extr[k]);
3075         __ movptr(used_addr, used_addr_param);
3076         __ movl(Address(used_addr, 0), len_reg);
3077         __ jmp(L_exit);
3078     }
3079 
3080     __ BIND(L_exit);
3081     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
3082     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
3083     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
3084     handleSOERegisters(false /*restoring*/);
3085     __ movptr(rax, len_param); // return length
3086     __ leave();                // required for proper stackwalking of RuntimeStub frame
3087     __ ret(0);
3088 
3089     __ BIND (L_key192_top);
3090     __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3091     __ jmp(L_multiBlock_loopTop[1]); //key192
3092 
3093     __ BIND (L_key256_top);
3094     __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
3095     __ jmp(L_multiBlock_loopTop[2]); //key192
3096 
3097     return start;
3098   }
3099 
3100   // ofs and limit are use for multi-block byte array.
3101   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
3102   address generate_md5_implCompress(bool multi_block, const char *name) {
3103     __ align(CodeEntryAlignment);
3104     StubCodeMark mark(this, "StubRoutines", name);
3105     address start = __ pc();
3106 
3107     const Register buf_param = rbp;
3108     const Address state_param(rsp, 0 * wordSize);
3109     const Address ofs_param  (rsp, 1 * wordSize);
3110     const Address limit_param(rsp, 2 * wordSize);
3111 
3112     __ enter();
3113     __ push(rbx);
3114     __ push(rdi);
3115     __ push(rsi);
3116     __ push(rbp);
3117     __ subptr(rsp, 3 * wordSize);
3118 
3119     __ movptr(rsi, Address(rbp, 8 + 4));
3120     __ movptr(state_param, rsi);
3121     if (multi_block) {
3122       __ movptr(rsi, Address(rbp, 8 + 8));
3123       __ movptr(ofs_param, rsi);
3124       __ movptr(rsi, Address(rbp, 8 + 12));
3125       __ movptr(limit_param, rsi);
3126     }
3127     __ movptr(buf_param, Address(rbp, 8 + 0)); // do it last because it override rbp
3128     __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
3129 
3130     __ addptr(rsp, 3 * wordSize);
3131     __ pop(rbp);
3132     __ pop(rsi);
3133     __ pop(rdi);
3134     __ pop(rbx);
3135     __ leave();
3136     __ ret(0);
3137     return start;
3138   }
3139 
3140   address generate_upper_word_mask() {
3141     __ align64();
3142     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3143     address start = __ pc();
3144     __ emit_data(0x00000000, relocInfo::none, 0);
3145     __ emit_data(0x00000000, relocInfo::none, 0);
3146     __ emit_data(0x00000000, relocInfo::none, 0);
3147     __ emit_data(0xFFFFFFFF, relocInfo::none, 0);
3148     return start;
3149   }
3150 
3151   address generate_shuffle_byte_flip_mask() {
3152     __ align64();
3153     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3154     address start = __ pc();
3155     __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3156     __ emit_data(0x08090a0b, relocInfo::none, 0);
3157     __ emit_data(0x04050607, relocInfo::none, 0);
3158     __ emit_data(0x00010203, relocInfo::none, 0);
3159     return start;
3160   }
3161 
3162   // ofs and limit are use for multi-block byte array.
3163   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3164   address generate_sha1_implCompress(bool multi_block, const char *name) {
3165     __ align(CodeEntryAlignment);
3166     StubCodeMark mark(this, "StubRoutines", name);
3167     address start = __ pc();
3168 
3169     Register buf   = rax;
3170     Register state = rdx;
3171     Register ofs   = rcx;
3172     Register limit = rdi;
3173 
3174     const Address  buf_param(rbp, 8 + 0);
3175     const Address  state_param(rbp, 8 + 4);
3176     const Address  ofs_param(rbp, 8 + 8);
3177     const Address  limit_param(rbp, 8 + 12);
3178 
3179     const XMMRegister abcd = xmm0;
3180     const XMMRegister e0 = xmm1;
3181     const XMMRegister e1 = xmm2;
3182     const XMMRegister msg0 = xmm3;
3183 
3184     const XMMRegister msg1 = xmm4;
3185     const XMMRegister msg2 = xmm5;
3186     const XMMRegister msg3 = xmm6;
3187     const XMMRegister shuf_mask = xmm7;
3188 
3189     __ enter();
3190     __ subptr(rsp, 8 * wordSize);
3191     handleSOERegisters(true /*saving*/);
3192 
3193     __ movptr(buf, buf_param);
3194     __ movptr(state, state_param);
3195     if (multi_block) {
3196       __ movptr(ofs, ofs_param);
3197       __ movptr(limit, limit_param);
3198     }
3199 
3200     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3201       buf, state, ofs, limit, rsp, multi_block);
3202 
3203     handleSOERegisters(false /*restoring*/);
3204     __ addptr(rsp, 8 * wordSize);
3205     __ leave();
3206     __ ret(0);
3207     return start;
3208   }
3209 
3210   address generate_pshuffle_byte_flip_mask() {
3211     __ align64();
3212     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3213     address start = __ pc();
3214     __ emit_data(0x00010203, relocInfo::none, 0);
3215     __ emit_data(0x04050607, relocInfo::none, 0);
3216     __ emit_data(0x08090a0b, relocInfo::none, 0);
3217     __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3218     return start;
3219   }
3220 
3221   // ofs and limit are use for multi-block byte array.
3222   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3223  address generate_sha256_implCompress(bool multi_block, const char *name) {
3224     __ align(CodeEntryAlignment);
3225     StubCodeMark mark(this, "StubRoutines", name);
3226     address start = __ pc();
3227 
3228     Register buf = rbx;
3229     Register state = rsi;
3230     Register ofs = rdx;
3231     Register limit = rcx;
3232 
3233     const Address  buf_param(rbp, 8 + 0);
3234     const Address  state_param(rbp, 8 + 4);
3235     const Address  ofs_param(rbp, 8 + 8);
3236     const Address  limit_param(rbp, 8 + 12);
3237 
3238     const XMMRegister msg = xmm0;
3239     const XMMRegister state0 = xmm1;
3240     const XMMRegister state1 = xmm2;
3241     const XMMRegister msgtmp0 = xmm3;
3242 
3243     const XMMRegister msgtmp1 = xmm4;
3244     const XMMRegister msgtmp2 = xmm5;
3245     const XMMRegister msgtmp3 = xmm6;
3246     const XMMRegister msgtmp4 = xmm7;
3247 
3248     __ enter();
3249     __ subptr(rsp, 8 * wordSize);
3250     handleSOERegisters(true /*saving*/);
3251     __ movptr(buf, buf_param);
3252     __ movptr(state, state_param);
3253     if (multi_block) {
3254      __ movptr(ofs, ofs_param);
3255      __ movptr(limit, limit_param);
3256     }
3257 
3258     __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3259       buf, state, ofs, limit, rsp, multi_block);
3260 
3261     handleSOERegisters(false);
3262     __ addptr(rsp, 8 * wordSize);
3263     __ leave();
3264     __ ret(0);
3265     return start;
3266   }
3267 
3268   // byte swap x86 long
3269   address generate_ghash_long_swap_mask() {
3270     __ align(CodeEntryAlignment);
3271     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
3272     address start = __ pc();
3273     __ emit_data(0x0b0a0908, relocInfo::none, 0);
3274     __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
3275     __ emit_data(0x03020100, relocInfo::none, 0);
3276     __ emit_data(0x07060504, relocInfo::none, 0);
3277 
3278   return start;
3279   }
3280 
3281   // byte swap x86 byte array
3282   address generate_ghash_byte_swap_mask() {
3283     __ align(CodeEntryAlignment);
3284     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
3285     address start = __ pc();
3286     __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
3287     __ emit_data(0x08090a0b, relocInfo::none, 0);
3288     __ emit_data(0x04050607, relocInfo::none, 0);
3289     __ emit_data(0x00010203, relocInfo::none, 0);
3290   return start;
3291   }
3292 
3293   /* Single and multi-block ghash operations */
3294   address generate_ghash_processBlocks() {
3295     assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
3296     __ align(CodeEntryAlignment);
3297     Label L_ghash_loop, L_exit;
3298     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3299     address start = __ pc();
3300 
3301     const Register state        = rdi;
3302     const Register subkeyH      = rsi;
3303     const Register data         = rdx;
3304     const Register blocks       = rcx;
3305 
3306     const Address  state_param(rbp, 8+0);
3307     const Address  subkeyH_param(rbp, 8+4);
3308     const Address  data_param(rbp, 8+8);
3309     const Address  blocks_param(rbp, 8+12);
3310 
3311     const XMMRegister xmm_temp0 = xmm0;
3312     const XMMRegister xmm_temp1 = xmm1;
3313     const XMMRegister xmm_temp2 = xmm2;
3314     const XMMRegister xmm_temp3 = xmm3;
3315     const XMMRegister xmm_temp4 = xmm4;
3316     const XMMRegister xmm_temp5 = xmm5;
3317     const XMMRegister xmm_temp6 = xmm6;
3318     const XMMRegister xmm_temp7 = xmm7;
3319 
3320     __ enter();
3321     handleSOERegisters(true);  // Save registers
3322 
3323     __ movptr(state, state_param);
3324     __ movptr(subkeyH, subkeyH_param);
3325     __ movptr(data, data_param);
3326     __ movptr(blocks, blocks_param);
3327 
3328     __ movdqu(xmm_temp0, Address(state, 0));
3329     __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3330 
3331     __ movdqu(xmm_temp1, Address(subkeyH, 0));
3332     __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3333 
3334     __ BIND(L_ghash_loop);
3335     __ movdqu(xmm_temp2, Address(data, 0));
3336     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
3337 
3338     __ pxor(xmm_temp0, xmm_temp2);
3339 
3340     //
3341     // Multiply with the hash key
3342     //
3343     __ movdqu(xmm_temp3, xmm_temp0);
3344     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
3345     __ movdqu(xmm_temp4, xmm_temp0);
3346     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
3347 
3348     __ movdqu(xmm_temp5, xmm_temp0);
3349     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
3350     __ movdqu(xmm_temp6, xmm_temp0);
3351     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
3352 
3353     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
3354 
3355     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
3356     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
3357     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
3358     __ pxor(xmm_temp3, xmm_temp5);
3359     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
3360                                         // of the carry-less multiplication of
3361                                         // xmm0 by xmm1.
3362 
3363     // We shift the result of the multiplication by one bit position
3364     // to the left to cope for the fact that the bits are reversed.
3365     __ movdqu(xmm_temp7, xmm_temp3);
3366     __ movdqu(xmm_temp4, xmm_temp6);
3367     __ pslld (xmm_temp3, 1);
3368     __ pslld(xmm_temp6, 1);
3369     __ psrld(xmm_temp7, 31);
3370     __ psrld(xmm_temp4, 31);
3371     __ movdqu(xmm_temp5, xmm_temp7);
3372     __ pslldq(xmm_temp4, 4);
3373     __ pslldq(xmm_temp7, 4);
3374     __ psrldq(xmm_temp5, 12);
3375     __ por(xmm_temp3, xmm_temp7);
3376     __ por(xmm_temp6, xmm_temp4);
3377     __ por(xmm_temp6, xmm_temp5);
3378 
3379     //
3380     // First phase of the reduction
3381     //
3382     // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
3383     // independently.
3384     __ movdqu(xmm_temp7, xmm_temp3);
3385     __ movdqu(xmm_temp4, xmm_temp3);
3386     __ movdqu(xmm_temp5, xmm_temp3);
3387     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
3388     __ pslld(xmm_temp4, 30);    // packed right shift shifting << 30
3389     __ pslld(xmm_temp5, 25);    // packed right shift shifting << 25
3390     __ pxor(xmm_temp7, xmm_temp4);      // xor the shifted versions
3391     __ pxor(xmm_temp7, xmm_temp5);
3392     __ movdqu(xmm_temp4, xmm_temp7);
3393     __ pslldq(xmm_temp7, 12);
3394     __ psrldq(xmm_temp4, 4);
3395     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
3396 
3397     //
3398     // Second phase of the reduction
3399     //
3400     // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
3401     // shift operations.
3402     __ movdqu(xmm_temp2, xmm_temp3);
3403     __ movdqu(xmm_temp7, xmm_temp3);
3404     __ movdqu(xmm_temp5, xmm_temp3);
3405     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
3406     __ psrld(xmm_temp7, 2);     // packed left shifting >> 2
3407     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
3408     __ pxor(xmm_temp2, xmm_temp7);      // xor the shifted versions
3409     __ pxor(xmm_temp2, xmm_temp5);
3410     __ pxor(xmm_temp2, xmm_temp4);
3411     __ pxor(xmm_temp3, xmm_temp2);
3412     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
3413 
3414     __ decrement(blocks);
3415     __ jcc(Assembler::zero, L_exit);
3416     __ movdqu(xmm_temp0, xmm_temp6);
3417     __ addptr(data, 16);
3418     __ jmp(L_ghash_loop);
3419 
3420     __ BIND(L_exit);
3421        // Byte swap 16-byte result
3422     __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3423     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
3424 
3425     handleSOERegisters(false);  // restore registers
3426     __ leave();
3427     __ ret(0);
3428     return start;
3429   }
3430 
3431   /**
3432    *  Arguments:
3433    *
3434    * Inputs:
3435    *   rsp(4)   - int crc
3436    *   rsp(8)   - byte* buf
3437    *   rsp(12)  - int length
3438    *
3439    * Output:
3440    *       rax   - int crc result
3441    */
3442   address generate_updateBytesCRC32() {
3443     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3444 
3445     __ align(CodeEntryAlignment);
3446     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3447 
3448     address start = __ pc();
3449 
3450     const Register crc   = rdx;  // crc
3451     const Register buf   = rsi;  // source java byte array address
3452     const Register len   = rcx;  // length
3453     const Register table = rdi;  // crc_table address (reuse register)
3454     const Register tmp   = rbx;
3455     assert_different_registers(crc, buf, len, table, tmp, rax);
3456 
3457     BLOCK_COMMENT("Entry:");
3458     __ enter(); // required for proper stackwalking of RuntimeStub frame
3459     __ push(rsi);
3460     __ push(rdi);
3461     __ push(rbx);
3462 
3463     Address crc_arg(rbp, 8 + 0);
3464     Address buf_arg(rbp, 8 + 4);
3465     Address len_arg(rbp, 8 + 8);
3466 
3467     // Load up:
3468     __ movl(crc,   crc_arg);
3469     __ movptr(buf, buf_arg);
3470     __ movl(len,   len_arg);
3471 
3472     __ kernel_crc32(crc, buf, len, table, tmp);
3473 
3474     __ movl(rax, crc);
3475     __ pop(rbx);
3476     __ pop(rdi);
3477     __ pop(rsi);
3478     __ vzeroupper();
3479     __ leave(); // required for proper stackwalking of RuntimeStub frame
3480     __ ret(0);
3481 
3482     return start;
3483   }
3484 
3485   /**
3486   *  Arguments:
3487   *
3488   * Inputs:
3489   *   rsp(4)   - int crc
3490   *   rsp(8)   - byte* buf
3491   *   rsp(12)  - int length
3492   *   rsp(16)  - table_start - optional (present only when doing a library_calll,
3493   *              not used by x86 algorithm)
3494   *
3495   * Output:
3496   *       rax  - int crc result
3497   */
3498   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
3499     assert(UseCRC32CIntrinsics, "need SSE4_2");
3500     __ align(CodeEntryAlignment);
3501     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3502     address start = __ pc();
3503     const Register crc = rax;  // crc
3504     const Register buf = rcx;  // source java byte array address
3505     const Register len = rdx;  // length
3506     const Register d = rbx;
3507     const Register g = rsi;
3508     const Register h = rdi;
3509     const Register empty = 0; // will never be used, in order not
3510                               // to change a signature for crc32c_IPL_Alg2_Alt2
3511                               // between 64/32 I'm just keeping it here
3512     assert_different_registers(crc, buf, len, d, g, h);
3513 
3514     BLOCK_COMMENT("Entry:");
3515     __ enter(); // required for proper stackwalking of RuntimeStub frame
3516     Address crc_arg(rsp, 4 + 4 + 0); // ESP+4 +
3517                                      // we need to add additional 4 because __ enter
3518                                      // have just pushed ebp on a stack
3519     Address buf_arg(rsp, 4 + 4 + 4);
3520     Address len_arg(rsp, 4 + 4 + 8);
3521       // Load up:
3522       __ movl(crc, crc_arg);
3523       __ movl(buf, buf_arg);
3524       __ movl(len, len_arg);
3525       __ push(d);
3526       __ push(g);
3527       __ push(h);
3528       __ crc32c_ipl_alg2_alt2(crc, buf, len,
3529                               d, g, h,
3530                               empty, empty, empty,
3531                               xmm0, xmm1, xmm2,
3532                               is_pclmulqdq_supported);
3533       __ pop(h);
3534       __ pop(g);
3535       __ pop(d);
3536     __ vzeroupper();
3537     __ leave(); // required for proper stackwalking of RuntimeStub frame
3538     __ ret(0);
3539 
3540     return start;
3541   }
3542 
3543  address generate_libmExp() {
3544     StubCodeMark mark(this, "StubRoutines", "libmExp");
3545 
3546     address start = __ pc();
3547 
3548     const XMMRegister x0  = xmm0;
3549     const XMMRegister x1  = xmm1;
3550     const XMMRegister x2  = xmm2;
3551     const XMMRegister x3  = xmm3;
3552 
3553     const XMMRegister x4  = xmm4;
3554     const XMMRegister x5  = xmm5;
3555     const XMMRegister x6  = xmm6;
3556     const XMMRegister x7  = xmm7;
3557 
3558     const Register tmp   = rbx;
3559 
3560     BLOCK_COMMENT("Entry:");
3561     __ enter(); // required for proper stackwalking of RuntimeStub frame
3562     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3563     __ leave(); // required for proper stackwalking of RuntimeStub frame
3564     __ ret(0);
3565 
3566     return start;
3567 
3568   }
3569 
3570  address generate_libmLog() {
3571    StubCodeMark mark(this, "StubRoutines", "libmLog");
3572 
3573    address start = __ pc();
3574 
3575    const XMMRegister x0 = xmm0;
3576    const XMMRegister x1 = xmm1;
3577    const XMMRegister x2 = xmm2;
3578    const XMMRegister x3 = xmm3;
3579 
3580    const XMMRegister x4 = xmm4;
3581    const XMMRegister x5 = xmm5;
3582    const XMMRegister x6 = xmm6;
3583    const XMMRegister x7 = xmm7;
3584 
3585    const Register tmp = rbx;
3586 
3587    BLOCK_COMMENT("Entry:");
3588    __ enter(); // required for proper stackwalking of RuntimeStub frame
3589    __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3590    __ leave(); // required for proper stackwalking of RuntimeStub frame
3591    __ ret(0);
3592 
3593    return start;
3594 
3595  }
3596 
3597  address generate_libmLog10() {
3598    StubCodeMark mark(this, "StubRoutines", "libmLog10");
3599 
3600    address start = __ pc();
3601 
3602    const XMMRegister x0 = xmm0;
3603    const XMMRegister x1 = xmm1;
3604    const XMMRegister x2 = xmm2;
3605    const XMMRegister x3 = xmm3;
3606 
3607    const XMMRegister x4 = xmm4;
3608    const XMMRegister x5 = xmm5;
3609    const XMMRegister x6 = xmm6;
3610    const XMMRegister x7 = xmm7;
3611 
3612    const Register tmp = rbx;
3613 
3614    BLOCK_COMMENT("Entry:");
3615    __ enter(); // required for proper stackwalking of RuntimeStub frame
3616    __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3617    __ leave(); // required for proper stackwalking of RuntimeStub frame
3618    __ ret(0);
3619 
3620    return start;
3621 
3622  }
3623 
3624  address generate_libmPow() {
3625    StubCodeMark mark(this, "StubRoutines", "libmPow");
3626 
3627    address start = __ pc();
3628 
3629    const XMMRegister x0 = xmm0;
3630    const XMMRegister x1 = xmm1;
3631    const XMMRegister x2 = xmm2;
3632    const XMMRegister x3 = xmm3;
3633 
3634    const XMMRegister x4 = xmm4;
3635    const XMMRegister x5 = xmm5;
3636    const XMMRegister x6 = xmm6;
3637    const XMMRegister x7 = xmm7;
3638 
3639    const Register tmp = rbx;
3640 
3641    BLOCK_COMMENT("Entry:");
3642    __ enter(); // required for proper stackwalking of RuntimeStub frame
3643    __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3644    __ leave(); // required for proper stackwalking of RuntimeStub frame
3645    __ ret(0);
3646 
3647    return start;
3648 
3649  }
3650 
3651  address generate_libm_reduce_pi04l() {
3652    StubCodeMark mark(this, "StubRoutines", "libm_reduce_pi04l");
3653 
3654    address start = __ pc();
3655 
3656    BLOCK_COMMENT("Entry:");
3657    __ libm_reduce_pi04l(rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3658 
3659    return start;
3660 
3661  }
3662 
3663  address generate_libm_sin_cos_huge() {
3664    StubCodeMark mark(this, "StubRoutines", "libm_sin_cos_huge");
3665 
3666    address start = __ pc();
3667 
3668    const XMMRegister x0 = xmm0;
3669    const XMMRegister x1 = xmm1;
3670 
3671    BLOCK_COMMENT("Entry:");
3672    __ libm_sincos_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3673 
3674    return start;
3675 
3676  }
3677 
3678  address generate_libmSin() {
3679    StubCodeMark mark(this, "StubRoutines", "libmSin");
3680 
3681    address start = __ pc();
3682 
3683    const XMMRegister x0 = xmm0;
3684    const XMMRegister x1 = xmm1;
3685    const XMMRegister x2 = xmm2;
3686    const XMMRegister x3 = xmm3;
3687 
3688    const XMMRegister x4 = xmm4;
3689    const XMMRegister x5 = xmm5;
3690    const XMMRegister x6 = xmm6;
3691    const XMMRegister x7 = xmm7;
3692 
3693    BLOCK_COMMENT("Entry:");
3694    __ enter(); // required for proper stackwalking of RuntimeStub frame
3695    __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rdx);
3696    __ leave(); // required for proper stackwalking of RuntimeStub frame
3697    __ ret(0);
3698 
3699    return start;
3700 
3701  }
3702 
3703  address generate_libmCos() {
3704    StubCodeMark mark(this, "StubRoutines", "libmCos");
3705 
3706    address start = __ pc();
3707 
3708    const XMMRegister x0 = xmm0;
3709    const XMMRegister x1 = xmm1;
3710    const XMMRegister x2 = xmm2;
3711    const XMMRegister x3 = xmm3;
3712 
3713    const XMMRegister x4 = xmm4;
3714    const XMMRegister x5 = xmm5;
3715    const XMMRegister x6 = xmm6;
3716    const XMMRegister x7 = xmm7;
3717 
3718    const Register tmp = rbx;
3719 
3720    BLOCK_COMMENT("Entry:");
3721    __ enter(); // required for proper stackwalking of RuntimeStub frame
3722    __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3723    __ leave(); // required for proper stackwalking of RuntimeStub frame
3724    __ ret(0);
3725 
3726    return start;
3727 
3728  }
3729 
3730  address generate_libm_tan_cot_huge() {
3731    StubCodeMark mark(this, "StubRoutines", "libm_tan_cot_huge");
3732 
3733    address start = __ pc();
3734 
3735    const XMMRegister x0 = xmm0;
3736    const XMMRegister x1 = xmm1;
3737 
3738    BLOCK_COMMENT("Entry:");
3739    __ libm_tancot_huge(x0, x1, rax, rcx, rdx, rbx, rsi, rdi, rbp, rsp);
3740 
3741    return start;
3742 
3743  }
3744 
3745  address generate_libmTan() {
3746    StubCodeMark mark(this, "StubRoutines", "libmTan");
3747 
3748    address start = __ pc();
3749 
3750    const XMMRegister x0 = xmm0;
3751    const XMMRegister x1 = xmm1;
3752    const XMMRegister x2 = xmm2;
3753    const XMMRegister x3 = xmm3;
3754 
3755    const XMMRegister x4 = xmm4;
3756    const XMMRegister x5 = xmm5;
3757    const XMMRegister x6 = xmm6;
3758    const XMMRegister x7 = xmm7;
3759 
3760    const Register tmp = rbx;
3761 
3762    BLOCK_COMMENT("Entry:");
3763    __ enter(); // required for proper stackwalking of RuntimeStub frame
3764    __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
3765    __ leave(); // required for proper stackwalking of RuntimeStub frame
3766    __ ret(0);
3767 
3768    return start;
3769 
3770  }
3771 
3772   address generate_method_entry_barrier() {
3773     __ align(CodeEntryAlignment);
3774     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
3775 
3776     Label deoptimize_label;
3777 
3778     address start = __ pc();
3779 
3780     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
3781 
3782     BLOCK_COMMENT("Entry:");
3783     __ enter(); // save rbp
3784 
3785     // save rbx, because we want to use that value.
3786     // We could do without it but then we depend on the number of slots used by pusha
3787     __ push(rbx);
3788 
3789     __ lea(rbx, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for rbx - this should be the return address
3790 
3791     __ pusha();
3792 
3793     // xmm0 and xmm1 may be used for passing float/double arguments
3794 
3795     if (UseSSE >= 2) {
3796       const int xmm_size = wordSize * 4;
3797       __ subptr(rsp, xmm_size * 2);
3798       __ movdbl(Address(rsp, xmm_size * 1), xmm1);
3799       __ movdbl(Address(rsp, xmm_size * 0), xmm0);
3800     } else if (UseSSE >= 1) {
3801       const int xmm_size = wordSize * 2;
3802       __ subptr(rsp, xmm_size * 2);
3803       __ movflt(Address(rsp, xmm_size * 1), xmm1);
3804       __ movflt(Address(rsp, xmm_size * 0), xmm0);
3805     }
3806 
3807     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), rbx);
3808 
3809     if (UseSSE >= 2) {
3810       const int xmm_size = wordSize * 4;
3811       __ movdbl(xmm0, Address(rsp, xmm_size * 0));
3812       __ movdbl(xmm1, Address(rsp, xmm_size * 1));
3813       __ addptr(rsp, xmm_size * 2);
3814     } else if (UseSSE >= 1) {
3815       const int xmm_size = wordSize * 2;
3816       __ movflt(xmm0, Address(rsp, xmm_size * 0));
3817       __ movflt(xmm1, Address(rsp, xmm_size * 1));
3818       __ addptr(rsp, xmm_size * 2);
3819     }
3820 
3821     __ cmpl(rax, 1); // 1 means deoptimize
3822     __ jcc(Assembler::equal, deoptimize_label);
3823 
3824     __ popa();
3825     __ pop(rbx);
3826 
3827     __ leave();
3828 
3829     __ addptr(rsp, 1 * wordSize); // cookie
3830     __ ret(0);
3831 
3832     __ BIND(deoptimize_label);
3833 
3834     __ popa();
3835     __ pop(rbx);
3836 
3837     __ leave();
3838 
3839     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
3840     // here while still having a correct stack is valuable
3841     __ testptr(rsp, Address(rsp, 0));
3842 
3843     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
3844     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
3845 
3846     return start;
3847   }
3848 
3849  public:
3850   // Information about frame layout at time of blocking runtime call.
3851   // Note that we only have to preserve callee-saved registers since
3852   // the compilers are responsible for supplying a continuation point
3853   // if they expect all registers to be preserved.
3854   enum layout {
3855     thread_off,    // last_java_sp
3856     arg1_off,
3857     arg2_off,
3858     rbp_off,       // callee saved register
3859     ret_pc,
3860     framesize
3861   };
3862 
3863  private:
3864 
3865 #undef  __
3866 #define __ masm->
3867 
3868   //------------------------------------------------------------------------------------------------------------------------
3869   // Continuation point for throwing of implicit exceptions that are not handled in
3870   // the current activation. Fabricates an exception oop and initiates normal
3871   // exception dispatching in this frame.
3872   //
3873   // Previously the compiler (c2) allowed for callee save registers on Java calls.
3874   // This is no longer true after adapter frames were removed but could possibly
3875   // be brought back in the future if the interpreter code was reworked and it
3876   // was deemed worthwhile. The comment below was left to describe what must
3877   // happen here if callee saves were resurrected. As it stands now this stub
3878   // could actually be a vanilla BufferBlob and have now oopMap at all.
3879   // Since it doesn't make much difference we've chosen to leave it the
3880   // way it was in the callee save days and keep the comment.
3881 
3882   // If we need to preserve callee-saved values we need a callee-saved oop map and
3883   // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
3884   // If the compiler needs all registers to be preserved between the fault
3885   // point and the exception handler then it must assume responsibility for that in
3886   // AbstractCompiler::continuation_for_implicit_null_exception or
3887   // continuation_for_implicit_division_by_zero_exception. All other implicit
3888   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
3889   // either at call sites or otherwise assume that stack unwinding will be initiated,
3890   // so caller saved registers were assumed volatile in the compiler.
3891   address generate_throw_exception(const char* name, address runtime_entry,
3892                                    Register arg1 = noreg, Register arg2 = noreg) {
3893 
3894     int insts_size = 256;
3895     int locs_size  = 32;
3896 
3897     CodeBuffer code(name, insts_size, locs_size);
3898     OopMapSet* oop_maps  = new OopMapSet();
3899     MacroAssembler* masm = new MacroAssembler(&code);
3900 
3901     address start = __ pc();
3902 
3903     // This is an inlined and slightly modified version of call_VM
3904     // which has the ability to fetch the return PC out of
3905     // thread-local storage and also sets up last_Java_sp slightly
3906     // differently than the real call_VM
3907     Register java_thread = rbx;
3908     __ get_thread(java_thread);
3909 
3910     __ enter(); // required for proper stackwalking of RuntimeStub frame
3911 
3912     // pc and rbp, already pushed
3913     __ subptr(rsp, (framesize-2) * wordSize); // prolog
3914 
3915     // Frame is now completed as far as size and linkage.
3916 
3917     int frame_complete = __ pc() - start;
3918 
3919     // push java thread (becomes first argument of C function)
3920     __ movptr(Address(rsp, thread_off * wordSize), java_thread);
3921     if (arg1 != noreg) {
3922       __ movptr(Address(rsp, arg1_off * wordSize), arg1);
3923     }
3924     if (arg2 != noreg) {
3925       assert(arg1 != noreg, "missing reg arg");
3926       __ movptr(Address(rsp, arg2_off * wordSize), arg2);
3927     }
3928 
3929     // Set up last_Java_sp and last_Java_fp
3930     __ set_last_Java_frame(java_thread, rsp, rbp, NULL);
3931 
3932     // Call runtime
3933     BLOCK_COMMENT("call runtime_entry");
3934     __ call(RuntimeAddress(runtime_entry));
3935     // Generate oop map
3936     OopMap* map =  new OopMap(framesize, 0);
3937     oop_maps->add_gc_map(__ pc() - start, map);
3938 
3939     // restore the thread (cannot use the pushed argument since arguments
3940     // may be overwritten by C code generated by an optimizing compiler);
3941     // however can use the register value directly if it is callee saved.
3942     __ get_thread(java_thread);
3943 
3944     __ reset_last_Java_frame(java_thread, true);
3945 
3946     __ leave(); // required for proper stackwalking of RuntimeStub frame
3947 
3948     // check for pending exceptions
3949 #ifdef ASSERT
3950     Label L;
3951     __ cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
3952     __ jcc(Assembler::notEqual, L);
3953     __ should_not_reach_here();
3954     __ bind(L);
3955 #endif /* ASSERT */
3956     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3957 
3958 
3959     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, framesize, oop_maps, false);
3960     return stub->entry_point();
3961   }
3962 
3963 
3964   void create_control_words() {
3965     // Round to nearest, 53-bit mode, exceptions masked
3966     StubRoutines::x86::_fpu_cntrl_wrd_std   = 0x027F;
3967     // Round to zero, 53-bit mode, exception mased
3968     StubRoutines::x86::_fpu_cntrl_wrd_trunc = 0x0D7F;
3969     // Round to nearest, 24-bit mode, exceptions masked
3970     StubRoutines::x86::_fpu_cntrl_wrd_24    = 0x007F;
3971     // Round to nearest, 64-bit mode, exceptions masked
3972     StubRoutines::x86::_mxcsr_std           = 0x1F80;
3973     // Note: the following two constants are 80-bit values
3974     //       layout is critical for correct loading by FPU.
3975     // Bias for strict fp multiply/divide
3976     StubRoutines::x86::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
3977     StubRoutines::x86::_fpu_subnormal_bias1[1]= 0x80000000;
3978     StubRoutines::x86::_fpu_subnormal_bias1[2]= 0x03ff;
3979     // Un-Bias for strict fp multiply/divide
3980     StubRoutines::x86::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
3981     StubRoutines::x86::_fpu_subnormal_bias2[1]= 0x80000000;
3982     StubRoutines::x86::_fpu_subnormal_bias2[2]= 0x7bff;
3983   }
3984 
3985   //---------------------------------------------------------------------------
3986   // Initialization
3987 
3988   void generate_initial() {
3989     // Generates all stubs and initializes the entry points
3990 
3991     //------------------------------------------------------------------------------------------------------------------------
3992     // entry points that exist in all platforms
3993     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3994     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3995     StubRoutines::_forward_exception_entry      = generate_forward_exception();
3996 
3997     StubRoutines::_call_stub_entry              =
3998       generate_call_stub(StubRoutines::_call_stub_return_address);
3999     // is referenced by megamorphic call
4000     StubRoutines::_catch_exception_entry        = generate_catch_exception();
4001 
4002     // platform dependent
4003     create_control_words();
4004 
4005     StubRoutines::x86::_verify_mxcsr_entry         = generate_verify_mxcsr();
4006     StubRoutines::x86::_verify_fpu_cntrl_wrd_entry = generate_verify_fpu_cntrl_wrd();
4007     StubRoutines::x86::_d2i_wrapper                = generate_d2i_wrapper(T_INT,  CAST_FROM_FN_PTR(address, SharedRuntime::d2i));
4008     StubRoutines::x86::_d2l_wrapper                = generate_d2i_wrapper(T_LONG, CAST_FROM_FN_PTR(address, SharedRuntime::d2l));
4009 
4010     // Build this early so it's available for the interpreter
4011     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",
4012                                                                                       CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
4013     StubRoutines::_throw_delayed_StackOverflowError_entry  = generate_throw_exception("delayed StackOverflowError throw_exception",
4014                                                                                       CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
4015 
4016     if (UseCRC32Intrinsics) {
4017       // set table address before stub generation which use it
4018       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
4019       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4020     }
4021 
4022     if (UseCRC32CIntrinsics) {
4023       bool supports_clmul = VM_Version::supports_clmul();
4024       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
4025       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
4026       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
4027     }
4028     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
4029       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
4030           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
4031           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
4032         StubRoutines::x86::_L_2il0floatpacket_0_adr = (address)StubRoutines::x86::_L_2il0floatpacket_0;
4033         StubRoutines::x86::_Pi4Inv_adr = (address)StubRoutines::x86::_Pi4Inv;
4034         StubRoutines::x86::_Pi4x3_adr = (address)StubRoutines::x86::_Pi4x3;
4035         StubRoutines::x86::_Pi4x4_adr = (address)StubRoutines::x86::_Pi4x4;
4036         StubRoutines::x86::_ones_adr = (address)StubRoutines::x86::_ones;
4037       }
4038       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
4039         StubRoutines::_dexp = generate_libmExp();
4040       }
4041       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
4042         StubRoutines::_dlog = generate_libmLog();
4043       }
4044       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
4045         StubRoutines::_dlog10 = generate_libmLog10();
4046       }
4047       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
4048         StubRoutines::_dpow = generate_libmPow();
4049       }
4050       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
4051         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
4052         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
4053         StubRoutines::_dlibm_reduce_pi04l = generate_libm_reduce_pi04l();
4054       }
4055       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
4056         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
4057         StubRoutines::_dlibm_sin_cos_huge = generate_libm_sin_cos_huge();
4058       }
4059       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
4060         StubRoutines::_dsin = generate_libmSin();
4061       }
4062       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
4063         StubRoutines::_dcos = generate_libmCos();
4064       }
4065       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
4066         StubRoutines::_dlibm_tan_cot_huge = generate_libm_tan_cot_huge();
4067         StubRoutines::_dtan = generate_libmTan();
4068       }
4069     }
4070   }
4071 
4072   void generate_all() {
4073     // Generates all stubs and initializes the entry points
4074 
4075     // These entry points require SharedInfo::stack0 to be set up in non-core builds
4076     // and need to be relocatable, so they each fabricate a RuntimeStub internally.
4077     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
4078     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
4079     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
4080 
4081     //------------------------------------------------------------------------------------------------------------------------
4082     // entry points that are platform specific
4083 
4084     StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF);
4085     StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x80000000);
4086     StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask_long_double("vector_double_sign_mask", 0x7FFFFFFF, 0xFFFFFFFF);
4087     StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask_long_double("vector_double_sign_flip", 0x80000000, 0x00000000);
4088     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff);
4089     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff);
4090     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff);
4091     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
4092                                                                         0xFFFFFFFF, 0, 0, 0);
4093     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
4094                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
4095     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x03020100);
4096     StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
4097     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x01000100);
4098     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask_long_double("vector_long_shuffle_mask", 0x00000001, 0x0);
4099     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
4100     StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask_long_double("vector_long_sign_mask", 0x80000000, 0x00000000);
4101     StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFF);
4102     StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x00000001);
4103     StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
4104     StubRoutines::x86::_vector_count_leading_zeros_lut = generate_count_leading_zeros_lut("count_leading_zeros_lut");
4105     StubRoutines::x86::_vector_reverse_bit_lut = generate_vector_reverse_bit_lut("reverse_bit_lut");
4106     StubRoutines::x86::_vector_reverse_byte_perm_mask_long = generate_vector_reverse_byte_perm_mask_long("perm_mask_long");
4107     StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
4108     StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");
4109 
4110     if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
4111       // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
4112       StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
4113     }
4114 
4115     // support for verify_oop (must happen after universe_init)
4116     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4117 
4118     // arraycopy stubs used by compilers
4119     generate_arraycopy_stubs();
4120 
4121     // don't bother generating these AES intrinsic stubs unless global flag is set
4122     if (UseAESIntrinsics) {
4123       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // might be needed by the others
4124 
4125       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4126       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4127       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4128       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
4129     }
4130 
4131     if (UseAESCTRIntrinsics) {
4132       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
4133       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
4134     }
4135 
4136     if (UseMD5Intrinsics) {
4137       StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
4138       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
4139     }
4140     if (UseSHA1Intrinsics) {
4141       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
4142       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
4143       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
4144       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
4145     }
4146     if (UseSHA256Intrinsics) {
4147       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
4148       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
4149       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
4150       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
4151     }
4152 
4153     // Generate GHASH intrinsics code
4154     if (UseGHASHIntrinsics) {
4155       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
4156       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
4157       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4158     }
4159 
4160     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
4161     if (bs_nm != NULL) {
4162       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
4163     }
4164   }
4165 
4166 
4167  public:
4168   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4169     if (all) {
4170       generate_all();
4171     } else {
4172       generate_initial();
4173     }
4174   }
4175 }; // end class declaration
4176 
4177 #define UCM_TABLE_MAX_ENTRIES 8
4178 void StubGenerator_generate(CodeBuffer* code, bool all) {
4179   if (UnsafeCopyMemory::_table == NULL) {
4180     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
4181   }
4182   StubGenerator g(code, all);
4183 }