1 /*
   2  * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "atomic_aarch64.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "memory/universe.hpp"
  37 #include "nativeInst_aarch64.hpp"
  38 #include "oops/instanceOop.hpp"
  39 #include "oops/method.hpp"
  40 #include "oops/objArrayKlass.hpp"
  41 #include "oops/oop.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "runtime/atomic.hpp"
  44 #include "runtime/frame.inline.hpp"
  45 #include "runtime/handles.inline.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.inline.hpp"
  50 #include "utilities/align.hpp"
  51 #include "utilities/powerOfTwo.hpp"
  52 #ifdef COMPILER2
  53 #include "opto/runtime.hpp"
  54 #endif
  55 #if INCLUDE_ZGC
  56 #include "gc/z/zThreadLocalData.hpp"
  57 #endif
  58 
  59 // Declaration and definition of StubGenerator (no .hpp file).
  60 // For a more detailed description of the stub routine structure
  61 // see the comment in stubRoutines.hpp
  62 
  63 #undef __
  64 #define __ _masm->
  65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  66 
  67 #ifdef PRODUCT
  68 #define BLOCK_COMMENT(str) /* nothing */
  69 #else
  70 #define BLOCK_COMMENT(str) __ block_comment(str)
  71 #endif
  72 
  73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  74 
  75 // Stub Code definitions
  76 
  77 class StubGenerator: public StubCodeGenerator {
  78  private:
  79 
  80 #ifdef PRODUCT
  81 #define inc_counter_np(counter) ((void)0)
  82 #else
  83   void inc_counter_np_(int& counter) {
  84     __ lea(rscratch2, ExternalAddress((address)&counter));
  85     __ ldrw(rscratch1, Address(rscratch2));
  86     __ addw(rscratch1, rscratch1, 1);
  87     __ strw(rscratch1, Address(rscratch2));
  88   }
  89 #define inc_counter_np(counter) \
  90   BLOCK_COMMENT("inc_counter " #counter); \
  91   inc_counter_np_(counter);
  92 #endif
  93 
  94   // Call stubs are used to call Java from C
  95   //
  96   // Arguments:
  97   //    c_rarg0:   call wrapper address                   address
  98   //    c_rarg1:   result                                 address
  99   //    c_rarg2:   result type                            BasicType
 100   //    c_rarg3:   method                                 Method*
 101   //    c_rarg4:   (interpreter) entry point              address
 102   //    c_rarg5:   parameters                             intptr_t*
 103   //    c_rarg6:   parameter size (in words)              int
 104   //    c_rarg7:   thread                                 Thread*
 105   //
 106   // There is no return from the stub itself as any Java result
 107   // is written to result
 108   //
 109   // we save r30 (lr) as the return PC at the base of the frame and
 110   // link r29 (fp) below it as the frame pointer installing sp (r31)
 111   // into fp.
 112   //
 113   // we save r0-r7, which accounts for all the c arguments.
 114   //
 115   // TODO: strictly do we need to save them all? they are treated as
 116   // volatile by C so could we omit saving the ones we are going to
 117   // place in global registers (thread? method?) or those we only use
 118   // during setup of the Java call?
 119   //
 120   // we don't need to save r8 which C uses as an indirect result location
 121   // return register.
 122   //
 123   // we don't need to save r9-r15 which both C and Java treat as
 124   // volatile
 125   //
 126   // we don't need to save r16-18 because Java does not use them
 127   //
 128   // we save r19-r28 which Java uses as scratch registers and C
 129   // expects to be callee-save
 130   //
 131   // we save the bottom 64 bits of each value stored in v8-v15; it is
 132   // the responsibility of the caller to preserve larger values.
 133   //
 134   // so the stub frame looks like this when we enter Java code
 135   //
 136   //     [ return_from_Java     ] <--- sp
 137   //     [ argument word n      ]
 138   //      ...
 139   // -27 [ argument word 1      ]
 140   // -26 [ saved v15            ] <--- sp_after_call
 141   // -25 [ saved v14            ]
 142   // -24 [ saved v13            ]
 143   // -23 [ saved v12            ]
 144   // -22 [ saved v11            ]
 145   // -21 [ saved v10            ]
 146   // -20 [ saved v9             ]
 147   // -19 [ saved v8             ]
 148   // -18 [ saved r28            ]
 149   // -17 [ saved r27            ]
 150   // -16 [ saved r26            ]
 151   // -15 [ saved r25            ]
 152   // -14 [ saved r24            ]
 153   // -13 [ saved r23            ]
 154   // -12 [ saved r22            ]
 155   // -11 [ saved r21            ]
 156   // -10 [ saved r20            ]
 157   //  -9 [ saved r19            ]
 158   //  -8 [ call wrapper    (r0) ]
 159   //  -7 [ result          (r1) ]
 160   //  -6 [ result type     (r2) ]
 161   //  -5 [ method          (r3) ]
 162   //  -4 [ entry point     (r4) ]
 163   //  -3 [ parameters      (r5) ]
 164   //  -2 [ parameter size  (r6) ]
 165   //  -1 [ thread (r7)          ]
 166   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 167   //   1 [ saved lr       (r30) ]
 168 
 169   // Call stub stack layout word offsets from fp
 170   enum call_stub_layout {
 171     sp_after_call_off = -26,
 172 
 173     d15_off            = -26,
 174     d13_off            = -24,
 175     d11_off            = -22,
 176     d9_off             = -20,
 177 
 178     r28_off            = -18,
 179     r26_off            = -16,
 180     r24_off            = -14,
 181     r22_off            = -12,
 182     r20_off            = -10,
 183     call_wrapper_off   =  -8,
 184     result_off         =  -7,
 185     result_type_off    =  -6,
 186     method_off         =  -5,
 187     entry_point_off    =  -4,
 188     parameter_size_off =  -2,
 189     thread_off         =  -1,
 190     fp_f               =   0,
 191     retaddr_off        =   1,
 192   };
 193 
 194   address generate_call_stub(address& return_address) {
 195     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 196            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 197            "adjust this code");
 198 
 199     StubCodeMark mark(this, "StubRoutines", "call_stub");
 200     address start = __ pc();
 201 
 202     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 203 
 204     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 205     const Address result        (rfp, result_off         * wordSize);
 206     const Address result_type   (rfp, result_type_off    * wordSize);
 207     const Address method        (rfp, method_off         * wordSize);
 208     const Address entry_point   (rfp, entry_point_off    * wordSize);
 209     const Address parameter_size(rfp, parameter_size_off * wordSize);
 210 
 211     const Address thread        (rfp, thread_off         * wordSize);
 212 
 213     const Address d15_save      (rfp, d15_off * wordSize);
 214     const Address d13_save      (rfp, d13_off * wordSize);
 215     const Address d11_save      (rfp, d11_off * wordSize);
 216     const Address d9_save       (rfp, d9_off * wordSize);
 217 
 218     const Address r28_save      (rfp, r28_off * wordSize);
 219     const Address r26_save      (rfp, r26_off * wordSize);
 220     const Address r24_save      (rfp, r24_off * wordSize);
 221     const Address r22_save      (rfp, r22_off * wordSize);
 222     const Address r20_save      (rfp, r20_off * wordSize);
 223 
 224     // stub code
 225 
 226     address aarch64_entry = __ pc();
 227 
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (u1)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // we do this here because the notify will already have been done
 299     // if we get to the next instruction via an exception
 300     //
 301     // n.b. adding this instruction here affects the calculation of
 302     // whether or not a routine returns to the call stub (used when
 303     // doing stack walks) since the normal test is to check the return
 304     // pc against the address saved below. so we may need to allow for
 305     // this extra instruction in the check.
 306 
 307     // save current address for use by exception handling code
 308 
 309     return_address = __ pc();
 310 
 311     // store result depending on type (everything that is not
 312     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 313     // n.b. this assumes Java returns an integral result in r0
 314     // and a floating result in j_farg0
 315     __ ldr(j_rarg2, result);
 316     Label is_long, is_float, is_double, exit;
 317     __ ldr(j_rarg1, result_type);
 318     __ cmp(j_rarg1, (u1)T_OBJECT);
 319     __ br(Assembler::EQ, is_long);
 320     __ cmp(j_rarg1, (u1)T_LONG);
 321     __ br(Assembler::EQ, is_long);
 322     __ cmp(j_rarg1, (u1)T_FLOAT);
 323     __ br(Assembler::EQ, is_float);
 324     __ cmp(j_rarg1, (u1)T_DOUBLE);
 325     __ br(Assembler::EQ, is_double);
 326 
 327     // handle T_INT case
 328     __ strw(r0, Address(j_rarg2));
 329 
 330     __ BIND(exit);
 331 
 332     // pop parameters
 333     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 334 
 335 #ifdef ASSERT
 336     // verify that threads correspond
 337     {
 338       Label L, S;
 339       __ ldr(rscratch1, thread);
 340       __ cmp(rthread, rscratch1);
 341       __ br(Assembler::NE, S);
 342       __ get_thread(rscratch1);
 343       __ cmp(rthread, rscratch1);
 344       __ br(Assembler::EQ, L);
 345       __ BIND(S);
 346       __ stop("StubRoutines::call_stub: threads must correspond");
 347       __ BIND(L);
 348     }
 349 #endif
 350 
 351     // restore callee-save registers
 352     __ ldpd(v15, v14,  d15_save);
 353     __ ldpd(v13, v12,  d13_save);
 354     __ ldpd(v11, v10,  d11_save);
 355     __ ldpd(v9,  v8,   d9_save);
 356 
 357     __ ldp(r28, r27,   r28_save);
 358     __ ldp(r26, r25,   r26_save);
 359     __ ldp(r24, r23,   r24_save);
 360     __ ldp(r22, r21,   r22_save);
 361     __ ldp(r20, r19,   r20_save);
 362 
 363     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 364     __ ldrw(c_rarg2, result_type);
 365     __ ldr(c_rarg3,  method);
 366     __ ldp(c_rarg4, c_rarg5,  entry_point);
 367     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 368 
 369     // leave frame and return to caller
 370     __ leave();
 371     __ ret(lr);
 372 
 373     // handle return types different from T_INT
 374 
 375     __ BIND(is_long);
 376     __ str(r0, Address(j_rarg2, 0));
 377     __ br(Assembler::AL, exit);
 378 
 379     __ BIND(is_float);
 380     __ strs(j_farg0, Address(j_rarg2, 0));
 381     __ br(Assembler::AL, exit);
 382 
 383     __ BIND(is_double);
 384     __ strd(j_farg0, Address(j_rarg2, 0));
 385     __ br(Assembler::AL, exit);
 386 
 387     return start;
 388   }
 389 
 390   // Return point for a Java call if there's an exception thrown in
 391   // Java code.  The exception is caught and transformed into a
 392   // pending exception stored in JavaThread that can be tested from
 393   // within the VM.
 394   //
 395   // Note: Usually the parameters are removed by the callee. In case
 396   // of an exception crossing an activation frame boundary, that is
 397   // not the case if the callee is compiled code => need to setup the
 398   // rsp.
 399   //
 400   // r0: exception oop
 401 
 402   address generate_catch_exception() {
 403     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 404     address start = __ pc();
 405 
 406     // same as in generate_call_stub():
 407     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 408     const Address thread        (rfp, thread_off         * wordSize);
 409 
 410 #ifdef ASSERT
 411     // verify that threads correspond
 412     {
 413       Label L, S;
 414       __ ldr(rscratch1, thread);
 415       __ cmp(rthread, rscratch1);
 416       __ br(Assembler::NE, S);
 417       __ get_thread(rscratch1);
 418       __ cmp(rthread, rscratch1);
 419       __ br(Assembler::EQ, L);
 420       __ bind(S);
 421       __ stop("StubRoutines::catch_exception: threads must correspond");
 422       __ bind(L);
 423     }
 424 #endif
 425 
 426     // set pending exception
 427     __ verify_oop(r0);
 428 
 429     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 430     __ mov(rscratch1, (address)__FILE__);
 431     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 432     __ movw(rscratch1, (int)__LINE__);
 433     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 434 
 435     // complete return to VM
 436     assert(StubRoutines::_call_stub_return_address != NULL,
 437            "_call_stub_return_address must have been generated before");
 438     __ b(StubRoutines::_call_stub_return_address);
 439 
 440     return start;
 441   }
 442 
 443   // Continuation point for runtime calls returning with a pending
 444   // exception.  The pending exception check happened in the runtime
 445   // or native call stub.  The pending exception in Thread is
 446   // converted into a Java-level exception.
 447   //
 448   // Contract with Java-level exception handlers:
 449   // r0: exception
 450   // r3: throwing pc
 451   //
 452   // NOTE: At entry of this stub, exception-pc must be in LR !!
 453 
 454   // NOTE: this is always used as a jump target within generated code
 455   // so it just needs to be generated code wiht no x86 prolog
 456 
 457   address generate_forward_exception() {
 458     StubCodeMark mark(this, "StubRoutines", "forward exception");
 459     address start = __ pc();
 460 
 461     // Upon entry, LR points to the return address returning into
 462     // Java (interpreted or compiled) code; i.e., the return address
 463     // becomes the throwing pc.
 464     //
 465     // Arguments pushed before the runtime call are still on the stack
 466     // but the exception handler will reset the stack pointer ->
 467     // ignore them.  A potential result in registers can be ignored as
 468     // well.
 469 
 470 #ifdef ASSERT
 471     // make sure this code is only executed if there is a pending exception
 472     {
 473       Label L;
 474       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 475       __ cbnz(rscratch1, L);
 476       __ stop("StubRoutines::forward exception: no pending exception (1)");
 477       __ bind(L);
 478     }
 479 #endif
 480 
 481     // compute exception handler into r19
 482 
 483     // call the VM to find the handler address associated with the
 484     // caller address. pass thread in r0 and caller pc (ret address)
 485     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 486     // the stack.
 487     __ mov(c_rarg1, lr);
 488     // lr will be trashed by the VM call so we move it to R19
 489     // (callee-saved) because we also need to pass it to the handler
 490     // returned by this call.
 491     __ mov(r19, lr);
 492     BLOCK_COMMENT("call exception_handler_for_return_address");
 493     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 494                          SharedRuntime::exception_handler_for_return_address),
 495                     rthread, c_rarg1);
 496     // Reinitialize the ptrue predicate register, in case the external runtime
 497     // call clobbers ptrue reg, as we may return to SVE compiled code.
 498     __ reinitialize_ptrue();
 499 
 500     // we should not really care that lr is no longer the callee
 501     // address. we saved the value the handler needs in r19 so we can
 502     // just copy it to r3. however, the C2 handler will push its own
 503     // frame and then calls into the VM and the VM code asserts that
 504     // the PC for the frame above the handler belongs to a compiled
 505     // Java method. So, we restore lr here to satisfy that assert.
 506     __ mov(lr, r19);
 507     // setup r0 & r3 & clear pending exception
 508     __ mov(r3, r19);
 509     __ mov(r19, r0);
 510     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 511     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 512 
 513 #ifdef ASSERT
 514     // make sure exception is set
 515     {
 516       Label L;
 517       __ cbnz(r0, L);
 518       __ stop("StubRoutines::forward exception: no pending exception (2)");
 519       __ bind(L);
 520     }
 521 #endif
 522 
 523     // continue at exception handler
 524     // r0: exception
 525     // r3: throwing pc
 526     // r19: exception handler
 527     __ verify_oop(r0);
 528     __ br(r19);
 529 
 530     return start;
 531   }
 532 
 533   // Non-destructive plausibility checks for oops
 534   //
 535   // Arguments:
 536   //    r0: oop to verify
 537   //    rscratch1: error message
 538   //
 539   // Stack after saving c_rarg3:
 540   //    [tos + 0]: saved c_rarg3
 541   //    [tos + 1]: saved c_rarg2
 542   //    [tos + 2]: saved lr
 543   //    [tos + 3]: saved rscratch2
 544   //    [tos + 4]: saved r0
 545   //    [tos + 5]: saved rscratch1
 546   address generate_verify_oop() {
 547 
 548     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 549     address start = __ pc();
 550 
 551     Label exit, error;
 552 
 553     // save c_rarg2 and c_rarg3
 554     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 555 
 556     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 557     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 558     __ ldr(c_rarg3, Address(c_rarg2));
 559     __ add(c_rarg3, c_rarg3, 1);
 560     __ str(c_rarg3, Address(c_rarg2));
 561 
 562     // object is in r0
 563     // make sure object is 'reasonable'
 564     __ cbz(r0, exit); // if obj is NULL it is OK
 565 
 566 #if INCLUDE_ZGC
 567     if (UseZGC) {
 568       // Check if mask is good.
 569       // verifies that ZAddressBadMask & r0 == 0
 570       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 571       __ andr(c_rarg2, r0, c_rarg3);
 572       __ cbnz(c_rarg2, error);
 573     }
 574 #endif
 575 
 576     // Check if the oop is in the right area of memory
 577     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 578     __ andr(c_rarg2, r0, c_rarg3);
 579     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 580 
 581     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 582     // instruction here because the flags register is live.
 583     __ eor(c_rarg2, c_rarg2, c_rarg3);
 584     __ cbnz(c_rarg2, error);
 585 
 586     // make sure klass is 'reasonable', which is not zero.
 587     __ load_klass(r0, r0);  // get klass
 588     __ cbz(r0, error);      // if klass is NULL it is broken
 589 
 590     // return if everything seems ok
 591     __ bind(exit);
 592 
 593     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 594     __ ret(lr);
 595 
 596     // handle errors
 597     __ bind(error);
 598     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 599 
 600     __ push(RegSet::range(r0, r29), sp);
 601     // debug(char* msg, int64_t pc, int64_t regs[])
 602     __ mov(c_rarg0, rscratch1);      // pass address of error message
 603     __ mov(c_rarg1, lr);             // pass return address
 604     __ mov(c_rarg2, sp);             // pass address of regs on stack
 605 #ifndef PRODUCT
 606     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 607 #endif
 608     BLOCK_COMMENT("call MacroAssembler::debug");
 609     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 610     __ blr(rscratch1);
 611     __ hlt(0);
 612 
 613     return start;
 614   }
 615 
 616   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 617 
 618   // Generate indices for iota vector.
 619   address generate_iota_indices(const char *stub_name) {
 620     __ align(CodeEntryAlignment);
 621     StubCodeMark mark(this, "StubRoutines", stub_name);
 622     address start = __ pc();
 623     __ emit_data64(0x0706050403020100, relocInfo::none);
 624     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 625     return start;
 626   }
 627 
 628   // The inner part of zero_words().  This is the bulk operation,
 629   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 630   // caller is responsible for zeroing the last few words.
 631   //
 632   // Inputs:
 633   // r10: the HeapWord-aligned base address of an array to zero.
 634   // r11: the count in HeapWords, r11 > 0.
 635   //
 636   // Returns r10 and r11, adjusted for the caller to clear.
 637   // r10: the base address of the tail of words left to clear.
 638   // r11: the number of words in the tail.
 639   //      r11 < MacroAssembler::zero_words_block_size.
 640 
 641   address generate_zero_blocks() {
 642     Label done;
 643     Label base_aligned;
 644 
 645     Register base = r10, cnt = r11;
 646 
 647     __ align(CodeEntryAlignment);
 648     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 649     address start = __ pc();
 650 
 651     if (UseBlockZeroing) {
 652       int zva_length = VM_Version::zva_length();
 653 
 654       // Ensure ZVA length can be divided by 16. This is required by
 655       // the subsequent operations.
 656       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 657 
 658       __ tbz(base, 3, base_aligned);
 659       __ str(zr, Address(__ post(base, 8)));
 660       __ sub(cnt, cnt, 1);
 661       __ bind(base_aligned);
 662 
 663       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 664       // alignment.
 665       Label small;
 666       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 667       __ subs(rscratch1, cnt, low_limit >> 3);
 668       __ br(Assembler::LT, small);
 669       __ zero_dcache_blocks(base, cnt);
 670       __ bind(small);
 671     }
 672 
 673     {
 674       // Number of stp instructions we'll unroll
 675       const int unroll =
 676         MacroAssembler::zero_words_block_size / 2;
 677       // Clear the remaining blocks.
 678       Label loop;
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::LT, done);
 681       __ bind(loop);
 682       for (int i = 0; i < unroll; i++)
 683         __ stp(zr, zr, __ post(base, 16));
 684       __ subs(cnt, cnt, unroll * 2);
 685       __ br(Assembler::GE, loop);
 686       __ bind(done);
 687       __ add(cnt, cnt, unroll * 2);
 688     }
 689 
 690     __ ret(lr);
 691 
 692     return start;
 693   }
 694 
 695 
 696   typedef enum {
 697     copy_forwards = 1,
 698     copy_backwards = -1
 699   } copy_direction;
 700 
 701   // Bulk copy of blocks of 8 words.
 702   //
 703   // count is a count of words.
 704   //
 705   // Precondition: count >= 8
 706   //
 707   // Postconditions:
 708   //
 709   // The least significant bit of count contains the remaining count
 710   // of words to copy.  The rest of count is trash.
 711   //
 712   // s and d are adjusted to point to the remaining words to copy
 713   //
 714   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 715                            copy_direction direction) {
 716     int unit = wordSize * direction;
 717     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 718 
 719     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 720       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 721     const Register stride = r13;
 722 
 723     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 724     assert_different_registers(s, d, count, rscratch1);
 725 
 726     Label again, drain;
 727     const char *stub_name;
 728     if (direction == copy_forwards)
 729       stub_name = "forward_copy_longs";
 730     else
 731       stub_name = "backward_copy_longs";
 732 
 733     __ align(CodeEntryAlignment);
 734 
 735     StubCodeMark mark(this, "StubRoutines", stub_name);
 736 
 737     __ bind(start);
 738 
 739     Label unaligned_copy_long;
 740     if (AvoidUnalignedAccesses) {
 741       __ tbnz(d, 3, unaligned_copy_long);
 742     }
 743 
 744     if (direction == copy_forwards) {
 745       __ sub(s, s, bias);
 746       __ sub(d, d, bias);
 747     }
 748 
 749 #ifdef ASSERT
 750     // Make sure we are never given < 8 words
 751     {
 752       Label L;
 753       __ cmp(count, (u1)8);
 754       __ br(Assembler::GE, L);
 755       __ stop("genrate_copy_longs called with < 8 words");
 756       __ bind(L);
 757     }
 758 #endif
 759 
 760     // Fill 8 registers
 761     if (UseSIMDForMemoryOps) {
 762       __ ldpq(v0, v1, Address(s, 4 * unit));
 763       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 764     } else {
 765       __ ldp(t0, t1, Address(s, 2 * unit));
 766       __ ldp(t2, t3, Address(s, 4 * unit));
 767       __ ldp(t4, t5, Address(s, 6 * unit));
 768       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 769     }
 770 
 771     __ subs(count, count, 16);
 772     __ br(Assembler::LO, drain);
 773 
 774     int prefetch = PrefetchCopyIntervalInBytes;
 775     bool use_stride = false;
 776     if (direction == copy_backwards) {
 777        use_stride = prefetch > 256;
 778        prefetch = -prefetch;
 779        if (use_stride) __ mov(stride, prefetch);
 780     }
 781 
 782     __ bind(again);
 783 
 784     if (PrefetchCopyIntervalInBytes > 0)
 785       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 786 
 787     if (UseSIMDForMemoryOps) {
 788       __ stpq(v0, v1, Address(d, 4 * unit));
 789       __ ldpq(v0, v1, Address(s, 4 * unit));
 790       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 791       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 792     } else {
 793       __ stp(t0, t1, Address(d, 2 * unit));
 794       __ ldp(t0, t1, Address(s, 2 * unit));
 795       __ stp(t2, t3, Address(d, 4 * unit));
 796       __ ldp(t2, t3, Address(s, 4 * unit));
 797       __ stp(t4, t5, Address(d, 6 * unit));
 798       __ ldp(t4, t5, Address(s, 6 * unit));
 799       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 800       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 801     }
 802 
 803     __ subs(count, count, 8);
 804     __ br(Assembler::HS, again);
 805 
 806     // Drain
 807     __ bind(drain);
 808     if (UseSIMDForMemoryOps) {
 809       __ stpq(v0, v1, Address(d, 4 * unit));
 810       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 811     } else {
 812       __ stp(t0, t1, Address(d, 2 * unit));
 813       __ stp(t2, t3, Address(d, 4 * unit));
 814       __ stp(t4, t5, Address(d, 6 * unit));
 815       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 816     }
 817 
 818     {
 819       Label L1, L2;
 820       __ tbz(count, exact_log2(4), L1);
 821       if (UseSIMDForMemoryOps) {
 822         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 823         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 824       } else {
 825         __ ldp(t0, t1, Address(s, 2 * unit));
 826         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 827         __ stp(t0, t1, Address(d, 2 * unit));
 828         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 829       }
 830       __ bind(L1);
 831 
 832       if (direction == copy_forwards) {
 833         __ add(s, s, bias);
 834         __ add(d, d, bias);
 835       }
 836 
 837       __ tbz(count, 1, L2);
 838       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 839       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 840       __ bind(L2);
 841     }
 842 
 843     __ ret(lr);
 844 
 845     if (AvoidUnalignedAccesses) {
 846       Label drain, again;
 847       // Register order for storing. Order is different for backward copy.
 848 
 849       __ bind(unaligned_copy_long);
 850 
 851       // source address is even aligned, target odd aligned
 852       //
 853       // when forward copying word pairs we read long pairs at offsets
 854       // {0, 2, 4, 6} (in long words). when backwards copying we read
 855       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 856       // address by -2 in the forwards case so we can compute the
 857       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 858       // or -1.
 859       //
 860       // when forward copying we need to store 1 word, 3 pairs and
 861       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 862       // zero offset We adjust the destination by -1 which means we
 863       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 864       //
 865       // When backwards copyng we need to store 1 word, 3 pairs and
 866       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 867       // offsets {1, 3, 5, 7, 8} * unit.
 868 
 869       if (direction == copy_forwards) {
 870         __ sub(s, s, 16);
 871         __ sub(d, d, 8);
 872       }
 873 
 874       // Fill 8 registers
 875       //
 876       // for forwards copy s was offset by -16 from the original input
 877       // value of s so the register contents are at these offsets
 878       // relative to the 64 bit block addressed by that original input
 879       // and so on for each successive 64 byte block when s is updated
 880       //
 881       // t0 at offset 0,  t1 at offset 8
 882       // t2 at offset 16, t3 at offset 24
 883       // t4 at offset 32, t5 at offset 40
 884       // t6 at offset 48, t7 at offset 56
 885 
 886       // for backwards copy s was not offset so the register contents
 887       // are at these offsets into the preceding 64 byte block
 888       // relative to that original input and so on for each successive
 889       // preceding 64 byte block when s is updated. this explains the
 890       // slightly counter-intuitive looking pattern of register usage
 891       // in the stp instructions for backwards copy.
 892       //
 893       // t0 at offset -16, t1 at offset -8
 894       // t2 at offset -32, t3 at offset -24
 895       // t4 at offset -48, t5 at offset -40
 896       // t6 at offset -64, t7 at offset -56
 897 
 898       __ ldp(t0, t1, Address(s, 2 * unit));
 899       __ ldp(t2, t3, Address(s, 4 * unit));
 900       __ ldp(t4, t5, Address(s, 6 * unit));
 901       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 902 
 903       __ subs(count, count, 16);
 904       __ br(Assembler::LO, drain);
 905 
 906       int prefetch = PrefetchCopyIntervalInBytes;
 907       bool use_stride = false;
 908       if (direction == copy_backwards) {
 909          use_stride = prefetch > 256;
 910          prefetch = -prefetch;
 911          if (use_stride) __ mov(stride, prefetch);
 912       }
 913 
 914       __ bind(again);
 915 
 916       if (PrefetchCopyIntervalInBytes > 0)
 917         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 918 
 919       if (direction == copy_forwards) {
 920        // allowing for the offset of -8 the store instructions place
 921        // registers into the target 64 bit block at the following
 922        // offsets
 923        //
 924        // t0 at offset 0
 925        // t1 at offset 8,  t2 at offset 16
 926        // t3 at offset 24, t4 at offset 32
 927        // t5 at offset 40, t6 at offset 48
 928        // t7 at offset 56
 929 
 930         __ str(t0, Address(d, 1 * unit));
 931         __ stp(t1, t2, Address(d, 2 * unit));
 932         __ ldp(t0, t1, Address(s, 2 * unit));
 933         __ stp(t3, t4, Address(d, 4 * unit));
 934         __ ldp(t2, t3, Address(s, 4 * unit));
 935         __ stp(t5, t6, Address(d, 6 * unit));
 936         __ ldp(t4, t5, Address(s, 6 * unit));
 937         __ str(t7, Address(__ pre(d, 8 * unit)));
 938         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 939       } else {
 940        // d was not offset when we started so the registers are
 941        // written into the 64 bit block preceding d with the following
 942        // offsets
 943        //
 944        // t1 at offset -8
 945        // t3 at offset -24, t0 at offset -16
 946        // t5 at offset -48, t2 at offset -32
 947        // t7 at offset -56, t4 at offset -48
 948        //                   t6 at offset -64
 949        //
 950        // note that this matches the offsets previously noted for the
 951        // loads
 952 
 953         __ str(t1, Address(d, 1 * unit));
 954         __ stp(t3, t0, Address(d, 3 * unit));
 955         __ ldp(t0, t1, Address(s, 2 * unit));
 956         __ stp(t5, t2, Address(d, 5 * unit));
 957         __ ldp(t2, t3, Address(s, 4 * unit));
 958         __ stp(t7, t4, Address(d, 7 * unit));
 959         __ ldp(t4, t5, Address(s, 6 * unit));
 960         __ str(t6, Address(__ pre(d, 8 * unit)));
 961         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 962       }
 963 
 964       __ subs(count, count, 8);
 965       __ br(Assembler::HS, again);
 966 
 967       // Drain
 968       //
 969       // this uses the same pattern of offsets and register arguments
 970       // as above
 971       __ bind(drain);
 972       if (direction == copy_forwards) {
 973         __ str(t0, Address(d, 1 * unit));
 974         __ stp(t1, t2, Address(d, 2 * unit));
 975         __ stp(t3, t4, Address(d, 4 * unit));
 976         __ stp(t5, t6, Address(d, 6 * unit));
 977         __ str(t7, Address(__ pre(d, 8 * unit)));
 978       } else {
 979         __ str(t1, Address(d, 1 * unit));
 980         __ stp(t3, t0, Address(d, 3 * unit));
 981         __ stp(t5, t2, Address(d, 5 * unit));
 982         __ stp(t7, t4, Address(d, 7 * unit));
 983         __ str(t6, Address(__ pre(d, 8 * unit)));
 984       }
 985       // now we need to copy any remaining part block which may
 986       // include a 4 word block subblock and/or a 2 word subblock.
 987       // bits 2 and 1 in the count are the tell-tale for whetehr we
 988       // have each such subblock
 989       {
 990         Label L1, L2;
 991         __ tbz(count, exact_log2(4), L1);
 992        // this is the same as above but copying only 4 longs hence
 993        // with ony one intervening stp between the str instructions
 994        // but note that the offsets and registers still follow the
 995        // same pattern
 996         __ ldp(t0, t1, Address(s, 2 * unit));
 997         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 998         if (direction == copy_forwards) {
 999           __ str(t0, Address(d, 1 * unit));
1000           __ stp(t1, t2, Address(d, 2 * unit));
1001           __ str(t3, Address(__ pre(d, 4 * unit)));
1002         } else {
1003           __ str(t1, Address(d, 1 * unit));
1004           __ stp(t3, t0, Address(d, 3 * unit));
1005           __ str(t2, Address(__ pre(d, 4 * unit)));
1006         }
1007         __ bind(L1);
1008 
1009         __ tbz(count, 1, L2);
1010        // this is the same as above but copying only 2 longs hence
1011        // there is no intervening stp between the str instructions
1012        // but note that the offset and register patterns are still
1013        // the same
1014         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1015         if (direction == copy_forwards) {
1016           __ str(t0, Address(d, 1 * unit));
1017           __ str(t1, Address(__ pre(d, 2 * unit)));
1018         } else {
1019           __ str(t1, Address(d, 1 * unit));
1020           __ str(t0, Address(__ pre(d, 2 * unit)));
1021         }
1022         __ bind(L2);
1023 
1024        // for forwards copy we need to re-adjust the offsets we
1025        // applied so that s and d are follow the last words written
1026 
1027        if (direction == copy_forwards) {
1028          __ add(s, s, 16);
1029          __ add(d, d, 8);
1030        }
1031 
1032       }
1033 
1034       __ ret(lr);
1035       }
1036   }
1037 
1038   // Small copy: less than 16 bytes.
1039   //
1040   // NB: Ignores all of the bits of count which represent more than 15
1041   // bytes, so a caller doesn't have to mask them.
1042 
1043   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1044     bool is_backwards = step < 0;
1045     size_t granularity = uabs(step);
1046     int direction = is_backwards ? -1 : 1;
1047     int unit = wordSize * direction;
1048 
1049     Label Lword, Lint, Lshort, Lbyte;
1050 
1051     assert(granularity
1052            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1053 
1054     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1055 
1056     // ??? I don't know if this bit-test-and-branch is the right thing
1057     // to do.  It does a lot of jumping, resulting in several
1058     // mispredicted branches.  It might make more sense to do this
1059     // with something like Duff's device with a single computed branch.
1060 
1061     __ tbz(count, 3 - exact_log2(granularity), Lword);
1062     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1063     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1064     __ bind(Lword);
1065 
1066     if (granularity <= sizeof (jint)) {
1067       __ tbz(count, 2 - exact_log2(granularity), Lint);
1068       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1069       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1070       __ bind(Lint);
1071     }
1072 
1073     if (granularity <= sizeof (jshort)) {
1074       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1075       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1076       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1077       __ bind(Lshort);
1078     }
1079 
1080     if (granularity <= sizeof (jbyte)) {
1081       __ tbz(count, 0, Lbyte);
1082       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1083       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1084       __ bind(Lbyte);
1085     }
1086   }
1087 
1088   Label copy_f, copy_b;
1089 
1090   // All-singing all-dancing memory copy.
1091   //
1092   // Copy count units of memory from s to d.  The size of a unit is
1093   // step, which can be positive or negative depending on the direction
1094   // of copy.  If is_aligned is false, we align the source address.
1095   //
1096 
1097   void copy_memory(bool is_aligned, Register s, Register d,
1098                    Register count, Register tmp, int step) {
1099     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1100     bool is_backwards = step < 0;
1101     unsigned int granularity = uabs(step);
1102     const Register t0 = r3, t1 = r4;
1103 
1104     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1105     // load all the data before writing anything
1106     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1107     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1108     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1109     const Register send = r17, dend = r16;
1110 
1111     if (PrefetchCopyIntervalInBytes > 0)
1112       __ prfm(Address(s, 0), PLDL1KEEP);
1113     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1114     __ br(Assembler::HI, copy_big);
1115 
1116     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1117     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1118 
1119     __ cmp(count, u1(16/granularity));
1120     __ br(Assembler::LS, copy16);
1121 
1122     __ cmp(count, u1(64/granularity));
1123     __ br(Assembler::HI, copy80);
1124 
1125     __ cmp(count, u1(32/granularity));
1126     __ br(Assembler::LS, copy32);
1127 
1128     // 33..64 bytes
1129     if (UseSIMDForMemoryOps) {
1130       __ ldpq(v0, v1, Address(s, 0));
1131       __ ldpq(v2, v3, Address(send, -32));
1132       __ stpq(v0, v1, Address(d, 0));
1133       __ stpq(v2, v3, Address(dend, -32));
1134     } else {
1135       __ ldp(t0, t1, Address(s, 0));
1136       __ ldp(t2, t3, Address(s, 16));
1137       __ ldp(t4, t5, Address(send, -32));
1138       __ ldp(t6, t7, Address(send, -16));
1139 
1140       __ stp(t0, t1, Address(d, 0));
1141       __ stp(t2, t3, Address(d, 16));
1142       __ stp(t4, t5, Address(dend, -32));
1143       __ stp(t6, t7, Address(dend, -16));
1144     }
1145     __ b(finish);
1146 
1147     // 17..32 bytes
1148     __ bind(copy32);
1149     __ ldp(t0, t1, Address(s, 0));
1150     __ ldp(t2, t3, Address(send, -16));
1151     __ stp(t0, t1, Address(d, 0));
1152     __ stp(t2, t3, Address(dend, -16));
1153     __ b(finish);
1154 
1155     // 65..80/96 bytes
1156     // (96 bytes if SIMD because we do 32 byes per instruction)
1157     __ bind(copy80);
1158     if (UseSIMDForMemoryOps) {
1159       __ ldpq(v0, v1, Address(s, 0));
1160       __ ldpq(v2, v3, Address(s, 32));
1161       // Unaligned pointers can be an issue for copying.
1162       // The issue has more chances to happen when granularity of data is
1163       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1164       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1165       // The most performance drop has been seen for the range 65-80 bytes.
1166       // For such cases using the pair of ldp/stp instead of the third pair of
1167       // ldpq/stpq fixes the performance issue.
1168       if (granularity < sizeof (jint)) {
1169         Label copy96;
1170         __ cmp(count, u1(80/granularity));
1171         __ br(Assembler::HI, copy96);
1172         __ ldp(t0, t1, Address(send, -16));
1173 
1174         __ stpq(v0, v1, Address(d, 0));
1175         __ stpq(v2, v3, Address(d, 32));
1176         __ stp(t0, t1, Address(dend, -16));
1177         __ b(finish);
1178 
1179         __ bind(copy96);
1180       }
1181       __ ldpq(v4, v5, Address(send, -32));
1182 
1183       __ stpq(v0, v1, Address(d, 0));
1184       __ stpq(v2, v3, Address(d, 32));
1185       __ stpq(v4, v5, Address(dend, -32));
1186     } else {
1187       __ ldp(t0, t1, Address(s, 0));
1188       __ ldp(t2, t3, Address(s, 16));
1189       __ ldp(t4, t5, Address(s, 32));
1190       __ ldp(t6, t7, Address(s, 48));
1191       __ ldp(t8, t9, Address(send, -16));
1192 
1193       __ stp(t0, t1, Address(d, 0));
1194       __ stp(t2, t3, Address(d, 16));
1195       __ stp(t4, t5, Address(d, 32));
1196       __ stp(t6, t7, Address(d, 48));
1197       __ stp(t8, t9, Address(dend, -16));
1198     }
1199     __ b(finish);
1200 
1201     // 0..16 bytes
1202     __ bind(copy16);
1203     __ cmp(count, u1(8/granularity));
1204     __ br(Assembler::LO, copy8);
1205 
1206     // 8..16 bytes
1207     __ ldr(t0, Address(s, 0));
1208     __ ldr(t1, Address(send, -8));
1209     __ str(t0, Address(d, 0));
1210     __ str(t1, Address(dend, -8));
1211     __ b(finish);
1212 
1213     if (granularity < 8) {
1214       // 4..7 bytes
1215       __ bind(copy8);
1216       __ tbz(count, 2 - exact_log2(granularity), copy4);
1217       __ ldrw(t0, Address(s, 0));
1218       __ ldrw(t1, Address(send, -4));
1219       __ strw(t0, Address(d, 0));
1220       __ strw(t1, Address(dend, -4));
1221       __ b(finish);
1222       if (granularity < 4) {
1223         // 0..3 bytes
1224         __ bind(copy4);
1225         __ cbz(count, finish); // get rid of 0 case
1226         if (granularity == 2) {
1227           __ ldrh(t0, Address(s, 0));
1228           __ strh(t0, Address(d, 0));
1229         } else { // granularity == 1
1230           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1231           // the first and last byte.
1232           // Handle the 3 byte case by loading and storing base + count/2
1233           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1234           // This does means in the 1 byte case we load/store the same
1235           // byte 3 times.
1236           __ lsr(count, count, 1);
1237           __ ldrb(t0, Address(s, 0));
1238           __ ldrb(t1, Address(send, -1));
1239           __ ldrb(t2, Address(s, count));
1240           __ strb(t0, Address(d, 0));
1241           __ strb(t1, Address(dend, -1));
1242           __ strb(t2, Address(d, count));
1243         }
1244         __ b(finish);
1245       }
1246     }
1247 
1248     __ bind(copy_big);
1249     if (is_backwards) {
1250       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1251       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1252     }
1253 
1254     // Now we've got the small case out of the way we can align the
1255     // source address on a 2-word boundary.
1256 
1257     Label aligned;
1258 
1259     if (is_aligned) {
1260       // We may have to adjust by 1 word to get s 2-word-aligned.
1261       __ tbz(s, exact_log2(wordSize), aligned);
1262       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1263       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1264       __ sub(count, count, wordSize/granularity);
1265     } else {
1266       if (is_backwards) {
1267         __ andr(rscratch2, s, 2 * wordSize - 1);
1268       } else {
1269         __ neg(rscratch2, s);
1270         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1271       }
1272       // rscratch2 is the byte adjustment needed to align s.
1273       __ cbz(rscratch2, aligned);
1274       int shift = exact_log2(granularity);
1275       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1276       __ sub(count, count, rscratch2);
1277 
1278 #if 0
1279       // ?? This code is only correct for a disjoint copy.  It may or
1280       // may not make sense to use it in that case.
1281 
1282       // Copy the first pair; s and d may not be aligned.
1283       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1284       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1285 
1286       // Align s and d, adjust count
1287       if (is_backwards) {
1288         __ sub(s, s, rscratch2);
1289         __ sub(d, d, rscratch2);
1290       } else {
1291         __ add(s, s, rscratch2);
1292         __ add(d, d, rscratch2);
1293       }
1294 #else
1295       copy_memory_small(s, d, rscratch2, rscratch1, step);
1296 #endif
1297     }
1298 
1299     __ bind(aligned);
1300 
1301     // s is now 2-word-aligned.
1302 
1303     // We have a count of units and some trailing bytes.  Adjust the
1304     // count and do a bulk copy of words.
1305     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1306     if (direction == copy_forwards)
1307       __ bl(copy_f);
1308     else
1309       __ bl(copy_b);
1310 
1311     // And the tail.
1312     copy_memory_small(s, d, count, tmp, step);
1313 
1314     if (granularity >= 8) __ bind(copy8);
1315     if (granularity >= 4) __ bind(copy4);
1316     __ bind(finish);
1317   }
1318 
1319 
1320   void clobber_registers() {
1321 #ifdef ASSERT
1322     RegSet clobbered
1323       = MacroAssembler::call_clobbered_registers() - rscratch1;
1324     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1325     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1326     for (RegSetIterator<> it = clobbered.begin(); *it != noreg; ++it) {
1327       __ mov(*it, rscratch1);
1328     }
1329 #endif
1330 
1331   }
1332 
1333   // Scan over array at a for count oops, verifying each one.
1334   // Preserves a and count, clobbers rscratch1 and rscratch2.
1335   void verify_oop_array (int size, Register a, Register count, Register temp) {
1336     Label loop, end;
1337     __ mov(rscratch1, a);
1338     __ mov(rscratch2, zr);
1339     __ bind(loop);
1340     __ cmp(rscratch2, count);
1341     __ br(Assembler::HS, end);
1342     if (size == wordSize) {
1343       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1344       __ verify_oop(temp);
1345     } else {
1346       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1347       __ decode_heap_oop(temp); // calls verify_oop
1348     }
1349     __ add(rscratch2, rscratch2, 1);
1350     __ b(loop);
1351     __ bind(end);
1352   }
1353 
1354   // Arguments:
1355   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1356   //             ignored
1357   //   is_oop  - true => oop array, so generate store check code
1358   //   name    - stub name string
1359   //
1360   // Inputs:
1361   //   c_rarg0   - source array address
1362   //   c_rarg1   - destination array address
1363   //   c_rarg2   - element count, treated as ssize_t, can be zero
1364   //
1365   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1366   // the hardware handle it.  The two dwords within qwords that span
1367   // cache line boundaries will still be loaded and stored atomically.
1368   //
1369   // Side Effects:
1370   //   disjoint_int_copy_entry is set to the no-overlap entry point
1371   //   used by generate_conjoint_int_oop_copy().
1372   //
1373   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1374                                   const char *name, bool dest_uninitialized = false) {
1375     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1376     RegSet saved_reg = RegSet::of(s, d, count);
1377     __ align(CodeEntryAlignment);
1378     StubCodeMark mark(this, "StubRoutines", name);
1379     address start = __ pc();
1380     __ enter();
1381 
1382     if (entry != NULL) {
1383       *entry = __ pc();
1384       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1385       BLOCK_COMMENT("Entry:");
1386     }
1387 
1388     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1389     if (dest_uninitialized) {
1390       decorators |= IS_DEST_UNINITIALIZED;
1391     }
1392     if (aligned) {
1393       decorators |= ARRAYCOPY_ALIGNED;
1394     }
1395 
1396     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1397     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1398 
1399     if (is_oop) {
1400       // save regs before copy_memory
1401       __ push(RegSet::of(d, count), sp);
1402     }
1403     {
1404       // UnsafeCopyMemory page error: continue after ucm
1405       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1406       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1407       copy_memory(aligned, s, d, count, rscratch1, size);
1408     }
1409 
1410     if (is_oop) {
1411       __ pop(RegSet::of(d, count), sp);
1412       if (VerifyOops)
1413         verify_oop_array(size, d, count, r16);
1414     }
1415 
1416     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1417 
1418     __ leave();
1419     __ mov(r0, zr); // return 0
1420     __ ret(lr);
1421     return start;
1422   }
1423 
1424   // Arguments:
1425   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1426   //             ignored
1427   //   is_oop  - true => oop array, so generate store check code
1428   //   name    - stub name string
1429   //
1430   // Inputs:
1431   //   c_rarg0   - source array address
1432   //   c_rarg1   - destination array address
1433   //   c_rarg2   - element count, treated as ssize_t, can be zero
1434   //
1435   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1436   // the hardware handle it.  The two dwords within qwords that span
1437   // cache line boundaries will still be loaded and stored atomically.
1438   //
1439   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1440                                  address *entry, const char *name,
1441                                  bool dest_uninitialized = false) {
1442     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1443     RegSet saved_regs = RegSet::of(s, d, count);
1444     StubCodeMark mark(this, "StubRoutines", name);
1445     address start = __ pc();
1446     __ enter();
1447 
1448     if (entry != NULL) {
1449       *entry = __ pc();
1450       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1451       BLOCK_COMMENT("Entry:");
1452     }
1453 
1454     // use fwd copy when (d-s) above_equal (count*size)
1455     __ sub(rscratch1, d, s);
1456     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1457     __ br(Assembler::HS, nooverlap_target);
1458 
1459     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1460     if (dest_uninitialized) {
1461       decorators |= IS_DEST_UNINITIALIZED;
1462     }
1463     if (aligned) {
1464       decorators |= ARRAYCOPY_ALIGNED;
1465     }
1466 
1467     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1468     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1469 
1470     if (is_oop) {
1471       // save regs before copy_memory
1472       __ push(RegSet::of(d, count), sp);
1473     }
1474     {
1475       // UnsafeCopyMemory page error: continue after ucm
1476       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1477       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1478       copy_memory(aligned, s, d, count, rscratch1, -size);
1479     }
1480     if (is_oop) {
1481       __ pop(RegSet::of(d, count), sp);
1482       if (VerifyOops)
1483         verify_oop_array(size, d, count, r16);
1484     }
1485     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1486     __ leave();
1487     __ mov(r0, zr); // return 0
1488     __ ret(lr);
1489     return start;
1490 }
1491 
1492   // Arguments:
1493   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1494   //             ignored
1495   //   name    - stub name string
1496   //
1497   // Inputs:
1498   //   c_rarg0   - source array address
1499   //   c_rarg1   - destination array address
1500   //   c_rarg2   - element count, treated as ssize_t, can be zero
1501   //
1502   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1503   // we let the hardware handle it.  The one to eight bytes within words,
1504   // dwords or qwords that span cache line boundaries will still be loaded
1505   // and stored atomically.
1506   //
1507   // Side Effects:
1508   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1509   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1510   // we let the hardware handle it.  The one to eight bytes within words,
1511   // dwords or qwords that span cache line boundaries will still be loaded
1512   // and stored atomically.
1513   //
1514   // Side Effects:
1515   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1516   //   used by generate_conjoint_byte_copy().
1517   //
1518   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1519     const bool not_oop = false;
1520     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1521   }
1522 
1523   // Arguments:
1524   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1525   //             ignored
1526   //   name    - stub name string
1527   //
1528   // Inputs:
1529   //   c_rarg0   - source array address
1530   //   c_rarg1   - destination array address
1531   //   c_rarg2   - element count, treated as ssize_t, can be zero
1532   //
1533   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1534   // we let the hardware handle it.  The one to eight bytes within words,
1535   // dwords or qwords that span cache line boundaries will still be loaded
1536   // and stored atomically.
1537   //
1538   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1539                                       address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1555   // let the hardware handle it.  The two or four words within dwords
1556   // or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   // Side Effects:
1560   //   disjoint_short_copy_entry is set to the no-overlap entry point
1561   //   used by generate_conjoint_short_copy().
1562   //
1563   address generate_disjoint_short_copy(bool aligned,
1564                                        address* entry, const char *name) {
1565     const bool not_oop = false;
1566     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1567   }
1568 
1569   // Arguments:
1570   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1571   //             ignored
1572   //   name    - stub name string
1573   //
1574   // Inputs:
1575   //   c_rarg0   - source array address
1576   //   c_rarg1   - destination array address
1577   //   c_rarg2   - element count, treated as ssize_t, can be zero
1578   //
1579   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1580   // let the hardware handle it.  The two or four words within dwords
1581   // or qwords that span cache line boundaries will still be loaded
1582   // and stored atomically.
1583   //
1584   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1585                                        address *entry, const char *name) {
1586     const bool not_oop = false;
1587     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1588 
1589   }
1590   // Arguments:
1591   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1592   //             ignored
1593   //   name    - stub name string
1594   //
1595   // Inputs:
1596   //   c_rarg0   - source array address
1597   //   c_rarg1   - destination array address
1598   //   c_rarg2   - element count, treated as ssize_t, can be zero
1599   //
1600   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1601   // the hardware handle it.  The two dwords within qwords that span
1602   // cache line boundaries will still be loaded and stored atomically.
1603   //
1604   // Side Effects:
1605   //   disjoint_int_copy_entry is set to the no-overlap entry point
1606   //   used by generate_conjoint_int_oop_copy().
1607   //
1608   address generate_disjoint_int_copy(bool aligned, address *entry,
1609                                          const char *name, bool dest_uninitialized = false) {
1610     const bool not_oop = false;
1611     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1612   }
1613 
1614   // Arguments:
1615   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1616   //             ignored
1617   //   name    - stub name string
1618   //
1619   // Inputs:
1620   //   c_rarg0   - source array address
1621   //   c_rarg1   - destination array address
1622   //   c_rarg2   - element count, treated as ssize_t, can be zero
1623   //
1624   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1625   // the hardware handle it.  The two dwords within qwords that span
1626   // cache line boundaries will still be loaded and stored atomically.
1627   //
1628   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1629                                      address *entry, const char *name,
1630                                      bool dest_uninitialized = false) {
1631     const bool not_oop = false;
1632     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1633   }
1634 
1635 
1636   // Arguments:
1637   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1638   //             ignored
1639   //   name    - stub name string
1640   //
1641   // Inputs:
1642   //   c_rarg0   - source array address
1643   //   c_rarg1   - destination array address
1644   //   c_rarg2   - element count, treated as size_t, can be zero
1645   //
1646   // Side Effects:
1647   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1648   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1649   //
1650   address generate_disjoint_long_copy(bool aligned, address *entry,
1651                                           const char *name, bool dest_uninitialized = false) {
1652     const bool not_oop = false;
1653     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1654   }
1655 
1656   // Arguments:
1657   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1658   //             ignored
1659   //   name    - stub name string
1660   //
1661   // Inputs:
1662   //   c_rarg0   - source array address
1663   //   c_rarg1   - destination array address
1664   //   c_rarg2   - element count, treated as size_t, can be zero
1665   //
1666   address generate_conjoint_long_copy(bool aligned,
1667                                       address nooverlap_target, address *entry,
1668                                       const char *name, bool dest_uninitialized = false) {
1669     const bool not_oop = false;
1670     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1671   }
1672 
1673   // Arguments:
1674   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1675   //             ignored
1676   //   name    - stub name string
1677   //
1678   // Inputs:
1679   //   c_rarg0   - source array address
1680   //   c_rarg1   - destination array address
1681   //   c_rarg2   - element count, treated as size_t, can be zero
1682   //
1683   // Side Effects:
1684   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1685   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1686   //
1687   address generate_disjoint_oop_copy(bool aligned, address *entry,
1688                                      const char *name, bool dest_uninitialized) {
1689     const bool is_oop = true;
1690     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1691     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1692   }
1693 
1694   // Arguments:
1695   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1696   //             ignored
1697   //   name    - stub name string
1698   //
1699   // Inputs:
1700   //   c_rarg0   - source array address
1701   //   c_rarg1   - destination array address
1702   //   c_rarg2   - element count, treated as size_t, can be zero
1703   //
1704   address generate_conjoint_oop_copy(bool aligned,
1705                                      address nooverlap_target, address *entry,
1706                                      const char *name, bool dest_uninitialized) {
1707     const bool is_oop = true;
1708     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1709     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1710                                   name, dest_uninitialized);
1711   }
1712 
1713 
1714   // Helper for generating a dynamic type check.
1715   // Smashes rscratch1, rscratch2.
1716   void generate_type_check(Register sub_klass,
1717                            Register super_check_offset,
1718                            Register super_klass,
1719                            Label& L_success) {
1720     assert_different_registers(sub_klass, super_check_offset, super_klass);
1721 
1722     BLOCK_COMMENT("type_check:");
1723 
1724     Label L_miss;
1725 
1726     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1727                                      super_check_offset);
1728     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1729 
1730     // Fall through on failure!
1731     __ BIND(L_miss);
1732   }
1733 
1734   //
1735   //  Generate checkcasting array copy stub
1736   //
1737   //  Input:
1738   //    c_rarg0   - source array address
1739   //    c_rarg1   - destination array address
1740   //    c_rarg2   - element count, treated as ssize_t, can be zero
1741   //    c_rarg3   - size_t ckoff (super_check_offset)
1742   //    c_rarg4   - oop ckval (super_klass)
1743   //
1744   //  Output:
1745   //    r0 ==  0  -  success
1746   //    r0 == -1^K - failure, where K is partial transfer count
1747   //
1748   address generate_checkcast_copy(const char *name, address *entry,
1749                                   bool dest_uninitialized = false) {
1750 
1751     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1752 
1753     // Input registers (after setup_arg_regs)
1754     const Register from        = c_rarg0;   // source array address
1755     const Register to          = c_rarg1;   // destination array address
1756     const Register count       = c_rarg2;   // elementscount
1757     const Register ckoff       = c_rarg3;   // super_check_offset
1758     const Register ckval       = c_rarg4;   // super_klass
1759 
1760     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1761     RegSet wb_post_saved_regs = RegSet::of(count);
1762 
1763     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1764     const Register copied_oop  = r22;       // actual oop copied
1765     const Register count_save  = r21;       // orig elementscount
1766     const Register start_to    = r20;       // destination array start address
1767     const Register r19_klass   = r19;       // oop._klass
1768 
1769     //---------------------------------------------------------------
1770     // Assembler stub will be used for this call to arraycopy
1771     // if the two arrays are subtypes of Object[] but the
1772     // destination array type is not equal to or a supertype
1773     // of the source type.  Each element must be separately
1774     // checked.
1775 
1776     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1777                                copied_oop, r19_klass, count_save);
1778 
1779     __ align(CodeEntryAlignment);
1780     StubCodeMark mark(this, "StubRoutines", name);
1781     address start = __ pc();
1782 
1783     __ enter(); // required for proper stackwalking of RuntimeStub frame
1784 
1785 #ifdef ASSERT
1786     // caller guarantees that the arrays really are different
1787     // otherwise, we would have to make conjoint checks
1788     { Label L;
1789       array_overlap_test(L, TIMES_OOP);
1790       __ stop("checkcast_copy within a single array");
1791       __ bind(L);
1792     }
1793 #endif //ASSERT
1794 
1795     // Caller of this entry point must set up the argument registers.
1796     if (entry != NULL) {
1797       *entry = __ pc();
1798       BLOCK_COMMENT("Entry:");
1799     }
1800 
1801      // Empty array:  Nothing to do.
1802     __ cbz(count, L_done);
1803     __ push(RegSet::of(r19, r20, r21, r22), sp);
1804 
1805 #ifdef ASSERT
1806     BLOCK_COMMENT("assert consistent ckoff/ckval");
1807     // The ckoff and ckval must be mutually consistent,
1808     // even though caller generates both.
1809     { Label L;
1810       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1811       __ ldrw(start_to, Address(ckval, sco_offset));
1812       __ cmpw(ckoff, start_to);
1813       __ br(Assembler::EQ, L);
1814       __ stop("super_check_offset inconsistent");
1815       __ bind(L);
1816     }
1817 #endif //ASSERT
1818 
1819     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1820     bool is_oop = true;
1821     if (dest_uninitialized) {
1822       decorators |= IS_DEST_UNINITIALIZED;
1823     }
1824 
1825     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1826     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1827 
1828     // save the original count
1829     __ mov(count_save, count);
1830 
1831     // Copy from low to high addresses
1832     __ mov(start_to, to);              // Save destination array start address
1833     __ b(L_load_element);
1834 
1835     // ======== begin loop ========
1836     // (Loop is rotated; its entry is L_load_element.)
1837     // Loop control:
1838     //   for (; count != 0; count--) {
1839     //     copied_oop = load_heap_oop(from++);
1840     //     ... generate_type_check ...;
1841     //     store_heap_oop(to++, copied_oop);
1842     //   }
1843     __ align(OptoLoopAlignment);
1844 
1845     __ BIND(L_store_element);
1846     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1847     __ sub(count, count, 1);
1848     __ cbz(count, L_do_card_marks);
1849 
1850     // ======== loop entry is here ========
1851     __ BIND(L_load_element);
1852     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1853     __ cbz(copied_oop, L_store_element);
1854 
1855     __ load_klass(r19_klass, copied_oop);// query the object klass
1856     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1857     // ======== end loop ========
1858 
1859     // It was a real error; we must depend on the caller to finish the job.
1860     // Register count = remaining oops, count_orig = total oops.
1861     // Emit GC store barriers for the oops we have copied and report
1862     // their number to the caller.
1863 
1864     __ subs(count, count_save, count);     // K = partially copied oop count
1865     __ eon(count, count, zr);                   // report (-1^K) to caller
1866     __ br(Assembler::EQ, L_done_pop);
1867 
1868     __ BIND(L_do_card_marks);
1869     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1870 
1871     __ bind(L_done_pop);
1872     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1873     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1874 
1875     __ bind(L_done);
1876     __ mov(r0, count);
1877     __ leave();
1878     __ ret(lr);
1879 
1880     return start;
1881   }
1882 
1883   // Perform range checks on the proposed arraycopy.
1884   // Kills temp, but nothing else.
1885   // Also, clean the sign bits of src_pos and dst_pos.
1886   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1887                               Register src_pos, // source position (c_rarg1)
1888                               Register dst,     // destination array oo (c_rarg2)
1889                               Register dst_pos, // destination position (c_rarg3)
1890                               Register length,
1891                               Register temp,
1892                               Label& L_failed) {
1893     BLOCK_COMMENT("arraycopy_range_checks:");
1894 
1895     assert_different_registers(rscratch1, temp);
1896 
1897     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1898     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1899     __ addw(temp, length, src_pos);
1900     __ cmpw(temp, rscratch1);
1901     __ br(Assembler::HI, L_failed);
1902 
1903     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1904     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1905     __ addw(temp, length, dst_pos);
1906     __ cmpw(temp, rscratch1);
1907     __ br(Assembler::HI, L_failed);
1908 
1909     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1910     __ movw(src_pos, src_pos);
1911     __ movw(dst_pos, dst_pos);
1912 
1913     BLOCK_COMMENT("arraycopy_range_checks done");
1914   }
1915 
1916   // These stubs get called from some dumb test routine.
1917   // I'll write them properly when they're called from
1918   // something that's actually doing something.
1919   static void fake_arraycopy_stub(address src, address dst, int count) {
1920     assert(count == 0, "huh?");
1921   }
1922 
1923 
1924   //
1925   //  Generate 'unsafe' array copy stub
1926   //  Though just as safe as the other stubs, it takes an unscaled
1927   //  size_t argument instead of an element count.
1928   //
1929   //  Input:
1930   //    c_rarg0   - source array address
1931   //    c_rarg1   - destination array address
1932   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1933   //
1934   // Examines the alignment of the operands and dispatches
1935   // to a long, int, short, or byte copy loop.
1936   //
1937   address generate_unsafe_copy(const char *name,
1938                                address byte_copy_entry,
1939                                address short_copy_entry,
1940                                address int_copy_entry,
1941                                address long_copy_entry) {
1942     Label L_long_aligned, L_int_aligned, L_short_aligned;
1943     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1944 
1945     __ align(CodeEntryAlignment);
1946     StubCodeMark mark(this, "StubRoutines", name);
1947     address start = __ pc();
1948     __ enter(); // required for proper stackwalking of RuntimeStub frame
1949 
1950     // bump this on entry, not on exit:
1951     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1952 
1953     __ orr(rscratch1, s, d);
1954     __ orr(rscratch1, rscratch1, count);
1955 
1956     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1957     __ cbz(rscratch1, L_long_aligned);
1958     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1959     __ cbz(rscratch1, L_int_aligned);
1960     __ tbz(rscratch1, 0, L_short_aligned);
1961     __ b(RuntimeAddress(byte_copy_entry));
1962 
1963     __ BIND(L_short_aligned);
1964     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1965     __ b(RuntimeAddress(short_copy_entry));
1966     __ BIND(L_int_aligned);
1967     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1968     __ b(RuntimeAddress(int_copy_entry));
1969     __ BIND(L_long_aligned);
1970     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1971     __ b(RuntimeAddress(long_copy_entry));
1972 
1973     return start;
1974   }
1975 
1976   //
1977   //  Generate generic array copy stubs
1978   //
1979   //  Input:
1980   //    c_rarg0    -  src oop
1981   //    c_rarg1    -  src_pos (32-bits)
1982   //    c_rarg2    -  dst oop
1983   //    c_rarg3    -  dst_pos (32-bits)
1984   //    c_rarg4    -  element count (32-bits)
1985   //
1986   //  Output:
1987   //    r0 ==  0  -  success
1988   //    r0 == -1^K - failure, where K is partial transfer count
1989   //
1990   address generate_generic_copy(const char *name,
1991                                 address byte_copy_entry, address short_copy_entry,
1992                                 address int_copy_entry, address oop_copy_entry,
1993                                 address long_copy_entry, address checkcast_copy_entry) {
1994 
1995     Label L_failed, L_objArray;
1996     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1997 
1998     // Input registers
1999     const Register src        = c_rarg0;  // source array oop
2000     const Register src_pos    = c_rarg1;  // source position
2001     const Register dst        = c_rarg2;  // destination array oop
2002     const Register dst_pos    = c_rarg3;  // destination position
2003     const Register length     = c_rarg4;
2004 
2005 
2006     // Registers used as temps
2007     const Register dst_klass  = c_rarg5;
2008 
2009     __ align(CodeEntryAlignment);
2010 
2011     StubCodeMark mark(this, "StubRoutines", name);
2012 
2013     address start = __ pc();
2014 
2015     __ enter(); // required for proper stackwalking of RuntimeStub frame
2016 
2017     // bump this on entry, not on exit:
2018     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2019 
2020     //-----------------------------------------------------------------------
2021     // Assembler stub will be used for this call to arraycopy
2022     // if the following conditions are met:
2023     //
2024     // (1) src and dst must not be null.
2025     // (2) src_pos must not be negative.
2026     // (3) dst_pos must not be negative.
2027     // (4) length  must not be negative.
2028     // (5) src klass and dst klass should be the same and not NULL.
2029     // (6) src and dst should be arrays.
2030     // (7) src_pos + length must not exceed length of src.
2031     // (8) dst_pos + length must not exceed length of dst.
2032     //
2033 
2034     //  if (src == NULL) return -1;
2035     __ cbz(src, L_failed);
2036 
2037     //  if (src_pos < 0) return -1;
2038     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2039 
2040     //  if (dst == NULL) return -1;
2041     __ cbz(dst, L_failed);
2042 
2043     //  if (dst_pos < 0) return -1;
2044     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2045 
2046     // registers used as temp
2047     const Register scratch_length    = r16; // elements count to copy
2048     const Register scratch_src_klass = r17; // array klass
2049     const Register lh                = r15; // layout helper
2050 
2051     //  if (length < 0) return -1;
2052     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2053     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2054 
2055     __ load_klass(scratch_src_klass, src);
2056 #ifdef ASSERT
2057     //  assert(src->klass() != NULL);
2058     {
2059       BLOCK_COMMENT("assert klasses not null {");
2060       Label L1, L2;
2061       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2062       __ bind(L1);
2063       __ stop("broken null klass");
2064       __ bind(L2);
2065       __ load_klass(rscratch1, dst);
2066       __ cbz(rscratch1, L1);     // this would be broken also
2067       BLOCK_COMMENT("} assert klasses not null done");
2068     }
2069 #endif
2070 
2071     // Load layout helper (32-bits)
2072     //
2073     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2074     // 32        30    24            16              8     2                 0
2075     //
2076     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2077     //
2078 
2079     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2080 
2081     // Handle objArrays completely differently...
2082     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2083     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2084     __ movw(rscratch1, objArray_lh);
2085     __ eorw(rscratch2, lh, rscratch1);
2086     __ cbzw(rscratch2, L_objArray);
2087 
2088     //  if (src->klass() != dst->klass()) return -1;
2089     __ load_klass(rscratch2, dst);
2090     __ eor(rscratch2, rscratch2, scratch_src_klass);
2091     __ cbnz(rscratch2, L_failed);
2092 
2093     //  if (!src->is_Array()) return -1;
2094     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2095 
2096     // At this point, it is known to be a typeArray (array_tag 0x3).
2097 #ifdef ASSERT
2098     {
2099       BLOCK_COMMENT("assert primitive array {");
2100       Label L;
2101       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2102       __ cmpw(lh, rscratch2);
2103       __ br(Assembler::GE, L);
2104       __ stop("must be a primitive array");
2105       __ bind(L);
2106       BLOCK_COMMENT("} assert primitive array done");
2107     }
2108 #endif
2109 
2110     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2111                            rscratch2, L_failed);
2112 
2113     // TypeArrayKlass
2114     //
2115     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2116     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2117     //
2118 
2119     const Register rscratch1_offset = rscratch1;    // array offset
2120     const Register r15_elsize = lh; // element size
2121 
2122     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2123            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2124     __ add(src, src, rscratch1_offset);           // src array offset
2125     __ add(dst, dst, rscratch1_offset);           // dst array offset
2126     BLOCK_COMMENT("choose copy loop based on element size");
2127 
2128     // next registers should be set before the jump to corresponding stub
2129     const Register from     = c_rarg0;  // source array address
2130     const Register to       = c_rarg1;  // destination array address
2131     const Register count    = c_rarg2;  // elements count
2132 
2133     // 'from', 'to', 'count' registers should be set in such order
2134     // since they are the same as 'src', 'src_pos', 'dst'.
2135 
2136     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2137 
2138     // The possible values of elsize are 0-3, i.e. exact_log2(element
2139     // size in bytes).  We do a simple bitwise binary search.
2140   __ BIND(L_copy_bytes);
2141     __ tbnz(r15_elsize, 1, L_copy_ints);
2142     __ tbnz(r15_elsize, 0, L_copy_shorts);
2143     __ lea(from, Address(src, src_pos));// src_addr
2144     __ lea(to,   Address(dst, dst_pos));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(byte_copy_entry));
2147 
2148   __ BIND(L_copy_shorts);
2149     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2150     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2151     __ movw(count, scratch_length); // length
2152     __ b(RuntimeAddress(short_copy_entry));
2153 
2154   __ BIND(L_copy_ints);
2155     __ tbnz(r15_elsize, 0, L_copy_longs);
2156     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2157     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2158     __ movw(count, scratch_length); // length
2159     __ b(RuntimeAddress(int_copy_entry));
2160 
2161   __ BIND(L_copy_longs);
2162 #ifdef ASSERT
2163     {
2164       BLOCK_COMMENT("assert long copy {");
2165       Label L;
2166       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2167       __ cmpw(r15_elsize, LogBytesPerLong);
2168       __ br(Assembler::EQ, L);
2169       __ stop("must be long copy, but elsize is wrong");
2170       __ bind(L);
2171       BLOCK_COMMENT("} assert long copy done");
2172     }
2173 #endif
2174     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2175     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2176     __ movw(count, scratch_length); // length
2177     __ b(RuntimeAddress(long_copy_entry));
2178 
2179     // ObjArrayKlass
2180   __ BIND(L_objArray);
2181     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2182 
2183     Label L_plain_copy, L_checkcast_copy;
2184     //  test array classes for subtyping
2185     __ load_klass(r15, dst);
2186     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2187     __ br(Assembler::NE, L_checkcast_copy);
2188 
2189     // Identically typed arrays can be copied without element-wise checks.
2190     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2191                            rscratch2, L_failed);
2192 
2193     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2194     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2195     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2196     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2197     __ movw(count, scratch_length); // length
2198   __ BIND(L_plain_copy);
2199     __ b(RuntimeAddress(oop_copy_entry));
2200 
2201   __ BIND(L_checkcast_copy);
2202     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2203     {
2204       // Before looking at dst.length, make sure dst is also an objArray.
2205       __ ldrw(rscratch1, Address(r15, lh_offset));
2206       __ movw(rscratch2, objArray_lh);
2207       __ eorw(rscratch1, rscratch1, rscratch2);
2208       __ cbnzw(rscratch1, L_failed);
2209 
2210       // It is safe to examine both src.length and dst.length.
2211       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2212                              r15, L_failed);
2213 
2214       __ load_klass(dst_klass, dst); // reload
2215 
2216       // Marshal the base address arguments now, freeing registers.
2217       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2218       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2219       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2220       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2221       __ movw(count, length);           // length (reloaded)
2222       Register sco_temp = c_rarg3;      // this register is free now
2223       assert_different_registers(from, to, count, sco_temp,
2224                                  dst_klass, scratch_src_klass);
2225       // assert_clean_int(count, sco_temp);
2226 
2227       // Generate the type check.
2228       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2229       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2230 
2231       // Smashes rscratch1, rscratch2
2232       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2233 
2234       // Fetch destination element klass from the ObjArrayKlass header.
2235       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2236       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2237       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2238 
2239       // the checkcast_copy loop needs two extra arguments:
2240       assert(c_rarg3 == sco_temp, "#3 already in place");
2241       // Set up arguments for checkcast_copy_entry.
2242       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2243       __ b(RuntimeAddress(checkcast_copy_entry));
2244     }
2245 
2246   __ BIND(L_failed);
2247     __ mov(r0, -1);
2248     __ leave();   // required for proper stackwalking of RuntimeStub frame
2249     __ ret(lr);
2250 
2251     return start;
2252   }
2253 
2254   //
2255   // Generate stub for array fill. If "aligned" is true, the
2256   // "to" address is assumed to be heapword aligned.
2257   //
2258   // Arguments for generated stub:
2259   //   to:    c_rarg0
2260   //   value: c_rarg1
2261   //   count: c_rarg2 treated as signed
2262   //
2263   address generate_fill(BasicType t, bool aligned, const char *name) {
2264     __ align(CodeEntryAlignment);
2265     StubCodeMark mark(this, "StubRoutines", name);
2266     address start = __ pc();
2267 
2268     BLOCK_COMMENT("Entry:");
2269 
2270     const Register to        = c_rarg0;  // source array address
2271     const Register value     = c_rarg1;  // value
2272     const Register count     = c_rarg2;  // elements count
2273 
2274     const Register bz_base = r10;        // base for block_zero routine
2275     const Register cnt_words = r11;      // temp register
2276 
2277     __ enter();
2278 
2279     Label L_fill_elements, L_exit1;
2280 
2281     int shift = -1;
2282     switch (t) {
2283       case T_BYTE:
2284         shift = 0;
2285         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2286         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2287         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2288         __ br(Assembler::LO, L_fill_elements);
2289         break;
2290       case T_SHORT:
2291         shift = 1;
2292         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2293         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2294         __ br(Assembler::LO, L_fill_elements);
2295         break;
2296       case T_INT:
2297         shift = 2;
2298         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2299         __ br(Assembler::LO, L_fill_elements);
2300         break;
2301       default: ShouldNotReachHere();
2302     }
2303 
2304     // Align source address at 8 bytes address boundary.
2305     Label L_skip_align1, L_skip_align2, L_skip_align4;
2306     if (!aligned) {
2307       switch (t) {
2308         case T_BYTE:
2309           // One byte misalignment happens only for byte arrays.
2310           __ tbz(to, 0, L_skip_align1);
2311           __ strb(value, Address(__ post(to, 1)));
2312           __ subw(count, count, 1);
2313           __ bind(L_skip_align1);
2314           // Fallthrough
2315         case T_SHORT:
2316           // Two bytes misalignment happens only for byte and short (char) arrays.
2317           __ tbz(to, 1, L_skip_align2);
2318           __ strh(value, Address(__ post(to, 2)));
2319           __ subw(count, count, 2 >> shift);
2320           __ bind(L_skip_align2);
2321           // Fallthrough
2322         case T_INT:
2323           // Align to 8 bytes, we know we are 4 byte aligned to start.
2324           __ tbz(to, 2, L_skip_align4);
2325           __ strw(value, Address(__ post(to, 4)));
2326           __ subw(count, count, 4 >> shift);
2327           __ bind(L_skip_align4);
2328           break;
2329         default: ShouldNotReachHere();
2330       }
2331     }
2332 
2333     //
2334     //  Fill large chunks
2335     //
2336     __ lsrw(cnt_words, count, 3 - shift); // number of words
2337     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2338     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2339     if (UseBlockZeroing) {
2340       Label non_block_zeroing, rest;
2341       // If the fill value is zero we can use the fast zero_words().
2342       __ cbnz(value, non_block_zeroing);
2343       __ mov(bz_base, to);
2344       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2345       __ zero_words(bz_base, cnt_words);
2346       __ b(rest);
2347       __ bind(non_block_zeroing);
2348       __ fill_words(to, cnt_words, value);
2349       __ bind(rest);
2350     } else {
2351       __ fill_words(to, cnt_words, value);
2352     }
2353 
2354     // Remaining count is less than 8 bytes. Fill it by a single store.
2355     // Note that the total length is no less than 8 bytes.
2356     if (t == T_BYTE || t == T_SHORT) {
2357       Label L_exit1;
2358       __ cbzw(count, L_exit1);
2359       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2360       __ str(value, Address(to, -8));    // overwrite some elements
2361       __ bind(L_exit1);
2362       __ leave();
2363       __ ret(lr);
2364     }
2365 
2366     // Handle copies less than 8 bytes.
2367     Label L_fill_2, L_fill_4, L_exit2;
2368     __ bind(L_fill_elements);
2369     switch (t) {
2370       case T_BYTE:
2371         __ tbz(count, 0, L_fill_2);
2372         __ strb(value, Address(__ post(to, 1)));
2373         __ bind(L_fill_2);
2374         __ tbz(count, 1, L_fill_4);
2375         __ strh(value, Address(__ post(to, 2)));
2376         __ bind(L_fill_4);
2377         __ tbz(count, 2, L_exit2);
2378         __ strw(value, Address(to));
2379         break;
2380       case T_SHORT:
2381         __ tbz(count, 0, L_fill_4);
2382         __ strh(value, Address(__ post(to, 2)));
2383         __ bind(L_fill_4);
2384         __ tbz(count, 1, L_exit2);
2385         __ strw(value, Address(to));
2386         break;
2387       case T_INT:
2388         __ cbzw(count, L_exit2);
2389         __ strw(value, Address(to));
2390         break;
2391       default: ShouldNotReachHere();
2392     }
2393     __ bind(L_exit2);
2394     __ leave();
2395     __ ret(lr);
2396     return start;
2397   }
2398 
2399   address generate_data_cache_writeback() {
2400     const Register line        = c_rarg0;  // address of line to write back
2401 
2402     __ align(CodeEntryAlignment);
2403 
2404     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2405 
2406     address start = __ pc();
2407     __ enter();
2408     __ cache_wb(Address(line, 0));
2409     __ leave();
2410     __ ret(lr);
2411 
2412     return start;
2413   }
2414 
2415   address generate_data_cache_writeback_sync() {
2416     const Register is_pre     = c_rarg0;  // pre or post sync
2417 
2418     __ align(CodeEntryAlignment);
2419 
2420     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2421 
2422     // pre wbsync is a no-op
2423     // post wbsync translates to an sfence
2424 
2425     Label skip;
2426     address start = __ pc();
2427     __ enter();
2428     __ cbnz(is_pre, skip);
2429     __ cache_wbsync(false);
2430     __ bind(skip);
2431     __ leave();
2432     __ ret(lr);
2433 
2434     return start;
2435   }
2436 
2437   void generate_arraycopy_stubs() {
2438     address entry;
2439     address entry_jbyte_arraycopy;
2440     address entry_jshort_arraycopy;
2441     address entry_jint_arraycopy;
2442     address entry_oop_arraycopy;
2443     address entry_jlong_arraycopy;
2444     address entry_checkcast_arraycopy;
2445 
2446     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2447     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2448 
2449     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2450 
2451     //*** jbyte
2452     // Always need aligned and unaligned versions
2453     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2454                                                                                   "jbyte_disjoint_arraycopy");
2455     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2456                                                                                   &entry_jbyte_arraycopy,
2457                                                                                   "jbyte_arraycopy");
2458     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2459                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2460     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2461                                                                                   "arrayof_jbyte_arraycopy");
2462 
2463     //*** jshort
2464     // Always need aligned and unaligned versions
2465     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2466                                                                                     "jshort_disjoint_arraycopy");
2467     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2468                                                                                     &entry_jshort_arraycopy,
2469                                                                                     "jshort_arraycopy");
2470     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2471                                                                                     "arrayof_jshort_disjoint_arraycopy");
2472     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2473                                                                                     "arrayof_jshort_arraycopy");
2474 
2475     //*** jint
2476     // Aligned versions
2477     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2478                                                                                 "arrayof_jint_disjoint_arraycopy");
2479     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2480                                                                                 "arrayof_jint_arraycopy");
2481     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2482     // entry_jint_arraycopy always points to the unaligned version
2483     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2484                                                                                 "jint_disjoint_arraycopy");
2485     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2486                                                                                 &entry_jint_arraycopy,
2487                                                                                 "jint_arraycopy");
2488 
2489     //*** jlong
2490     // It is always aligned
2491     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2492                                                                                   "arrayof_jlong_disjoint_arraycopy");
2493     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2494                                                                                   "arrayof_jlong_arraycopy");
2495     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2496     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2497 
2498     //*** oops
2499     {
2500       // With compressed oops we need unaligned versions; notice that
2501       // we overwrite entry_oop_arraycopy.
2502       bool aligned = !UseCompressedOops;
2503 
2504       StubRoutines::_arrayof_oop_disjoint_arraycopy
2505         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2506                                      /*dest_uninitialized*/false);
2507       StubRoutines::_arrayof_oop_arraycopy
2508         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2509                                      /*dest_uninitialized*/false);
2510       // Aligned versions without pre-barriers
2511       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2512         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2513                                      /*dest_uninitialized*/true);
2514       StubRoutines::_arrayof_oop_arraycopy_uninit
2515         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2516                                      /*dest_uninitialized*/true);
2517     }
2518 
2519     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2520     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2521     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2522     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2523 
2524     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2525     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2526                                                                         /*dest_uninitialized*/true);
2527 
2528     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2529                                                               entry_jbyte_arraycopy,
2530                                                               entry_jshort_arraycopy,
2531                                                               entry_jint_arraycopy,
2532                                                               entry_jlong_arraycopy);
2533 
2534     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2535                                                                entry_jbyte_arraycopy,
2536                                                                entry_jshort_arraycopy,
2537                                                                entry_jint_arraycopy,
2538                                                                entry_oop_arraycopy,
2539                                                                entry_jlong_arraycopy,
2540                                                                entry_checkcast_arraycopy);
2541 
2542     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2543     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2544     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2545     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2546     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2547     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2548   }
2549 
2550   void generate_math_stubs() { Unimplemented(); }
2551 
2552   // Arguments:
2553   //
2554   // Inputs:
2555   //   c_rarg0   - source byte array address
2556   //   c_rarg1   - destination byte array address
2557   //   c_rarg2   - K (key) in little endian int array
2558   //
2559   address generate_aescrypt_encryptBlock() {
2560     __ align(CodeEntryAlignment);
2561     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2562 
2563     const Register from        = c_rarg0;  // source array address
2564     const Register to          = c_rarg1;  // destination array address
2565     const Register key         = c_rarg2;  // key array address
2566     const Register keylen      = rscratch1;
2567 
2568     address start = __ pc();
2569     __ enter();
2570 
2571     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2572 
2573     __ aesenc_loadkeys(key, keylen);
2574     __ aesecb_encrypt(from, to, keylen);
2575 
2576     __ mov(r0, 0);
2577 
2578     __ leave();
2579     __ ret(lr);
2580 
2581     return start;
2582   }
2583 
2584   // Arguments:
2585   //
2586   // Inputs:
2587   //   c_rarg0   - source byte array address
2588   //   c_rarg1   - destination byte array address
2589   //   c_rarg2   - K (key) in little endian int array
2590   //
2591   address generate_aescrypt_decryptBlock() {
2592     assert(UseAES, "need AES cryptographic extension support");
2593     __ align(CodeEntryAlignment);
2594     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2595     Label L_doLast;
2596 
2597     const Register from        = c_rarg0;  // source array address
2598     const Register to          = c_rarg1;  // destination array address
2599     const Register key         = c_rarg2;  // key array address
2600     const Register keylen      = rscratch1;
2601 
2602     address start = __ pc();
2603     __ enter(); // required for proper stackwalking of RuntimeStub frame
2604 
2605     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2606 
2607     __ aesecb_decrypt(from, to, key, keylen);
2608 
2609     __ mov(r0, 0);
2610 
2611     __ leave();
2612     __ ret(lr);
2613 
2614     return start;
2615   }
2616 
2617   // Arguments:
2618   //
2619   // Inputs:
2620   //   c_rarg0   - source byte array address
2621   //   c_rarg1   - destination byte array address
2622   //   c_rarg2   - K (key) in little endian int array
2623   //   c_rarg3   - r vector byte array address
2624   //   c_rarg4   - input length
2625   //
2626   // Output:
2627   //   x0        - input length
2628   //
2629   address generate_cipherBlockChaining_encryptAESCrypt() {
2630     assert(UseAES, "need AES cryptographic extension support");
2631     __ align(CodeEntryAlignment);
2632     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2633 
2634     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2635 
2636     const Register from        = c_rarg0;  // source array address
2637     const Register to          = c_rarg1;  // destination array address
2638     const Register key         = c_rarg2;  // key array address
2639     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2640                                            // and left with the results of the last encryption block
2641     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2642     const Register keylen      = rscratch1;
2643 
2644     address start = __ pc();
2645 
2646       __ enter();
2647 
2648       __ movw(rscratch2, len_reg);
2649 
2650       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2651 
2652       __ ld1(v0, __ T16B, rvec);
2653 
2654       __ cmpw(keylen, 52);
2655       __ br(Assembler::CC, L_loadkeys_44);
2656       __ br(Assembler::EQ, L_loadkeys_52);
2657 
2658       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2659       __ rev32(v17, __ T16B, v17);
2660       __ rev32(v18, __ T16B, v18);
2661     __ BIND(L_loadkeys_52);
2662       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2663       __ rev32(v19, __ T16B, v19);
2664       __ rev32(v20, __ T16B, v20);
2665     __ BIND(L_loadkeys_44);
2666       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2667       __ rev32(v21, __ T16B, v21);
2668       __ rev32(v22, __ T16B, v22);
2669       __ rev32(v23, __ T16B, v23);
2670       __ rev32(v24, __ T16B, v24);
2671       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2672       __ rev32(v25, __ T16B, v25);
2673       __ rev32(v26, __ T16B, v26);
2674       __ rev32(v27, __ T16B, v27);
2675       __ rev32(v28, __ T16B, v28);
2676       __ ld1(v29, v30, v31, __ T16B, key);
2677       __ rev32(v29, __ T16B, v29);
2678       __ rev32(v30, __ T16B, v30);
2679       __ rev32(v31, __ T16B, v31);
2680 
2681     __ BIND(L_aes_loop);
2682       __ ld1(v1, __ T16B, __ post(from, 16));
2683       __ eor(v0, __ T16B, v0, v1);
2684 
2685       __ br(Assembler::CC, L_rounds_44);
2686       __ br(Assembler::EQ, L_rounds_52);
2687 
2688       __ aese(v0, v17); __ aesmc(v0, v0);
2689       __ aese(v0, v18); __ aesmc(v0, v0);
2690     __ BIND(L_rounds_52);
2691       __ aese(v0, v19); __ aesmc(v0, v0);
2692       __ aese(v0, v20); __ aesmc(v0, v0);
2693     __ BIND(L_rounds_44);
2694       __ aese(v0, v21); __ aesmc(v0, v0);
2695       __ aese(v0, v22); __ aesmc(v0, v0);
2696       __ aese(v0, v23); __ aesmc(v0, v0);
2697       __ aese(v0, v24); __ aesmc(v0, v0);
2698       __ aese(v0, v25); __ aesmc(v0, v0);
2699       __ aese(v0, v26); __ aesmc(v0, v0);
2700       __ aese(v0, v27); __ aesmc(v0, v0);
2701       __ aese(v0, v28); __ aesmc(v0, v0);
2702       __ aese(v0, v29); __ aesmc(v0, v0);
2703       __ aese(v0, v30);
2704       __ eor(v0, __ T16B, v0, v31);
2705 
2706       __ st1(v0, __ T16B, __ post(to, 16));
2707 
2708       __ subw(len_reg, len_reg, 16);
2709       __ cbnzw(len_reg, L_aes_loop);
2710 
2711       __ st1(v0, __ T16B, rvec);
2712 
2713       __ mov(r0, rscratch2);
2714 
2715       __ leave();
2716       __ ret(lr);
2717 
2718       return start;
2719   }
2720 
2721   // Arguments:
2722   //
2723   // Inputs:
2724   //   c_rarg0   - source byte array address
2725   //   c_rarg1   - destination byte array address
2726   //   c_rarg2   - K (key) in little endian int array
2727   //   c_rarg3   - r vector byte array address
2728   //   c_rarg4   - input length
2729   //
2730   // Output:
2731   //   r0        - input length
2732   //
2733   address generate_cipherBlockChaining_decryptAESCrypt() {
2734     assert(UseAES, "need AES cryptographic extension support");
2735     __ align(CodeEntryAlignment);
2736     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2737 
2738     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2739 
2740     const Register from        = c_rarg0;  // source array address
2741     const Register to          = c_rarg1;  // destination array address
2742     const Register key         = c_rarg2;  // key array address
2743     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2744                                            // and left with the results of the last encryption block
2745     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2746     const Register keylen      = rscratch1;
2747 
2748     address start = __ pc();
2749 
2750       __ enter();
2751 
2752       __ movw(rscratch2, len_reg);
2753 
2754       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2755 
2756       __ ld1(v2, __ T16B, rvec);
2757 
2758       __ ld1(v31, __ T16B, __ post(key, 16));
2759       __ rev32(v31, __ T16B, v31);
2760 
2761       __ cmpw(keylen, 52);
2762       __ br(Assembler::CC, L_loadkeys_44);
2763       __ br(Assembler::EQ, L_loadkeys_52);
2764 
2765       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2766       __ rev32(v17, __ T16B, v17);
2767       __ rev32(v18, __ T16B, v18);
2768     __ BIND(L_loadkeys_52);
2769       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2770       __ rev32(v19, __ T16B, v19);
2771       __ rev32(v20, __ T16B, v20);
2772     __ BIND(L_loadkeys_44);
2773       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2774       __ rev32(v21, __ T16B, v21);
2775       __ rev32(v22, __ T16B, v22);
2776       __ rev32(v23, __ T16B, v23);
2777       __ rev32(v24, __ T16B, v24);
2778       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2779       __ rev32(v25, __ T16B, v25);
2780       __ rev32(v26, __ T16B, v26);
2781       __ rev32(v27, __ T16B, v27);
2782       __ rev32(v28, __ T16B, v28);
2783       __ ld1(v29, v30, __ T16B, key);
2784       __ rev32(v29, __ T16B, v29);
2785       __ rev32(v30, __ T16B, v30);
2786 
2787     __ BIND(L_aes_loop);
2788       __ ld1(v0, __ T16B, __ post(from, 16));
2789       __ orr(v1, __ T16B, v0, v0);
2790 
2791       __ br(Assembler::CC, L_rounds_44);
2792       __ br(Assembler::EQ, L_rounds_52);
2793 
2794       __ aesd(v0, v17); __ aesimc(v0, v0);
2795       __ aesd(v0, v18); __ aesimc(v0, v0);
2796     __ BIND(L_rounds_52);
2797       __ aesd(v0, v19); __ aesimc(v0, v0);
2798       __ aesd(v0, v20); __ aesimc(v0, v0);
2799     __ BIND(L_rounds_44);
2800       __ aesd(v0, v21); __ aesimc(v0, v0);
2801       __ aesd(v0, v22); __ aesimc(v0, v0);
2802       __ aesd(v0, v23); __ aesimc(v0, v0);
2803       __ aesd(v0, v24); __ aesimc(v0, v0);
2804       __ aesd(v0, v25); __ aesimc(v0, v0);
2805       __ aesd(v0, v26); __ aesimc(v0, v0);
2806       __ aesd(v0, v27); __ aesimc(v0, v0);
2807       __ aesd(v0, v28); __ aesimc(v0, v0);
2808       __ aesd(v0, v29); __ aesimc(v0, v0);
2809       __ aesd(v0, v30);
2810       __ eor(v0, __ T16B, v0, v31);
2811       __ eor(v0, __ T16B, v0, v2);
2812 
2813       __ st1(v0, __ T16B, __ post(to, 16));
2814       __ orr(v2, __ T16B, v1, v1);
2815 
2816       __ subw(len_reg, len_reg, 16);
2817       __ cbnzw(len_reg, L_aes_loop);
2818 
2819       __ st1(v2, __ T16B, rvec);
2820 
2821       __ mov(r0, rscratch2);
2822 
2823       __ leave();
2824       __ ret(lr);
2825 
2826     return start;
2827   }
2828 
2829   // CTR AES crypt.
2830   // Arguments:
2831   //
2832   // Inputs:
2833   //   c_rarg0   - source byte array address
2834   //   c_rarg1   - destination byte array address
2835   //   c_rarg2   - K (key) in little endian int array
2836   //   c_rarg3   - counter vector byte array address
2837   //   c_rarg4   - input length
2838   //   c_rarg5   - saved encryptedCounter start
2839   //   c_rarg6   - saved used length
2840   //
2841   // Output:
2842   //   r0       - input length
2843   //
2844   address generate_counterMode_AESCrypt() {
2845     const Register in = c_rarg0;
2846     const Register out = c_rarg1;
2847     const Register key = c_rarg2;
2848     const Register counter = c_rarg3;
2849     const Register saved_len = c_rarg4, len = r10;
2850     const Register saved_encrypted_ctr = c_rarg5;
2851     const Register used_ptr = c_rarg6, used = r12;
2852 
2853     const Register offset = r7;
2854     const Register keylen = r11;
2855 
2856     const unsigned char block_size = 16;
2857     const int bulk_width = 4;
2858     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2859     // performance with larger data sizes, but it also means that the
2860     // fast path isn't used until you have at least 8 blocks, and up
2861     // to 127 bytes of data will be executed on the slow path. For
2862     // that reason, and also so as not to blow away too much icache, 4
2863     // blocks seems like a sensible compromise.
2864 
2865     // Algorithm:
2866     //
2867     //    if (len == 0) {
2868     //        goto DONE;
2869     //    }
2870     //    int result = len;
2871     //    do {
2872     //        if (used >= blockSize) {
2873     //            if (len >= bulk_width * blockSize) {
2874     //                CTR_large_block();
2875     //                if (len == 0)
2876     //                    goto DONE;
2877     //            }
2878     //            for (;;) {
2879     //                16ByteVector v0 = counter;
2880     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2881     //                used = 0;
2882     //                if (len < blockSize)
2883     //                    break;    /* goto NEXT */
2884     //                16ByteVector v1 = load16Bytes(in, offset);
2885     //                v1 = v1 ^ encryptedCounter;
2886     //                store16Bytes(out, offset);
2887     //                used = blockSize;
2888     //                offset += blockSize;
2889     //                len -= blockSize;
2890     //                if (len == 0)
2891     //                    goto DONE;
2892     //            }
2893     //        }
2894     //      NEXT:
2895     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2896     //        len--;
2897     //    } while (len != 0);
2898     //  DONE:
2899     //    return result;
2900     //
2901     // CTR_large_block()
2902     //    Wide bulk encryption of whole blocks.
2903 
2904     __ align(CodeEntryAlignment);
2905     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2906     const address start = __ pc();
2907     __ enter();
2908 
2909     Label DONE, CTR_large_block, large_block_return;
2910     __ ldrw(used, Address(used_ptr));
2911     __ cbzw(saved_len, DONE);
2912 
2913     __ mov(len, saved_len);
2914     __ mov(offset, 0);
2915 
2916     // Compute #rounds for AES based on the length of the key array
2917     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2918 
2919     __ aesenc_loadkeys(key, keylen);
2920 
2921     {
2922       Label L_CTR_loop, NEXT;
2923 
2924       __ bind(L_CTR_loop);
2925 
2926       __ cmp(used, block_size);
2927       __ br(__ LO, NEXT);
2928 
2929       // Maybe we have a lot of data
2930       __ subsw(rscratch1, len, bulk_width * block_size);
2931       __ br(__ HS, CTR_large_block);
2932       __ BIND(large_block_return);
2933       __ cbzw(len, DONE);
2934 
2935       // Setup the counter
2936       __ movi(v4, __ T4S, 0);
2937       __ movi(v5, __ T4S, 1);
2938       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2939 
2940       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2941       __ rev32(v16, __ T16B, v0);
2942       __ addv(v16, __ T4S, v16, v4);
2943       __ rev32(v16, __ T16B, v16);
2944       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2945 
2946       {
2947         // We have fewer than bulk_width blocks of data left. Encrypt
2948         // them one by one until there is less than a full block
2949         // remaining, being careful to save both the encrypted counter
2950         // and the counter.
2951 
2952         Label inner_loop;
2953         __ bind(inner_loop);
2954         // Counter to encrypt is in v0
2955         __ aesecb_encrypt(noreg, noreg, keylen);
2956         __ st1(v0, __ T16B, saved_encrypted_ctr);
2957 
2958         // Do we have a remaining full block?
2959 
2960         __ mov(used, 0);
2961         __ cmp(len, block_size);
2962         __ br(__ LO, NEXT);
2963 
2964         // Yes, we have a full block
2965         __ ldrq(v1, Address(in, offset));
2966         __ eor(v1, __ T16B, v1, v0);
2967         __ strq(v1, Address(out, offset));
2968         __ mov(used, block_size);
2969         __ add(offset, offset, block_size);
2970 
2971         __ subw(len, len, block_size);
2972         __ cbzw(len, DONE);
2973 
2974         // Increment the counter, store it back
2975         __ orr(v0, __ T16B, v16, v16);
2976         __ rev32(v16, __ T16B, v16);
2977         __ addv(v16, __ T4S, v16, v4);
2978         __ rev32(v16, __ T16B, v16);
2979         __ st1(v16, __ T16B, counter); // Save the incremented counter back
2980 
2981         __ b(inner_loop);
2982       }
2983 
2984       __ BIND(NEXT);
2985 
2986       // Encrypt a single byte, and loop.
2987       // We expect this to be a rare event.
2988       __ ldrb(rscratch1, Address(in, offset));
2989       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
2990       __ eor(rscratch1, rscratch1, rscratch2);
2991       __ strb(rscratch1, Address(out, offset));
2992       __ add(offset, offset, 1);
2993       __ add(used, used, 1);
2994       __ subw(len, len,1);
2995       __ cbnzw(len, L_CTR_loop);
2996     }
2997 
2998     __ bind(DONE);
2999     __ strw(used, Address(used_ptr));
3000     __ mov(r0, saved_len);
3001 
3002     __ leave(); // required for proper stackwalking of RuntimeStub frame
3003     __ ret(lr);
3004 
3005     // Bulk encryption
3006 
3007     __ BIND (CTR_large_block);
3008     assert(bulk_width == 4 || bulk_width == 8, "must be");
3009 
3010     if (bulk_width == 8) {
3011       __ sub(sp, sp, 4 * 16);
3012       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3013     }
3014     __ sub(sp, sp, 4 * 16);
3015     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3016     RegSet saved_regs = (RegSet::of(in, out, offset)
3017                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3018     __ push(saved_regs, sp);
3019     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3020     __ add(in, in, offset);
3021     __ add(out, out, offset);
3022 
3023     // Keys should already be loaded into the correct registers
3024 
3025     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3026     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3027 
3028     // AES/CTR loop
3029     {
3030       Label L_CTR_loop;
3031       __ BIND(L_CTR_loop);
3032 
3033       // Setup the counters
3034       __ movi(v8, __ T4S, 0);
3035       __ movi(v9, __ T4S, 1);
3036       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3037 
3038       for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
3039         __ rev32(f, __ T16B, v16);
3040         __ addv(v16, __ T4S, v16, v8);
3041       }
3042 
3043       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3044 
3045       // Encrypt the counters
3046       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3047 
3048       if (bulk_width == 8) {
3049         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3050       }
3051 
3052       // XOR the encrypted counters with the inputs
3053       for (int i = 0; i < bulk_width; i++) {
3054         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3055       }
3056 
3057       // Write the encrypted data
3058       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3059       if (bulk_width == 8) {
3060         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3061       }
3062 
3063       __ subw(len, len, 16 * bulk_width);
3064       __ cbnzw(len, L_CTR_loop);
3065     }
3066 
3067     // Save the counter back where it goes
3068     __ rev32(v16, __ T16B, v16);
3069     __ st1(v16, __ T16B, counter);
3070 
3071     __ pop(saved_regs, sp);
3072 
3073     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3074     if (bulk_width == 8) {
3075       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3076     }
3077 
3078     __ andr(rscratch1, len, -16 * bulk_width);
3079     __ sub(len, len, rscratch1);
3080     __ add(offset, offset, rscratch1);
3081     __ mov(used, 16);
3082     __ strw(used, Address(used_ptr));
3083     __ b(large_block_return);
3084 
3085     return start;
3086   }
3087 
3088   // Vector AES Galois Counter Mode implementation. Parameters:
3089   //
3090   // in = c_rarg0
3091   // len = c_rarg1
3092   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3093   // out = c_rarg3
3094   // key = c_rarg4
3095   // state = c_rarg5 - GHASH.state
3096   // subkeyHtbl = c_rarg6 - powers of H
3097   // subkeyHtbl_48_entries = c_rarg7 (not used)
3098   // counter = [sp, #0] pointer to 16 bytes of CTR
3099   // return - number of processed bytes
3100   address generate_galoisCounterMode_AESCrypt() {
3101     address ghash_polynomial = __ pc();
3102     __ emit_int64(0x87);  // The low-order bits of the field
3103                           // polynomial (i.e. p = z^7+z^2+z+1)
3104                           // repeated in the low and high parts of a
3105                           // 128-bit vector
3106     __ emit_int64(0x87);
3107 
3108     __ align(CodeEntryAlignment);
3109      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3110     address start = __ pc();
3111     __ enter();
3112 
3113     const Register in = c_rarg0;
3114     const Register len = c_rarg1;
3115     const Register ct = c_rarg2;
3116     const Register out = c_rarg3;
3117     // and updated with the incremented counter in the end
3118 
3119     const Register key = c_rarg4;
3120     const Register state = c_rarg5;
3121 
3122     const Register subkeyHtbl = c_rarg6;
3123 
3124     // Pointer to CTR is passed on the stack before the (fp, lr) pair.
3125     const Address counter_mem(sp, 2 * wordSize);
3126     const Register counter = c_rarg7;
3127     __ ldr(counter, counter_mem);
3128 
3129     const Register keylen = r10;
3130     // Save state before entering routine
3131     __ sub(sp, sp, 4 * 16);
3132     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3133     __ sub(sp, sp, 4 * 16);
3134     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3135 
3136     // __ andr(len, len, -512);
3137     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3138     __ str(len, __ pre(sp, -2 * wordSize));
3139 
3140     Label DONE;
3141     __ cbz(len, DONE);
3142 
3143     // Compute #rounds for AES based on the length of the key array
3144     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3145 
3146     __ aesenc_loadkeys(key, keylen);
3147     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3148     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3149 
3150     // AES/CTR loop
3151     {
3152       Label L_CTR_loop;
3153       __ BIND(L_CTR_loop);
3154 
3155       // Setup the counters
3156       __ movi(v8, __ T4S, 0);
3157       __ movi(v9, __ T4S, 1);
3158       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3159       for (FloatRegister f = v0; f < v8; f++) {
3160         __ rev32(f, __ T16B, v16);
3161         __ addv(v16, __ T4S, v16, v8);
3162       }
3163 
3164       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3165 
3166       // Encrypt the counters
3167       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3168 
3169       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3170 
3171       // XOR the encrypted counters with the inputs
3172       for (int i = 0; i < 8; i++) {
3173         __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
3174       }
3175       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3176       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3177 
3178       __ subw(len, len, 16 * 8);
3179       __ cbnzw(len, L_CTR_loop);
3180     }
3181 
3182     __ rev32(v16, __ T16B, v16);
3183     __ st1(v16, __ T16B, counter);
3184 
3185     __ ldr(len, Address(sp));
3186     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3187 
3188     // GHASH/CTR loop
3189     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3190                                 len, /*unrolls*/4);
3191 
3192 #ifdef ASSERT
3193     { Label L;
3194       __ cmp(len, (unsigned char)0);
3195       __ br(Assembler::EQ, L);
3196       __ stop("stubGenerator: abort");
3197       __ bind(L);
3198   }
3199 #endif
3200 
3201   __ bind(DONE);
3202     // Return the number of bytes processed
3203     __ ldr(r0, __ post(sp, 2 * wordSize));
3204 
3205     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3206     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3207 
3208     __ leave(); // required for proper stackwalking of RuntimeStub frame
3209     __ ret(lr);
3210      return start;
3211   }
3212 
3213   // Arguments:
3214   //
3215   // Inputs:
3216   //   c_rarg0   - byte[]  source+offset
3217   //   c_rarg1   - int[]   SHA.state
3218   //   c_rarg2   - int     offset
3219   //   c_rarg3   - int     limit
3220   //
3221   address generate_sha1_implCompress(bool multi_block, const char *name) {
3222     __ align(CodeEntryAlignment);
3223     StubCodeMark mark(this, "StubRoutines", name);
3224     address start = __ pc();
3225 
3226     Register buf   = c_rarg0;
3227     Register state = c_rarg1;
3228     Register ofs   = c_rarg2;
3229     Register limit = c_rarg3;
3230 
3231     Label keys;
3232     Label sha1_loop;
3233 
3234     // load the keys into v0..v3
3235     __ adr(rscratch1, keys);
3236     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3237     // load 5 words state into v6, v7
3238     __ ldrq(v6, Address(state, 0));
3239     __ ldrs(v7, Address(state, 16));
3240 
3241 
3242     __ BIND(sha1_loop);
3243     // load 64 bytes of data into v16..v19
3244     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3245     __ rev32(v16, __ T16B, v16);
3246     __ rev32(v17, __ T16B, v17);
3247     __ rev32(v18, __ T16B, v18);
3248     __ rev32(v19, __ T16B, v19);
3249 
3250     // do the sha1
3251     __ addv(v4, __ T4S, v16, v0);
3252     __ orr(v20, __ T16B, v6, v6);
3253 
3254     FloatRegister d0 = v16;
3255     FloatRegister d1 = v17;
3256     FloatRegister d2 = v18;
3257     FloatRegister d3 = v19;
3258 
3259     for (int round = 0; round < 20; round++) {
3260       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3261       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3262       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3263       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3264       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3265 
3266       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3267       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3268       __ sha1h(tmp2, __ T4S, v20);
3269       if (round < 5)
3270         __ sha1c(v20, __ T4S, tmp3, tmp4);
3271       else if (round < 10 || round >= 15)
3272         __ sha1p(v20, __ T4S, tmp3, tmp4);
3273       else
3274         __ sha1m(v20, __ T4S, tmp3, tmp4);
3275       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3276 
3277       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3278     }
3279 
3280     __ addv(v7, __ T2S, v7, v21);
3281     __ addv(v6, __ T4S, v6, v20);
3282 
3283     if (multi_block) {
3284       __ add(ofs, ofs, 64);
3285       __ cmp(ofs, limit);
3286       __ br(Assembler::LE, sha1_loop);
3287       __ mov(c_rarg0, ofs); // return ofs
3288     }
3289 
3290     __ strq(v6, Address(state, 0));
3291     __ strs(v7, Address(state, 16));
3292 
3293     __ ret(lr);
3294 
3295     __ bind(keys);
3296     __ emit_int32(0x5a827999);
3297     __ emit_int32(0x6ed9eba1);
3298     __ emit_int32(0x8f1bbcdc);
3299     __ emit_int32(0xca62c1d6);
3300 
3301     return start;
3302   }
3303 
3304 
3305   // Arguments:
3306   //
3307   // Inputs:
3308   //   c_rarg0   - byte[]  source+offset
3309   //   c_rarg1   - int[]   SHA.state
3310   //   c_rarg2   - int     offset
3311   //   c_rarg3   - int     limit
3312   //
3313   address generate_sha256_implCompress(bool multi_block, const char *name) {
3314     static const uint32_t round_consts[64] = {
3315       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3316       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3317       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3318       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3319       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3320       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3321       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3322       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3323       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3324       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3325       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3326       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3327       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3328       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3329       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3330       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3331     };
3332     __ align(CodeEntryAlignment);
3333     StubCodeMark mark(this, "StubRoutines", name);
3334     address start = __ pc();
3335 
3336     Register buf   = c_rarg0;
3337     Register state = c_rarg1;
3338     Register ofs   = c_rarg2;
3339     Register limit = c_rarg3;
3340 
3341     Label sha1_loop;
3342 
3343     __ stpd(v8, v9, __ pre(sp, -32));
3344     __ stpd(v10, v11, Address(sp, 16));
3345 
3346 // dga == v0
3347 // dgb == v1
3348 // dg0 == v2
3349 // dg1 == v3
3350 // dg2 == v4
3351 // t0 == v6
3352 // t1 == v7
3353 
3354     // load 16 keys to v16..v31
3355     __ lea(rscratch1, ExternalAddress((address)round_consts));
3356     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3357     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3358     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3359     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3360 
3361     // load 8 words (256 bits) state
3362     __ ldpq(v0, v1, state);
3363 
3364     __ BIND(sha1_loop);
3365     // load 64 bytes of data into v8..v11
3366     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3367     __ rev32(v8, __ T16B, v8);
3368     __ rev32(v9, __ T16B, v9);
3369     __ rev32(v10, __ T16B, v10);
3370     __ rev32(v11, __ T16B, v11);
3371 
3372     __ addv(v6, __ T4S, v8, v16);
3373     __ orr(v2, __ T16B, v0, v0);
3374     __ orr(v3, __ T16B, v1, v1);
3375 
3376     FloatRegister d0 = v8;
3377     FloatRegister d1 = v9;
3378     FloatRegister d2 = v10;
3379     FloatRegister d3 = v11;
3380 
3381 
3382     for (int round = 0; round < 16; round++) {
3383       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3384       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3385       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3386       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3387 
3388       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3389        __ orr(v4, __ T16B, v2, v2);
3390       if (round < 15)
3391         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3392       __ sha256h(v2, __ T4S, v3, tmp2);
3393       __ sha256h2(v3, __ T4S, v4, tmp2);
3394       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3395 
3396       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3397     }
3398 
3399     __ addv(v0, __ T4S, v0, v2);
3400     __ addv(v1, __ T4S, v1, v3);
3401 
3402     if (multi_block) {
3403       __ add(ofs, ofs, 64);
3404       __ cmp(ofs, limit);
3405       __ br(Assembler::LE, sha1_loop);
3406       __ mov(c_rarg0, ofs); // return ofs
3407     }
3408 
3409     __ ldpd(v10, v11, Address(sp, 16));
3410     __ ldpd(v8, v9, __ post(sp, 32));
3411 
3412     __ stpq(v0, v1, state);
3413 
3414     __ ret(lr);
3415 
3416     return start;
3417   }
3418 
3419   // Arguments:
3420   //
3421   // Inputs:
3422   //   c_rarg0   - byte[]  source+offset
3423   //   c_rarg1   - int[]   SHA.state
3424   //   c_rarg2   - int     offset
3425   //   c_rarg3   - int     limit
3426   //
3427   address generate_sha512_implCompress(bool multi_block, const char *name) {
3428     static const uint64_t round_consts[80] = {
3429       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3430       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3431       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3432       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3433       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3434       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3435       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3436       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3437       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3438       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3439       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3440       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3441       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3442       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3443       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3444       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3445       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3446       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3447       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3448       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3449       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3450       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3451       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3452       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3453       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3454       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3455       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3456     };
3457 
3458     // Double rounds for sha512.
3459     #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
3460       if (dr < 36)                                                                   \
3461         __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16));                              \
3462       __ addv(v5, __ T2D, v##rc0, v##in0);                                           \
3463       __ ext(v6, __ T16B, v##i2, v##i3, 8);                                          \
3464       __ ext(v5, __ T16B, v5, v5, 8);                                                \
3465       __ ext(v7, __ T16B, v##i1, v##i2, 8);                                          \
3466       __ addv(v##i3, __ T2D, v##i3, v5);                                             \
3467       if (dr < 32) {                                                                 \
3468         __ ext(v5, __ T16B, v##in3, v##in4, 8);                                      \
3469         __ sha512su0(v##in0, __ T2D, v##in1);                                        \
3470       }                                                                              \
3471       __ sha512h(v##i3, __ T2D, v6, v7);                                             \
3472       if (dr < 32)                                                                   \
3473         __ sha512su1(v##in0, __ T2D, v##in2, v5);                                    \
3474       __ addv(v##i4, __ T2D, v##i1, v##i3);                                          \
3475       __ sha512h2(v##i3, __ T2D, v##i1, v##i0);                                      \
3476 
3477     __ align(CodeEntryAlignment);
3478     StubCodeMark mark(this, "StubRoutines", name);
3479     address start = __ pc();
3480 
3481     Register buf   = c_rarg0;
3482     Register state = c_rarg1;
3483     Register ofs   = c_rarg2;
3484     Register limit = c_rarg3;
3485 
3486     __ stpd(v8, v9, __ pre(sp, -64));
3487     __ stpd(v10, v11, Address(sp, 16));
3488     __ stpd(v12, v13, Address(sp, 32));
3489     __ stpd(v14, v15, Address(sp, 48));
3490 
3491     Label sha512_loop;
3492 
3493     // load state
3494     __ ld1(v8, v9, v10, v11, __ T2D, state);
3495 
3496     // load first 4 round constants
3497     __ lea(rscratch1, ExternalAddress((address)round_consts));
3498     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3499 
3500     __ BIND(sha512_loop);
3501     // load 128B of data into v12..v19
3502     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3503     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3504     __ rev64(v12, __ T16B, v12);
3505     __ rev64(v13, __ T16B, v13);
3506     __ rev64(v14, __ T16B, v14);
3507     __ rev64(v15, __ T16B, v15);
3508     __ rev64(v16, __ T16B, v16);
3509     __ rev64(v17, __ T16B, v17);
3510     __ rev64(v18, __ T16B, v18);
3511     __ rev64(v19, __ T16B, v19);
3512 
3513     __ mov(rscratch2, rscratch1);
3514 
3515     __ mov(v0, __ T16B, v8);
3516     __ mov(v1, __ T16B, v9);
3517     __ mov(v2, __ T16B, v10);
3518     __ mov(v3, __ T16B, v11);
3519 
3520     sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17);
3521     sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18);
3522     sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19);
3523     sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12);
3524     sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13);
3525     sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14);
3526     sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15);
3527     sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16);
3528     sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17);
3529     sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18);
3530     sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19);
3531     sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12);
3532     sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13);
3533     sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14);
3534     sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15);
3535     sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16);
3536     sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17);
3537     sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18);
3538     sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19);
3539     sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12);
3540     sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13);
3541     sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14);
3542     sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15);
3543     sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16);
3544     sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17);
3545     sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18);
3546     sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19);
3547     sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12);
3548     sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13);
3549     sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14);
3550     sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15);
3551     sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16);
3552     sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12,  0,  0,  0,  0);
3553     sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13,  0,  0,  0,  0);
3554     sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14,  0,  0,  0,  0);
3555     sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15,  0,  0,  0,  0);
3556     sha512_dround(36, 3, 0, 4, 2, 1, 24,  0, 16,  0,  0,  0,  0);
3557     sha512_dround(37, 2, 3, 1, 4, 0, 25,  0, 17,  0,  0,  0,  0);
3558     sha512_dround(38, 4, 2, 0, 1, 3, 26,  0, 18,  0,  0,  0,  0);
3559     sha512_dround(39, 1, 4, 3, 0, 2, 27,  0, 19,  0,  0,  0,  0);
3560 
3561     __ addv(v8, __ T2D, v8, v0);
3562     __ addv(v9, __ T2D, v9, v1);
3563     __ addv(v10, __ T2D, v10, v2);
3564     __ addv(v11, __ T2D, v11, v3);
3565 
3566     if (multi_block) {
3567       __ add(ofs, ofs, 128);
3568       __ cmp(ofs, limit);
3569       __ br(Assembler::LE, sha512_loop);
3570       __ mov(c_rarg0, ofs); // return ofs
3571     }
3572 
3573     __ st1(v8, v9, v10, v11, __ T2D, state);
3574 
3575     __ ldpd(v14, v15, Address(sp, 48));
3576     __ ldpd(v12, v13, Address(sp, 32));
3577     __ ldpd(v10, v11, Address(sp, 16));
3578     __ ldpd(v8, v9, __ post(sp, 64));
3579 
3580     __ ret(lr);
3581 
3582     return start;
3583   }
3584 
3585   // Arguments:
3586   //
3587   // Inputs:
3588   //   c_rarg0   - byte[]  source+offset
3589   //   c_rarg1   - byte[]   SHA.state
3590   //   c_rarg2   - int     digest_length
3591   //   c_rarg3   - int     offset
3592   //   c_rarg4   - int     limit
3593   //
3594   address generate_sha3_implCompress(bool multi_block, const char *name) {
3595     static const uint64_t round_consts[24] = {
3596       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3597       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3598       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3599       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3600       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3601       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3602       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3603       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3604     };
3605 
3606     __ align(CodeEntryAlignment);
3607     StubCodeMark mark(this, "StubRoutines", name);
3608     address start = __ pc();
3609 
3610     Register buf           = c_rarg0;
3611     Register state         = c_rarg1;
3612     Register digest_length = c_rarg2;
3613     Register ofs           = c_rarg3;
3614     Register limit         = c_rarg4;
3615 
3616     Label sha3_loop, rounds24_loop;
3617     Label sha3_512, sha3_384_or_224, sha3_256;
3618 
3619     __ stpd(v8, v9, __ pre(sp, -64));
3620     __ stpd(v10, v11, Address(sp, 16));
3621     __ stpd(v12, v13, Address(sp, 32));
3622     __ stpd(v14, v15, Address(sp, 48));
3623 
3624     // load state
3625     __ add(rscratch1, state, 32);
3626     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3627     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3628     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3629     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3630     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3631     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3632     __ ld1(v24, __ T1D, rscratch1);
3633 
3634     __ BIND(sha3_loop);
3635 
3636     // 24 keccak rounds
3637     __ movw(rscratch2, 24);
3638 
3639     // load round_constants base
3640     __ lea(rscratch1, ExternalAddress((address) round_consts));
3641 
3642     // load input
3643     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3644     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3645     __ eor(v0, __ T8B, v0, v25);
3646     __ eor(v1, __ T8B, v1, v26);
3647     __ eor(v2, __ T8B, v2, v27);
3648     __ eor(v3, __ T8B, v3, v28);
3649     __ eor(v4, __ T8B, v4, v29);
3650     __ eor(v5, __ T8B, v5, v30);
3651     __ eor(v6, __ T8B, v6, v31);
3652 
3653     // digest_length == 64, SHA3-512
3654     __ tbnz(digest_length, 6, sha3_512);
3655 
3656     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3657     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3658     __ eor(v7, __ T8B, v7, v25);
3659     __ eor(v8, __ T8B, v8, v26);
3660     __ eor(v9, __ T8B, v9, v27);
3661     __ eor(v10, __ T8B, v10, v28);
3662     __ eor(v11, __ T8B, v11, v29);
3663     __ eor(v12, __ T8B, v12, v30);
3664 
3665     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3666     __ tbnz(digest_length, 4, sha3_384_or_224);
3667 
3668     // SHA3-256
3669     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3670     __ eor(v13, __ T8B, v13, v25);
3671     __ eor(v14, __ T8B, v14, v26);
3672     __ eor(v15, __ T8B, v15, v27);
3673     __ eor(v16, __ T8B, v16, v28);
3674     __ b(rounds24_loop);
3675 
3676     __ BIND(sha3_384_or_224);
3677     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3678 
3679     // SHA3-224
3680     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3681     __ ld1(v29, __ T8B, __ post(buf, 8));
3682     __ eor(v13, __ T8B, v13, v25);
3683     __ eor(v14, __ T8B, v14, v26);
3684     __ eor(v15, __ T8B, v15, v27);
3685     __ eor(v16, __ T8B, v16, v28);
3686     __ eor(v17, __ T8B, v17, v29);
3687     __ b(rounds24_loop);
3688 
3689     __ BIND(sha3_512);
3690     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3691     __ eor(v7, __ T8B, v7, v25);
3692     __ eor(v8, __ T8B, v8, v26);
3693 
3694     __ BIND(rounds24_loop);
3695     __ subw(rscratch2, rscratch2, 1);
3696 
3697     __ eor3(v29, __ T16B, v4, v9, v14);
3698     __ eor3(v26, __ T16B, v1, v6, v11);
3699     __ eor3(v28, __ T16B, v3, v8, v13);
3700     __ eor3(v25, __ T16B, v0, v5, v10);
3701     __ eor3(v27, __ T16B, v2, v7, v12);
3702     __ eor3(v29, __ T16B, v29, v19, v24);
3703     __ eor3(v26, __ T16B, v26, v16, v21);
3704     __ eor3(v28, __ T16B, v28, v18, v23);
3705     __ eor3(v25, __ T16B, v25, v15, v20);
3706     __ eor3(v27, __ T16B, v27, v17, v22);
3707 
3708     __ rax1(v30, __ T2D, v29, v26);
3709     __ rax1(v26, __ T2D, v26, v28);
3710     __ rax1(v28, __ T2D, v28, v25);
3711     __ rax1(v25, __ T2D, v25, v27);
3712     __ rax1(v27, __ T2D, v27, v29);
3713 
3714     __ eor(v0, __ T16B, v0, v30);
3715     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3716     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3717     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3718     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3719     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3720     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3721     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3722     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3723     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3724     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3725     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3726     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3727     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3728     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3729     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3730     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3731     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3732     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3733     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3734     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3735     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3736     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3737     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3738     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3739 
3740     __ bcax(v20, __ T16B, v31, v22, v8);
3741     __ bcax(v21, __ T16B, v8,  v23, v22);
3742     __ bcax(v22, __ T16B, v22, v24, v23);
3743     __ bcax(v23, __ T16B, v23, v31, v24);
3744     __ bcax(v24, __ T16B, v24, v8,  v31);
3745 
3746     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3747 
3748     __ bcax(v17, __ T16B, v25, v19, v3);
3749     __ bcax(v18, __ T16B, v3,  v15, v19);
3750     __ bcax(v19, __ T16B, v19, v16, v15);
3751     __ bcax(v15, __ T16B, v15, v25, v16);
3752     __ bcax(v16, __ T16B, v16, v3,  v25);
3753 
3754     __ bcax(v10, __ T16B, v29, v12, v26);
3755     __ bcax(v11, __ T16B, v26, v13, v12);
3756     __ bcax(v12, __ T16B, v12, v14, v13);
3757     __ bcax(v13, __ T16B, v13, v29, v14);
3758     __ bcax(v14, __ T16B, v14, v26, v29);
3759 
3760     __ bcax(v7, __ T16B, v30, v9,  v4);
3761     __ bcax(v8, __ T16B, v4,  v5,  v9);
3762     __ bcax(v9, __ T16B, v9,  v6,  v5);
3763     __ bcax(v5, __ T16B, v5,  v30, v6);
3764     __ bcax(v6, __ T16B, v6,  v4,  v30);
3765 
3766     __ bcax(v3, __ T16B, v27, v0,  v28);
3767     __ bcax(v4, __ T16B, v28, v1,  v0);
3768     __ bcax(v0, __ T16B, v0,  v2,  v1);
3769     __ bcax(v1, __ T16B, v1,  v27, v2);
3770     __ bcax(v2, __ T16B, v2,  v28, v27);
3771 
3772     __ eor(v0, __ T16B, v0, v31);
3773 
3774     __ cbnzw(rscratch2, rounds24_loop);
3775 
3776     if (multi_block) {
3777       // block_size =  200 - 2 * digest_length, ofs += block_size
3778       __ add(ofs, ofs, 200);
3779       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3780 
3781       __ cmp(ofs, limit);
3782       __ br(Assembler::LE, sha3_loop);
3783       __ mov(c_rarg0, ofs); // return ofs
3784     }
3785 
3786     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3787     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3788     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3789     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3790     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3791     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3792     __ st1(v24, __ T1D, state);
3793 
3794     __ ldpd(v14, v15, Address(sp, 48));
3795     __ ldpd(v12, v13, Address(sp, 32));
3796     __ ldpd(v10, v11, Address(sp, 16));
3797     __ ldpd(v8, v9, __ post(sp, 64));
3798 
3799     __ ret(lr);
3800 
3801     return start;
3802   }
3803 
3804   // Safefetch stubs.
3805   void generate_safefetch(const char* name, int size, address* entry,
3806                           address* fault_pc, address* continuation_pc) {
3807     // safefetch signatures:
3808     //   int      SafeFetch32(int*      adr, int      errValue);
3809     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3810     //
3811     // arguments:
3812     //   c_rarg0 = adr
3813     //   c_rarg1 = errValue
3814     //
3815     // result:
3816     //   PPC_RET  = *adr or errValue
3817 
3818     StubCodeMark mark(this, "StubRoutines", name);
3819 
3820     // Entry point, pc or function descriptor.
3821     *entry = __ pc();
3822 
3823     // Load *adr into c_rarg1, may fault.
3824     *fault_pc = __ pc();
3825     switch (size) {
3826       case 4:
3827         // int32_t
3828         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3829         break;
3830       case 8:
3831         // int64_t
3832         __ ldr(c_rarg1, Address(c_rarg0, 0));
3833         break;
3834       default:
3835         ShouldNotReachHere();
3836     }
3837 
3838     // return errValue or *adr
3839     *continuation_pc = __ pc();
3840     __ mov(r0, c_rarg1);
3841     __ ret(lr);
3842   }
3843 
3844   /**
3845    *  Arguments:
3846    *
3847    * Inputs:
3848    *   c_rarg0   - int crc
3849    *   c_rarg1   - byte* buf
3850    *   c_rarg2   - int length
3851    *
3852    * Ouput:
3853    *       rax   - int crc result
3854    */
3855   address generate_updateBytesCRC32() {
3856     assert(UseCRC32Intrinsics, "what are we doing here?");
3857 
3858     __ align(CodeEntryAlignment);
3859     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3860 
3861     address start = __ pc();
3862 
3863     const Register crc   = c_rarg0;  // crc
3864     const Register buf   = c_rarg1;  // source java byte array address
3865     const Register len   = c_rarg2;  // length
3866     const Register table0 = c_rarg3; // crc_table address
3867     const Register table1 = c_rarg4;
3868     const Register table2 = c_rarg5;
3869     const Register table3 = c_rarg6;
3870     const Register tmp3 = c_rarg7;
3871 
3872     BLOCK_COMMENT("Entry:");
3873     __ enter(); // required for proper stackwalking of RuntimeStub frame
3874 
3875     __ kernel_crc32(crc, buf, len,
3876               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3877 
3878     __ leave(); // required for proper stackwalking of RuntimeStub frame
3879     __ ret(lr);
3880 
3881     return start;
3882   }
3883 
3884   /**
3885    *  Arguments:
3886    *
3887    * Inputs:
3888    *   c_rarg0   - int crc
3889    *   c_rarg1   - byte* buf
3890    *   c_rarg2   - int length
3891    *   c_rarg3   - int* table
3892    *
3893    * Ouput:
3894    *       r0   - int crc result
3895    */
3896   address generate_updateBytesCRC32C() {
3897     assert(UseCRC32CIntrinsics, "what are we doing here?");
3898 
3899     __ align(CodeEntryAlignment);
3900     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3901 
3902     address start = __ pc();
3903 
3904     const Register crc   = c_rarg0;  // crc
3905     const Register buf   = c_rarg1;  // source java byte array address
3906     const Register len   = c_rarg2;  // length
3907     const Register table0 = c_rarg3; // crc_table address
3908     const Register table1 = c_rarg4;
3909     const Register table2 = c_rarg5;
3910     const Register table3 = c_rarg6;
3911     const Register tmp3 = c_rarg7;
3912 
3913     BLOCK_COMMENT("Entry:");
3914     __ enter(); // required for proper stackwalking of RuntimeStub frame
3915 
3916     __ kernel_crc32c(crc, buf, len,
3917               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3918 
3919     __ leave(); // required for proper stackwalking of RuntimeStub frame
3920     __ ret(lr);
3921 
3922     return start;
3923   }
3924 
3925   /***
3926    *  Arguments:
3927    *
3928    *  Inputs:
3929    *   c_rarg0   - int   adler
3930    *   c_rarg1   - byte* buff
3931    *   c_rarg2   - int   len
3932    *
3933    * Output:
3934    *   c_rarg0   - int adler result
3935    */
3936   address generate_updateBytesAdler32() {
3937     __ align(CodeEntryAlignment);
3938     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3939     address start = __ pc();
3940 
3941     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3942 
3943     // Aliases
3944     Register adler  = c_rarg0;
3945     Register s1     = c_rarg0;
3946     Register s2     = c_rarg3;
3947     Register buff   = c_rarg1;
3948     Register len    = c_rarg2;
3949     Register nmax  = r4;
3950     Register base  = r5;
3951     Register count = r6;
3952     Register temp0 = rscratch1;
3953     Register temp1 = rscratch2;
3954     FloatRegister vbytes = v0;
3955     FloatRegister vs1acc = v1;
3956     FloatRegister vs2acc = v2;
3957     FloatRegister vtable = v3;
3958 
3959     // Max number of bytes we can process before having to take the mod
3960     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3961     uint64_t BASE = 0xfff1;
3962     uint64_t NMAX = 0x15B0;
3963 
3964     __ mov(base, BASE);
3965     __ mov(nmax, NMAX);
3966 
3967     // Load accumulation coefficients for the upper 16 bits
3968     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3969     __ ld1(vtable, __ T16B, Address(temp0));
3970 
3971     // s1 is initialized to the lower 16 bits of adler
3972     // s2 is initialized to the upper 16 bits of adler
3973     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3974     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3975 
3976     // The pipelined loop needs at least 16 elements for 1 iteration
3977     // It does check this, but it is more effective to skip to the cleanup loop
3978     __ cmp(len, (u1)16);
3979     __ br(Assembler::HS, L_nmax);
3980     __ cbz(len, L_combine);
3981 
3982     __ bind(L_simple_by1_loop);
3983     __ ldrb(temp0, Address(__ post(buff, 1)));
3984     __ add(s1, s1, temp0);
3985     __ add(s2, s2, s1);
3986     __ subs(len, len, 1);
3987     __ br(Assembler::HI, L_simple_by1_loop);
3988 
3989     // s1 = s1 % BASE
3990     __ subs(temp0, s1, base);
3991     __ csel(s1, temp0, s1, Assembler::HS);
3992 
3993     // s2 = s2 % BASE
3994     __ lsr(temp0, s2, 16);
3995     __ lsl(temp1, temp0, 4);
3996     __ sub(temp1, temp1, temp0);
3997     __ add(s2, temp1, s2, ext::uxth);
3998 
3999     __ subs(temp0, s2, base);
4000     __ csel(s2, temp0, s2, Assembler::HS);
4001 
4002     __ b(L_combine);
4003 
4004     __ bind(L_nmax);
4005     __ subs(len, len, nmax);
4006     __ sub(count, nmax, 16);
4007     __ br(Assembler::LO, L_by16);
4008 
4009     __ bind(L_nmax_loop);
4010 
4011     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4012                                       vbytes, vs1acc, vs2acc, vtable);
4013 
4014     __ subs(count, count, 16);
4015     __ br(Assembler::HS, L_nmax_loop);
4016 
4017     // s1 = s1 % BASE
4018     __ lsr(temp0, s1, 16);
4019     __ lsl(temp1, temp0, 4);
4020     __ sub(temp1, temp1, temp0);
4021     __ add(temp1, temp1, s1, ext::uxth);
4022 
4023     __ lsr(temp0, temp1, 16);
4024     __ lsl(s1, temp0, 4);
4025     __ sub(s1, s1, temp0);
4026     __ add(s1, s1, temp1, ext:: uxth);
4027 
4028     __ subs(temp0, s1, base);
4029     __ csel(s1, temp0, s1, Assembler::HS);
4030 
4031     // s2 = s2 % BASE
4032     __ lsr(temp0, s2, 16);
4033     __ lsl(temp1, temp0, 4);
4034     __ sub(temp1, temp1, temp0);
4035     __ add(temp1, temp1, s2, ext::uxth);
4036 
4037     __ lsr(temp0, temp1, 16);
4038     __ lsl(s2, temp0, 4);
4039     __ sub(s2, s2, temp0);
4040     __ add(s2, s2, temp1, ext:: uxth);
4041 
4042     __ subs(temp0, s2, base);
4043     __ csel(s2, temp0, s2, Assembler::HS);
4044 
4045     __ subs(len, len, nmax);
4046     __ sub(count, nmax, 16);
4047     __ br(Assembler::HS, L_nmax_loop);
4048 
4049     __ bind(L_by16);
4050     __ adds(len, len, count);
4051     __ br(Assembler::LO, L_by1);
4052 
4053     __ bind(L_by16_loop);
4054 
4055     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4056                                       vbytes, vs1acc, vs2acc, vtable);
4057 
4058     __ subs(len, len, 16);
4059     __ br(Assembler::HS, L_by16_loop);
4060 
4061     __ bind(L_by1);
4062     __ adds(len, len, 15);
4063     __ br(Assembler::LO, L_do_mod);
4064 
4065     __ bind(L_by1_loop);
4066     __ ldrb(temp0, Address(__ post(buff, 1)));
4067     __ add(s1, temp0, s1);
4068     __ add(s2, s2, s1);
4069     __ subs(len, len, 1);
4070     __ br(Assembler::HS, L_by1_loop);
4071 
4072     __ bind(L_do_mod);
4073     // s1 = s1 % BASE
4074     __ lsr(temp0, s1, 16);
4075     __ lsl(temp1, temp0, 4);
4076     __ sub(temp1, temp1, temp0);
4077     __ add(temp1, temp1, s1, ext::uxth);
4078 
4079     __ lsr(temp0, temp1, 16);
4080     __ lsl(s1, temp0, 4);
4081     __ sub(s1, s1, temp0);
4082     __ add(s1, s1, temp1, ext:: uxth);
4083 
4084     __ subs(temp0, s1, base);
4085     __ csel(s1, temp0, s1, Assembler::HS);
4086 
4087     // s2 = s2 % BASE
4088     __ lsr(temp0, s2, 16);
4089     __ lsl(temp1, temp0, 4);
4090     __ sub(temp1, temp1, temp0);
4091     __ add(temp1, temp1, s2, ext::uxth);
4092 
4093     __ lsr(temp0, temp1, 16);
4094     __ lsl(s2, temp0, 4);
4095     __ sub(s2, s2, temp0);
4096     __ add(s2, s2, temp1, ext:: uxth);
4097 
4098     __ subs(temp0, s2, base);
4099     __ csel(s2, temp0, s2, Assembler::HS);
4100 
4101     // Combine lower bits and higher bits
4102     __ bind(L_combine);
4103     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4104 
4105     __ ret(lr);
4106 
4107     return start;
4108   }
4109 
4110   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4111           Register temp0, Register temp1, FloatRegister vbytes,
4112           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4113     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4114     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4115     // In non-vectorized code, we update s1 and s2 as:
4116     //   s1 <- s1 + b1
4117     //   s2 <- s2 + s1
4118     //   s1 <- s1 + b2
4119     //   s2 <- s2 + b1
4120     //   ...
4121     //   s1 <- s1 + b16
4122     //   s2 <- s2 + s1
4123     // Putting above assignments together, we have:
4124     //   s1_new = s1 + b1 + b2 + ... + b16
4125     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4126     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4127     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4128     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4129 
4130     // s2 = s2 + s1 * 16
4131     __ add(s2, s2, s1, Assembler::LSL, 4);
4132 
4133     // vs1acc = b1 + b2 + b3 + ... + b16
4134     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4135     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4136     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4137     __ uaddlv(vs1acc, __ T16B, vbytes);
4138     __ uaddlv(vs2acc, __ T8H, vs2acc);
4139 
4140     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4141     __ fmovd(temp0, vs1acc);
4142     __ fmovd(temp1, vs2acc);
4143     __ add(s1, s1, temp0);
4144     __ add(s2, s2, temp1);
4145   }
4146 
4147   /**
4148    *  Arguments:
4149    *
4150    *  Input:
4151    *    c_rarg0   - x address
4152    *    c_rarg1   - x length
4153    *    c_rarg2   - y address
4154    *    c_rarg3   - y lenth
4155    *    c_rarg4   - z address
4156    *    c_rarg5   - z length
4157    */
4158   address generate_multiplyToLen() {
4159     __ align(CodeEntryAlignment);
4160     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4161 
4162     address start = __ pc();
4163     const Register x     = r0;
4164     const Register xlen  = r1;
4165     const Register y     = r2;
4166     const Register ylen  = r3;
4167     const Register z     = r4;
4168     const Register zlen  = r5;
4169 
4170     const Register tmp1  = r10;
4171     const Register tmp2  = r11;
4172     const Register tmp3  = r12;
4173     const Register tmp4  = r13;
4174     const Register tmp5  = r14;
4175     const Register tmp6  = r15;
4176     const Register tmp7  = r16;
4177 
4178     BLOCK_COMMENT("Entry:");
4179     __ enter(); // required for proper stackwalking of RuntimeStub frame
4180     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4181     __ leave(); // required for proper stackwalking of RuntimeStub frame
4182     __ ret(lr);
4183 
4184     return start;
4185   }
4186 
4187   address generate_squareToLen() {
4188     // squareToLen algorithm for sizes 1..127 described in java code works
4189     // faster than multiply_to_len on some CPUs and slower on others, but
4190     // multiply_to_len shows a bit better overall results
4191     __ align(CodeEntryAlignment);
4192     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4193     address start = __ pc();
4194 
4195     const Register x     = r0;
4196     const Register xlen  = r1;
4197     const Register z     = r2;
4198     const Register zlen  = r3;
4199     const Register y     = r4; // == x
4200     const Register ylen  = r5; // == xlen
4201 
4202     const Register tmp1  = r10;
4203     const Register tmp2  = r11;
4204     const Register tmp3  = r12;
4205     const Register tmp4  = r13;
4206     const Register tmp5  = r14;
4207     const Register tmp6  = r15;
4208     const Register tmp7  = r16;
4209 
4210     RegSet spilled_regs = RegSet::of(y, ylen);
4211     BLOCK_COMMENT("Entry:");
4212     __ enter();
4213     __ push(spilled_regs, sp);
4214     __ mov(y, x);
4215     __ mov(ylen, xlen);
4216     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4217     __ pop(spilled_regs, sp);
4218     __ leave();
4219     __ ret(lr);
4220     return start;
4221   }
4222 
4223   address generate_mulAdd() {
4224     __ align(CodeEntryAlignment);
4225     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4226 
4227     address start = __ pc();
4228 
4229     const Register out     = r0;
4230     const Register in      = r1;
4231     const Register offset  = r2;
4232     const Register len     = r3;
4233     const Register k       = r4;
4234 
4235     BLOCK_COMMENT("Entry:");
4236     __ enter();
4237     __ mul_add(out, in, offset, len, k);
4238     __ leave();
4239     __ ret(lr);
4240 
4241     return start;
4242   }
4243 
4244   // Arguments:
4245   //
4246   // Input:
4247   //   c_rarg0   - newArr address
4248   //   c_rarg1   - oldArr address
4249   //   c_rarg2   - newIdx
4250   //   c_rarg3   - shiftCount
4251   //   c_rarg4   - numIter
4252   //
4253   address generate_bigIntegerRightShift() {
4254     __ align(CodeEntryAlignment);
4255     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4256     address start = __ pc();
4257 
4258     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4259 
4260     Register newArr        = c_rarg0;
4261     Register oldArr        = c_rarg1;
4262     Register newIdx        = c_rarg2;
4263     Register shiftCount    = c_rarg3;
4264     Register numIter       = c_rarg4;
4265     Register idx           = numIter;
4266 
4267     Register newArrCur     = rscratch1;
4268     Register shiftRevCount = rscratch2;
4269     Register oldArrCur     = r13;
4270     Register oldArrNext    = r14;
4271 
4272     FloatRegister oldElem0        = v0;
4273     FloatRegister oldElem1        = v1;
4274     FloatRegister newElem         = v2;
4275     FloatRegister shiftVCount     = v3;
4276     FloatRegister shiftVRevCount  = v4;
4277 
4278     __ cbz(idx, Exit);
4279 
4280     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4281 
4282     // left shift count
4283     __ movw(shiftRevCount, 32);
4284     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4285 
4286     // numIter too small to allow a 4-words SIMD loop, rolling back
4287     __ cmp(numIter, (u1)4);
4288     __ br(Assembler::LT, ShiftThree);
4289 
4290     __ dup(shiftVCount,    __ T4S, shiftCount);
4291     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4292     __ negr(shiftVCount,   __ T4S, shiftVCount);
4293 
4294     __ BIND(ShiftSIMDLoop);
4295 
4296     // Calculate the load addresses
4297     __ sub(idx, idx, 4);
4298     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4299     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4300     __ add(oldArrCur,  oldArrNext, 4);
4301 
4302     // Load 4 words and process
4303     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4304     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4305     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4306     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4307     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4308     __ st1(newElem,   __ T4S,  Address(newArrCur));
4309 
4310     __ cmp(idx, (u1)4);
4311     __ br(Assembler::LT, ShiftTwoLoop);
4312     __ b(ShiftSIMDLoop);
4313 
4314     __ BIND(ShiftTwoLoop);
4315     __ cbz(idx, Exit);
4316     __ cmp(idx, (u1)1);
4317     __ br(Assembler::EQ, ShiftOne);
4318 
4319     // Calculate the load addresses
4320     __ sub(idx, idx, 2);
4321     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4322     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4323     __ add(oldArrCur,  oldArrNext, 4);
4324 
4325     // Load 2 words and process
4326     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4327     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4328     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4329     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4330     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4331     __ st1(newElem,   __ T2S, Address(newArrCur));
4332     __ b(ShiftTwoLoop);
4333 
4334     __ BIND(ShiftThree);
4335     __ tbz(idx, 1, ShiftOne);
4336     __ tbz(idx, 0, ShiftTwo);
4337     __ ldrw(r10,  Address(oldArr, 12));
4338     __ ldrw(r11,  Address(oldArr, 8));
4339     __ lsrvw(r10, r10, shiftCount);
4340     __ lslvw(r11, r11, shiftRevCount);
4341     __ orrw(r12,  r10, r11);
4342     __ strw(r12,  Address(newArr, 8));
4343 
4344     __ BIND(ShiftTwo);
4345     __ ldrw(r10,  Address(oldArr, 8));
4346     __ ldrw(r11,  Address(oldArr, 4));
4347     __ lsrvw(r10, r10, shiftCount);
4348     __ lslvw(r11, r11, shiftRevCount);
4349     __ orrw(r12,  r10, r11);
4350     __ strw(r12,  Address(newArr, 4));
4351 
4352     __ BIND(ShiftOne);
4353     __ ldrw(r10,  Address(oldArr, 4));
4354     __ ldrw(r11,  Address(oldArr));
4355     __ lsrvw(r10, r10, shiftCount);
4356     __ lslvw(r11, r11, shiftRevCount);
4357     __ orrw(r12,  r10, r11);
4358     __ strw(r12,  Address(newArr));
4359 
4360     __ BIND(Exit);
4361     __ ret(lr);
4362 
4363     return start;
4364   }
4365 
4366   // Arguments:
4367   //
4368   // Input:
4369   //   c_rarg0   - newArr address
4370   //   c_rarg1   - oldArr address
4371   //   c_rarg2   - newIdx
4372   //   c_rarg3   - shiftCount
4373   //   c_rarg4   - numIter
4374   //
4375   address generate_bigIntegerLeftShift() {
4376     __ align(CodeEntryAlignment);
4377     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4378     address start = __ pc();
4379 
4380     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4381 
4382     Register newArr        = c_rarg0;
4383     Register oldArr        = c_rarg1;
4384     Register newIdx        = c_rarg2;
4385     Register shiftCount    = c_rarg3;
4386     Register numIter       = c_rarg4;
4387 
4388     Register shiftRevCount = rscratch1;
4389     Register oldArrNext    = rscratch2;
4390 
4391     FloatRegister oldElem0        = v0;
4392     FloatRegister oldElem1        = v1;
4393     FloatRegister newElem         = v2;
4394     FloatRegister shiftVCount     = v3;
4395     FloatRegister shiftVRevCount  = v4;
4396 
4397     __ cbz(numIter, Exit);
4398 
4399     __ add(oldArrNext, oldArr, 4);
4400     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4401 
4402     // right shift count
4403     __ movw(shiftRevCount, 32);
4404     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4405 
4406     // numIter too small to allow a 4-words SIMD loop, rolling back
4407     __ cmp(numIter, (u1)4);
4408     __ br(Assembler::LT, ShiftThree);
4409 
4410     __ dup(shiftVCount,     __ T4S, shiftCount);
4411     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4412     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4413 
4414     __ BIND(ShiftSIMDLoop);
4415 
4416     // load 4 words and process
4417     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4418     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4419     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4420     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4421     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4422     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4423     __ sub(numIter,   numIter, 4);
4424 
4425     __ cmp(numIter, (u1)4);
4426     __ br(Assembler::LT, ShiftTwoLoop);
4427     __ b(ShiftSIMDLoop);
4428 
4429     __ BIND(ShiftTwoLoop);
4430     __ cbz(numIter, Exit);
4431     __ cmp(numIter, (u1)1);
4432     __ br(Assembler::EQ, ShiftOne);
4433 
4434     // load 2 words and process
4435     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4436     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4437     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4438     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4439     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4440     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4441     __ sub(numIter,   numIter, 2);
4442     __ b(ShiftTwoLoop);
4443 
4444     __ BIND(ShiftThree);
4445     __ ldrw(r10,  __ post(oldArr, 4));
4446     __ ldrw(r11,  __ post(oldArrNext, 4));
4447     __ lslvw(r10, r10, shiftCount);
4448     __ lsrvw(r11, r11, shiftRevCount);
4449     __ orrw(r12,  r10, r11);
4450     __ strw(r12,  __ post(newArr, 4));
4451     __ tbz(numIter, 1, Exit);
4452     __ tbz(numIter, 0, ShiftOne);
4453 
4454     __ BIND(ShiftTwo);
4455     __ ldrw(r10,  __ post(oldArr, 4));
4456     __ ldrw(r11,  __ post(oldArrNext, 4));
4457     __ lslvw(r10, r10, shiftCount);
4458     __ lsrvw(r11, r11, shiftRevCount);
4459     __ orrw(r12,  r10, r11);
4460     __ strw(r12,  __ post(newArr, 4));
4461 
4462     __ BIND(ShiftOne);
4463     __ ldrw(r10,  Address(oldArr));
4464     __ ldrw(r11,  Address(oldArrNext));
4465     __ lslvw(r10, r10, shiftCount);
4466     __ lsrvw(r11, r11, shiftRevCount);
4467     __ orrw(r12,  r10, r11);
4468     __ strw(r12,  Address(newArr));
4469 
4470     __ BIND(Exit);
4471     __ ret(lr);
4472 
4473     return start;
4474   }
4475 
4476   address generate_has_negatives(address &has_negatives_long) {
4477     const u1 large_loop_size = 64;
4478     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4479     int dcache_line = VM_Version::dcache_line_size();
4480 
4481     Register ary1 = r1, len = r2, result = r0;
4482 
4483     __ align(CodeEntryAlignment);
4484 
4485     StubCodeMark mark(this, "StubRoutines", "has_negatives");
4486 
4487     address entry = __ pc();
4488 
4489     __ enter();
4490 
4491   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
4492         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4493 
4494   __ cmp(len, (u1)15);
4495   __ br(Assembler::GT, LEN_OVER_15);
4496   // The only case when execution falls into this code is when pointer is near
4497   // the end of memory page and we have to avoid reading next page
4498   __ add(ary1, ary1, len);
4499   __ subs(len, len, 8);
4500   __ br(Assembler::GT, LEN_OVER_8);
4501   __ ldr(rscratch2, Address(ary1, -8));
4502   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4503   __ lsrv(rscratch2, rscratch2, rscratch1);
4504   __ tst(rscratch2, UPPER_BIT_MASK);
4505   __ cset(result, Assembler::NE);
4506   __ leave();
4507   __ ret(lr);
4508   __ bind(LEN_OVER_8);
4509   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4510   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4511   __ tst(rscratch2, UPPER_BIT_MASK);
4512   __ br(Assembler::NE, RET_TRUE_NO_POP);
4513   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4514   __ lsrv(rscratch1, rscratch1, rscratch2);
4515   __ tst(rscratch1, UPPER_BIT_MASK);
4516   __ cset(result, Assembler::NE);
4517   __ leave();
4518   __ ret(lr);
4519 
4520   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4521   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4522 
4523   has_negatives_long = __ pc(); // 2nd entry point
4524 
4525   __ enter();
4526 
4527   __ bind(LEN_OVER_15);
4528     __ push(spilled_regs, sp);
4529     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4530     __ cbz(rscratch2, ALIGNED);
4531     __ ldp(tmp6, tmp1, Address(ary1));
4532     __ mov(tmp5, 16);
4533     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4534     __ add(ary1, ary1, rscratch1);
4535     __ sub(len, len, rscratch1);
4536     __ orr(tmp6, tmp6, tmp1);
4537     __ tst(tmp6, UPPER_BIT_MASK);
4538     __ br(Assembler::NE, RET_TRUE);
4539 
4540   __ bind(ALIGNED);
4541     __ cmp(len, large_loop_size);
4542     __ br(Assembler::LT, CHECK_16);
4543     // Perform 16-byte load as early return in pre-loop to handle situation
4544     // when initially aligned large array has negative values at starting bytes,
4545     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4546     // slower. Cases with negative bytes further ahead won't be affected that
4547     // much. In fact, it'll be faster due to early loads, less instructions and
4548     // less branches in LARGE_LOOP.
4549     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4550     __ sub(len, len, 16);
4551     __ orr(tmp6, tmp6, tmp1);
4552     __ tst(tmp6, UPPER_BIT_MASK);
4553     __ br(Assembler::NE, RET_TRUE);
4554     __ cmp(len, large_loop_size);
4555     __ br(Assembler::LT, CHECK_16);
4556 
4557     if (SoftwarePrefetchHintDistance >= 0
4558         && SoftwarePrefetchHintDistance >= dcache_line) {
4559       // initial prefetch
4560       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4561     }
4562   __ bind(LARGE_LOOP);
4563     if (SoftwarePrefetchHintDistance >= 0) {
4564       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4565     }
4566     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4567     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4568     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4569     // instructions per cycle and have less branches, but this approach disables
4570     // early return, thus, all 64 bytes are loaded and checked every time.
4571     __ ldp(tmp2, tmp3, Address(ary1));
4572     __ ldp(tmp4, tmp5, Address(ary1, 16));
4573     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4574     __ ldp(tmp6, tmp1, Address(ary1, 48));
4575     __ add(ary1, ary1, large_loop_size);
4576     __ sub(len, len, large_loop_size);
4577     __ orr(tmp2, tmp2, tmp3);
4578     __ orr(tmp4, tmp4, tmp5);
4579     __ orr(rscratch1, rscratch1, rscratch2);
4580     __ orr(tmp6, tmp6, tmp1);
4581     __ orr(tmp2, tmp2, tmp4);
4582     __ orr(rscratch1, rscratch1, tmp6);
4583     __ orr(tmp2, tmp2, rscratch1);
4584     __ tst(tmp2, UPPER_BIT_MASK);
4585     __ br(Assembler::NE, RET_TRUE);
4586     __ cmp(len, large_loop_size);
4587     __ br(Assembler::GE, LARGE_LOOP);
4588 
4589   __ bind(CHECK_16); // small 16-byte load pre-loop
4590     __ cmp(len, (u1)16);
4591     __ br(Assembler::LT, POST_LOOP16);
4592 
4593   __ bind(LOOP16); // small 16-byte load loop
4594     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4595     __ sub(len, len, 16);
4596     __ orr(tmp2, tmp2, tmp3);
4597     __ tst(tmp2, UPPER_BIT_MASK);
4598     __ br(Assembler::NE, RET_TRUE);
4599     __ cmp(len, (u1)16);
4600     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4601 
4602   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4603     __ cmp(len, (u1)8);
4604     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4605     __ ldr(tmp3, Address(__ post(ary1, 8)));
4606     __ sub(len, len, 8);
4607     __ tst(tmp3, UPPER_BIT_MASK);
4608     __ br(Assembler::NE, RET_TRUE);
4609 
4610   __ bind(POST_LOOP16_LOAD_TAIL);
4611     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
4612     __ ldr(tmp1, Address(ary1));
4613     __ mov(tmp2, 64);
4614     __ sub(tmp4, tmp2, len, __ LSL, 3);
4615     __ lslv(tmp1, tmp1, tmp4);
4616     __ tst(tmp1, UPPER_BIT_MASK);
4617     __ br(Assembler::NE, RET_TRUE);
4618     // Fallthrough
4619 
4620   __ bind(RET_FALSE);
4621     __ pop(spilled_regs, sp);
4622     __ leave();
4623     __ mov(result, zr);
4624     __ ret(lr);
4625 
4626   __ bind(RET_TRUE);
4627     __ pop(spilled_regs, sp);
4628   __ bind(RET_TRUE_NO_POP);
4629     __ leave();
4630     __ mov(result, 1);
4631     __ ret(lr);
4632 
4633   __ bind(DONE);
4634     __ pop(spilled_regs, sp);
4635     __ leave();
4636     __ ret(lr);
4637     return entry;
4638   }
4639 
4640   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4641         bool usePrefetch, Label &NOT_EQUAL) {
4642     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4643         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4644         tmp7 = r12, tmp8 = r13;
4645     Label LOOP;
4646 
4647     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4648     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4649     __ bind(LOOP);
4650     if (usePrefetch) {
4651       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4652       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4653     }
4654     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4655     __ eor(tmp1, tmp1, tmp2);
4656     __ eor(tmp3, tmp3, tmp4);
4657     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4658     __ orr(tmp1, tmp1, tmp3);
4659     __ cbnz(tmp1, NOT_EQUAL);
4660     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4661     __ eor(tmp5, tmp5, tmp6);
4662     __ eor(tmp7, tmp7, tmp8);
4663     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4664     __ orr(tmp5, tmp5, tmp7);
4665     __ cbnz(tmp5, NOT_EQUAL);
4666     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4667     __ eor(tmp1, tmp1, tmp2);
4668     __ eor(tmp3, tmp3, tmp4);
4669     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4670     __ orr(tmp1, tmp1, tmp3);
4671     __ cbnz(tmp1, NOT_EQUAL);
4672     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4673     __ eor(tmp5, tmp5, tmp6);
4674     __ sub(cnt1, cnt1, 8 * wordSize);
4675     __ eor(tmp7, tmp7, tmp8);
4676     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4677     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4678     // cmp) because subs allows an unlimited range of immediate operand.
4679     __ subs(tmp6, cnt1, loopThreshold);
4680     __ orr(tmp5, tmp5, tmp7);
4681     __ cbnz(tmp5, NOT_EQUAL);
4682     __ br(__ GE, LOOP);
4683     // post-loop
4684     __ eor(tmp1, tmp1, tmp2);
4685     __ eor(tmp3, tmp3, tmp4);
4686     __ orr(tmp1, tmp1, tmp3);
4687     __ sub(cnt1, cnt1, 2 * wordSize);
4688     __ cbnz(tmp1, NOT_EQUAL);
4689   }
4690 
4691   void generate_large_array_equals_loop_simd(int loopThreshold,
4692         bool usePrefetch, Label &NOT_EQUAL) {
4693     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4694         tmp2 = rscratch2;
4695     Label LOOP;
4696 
4697     __ bind(LOOP);
4698     if (usePrefetch) {
4699       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4700       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4701     }
4702     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4703     __ sub(cnt1, cnt1, 8 * wordSize);
4704     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4705     __ subs(tmp1, cnt1, loopThreshold);
4706     __ eor(v0, __ T16B, v0, v4);
4707     __ eor(v1, __ T16B, v1, v5);
4708     __ eor(v2, __ T16B, v2, v6);
4709     __ eor(v3, __ T16B, v3, v7);
4710     __ orr(v0, __ T16B, v0, v1);
4711     __ orr(v1, __ T16B, v2, v3);
4712     __ orr(v0, __ T16B, v0, v1);
4713     __ umov(tmp1, v0, __ D, 0);
4714     __ umov(tmp2, v0, __ D, 1);
4715     __ orr(tmp1, tmp1, tmp2);
4716     __ cbnz(tmp1, NOT_EQUAL);
4717     __ br(__ GE, LOOP);
4718   }
4719 
4720   // a1 = r1 - array1 address
4721   // a2 = r2 - array2 address
4722   // result = r0 - return value. Already contains "false"
4723   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4724   // r3-r5 are reserved temporary registers
4725   address generate_large_array_equals() {
4726     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4727         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4728         tmp7 = r12, tmp8 = r13;
4729     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4730         SMALL_LOOP, POST_LOOP;
4731     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4732     // calculate if at least 32 prefetched bytes are used
4733     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4734     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4735     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4736     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4737         tmp5, tmp6, tmp7, tmp8);
4738 
4739     __ align(CodeEntryAlignment);
4740 
4741     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4742 
4743     address entry = __ pc();
4744     __ enter();
4745     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4746     // also advance pointers to use post-increment instead of pre-increment
4747     __ add(a1, a1, wordSize);
4748     __ add(a2, a2, wordSize);
4749     if (AvoidUnalignedAccesses) {
4750       // both implementations (SIMD/nonSIMD) are using relatively large load
4751       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4752       // on some CPUs in case of address is not at least 16-byte aligned.
4753       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4754       // load if needed at least for 1st address and make if 16-byte aligned.
4755       Label ALIGNED16;
4756       __ tbz(a1, 3, ALIGNED16);
4757       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4758       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4759       __ sub(cnt1, cnt1, wordSize);
4760       __ eor(tmp1, tmp1, tmp2);
4761       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4762       __ bind(ALIGNED16);
4763     }
4764     if (UseSIMDForArrayEquals) {
4765       if (SoftwarePrefetchHintDistance >= 0) {
4766         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4767         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4768         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4769             /* prfm = */ true, NOT_EQUAL);
4770         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4771         __ br(__ LT, TAIL);
4772       }
4773       __ bind(NO_PREFETCH_LARGE_LOOP);
4774       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4775           /* prfm = */ false, NOT_EQUAL);
4776     } else {
4777       __ push(spilled_regs, sp);
4778       if (SoftwarePrefetchHintDistance >= 0) {
4779         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4780         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4781         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4782             /* prfm = */ true, NOT_EQUAL);
4783         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4784         __ br(__ LT, TAIL);
4785       }
4786       __ bind(NO_PREFETCH_LARGE_LOOP);
4787       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4788           /* prfm = */ false, NOT_EQUAL);
4789     }
4790     __ bind(TAIL);
4791       __ cbz(cnt1, EQUAL);
4792       __ subs(cnt1, cnt1, wordSize);
4793       __ br(__ LE, POST_LOOP);
4794     __ bind(SMALL_LOOP);
4795       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4796       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4797       __ subs(cnt1, cnt1, wordSize);
4798       __ eor(tmp1, tmp1, tmp2);
4799       __ cbnz(tmp1, NOT_EQUAL);
4800       __ br(__ GT, SMALL_LOOP);
4801     __ bind(POST_LOOP);
4802       __ ldr(tmp1, Address(a1, cnt1));
4803       __ ldr(tmp2, Address(a2, cnt1));
4804       __ eor(tmp1, tmp1, tmp2);
4805       __ cbnz(tmp1, NOT_EQUAL);
4806     __ bind(EQUAL);
4807       __ mov(result, true);
4808     __ bind(NOT_EQUAL);
4809       if (!UseSIMDForArrayEquals) {
4810         __ pop(spilled_regs, sp);
4811       }
4812     __ bind(NOT_EQUAL_NO_POP);
4813     __ leave();
4814     __ ret(lr);
4815     return entry;
4816   }
4817 
4818   address generate_dsin_dcos(bool isCos) {
4819     __ align(CodeEntryAlignment);
4820     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4821     address start = __ pc();
4822     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4823         (address)StubRoutines::aarch64::_two_over_pi,
4824         (address)StubRoutines::aarch64::_pio2,
4825         (address)StubRoutines::aarch64::_dsin_coef,
4826         (address)StubRoutines::aarch64::_dcos_coef);
4827     return start;
4828   }
4829 
4830   address generate_dlog() {
4831     __ align(CodeEntryAlignment);
4832     StubCodeMark mark(this, "StubRoutines", "dlog");
4833     address entry = __ pc();
4834     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4835         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4836     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4837     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4838         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4839     return entry;
4840   }
4841 
4842 
4843   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4844   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4845       Label &DIFF2) {
4846     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4847     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4848 
4849     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4850     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4851     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4852     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4853 
4854     __ fmovd(tmpL, vtmp3);
4855     __ eor(rscratch2, tmp3, tmpL);
4856     __ cbnz(rscratch2, DIFF2);
4857 
4858     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4859     __ umov(tmpL, vtmp3, __ D, 1);
4860     __ eor(rscratch2, tmpU, tmpL);
4861     __ cbnz(rscratch2, DIFF1);
4862 
4863     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4864     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4865     __ fmovd(tmpL, vtmp);
4866     __ eor(rscratch2, tmp3, tmpL);
4867     __ cbnz(rscratch2, DIFF2);
4868 
4869     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4870     __ umov(tmpL, vtmp, __ D, 1);
4871     __ eor(rscratch2, tmpU, tmpL);
4872     __ cbnz(rscratch2, DIFF1);
4873   }
4874 
4875   // r0  = result
4876   // r1  = str1
4877   // r2  = cnt1
4878   // r3  = str2
4879   // r4  = cnt2
4880   // r10 = tmp1
4881   // r11 = tmp2
4882   address generate_compare_long_string_different_encoding(bool isLU) {
4883     __ align(CodeEntryAlignment);
4884     StubCodeMark mark(this, "StubRoutines", isLU
4885         ? "compare_long_string_different_encoding LU"
4886         : "compare_long_string_different_encoding UL");
4887     address entry = __ pc();
4888     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4889         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4890         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4891     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4892         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4893     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4894     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4895 
4896     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
4897 
4898     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4899     // cnt2 == amount of characters left to compare
4900     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4901     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4902     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4903     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4904     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4905     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4906     __ eor(rscratch2, tmp1, tmp2);
4907     __ mov(rscratch1, tmp2);
4908     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4909     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4910              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4911     __ push(spilled_regs, sp);
4912     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4913     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4914 
4915     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4916 
4917     if (SoftwarePrefetchHintDistance >= 0) {
4918       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4919       __ br(__ LT, NO_PREFETCH);
4920       __ bind(LARGE_LOOP_PREFETCH);
4921         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4922         __ mov(tmp4, 2);
4923         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4924         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4925           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4926           __ subs(tmp4, tmp4, 1);
4927           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4928           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4929           __ mov(tmp4, 2);
4930         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4931           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4932           __ subs(tmp4, tmp4, 1);
4933           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4934           __ sub(cnt2, cnt2, 64);
4935           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4936           __ br(__ GE, LARGE_LOOP_PREFETCH);
4937     }
4938     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4939     __ bind(NO_PREFETCH);
4940     __ subs(cnt2, cnt2, 16);
4941     __ br(__ LT, TAIL);
4942     __ align(OptoLoopAlignment);
4943     __ bind(SMALL_LOOP); // smaller loop
4944       __ subs(cnt2, cnt2, 16);
4945       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4946       __ br(__ GE, SMALL_LOOP);
4947       __ cmn(cnt2, (u1)16);
4948       __ br(__ EQ, LOAD_LAST);
4949     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4950       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4951       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4952       __ ldr(tmp3, Address(cnt1, -8));
4953       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4954       __ b(LOAD_LAST);
4955     __ bind(DIFF2);
4956       __ mov(tmpU, tmp3);
4957     __ bind(DIFF1);
4958       __ pop(spilled_regs, sp);
4959       __ b(CALCULATE_DIFFERENCE);
4960     __ bind(LOAD_LAST);
4961       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4962       // No need to load it again
4963       __ mov(tmpU, tmp3);
4964       __ pop(spilled_regs, sp);
4965 
4966       // tmp2 points to the address of the last 4 Latin1 characters right now
4967       __ ldrs(vtmp, Address(tmp2));
4968       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4969       __ fmovd(tmpL, vtmp);
4970 
4971       __ eor(rscratch2, tmpU, tmpL);
4972       __ cbz(rscratch2, DONE);
4973 
4974     // Find the first different characters in the longwords and
4975     // compute their difference.
4976     __ bind(CALCULATE_DIFFERENCE);
4977       __ rev(rscratch2, rscratch2);
4978       __ clz(rscratch2, rscratch2);
4979       __ andr(rscratch2, rscratch2, -16);
4980       __ lsrv(tmp1, tmp1, rscratch2);
4981       __ uxthw(tmp1, tmp1);
4982       __ lsrv(rscratch1, rscratch1, rscratch2);
4983       __ uxthw(rscratch1, rscratch1);
4984       __ subw(result, tmp1, rscratch1);
4985     __ bind(DONE);
4986       __ ret(lr);
4987     return entry;
4988   }
4989 
4990     address generate_method_entry_barrier() {
4991     __ align(CodeEntryAlignment);
4992     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4993 
4994     Label deoptimize_label;
4995 
4996     address start = __ pc();
4997 
4998     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4999 
5000     __ enter();
5001     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5002 
5003     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5004 
5005     __ push_call_clobbered_registers();
5006 
5007     __ mov(c_rarg0, rscratch2);
5008     __ call_VM_leaf
5009          (CAST_FROM_FN_PTR
5010           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5011 
5012     __ reset_last_Java_frame(true);
5013 
5014     __ mov(rscratch1, r0);
5015 
5016     __ pop_call_clobbered_registers();
5017 
5018     __ cbnz(rscratch1, deoptimize_label);
5019 
5020     __ leave();
5021     __ ret(lr);
5022 
5023     __ BIND(deoptimize_label);
5024 
5025     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5026     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5027 
5028     __ mov(sp, rscratch1);
5029     __ br(rscratch2);
5030 
5031     return start;
5032   }
5033 
5034   // r0  = result
5035   // r1  = str1
5036   // r2  = cnt1
5037   // r3  = str2
5038   // r4  = cnt2
5039   // r10 = tmp1
5040   // r11 = tmp2
5041   address generate_compare_long_string_same_encoding(bool isLL) {
5042     __ align(CodeEntryAlignment);
5043     StubCodeMark mark(this, "StubRoutines", isLL
5044         ? "compare_long_string_same_encoding LL"
5045         : "compare_long_string_same_encoding UU");
5046     address entry = __ pc();
5047     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5048         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5049 
5050     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5051 
5052     // exit from large loop when less than 64 bytes left to read or we're about
5053     // to prefetch memory behind array border
5054     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5055 
5056     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5057     __ eor(rscratch2, tmp1, tmp2);
5058     __ cbnz(rscratch2, CAL_DIFFERENCE);
5059 
5060     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5061     // update pointers, because of previous read
5062     __ add(str1, str1, wordSize);
5063     __ add(str2, str2, wordSize);
5064     if (SoftwarePrefetchHintDistance >= 0) {
5065       __ bind(LARGE_LOOP_PREFETCH);
5066         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5067         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5068 
5069         __ align(OptoLoopAlignment);
5070         for (int i = 0; i < 4; i++) {
5071           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5072           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5073           __ cmp(tmp1, tmp2);
5074           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5075           __ br(Assembler::NE, DIFF);
5076         }
5077         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5078         __ add(str1, str1, 64);
5079         __ add(str2, str2, 64);
5080         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5081         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5082         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5083     }
5084 
5085     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5086     __ br(Assembler::LE, LESS16);
5087     __ align(OptoLoopAlignment);
5088     __ bind(LOOP_COMPARE16);
5089       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5090       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5091       __ cmp(tmp1, tmp2);
5092       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5093       __ br(Assembler::NE, DIFF);
5094       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5095       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5096       __ br(Assembler::LT, LESS16);
5097 
5098       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5099       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5100       __ cmp(tmp1, tmp2);
5101       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5102       __ br(Assembler::NE, DIFF);
5103       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5104       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5105       __ br(Assembler::GE, LOOP_COMPARE16);
5106       __ cbz(cnt2, LENGTH_DIFF);
5107 
5108     __ bind(LESS16);
5109       // each 8 compare
5110       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5111       __ br(Assembler::LE, LESS8);
5112       __ ldr(tmp1, Address(__ post(str1, 8)));
5113       __ ldr(tmp2, Address(__ post(str2, 8)));
5114       __ eor(rscratch2, tmp1, tmp2);
5115       __ cbnz(rscratch2, CAL_DIFFERENCE);
5116       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5117 
5118     __ bind(LESS8); // directly load last 8 bytes
5119       if (!isLL) {
5120         __ add(cnt2, cnt2, cnt2);
5121       }
5122       __ ldr(tmp1, Address(str1, cnt2));
5123       __ ldr(tmp2, Address(str2, cnt2));
5124       __ eor(rscratch2, tmp1, tmp2);
5125       __ cbz(rscratch2, LENGTH_DIFF);
5126       __ b(CAL_DIFFERENCE);
5127 
5128     __ bind(DIFF);
5129       __ cmp(tmp1, tmp2);
5130       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5131       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5132       // reuse rscratch2 register for the result of eor instruction
5133       __ eor(rscratch2, tmp1, tmp2);
5134 
5135     __ bind(CAL_DIFFERENCE);
5136       __ rev(rscratch2, rscratch2);
5137       __ clz(rscratch2, rscratch2);
5138       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5139       __ lsrv(tmp1, tmp1, rscratch2);
5140       __ lsrv(tmp2, tmp2, rscratch2);
5141       if (isLL) {
5142         __ uxtbw(tmp1, tmp1);
5143         __ uxtbw(tmp2, tmp2);
5144       } else {
5145         __ uxthw(tmp1, tmp1);
5146         __ uxthw(tmp2, tmp2);
5147       }
5148       __ subw(result, tmp1, tmp2);
5149 
5150     __ bind(LENGTH_DIFF);
5151       __ ret(lr);
5152     return entry;
5153   }
5154 
5155   void generate_compare_long_strings() {
5156       StubRoutines::aarch64::_compare_long_string_LL
5157           = generate_compare_long_string_same_encoding(true);
5158       StubRoutines::aarch64::_compare_long_string_UU
5159           = generate_compare_long_string_same_encoding(false);
5160       StubRoutines::aarch64::_compare_long_string_LU
5161           = generate_compare_long_string_different_encoding(true);
5162       StubRoutines::aarch64::_compare_long_string_UL
5163           = generate_compare_long_string_different_encoding(false);
5164   }
5165 
5166   // R0 = result
5167   // R1 = str2
5168   // R2 = cnt1
5169   // R3 = str1
5170   // R4 = cnt2
5171   // This generic linear code use few additional ideas, which makes it faster:
5172   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5173   // in order to skip initial loading(help in systems with 1 ld pipeline)
5174   // 2) we can use "fast" algorithm of finding single character to search for
5175   // first symbol with less branches(1 branch per each loaded register instead
5176   // of branch for each symbol), so, this is where constants like
5177   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5178   // 3) after loading and analyzing 1st register of source string, it can be
5179   // used to search for every 1st character entry, saving few loads in
5180   // comparison with "simplier-but-slower" implementation
5181   // 4) in order to avoid lots of push/pop operations, code below is heavily
5182   // re-using/re-initializing/compressing register values, which makes code
5183   // larger and a bit less readable, however, most of extra operations are
5184   // issued during loads or branches, so, penalty is minimal
5185   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5186     const char* stubName = str1_isL
5187         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5188         : "indexof_linear_uu";
5189     __ align(CodeEntryAlignment);
5190     StubCodeMark mark(this, "StubRoutines", stubName);
5191     address entry = __ pc();
5192 
5193     int str1_chr_size = str1_isL ? 1 : 2;
5194     int str2_chr_size = str2_isL ? 1 : 2;
5195     int str1_chr_shift = str1_isL ? 0 : 1;
5196     int str2_chr_shift = str2_isL ? 0 : 1;
5197     bool isL = str1_isL && str2_isL;
5198    // parameters
5199     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5200     // temporary registers
5201     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5202     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5203     // redefinitions
5204     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5205 
5206     __ push(spilled_regs, sp);
5207     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5208         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5209         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5210         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5211         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5212         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5213     // Read whole register from str1. It is safe, because length >=8 here
5214     __ ldr(ch1, Address(str1));
5215     // Read whole register from str2. It is safe, because length >=8 here
5216     __ ldr(ch2, Address(str2));
5217     __ sub(cnt2, cnt2, cnt1);
5218     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5219     if (str1_isL != str2_isL) {
5220       __ eor(v0, __ T16B, v0, v0);
5221     }
5222     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5223     __ mul(first, first, tmp1);
5224     // check if we have less than 1 register to check
5225     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5226     if (str1_isL != str2_isL) {
5227       __ fmovd(v1, ch1);
5228     }
5229     __ br(__ LE, L_SMALL);
5230     __ eor(ch2, first, ch2);
5231     if (str1_isL != str2_isL) {
5232       __ zip1(v1, __ T16B, v1, v0);
5233     }
5234     __ sub(tmp2, ch2, tmp1);
5235     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5236     __ bics(tmp2, tmp2, ch2);
5237     if (str1_isL != str2_isL) {
5238       __ fmovd(ch1, v1);
5239     }
5240     __ br(__ NE, L_HAS_ZERO);
5241     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5242     __ add(result, result, wordSize/str2_chr_size);
5243     __ add(str2, str2, wordSize);
5244     __ br(__ LT, L_POST_LOOP);
5245     __ BIND(L_LOOP);
5246       __ ldr(ch2, Address(str2));
5247       __ eor(ch2, first, ch2);
5248       __ sub(tmp2, ch2, tmp1);
5249       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5250       __ bics(tmp2, tmp2, ch2);
5251       __ br(__ NE, L_HAS_ZERO);
5252     __ BIND(L_LOOP_PROCEED);
5253       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5254       __ add(str2, str2, wordSize);
5255       __ add(result, result, wordSize/str2_chr_size);
5256       __ br(__ GE, L_LOOP);
5257     __ BIND(L_POST_LOOP);
5258       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5259       __ br(__ LE, NOMATCH);
5260       __ ldr(ch2, Address(str2));
5261       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5262       __ eor(ch2, first, ch2);
5263       __ sub(tmp2, ch2, tmp1);
5264       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5265       __ mov(tmp4, -1); // all bits set
5266       __ b(L_SMALL_PROCEED);
5267     __ align(OptoLoopAlignment);
5268     __ BIND(L_SMALL);
5269       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5270       __ eor(ch2, first, ch2);
5271       if (str1_isL != str2_isL) {
5272         __ zip1(v1, __ T16B, v1, v0);
5273       }
5274       __ sub(tmp2, ch2, tmp1);
5275       __ mov(tmp4, -1); // all bits set
5276       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5277       if (str1_isL != str2_isL) {
5278         __ fmovd(ch1, v1); // move converted 4 symbols
5279       }
5280     __ BIND(L_SMALL_PROCEED);
5281       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5282       __ bic(tmp2, tmp2, ch2);
5283       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5284       __ rbit(tmp2, tmp2);
5285       __ br(__ EQ, NOMATCH);
5286     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5287       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5288       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5289       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5290       if (str2_isL) { // LL
5291         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5292         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5293         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5294         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5295         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5296       } else {
5297         __ mov(ch2, 0xE); // all bits in byte set except last one
5298         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5299         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5300         __ lslv(tmp2, tmp2, tmp4);
5301         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5302         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5303         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5304         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5305       }
5306       __ cmp(ch1, ch2);
5307       __ mov(tmp4, wordSize/str2_chr_size);
5308       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5309     __ BIND(L_SMALL_CMP_LOOP);
5310       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5311                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5312       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5313                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5314       __ add(tmp4, tmp4, 1);
5315       __ cmp(tmp4, cnt1);
5316       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5317       __ cmp(first, ch2);
5318       __ br(__ EQ, L_SMALL_CMP_LOOP);
5319     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5320       __ cbz(tmp2, NOMATCH); // no more matches. exit
5321       __ clz(tmp4, tmp2);
5322       __ add(result, result, 1); // advance index
5323       __ add(str2, str2, str2_chr_size); // advance pointer
5324       __ b(L_SMALL_HAS_ZERO_LOOP);
5325     __ align(OptoLoopAlignment);
5326     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5327       __ cmp(first, ch2);
5328       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5329       __ b(DONE);
5330     __ align(OptoLoopAlignment);
5331     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5332       if (str2_isL) { // LL
5333         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5334         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5335         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5336         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5337         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5338       } else {
5339         __ mov(ch2, 0xE); // all bits in byte set except last one
5340         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5341         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5342         __ lslv(tmp2, tmp2, tmp4);
5343         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5344         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5345         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5346         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5347       }
5348       __ cmp(ch1, ch2);
5349       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5350       __ b(DONE);
5351     __ align(OptoLoopAlignment);
5352     __ BIND(L_HAS_ZERO);
5353       __ rbit(tmp2, tmp2);
5354       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5355       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5356       // It's fine because both counters are 32bit and are not changed in this
5357       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5358       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5359       __ sub(result, result, 1);
5360     __ BIND(L_HAS_ZERO_LOOP);
5361       __ mov(cnt1, wordSize/str2_chr_size);
5362       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5363       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5364       if (str2_isL) {
5365         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5366         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5367         __ lslv(tmp2, tmp2, tmp4);
5368         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5369         __ add(tmp4, tmp4, 1);
5370         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5371         __ lsl(tmp2, tmp2, 1);
5372         __ mov(tmp4, wordSize/str2_chr_size);
5373       } else {
5374         __ mov(ch2, 0xE);
5375         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5376         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5377         __ lslv(tmp2, tmp2, tmp4);
5378         __ add(tmp4, tmp4, 1);
5379         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5380         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5381         __ lsl(tmp2, tmp2, 1);
5382         __ mov(tmp4, wordSize/str2_chr_size);
5383         __ sub(str2, str2, str2_chr_size);
5384       }
5385       __ cmp(ch1, ch2);
5386       __ mov(tmp4, wordSize/str2_chr_size);
5387       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5388     __ BIND(L_CMP_LOOP);
5389       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5390                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5391       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5392                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5393       __ add(tmp4, tmp4, 1);
5394       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5395       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5396       __ cmp(cnt1, ch2);
5397       __ br(__ EQ, L_CMP_LOOP);
5398     __ BIND(L_CMP_LOOP_NOMATCH);
5399       // here we're not matched
5400       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5401       __ clz(tmp4, tmp2);
5402       __ add(str2, str2, str2_chr_size); // advance pointer
5403       __ b(L_HAS_ZERO_LOOP);
5404     __ align(OptoLoopAlignment);
5405     __ BIND(L_CMP_LOOP_LAST_CMP);
5406       __ cmp(cnt1, ch2);
5407       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5408       __ b(DONE);
5409     __ align(OptoLoopAlignment);
5410     __ BIND(L_CMP_LOOP_LAST_CMP2);
5411       if (str2_isL) {
5412         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5413         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5414         __ lslv(tmp2, tmp2, tmp4);
5415         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5416         __ add(tmp4, tmp4, 1);
5417         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5418         __ lsl(tmp2, tmp2, 1);
5419       } else {
5420         __ mov(ch2, 0xE);
5421         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5422         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5423         __ lslv(tmp2, tmp2, tmp4);
5424         __ add(tmp4, tmp4, 1);
5425         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5426         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5427         __ lsl(tmp2, tmp2, 1);
5428         __ sub(str2, str2, str2_chr_size);
5429       }
5430       __ cmp(ch1, ch2);
5431       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5432       __ b(DONE);
5433     __ align(OptoLoopAlignment);
5434     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5435       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5436       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5437       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5438       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5439       // result by analyzed characters value, so, we can just reset lower bits
5440       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5441       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5442       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5443       // index of last analyzed substring inside current octet. So, str2 in at
5444       // respective start address. We need to advance it to next octet
5445       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5446       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5447       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5448       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5449       __ movw(cnt2, cnt2);
5450       __ b(L_LOOP_PROCEED);
5451     __ align(OptoLoopAlignment);
5452     __ BIND(NOMATCH);
5453       __ mov(result, -1);
5454     __ BIND(DONE);
5455       __ pop(spilled_regs, sp);
5456       __ ret(lr);
5457     return entry;
5458   }
5459 
5460   void generate_string_indexof_stubs() {
5461     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5462     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5463     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5464   }
5465 
5466   void inflate_and_store_2_fp_registers(bool generatePrfm,
5467       FloatRegister src1, FloatRegister src2) {
5468     Register dst = r1;
5469     __ zip1(v1, __ T16B, src1, v0);
5470     __ zip2(v2, __ T16B, src1, v0);
5471     if (generatePrfm) {
5472       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5473     }
5474     __ zip1(v3, __ T16B, src2, v0);
5475     __ zip2(v4, __ T16B, src2, v0);
5476     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5477   }
5478 
5479   // R0 = src
5480   // R1 = dst
5481   // R2 = len
5482   // R3 = len >> 3
5483   // V0 = 0
5484   // v1 = loaded 8 bytes
5485   address generate_large_byte_array_inflate() {
5486     __ align(CodeEntryAlignment);
5487     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5488     address entry = __ pc();
5489     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5490     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5491     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5492 
5493     // do one more 8-byte read to have address 16-byte aligned in most cases
5494     // also use single store instruction
5495     __ ldrd(v2, __ post(src, 8));
5496     __ sub(octetCounter, octetCounter, 2);
5497     __ zip1(v1, __ T16B, v1, v0);
5498     __ zip1(v2, __ T16B, v2, v0);
5499     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5500     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5501     __ subs(rscratch1, octetCounter, large_loop_threshold);
5502     __ br(__ LE, LOOP_START);
5503     __ b(LOOP_PRFM_START);
5504     __ bind(LOOP_PRFM);
5505       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5506     __ bind(LOOP_PRFM_START);
5507       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5508       __ sub(octetCounter, octetCounter, 8);
5509       __ subs(rscratch1, octetCounter, large_loop_threshold);
5510       inflate_and_store_2_fp_registers(true, v3, v4);
5511       inflate_and_store_2_fp_registers(true, v5, v6);
5512       __ br(__ GT, LOOP_PRFM);
5513       __ cmp(octetCounter, (u1)8);
5514       __ br(__ LT, DONE);
5515     __ bind(LOOP);
5516       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5517       __ bind(LOOP_START);
5518       __ sub(octetCounter, octetCounter, 8);
5519       __ cmp(octetCounter, (u1)8);
5520       inflate_and_store_2_fp_registers(false, v3, v4);
5521       inflate_and_store_2_fp_registers(false, v5, v6);
5522       __ br(__ GE, LOOP);
5523     __ bind(DONE);
5524       __ ret(lr);
5525     return entry;
5526   }
5527 
5528   /**
5529    *  Arguments:
5530    *
5531    *  Input:
5532    *  c_rarg0   - current state address
5533    *  c_rarg1   - H key address
5534    *  c_rarg2   - data address
5535    *  c_rarg3   - number of blocks
5536    *
5537    *  Output:
5538    *  Updated state at c_rarg0
5539    */
5540   address generate_ghash_processBlocks() {
5541     // Bafflingly, GCM uses little-endian for the byte order, but
5542     // big-endian for the bit order.  For example, the polynomial 1 is
5543     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5544     //
5545     // So, we must either reverse the bytes in each word and do
5546     // everything big-endian or reverse the bits in each byte and do
5547     // it little-endian.  On AArch64 it's more idiomatic to reverse
5548     // the bits in each byte (we have an instruction, RBIT, to do
5549     // that) and keep the data in little-endian bit order throught the
5550     // calculation, bit-reversing the inputs and outputs.
5551 
5552     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5553     __ align(wordSize * 2);
5554     address p = __ pc();
5555     __ emit_int64(0x87);  // The low-order bits of the field
5556                           // polynomial (i.e. p = z^7+z^2+z+1)
5557                           // repeated in the low and high parts of a
5558                           // 128-bit vector
5559     __ emit_int64(0x87);
5560 
5561     __ align(CodeEntryAlignment);
5562     address start = __ pc();
5563 
5564     Register state   = c_rarg0;
5565     Register subkeyH = c_rarg1;
5566     Register data    = c_rarg2;
5567     Register blocks  = c_rarg3;
5568 
5569     FloatRegister vzr = v30;
5570     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5571 
5572     __ ldrq(v24, p);    // The field polynomial
5573 
5574     __ ldrq(v0, Address(state));
5575     __ ldrq(v1, Address(subkeyH));
5576 
5577     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5578     __ rbit(v0, __ T16B, v0);
5579     __ rev64(v1, __ T16B, v1);
5580     __ rbit(v1, __ T16B, v1);
5581 
5582     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5583     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5584 
5585     {
5586       Label L_ghash_loop;
5587       __ bind(L_ghash_loop);
5588 
5589       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5590                                                  // reversing each byte
5591       __ rbit(v2, __ T16B, v2);
5592       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5593 
5594       // Multiply state in v2 by subkey in v1
5595       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5596                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5597                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5598       // Reduce v7:v5 by the field polynomial
5599       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5600 
5601       __ sub(blocks, blocks, 1);
5602       __ cbnz(blocks, L_ghash_loop);
5603     }
5604 
5605     // The bit-reversed result is at this point in v0
5606     __ rev64(v0, __ T16B, v0);
5607     __ rbit(v0, __ T16B, v0);
5608 
5609     __ st1(v0, __ T16B, state);
5610     __ ret(lr);
5611 
5612     return start;
5613   }
5614 
5615   address generate_ghash_processBlocks_wide() {
5616     address small = generate_ghash_processBlocks();
5617 
5618     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5619     __ align(wordSize * 2);
5620     address p = __ pc();
5621     __ emit_int64(0x87);  // The low-order bits of the field
5622                           // polynomial (i.e. p = z^7+z^2+z+1)
5623                           // repeated in the low and high parts of a
5624                           // 128-bit vector
5625     __ emit_int64(0x87);
5626 
5627     __ align(CodeEntryAlignment);
5628     address start = __ pc();
5629 
5630     Register state   = c_rarg0;
5631     Register subkeyH = c_rarg1;
5632     Register data    = c_rarg2;
5633     Register blocks  = c_rarg3;
5634 
5635     const int unroll = 4;
5636 
5637     __ cmp(blocks, (unsigned char)(unroll * 2));
5638     __ br(__ LT, small);
5639 
5640     if (unroll > 1) {
5641     // Save state before entering routine
5642       __ sub(sp, sp, 4 * 16);
5643       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5644       __ sub(sp, sp, 4 * 16);
5645       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5646     }
5647 
5648     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5649 
5650     if (unroll > 1) {
5651       // And restore state
5652       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5653       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5654     }
5655 
5656     __ cmp(blocks, (unsigned char)0);
5657     __ br(__ GT, small);
5658 
5659     __ ret(lr);
5660 
5661     return start;
5662   }
5663 
5664   void generate_base64_encode_simdround(Register src, Register dst,
5665         FloatRegister codec, u8 size) {
5666 
5667     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5668     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5669     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5670 
5671     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5672 
5673     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
5674 
5675     __ ushr(ind0, arrangement, in0,  2);
5676 
5677     __ ushr(ind1, arrangement, in1,  2);
5678     __ shl(in0,   arrangement, in0,  6);
5679     __ orr(ind1,  arrangement, ind1, in0);
5680     __ ushr(ind1, arrangement, ind1, 2);
5681 
5682     __ ushr(ind2, arrangement, in2,  4);
5683     __ shl(in1,   arrangement, in1,  4);
5684     __ orr(ind2,  arrangement, in1,  ind2);
5685     __ ushr(ind2, arrangement, ind2, 2);
5686 
5687     __ shl(ind3,  arrangement, in2,  2);
5688     __ ushr(ind3, arrangement, ind3, 2);
5689 
5690     __ tbl(out0,  arrangement, codec,  4, ind0);
5691     __ tbl(out1,  arrangement, codec,  4, ind1);
5692     __ tbl(out2,  arrangement, codec,  4, ind2);
5693     __ tbl(out3,  arrangement, codec,  4, ind3);
5694 
5695     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
5696   }
5697 
5698    /**
5699    *  Arguments:
5700    *
5701    *  Input:
5702    *  c_rarg0   - src_start
5703    *  c_rarg1   - src_offset
5704    *  c_rarg2   - src_length
5705    *  c_rarg3   - dest_start
5706    *  c_rarg4   - dest_offset
5707    *  c_rarg5   - isURL
5708    *
5709    */
5710   address generate_base64_encodeBlock() {
5711 
5712     static const char toBase64[64] = {
5713       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5714       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5715       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5716       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5717       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
5718     };
5719 
5720     static const char toBase64URL[64] = {
5721       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
5722       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
5723       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
5724       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
5725       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
5726     };
5727 
5728     __ align(CodeEntryAlignment);
5729     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
5730     address start = __ pc();
5731 
5732     Register src   = c_rarg0;  // source array
5733     Register soff  = c_rarg1;  // source start offset
5734     Register send  = c_rarg2;  // source end offset
5735     Register dst   = c_rarg3;  // dest array
5736     Register doff  = c_rarg4;  // position for writing to dest array
5737     Register isURL = c_rarg5;  // Base64 or URL chracter set
5738 
5739     // c_rarg6 and c_rarg7 are free to use as temps
5740     Register codec  = c_rarg6;
5741     Register length = c_rarg7;
5742 
5743     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
5744 
5745     __ add(src, src, soff);
5746     __ add(dst, dst, doff);
5747     __ sub(length, send, soff);
5748 
5749     // load the codec base address
5750     __ lea(codec, ExternalAddress((address) toBase64));
5751     __ cbz(isURL, ProcessData);
5752     __ lea(codec, ExternalAddress((address) toBase64URL));
5753 
5754     __ BIND(ProcessData);
5755 
5756     // too short to formup a SIMD loop, roll back
5757     __ cmp(length, (u1)24);
5758     __ br(Assembler::LT, Process3B);
5759 
5760     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
5761 
5762     __ BIND(Process48B);
5763     __ cmp(length, (u1)48);
5764     __ br(Assembler::LT, Process24B);
5765     generate_base64_encode_simdround(src, dst, v0, 16);
5766     __ sub(length, length, 48);
5767     __ b(Process48B);
5768 
5769     __ BIND(Process24B);
5770     __ cmp(length, (u1)24);
5771     __ br(Assembler::LT, SIMDExit);
5772     generate_base64_encode_simdround(src, dst, v0, 8);
5773     __ sub(length, length, 24);
5774 
5775     __ BIND(SIMDExit);
5776     __ cbz(length, Exit);
5777 
5778     __ BIND(Process3B);
5779     //  3 src bytes, 24 bits
5780     __ ldrb(r10, __ post(src, 1));
5781     __ ldrb(r11, __ post(src, 1));
5782     __ ldrb(r12, __ post(src, 1));
5783     __ orrw(r11, r11, r10, Assembler::LSL, 8);
5784     __ orrw(r12, r12, r11, Assembler::LSL, 8);
5785     // codec index
5786     __ ubfmw(r15, r12, 18, 23);
5787     __ ubfmw(r14, r12, 12, 17);
5788     __ ubfmw(r13, r12, 6,  11);
5789     __ andw(r12,  r12, 63);
5790     // get the code based on the codec
5791     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
5792     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
5793     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
5794     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
5795     __ strb(r15, __ post(dst, 1));
5796     __ strb(r14, __ post(dst, 1));
5797     __ strb(r13, __ post(dst, 1));
5798     __ strb(r12, __ post(dst, 1));
5799     __ sub(length, length, 3);
5800     __ cbnz(length, Process3B);
5801 
5802     __ BIND(Exit);
5803     __ ret(lr);
5804 
5805     return start;
5806   }
5807 
5808   void generate_base64_decode_simdround(Register src, Register dst,
5809         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
5810 
5811     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
5812     FloatRegister out0 = v20, out1 = v21, out2 = v22;
5813 
5814     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
5815     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
5816 
5817     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
5818 
5819     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5820 
5821     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
5822 
5823     // we need unsigned saturating substract, to make sure all input values
5824     // in range [0, 63] will have 0U value in the higher half lookup
5825     __ uqsubv(decH0, __ T16B, in0, v27);
5826     __ uqsubv(decH1, __ T16B, in1, v27);
5827     __ uqsubv(decH2, __ T16B, in2, v27);
5828     __ uqsubv(decH3, __ T16B, in3, v27);
5829 
5830     // lower half lookup
5831     __ tbl(decL0, arrangement, codecL, 4, in0);
5832     __ tbl(decL1, arrangement, codecL, 4, in1);
5833     __ tbl(decL2, arrangement, codecL, 4, in2);
5834     __ tbl(decL3, arrangement, codecL, 4, in3);
5835 
5836     // higher half lookup
5837     __ tbx(decH0, arrangement, codecH, 4, decH0);
5838     __ tbx(decH1, arrangement, codecH, 4, decH1);
5839     __ tbx(decH2, arrangement, codecH, 4, decH2);
5840     __ tbx(decH3, arrangement, codecH, 4, decH3);
5841 
5842     // combine lower and higher
5843     __ orr(decL0, arrangement, decL0, decH0);
5844     __ orr(decL1, arrangement, decL1, decH1);
5845     __ orr(decL2, arrangement, decL2, decH2);
5846     __ orr(decL3, arrangement, decL3, decH3);
5847 
5848     // check illegal inputs, value larger than 63 (maximum of 6 bits)
5849     __ cmhi(decH0, arrangement, decL0, v27);
5850     __ cmhi(decH1, arrangement, decL1, v27);
5851     __ cmhi(decH2, arrangement, decL2, v27);
5852     __ cmhi(decH3, arrangement, decL3, v27);
5853     __ orr(in0, arrangement, decH0, decH1);
5854     __ orr(in1, arrangement, decH2, decH3);
5855     __ orr(in2, arrangement, in0,   in1);
5856     __ umaxv(in3, arrangement, in2);
5857     __ umov(rscratch2, in3, __ B, 0);
5858 
5859     // get the data to output
5860     __ shl(out0,  arrangement, decL0, 2);
5861     __ ushr(out1, arrangement, decL1, 4);
5862     __ orr(out0,  arrangement, out0,  out1);
5863     __ shl(out1,  arrangement, decL1, 4);
5864     __ ushr(out2, arrangement, decL2, 2);
5865     __ orr(out1,  arrangement, out1,  out2);
5866     __ shl(out2,  arrangement, decL2, 6);
5867     __ orr(out2,  arrangement, out2,  decL3);
5868 
5869     __ cbz(rscratch2, NoIllegalData);
5870 
5871     // handle illegal input
5872     __ umov(r10, in2, __ D, 0);
5873     if (size == 16) {
5874       __ cbnz(r10, ErrorInLowerHalf);
5875 
5876       // illegal input is in higher half, store the lower half now.
5877       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
5878 
5879       __ umov(r10, in2,  __ D, 1);
5880       __ umov(r11, out0, __ D, 1);
5881       __ umov(r12, out1, __ D, 1);
5882       __ umov(r13, out2, __ D, 1);
5883       __ b(StoreLegalData);
5884 
5885       __ BIND(ErrorInLowerHalf);
5886     }
5887     __ umov(r11, out0, __ D, 0);
5888     __ umov(r12, out1, __ D, 0);
5889     __ umov(r13, out2, __ D, 0);
5890 
5891     __ BIND(StoreLegalData);
5892     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
5893     __ strb(r11, __ post(dst, 1));
5894     __ strb(r12, __ post(dst, 1));
5895     __ strb(r13, __ post(dst, 1));
5896     __ lsr(r10, r10, 8);
5897     __ lsr(r11, r11, 8);
5898     __ lsr(r12, r12, 8);
5899     __ lsr(r13, r13, 8);
5900     __ b(StoreLegalData);
5901 
5902     __ BIND(NoIllegalData);
5903     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
5904   }
5905 
5906 
5907    /**
5908    *  Arguments:
5909    *
5910    *  Input:
5911    *  c_rarg0   - src_start
5912    *  c_rarg1   - src_offset
5913    *  c_rarg2   - src_length
5914    *  c_rarg3   - dest_start
5915    *  c_rarg4   - dest_offset
5916    *  c_rarg5   - isURL
5917    *  c_rarg6   - isMIME
5918    *
5919    */
5920   address generate_base64_decodeBlock() {
5921 
5922     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
5923     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
5924     // titled "Base64 decoding".
5925 
5926     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
5927     // except the trailing character '=' is also treated illegal value in this instrinsic. That
5928     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
5929     static const uint8_t fromBase64ForNoSIMD[256] = {
5930       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5931       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5932       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5933        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5934       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5935        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
5936       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5937        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5938       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5939       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5940       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5941       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5942       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5943       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5944       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5945       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5946     };
5947 
5948     static const uint8_t fromBase64URLForNoSIMD[256] = {
5949       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5950       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5951       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5952        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5953       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
5954        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
5955       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
5956        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
5957       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5958       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5959       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5960       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5961       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5962       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5963       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5964       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5965     };
5966 
5967     // A legal value of base64 code is in range [0, 127].  We need two lookups
5968     // with tbl/tbx and combine them to get the decode data. The 1st table vector
5969     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
5970     // table vector lookup use tbx, out of range indices are unchanged in
5971     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
5972     // The value of index 64 is set to 0, so that we know that we already get the
5973     // decoded data with the 1st lookup.
5974     static const uint8_t fromBase64ForSIMD[128] = {
5975       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5976       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5977       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
5978        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5979         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
5980        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
5981       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
5982        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
5983     };
5984 
5985     static const uint8_t fromBase64URLForSIMD[128] = {
5986       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5987       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
5988       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
5989        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
5990         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
5991        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
5992        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
5993        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
5994     };
5995 
5996     __ align(CodeEntryAlignment);
5997     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
5998     address start = __ pc();
5999 
6000     Register src    = c_rarg0;  // source array
6001     Register soff   = c_rarg1;  // source start offset
6002     Register send   = c_rarg2;  // source end offset
6003     Register dst    = c_rarg3;  // dest array
6004     Register doff   = c_rarg4;  // position for writing to dest array
6005     Register isURL  = c_rarg5;  // Base64 or URL character set
6006     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6007 
6008     Register length = send;    // reuse send as length of source data to process
6009 
6010     Register simd_codec   = c_rarg6;
6011     Register nosimd_codec = c_rarg7;
6012 
6013     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6014 
6015     __ enter();
6016 
6017     __ add(src, src, soff);
6018     __ add(dst, dst, doff);
6019 
6020     __ mov(doff, dst);
6021 
6022     __ sub(length, send, soff);
6023     __ bfm(length, zr, 0, 1);
6024 
6025     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6026     __ cbz(isURL, ProcessData);
6027     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6028 
6029     __ BIND(ProcessData);
6030     __ mov(rscratch1, length);
6031     __ cmp(length, (u1)144); // 144 = 80 + 64
6032     __ br(Assembler::LT, Process4B);
6033 
6034     // In the MIME case, the line length cannot be more than 76
6035     // bytes (see RFC 2045). This is too short a block for SIMD
6036     // to be worthwhile, so we use non-SIMD here.
6037     __ movw(rscratch1, 79);
6038 
6039     __ BIND(Process4B);
6040     __ ldrw(r14, __ post(src, 4));
6041     __ ubfxw(r10, r14, 0,  8);
6042     __ ubfxw(r11, r14, 8,  8);
6043     __ ubfxw(r12, r14, 16, 8);
6044     __ ubfxw(r13, r14, 24, 8);
6045     // get the de-code
6046     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6047     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6048     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6049     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6050     // error detection, 255u indicates an illegal input
6051     __ orrw(r14, r10, r11);
6052     __ orrw(r15, r12, r13);
6053     __ orrw(r14, r14, r15);
6054     __ tbnz(r14, 7, Exit);
6055     // recover the data
6056     __ lslw(r14, r10, 10);
6057     __ bfiw(r14, r11, 4, 6);
6058     __ bfmw(r14, r12, 2, 5);
6059     __ rev16w(r14, r14);
6060     __ bfiw(r13, r12, 6, 2);
6061     __ strh(r14, __ post(dst, 2));
6062     __ strb(r13, __ post(dst, 1));
6063     // non-simd loop
6064     __ subsw(rscratch1, rscratch1, 4);
6065     __ br(Assembler::GT, Process4B);
6066 
6067     // if exiting from PreProcess80B, rscratch1 == -1;
6068     // otherwise, rscratch1 == 0.
6069     __ cbzw(rscratch1, Exit);
6070     __ sub(length, length, 80);
6071 
6072     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6073     __ cbz(isURL, SIMDEnter);
6074     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6075 
6076     __ BIND(SIMDEnter);
6077     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6078     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6079     __ mov(rscratch1, 63);
6080     __ dup(v27, __ T16B, rscratch1);
6081 
6082     __ BIND(Process64B);
6083     __ cmp(length, (u1)64);
6084     __ br(Assembler::LT, Process32B);
6085     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6086     __ sub(length, length, 64);
6087     __ b(Process64B);
6088 
6089     __ BIND(Process32B);
6090     __ cmp(length, (u1)32);
6091     __ br(Assembler::LT, SIMDExit);
6092     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6093     __ sub(length, length, 32);
6094     __ b(Process32B);
6095 
6096     __ BIND(SIMDExit);
6097     __ cbz(length, Exit);
6098     __ movw(rscratch1, length);
6099     __ b(Process4B);
6100 
6101     __ BIND(Exit);
6102     __ sub(c_rarg0, dst, doff);
6103 
6104     __ leave();
6105     __ ret(lr);
6106 
6107     return start;
6108   }
6109 
6110 #ifdef LINUX
6111 
6112   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6113   //
6114   // If LSE is in use, generate LSE versions of all the stubs. The
6115   // non-LSE versions are in atomic_aarch64.S.
6116 
6117   // class AtomicStubMark records the entry point of a stub and the
6118   // stub pointer which will point to it. The stub pointer is set to
6119   // the entry point when ~AtomicStubMark() is called, which must be
6120   // after ICache::invalidate_range. This ensures safe publication of
6121   // the generated code.
6122   class AtomicStubMark {
6123     address _entry_point;
6124     aarch64_atomic_stub_t *_stub;
6125     MacroAssembler *_masm;
6126   public:
6127     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6128       _masm = masm;
6129       __ align(32);
6130       _entry_point = __ pc();
6131       _stub = stub;
6132     }
6133     ~AtomicStubMark() {
6134       *_stub = (aarch64_atomic_stub_t)_entry_point;
6135     }
6136   };
6137 
6138   // NB: For memory_order_conservative we need a trailing membar after
6139   // LSE atomic operations but not a leading membar.
6140   //
6141   // We don't need a leading membar because a clause in the Arm ARM
6142   // says:
6143   //
6144   //   Barrier-ordered-before
6145   //
6146   //   Barrier instructions order prior Memory effects before subsequent
6147   //   Memory effects generated by the same Observer. A read or a write
6148   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6149   //   Observer if and only if RW1 appears in program order before RW 2
6150   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6151   //   instruction with both Acquire and Release semantics.
6152   //
6153   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6154   // and Release semantics, therefore we don't need a leading
6155   // barrier. However, there is no corresponding Barrier-ordered-after
6156   // relationship, therefore we need a trailing membar to prevent a
6157   // later store or load from being reordered with the store in an
6158   // atomic instruction.
6159   //
6160   // This was checked by using the herd7 consistency model simulator
6161   // (http://diy.inria.fr/) with this test case:
6162   //
6163   // AArch64 LseCas
6164   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6165   // P0 | P1;
6166   // LDR W4, [X2] | MOV W3, #0;
6167   // DMB LD       | MOV W4, #1;
6168   // LDR W3, [X1] | CASAL W3, W4, [X1];
6169   //              | DMB ISH;
6170   //              | STR W4, [X2];
6171   // exists
6172   // (0:X3=0 /\ 0:X4=1)
6173   //
6174   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6175   // with the store to x in P1. Without the DMB in P1 this may happen.
6176   //
6177   // At the time of writing we don't know of any AArch64 hardware that
6178   // reorders stores in this way, but the Reference Manual permits it.
6179 
6180   void gen_cas_entry(Assembler::operand_size size,
6181                      atomic_memory_order order) {
6182     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6183       exchange_val = c_rarg2;
6184     bool acquire, release;
6185     switch (order) {
6186       case memory_order_relaxed:
6187         acquire = false;
6188         release = false;
6189         break;
6190       case memory_order_release:
6191         acquire = false;
6192         release = true;
6193         break;
6194       default:
6195         acquire = true;
6196         release = true;
6197         break;
6198     }
6199     __ mov(prev, compare_val);
6200     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6201     if (order == memory_order_conservative) {
6202       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6203     }
6204     if (size == Assembler::xword) {
6205       __ mov(r0, prev);
6206     } else {
6207       __ movw(r0, prev);
6208     }
6209     __ ret(lr);
6210   }
6211 
6212   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6213     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6214     // If not relaxed, then default to conservative.  Relaxed is the only
6215     // case we use enough to be worth specializing.
6216     if (order == memory_order_relaxed) {
6217       __ ldadd(size, incr, prev, addr);
6218     } else {
6219       __ ldaddal(size, incr, prev, addr);
6220       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6221     }
6222     if (size == Assembler::xword) {
6223       __ mov(r0, prev);
6224     } else {
6225       __ movw(r0, prev);
6226     }
6227     __ ret(lr);
6228   }
6229 
6230   void gen_swpal_entry(Assembler::operand_size size) {
6231     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6232     __ swpal(size, incr, prev, addr);
6233     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6234     if (size == Assembler::xword) {
6235       __ mov(r0, prev);
6236     } else {
6237       __ movw(r0, prev);
6238     }
6239     __ ret(lr);
6240   }
6241 
6242   void generate_atomic_entry_points() {
6243     if (! UseLSE) {
6244       return;
6245     }
6246 
6247     __ align(CodeEntryAlignment);
6248     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6249     address first_entry = __ pc();
6250 
6251     // ADD, memory_order_conservative
6252     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6253     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6254     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6255     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6256 
6257     // ADD, memory_order_relaxed
6258     AtomicStubMark mark_fetch_add_4_relaxed
6259       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6260     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6261     AtomicStubMark mark_fetch_add_8_relaxed
6262       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6263     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6264 
6265     // XCHG, memory_order_conservative
6266     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6267     gen_swpal_entry(Assembler::word);
6268     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6269     gen_swpal_entry(Assembler::xword);
6270 
6271     // CAS, memory_order_conservative
6272     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6273     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6274     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6275     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6276     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6277     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6278 
6279     // CAS, memory_order_relaxed
6280     AtomicStubMark mark_cmpxchg_1_relaxed
6281       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6282     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6283     AtomicStubMark mark_cmpxchg_4_relaxed
6284       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6285     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6286     AtomicStubMark mark_cmpxchg_8_relaxed
6287       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6288     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6289 
6290     AtomicStubMark mark_cmpxchg_4_release
6291       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6292     gen_cas_entry(MacroAssembler::word, memory_order_release);
6293     AtomicStubMark mark_cmpxchg_8_release
6294       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6295     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6296 
6297     AtomicStubMark mark_cmpxchg_4_seq_cst
6298       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6299     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6300     AtomicStubMark mark_cmpxchg_8_seq_cst
6301       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6302     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6303 
6304     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6305   }
6306 #endif // LINUX
6307 
6308   // Continuation point for throwing of implicit exceptions that are
6309   // not handled in the current activation. Fabricates an exception
6310   // oop and initiates normal exception dispatching in this
6311   // frame. Since we need to preserve callee-saved values (currently
6312   // only for C2, but done for C1 as well) we need a callee-saved oop
6313   // map and therefore have to make these stubs into RuntimeStubs
6314   // rather than BufferBlobs.  If the compiler needs all registers to
6315   // be preserved between the fault point and the exception handler
6316   // then it must assume responsibility for that in
6317   // AbstractCompiler::continuation_for_implicit_null_exception or
6318   // continuation_for_implicit_division_by_zero_exception. All other
6319   // implicit exceptions (e.g., NullPointerException or
6320   // AbstractMethodError on entry) are either at call sites or
6321   // otherwise assume that stack unwinding will be initiated, so
6322   // caller saved registers were assumed volatile in the compiler.
6323 
6324 #undef __
6325 #define __ masm->
6326 
6327   address generate_throw_exception(const char* name,
6328                                    address runtime_entry,
6329                                    Register arg1 = noreg,
6330                                    Register arg2 = noreg) {
6331     // Information about frame layout at time of blocking runtime call.
6332     // Note that we only have to preserve callee-saved registers since
6333     // the compilers are responsible for supplying a continuation point
6334     // if they expect all registers to be preserved.
6335     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6336     enum layout {
6337       rfp_off = 0,
6338       rfp_off2,
6339       return_off,
6340       return_off2,
6341       framesize // inclusive of return address
6342     };
6343 
6344     int insts_size = 512;
6345     int locs_size  = 64;
6346 
6347     CodeBuffer code(name, insts_size, locs_size);
6348     OopMapSet* oop_maps  = new OopMapSet();
6349     MacroAssembler* masm = new MacroAssembler(&code);
6350 
6351     address start = __ pc();
6352 
6353     // This is an inlined and slightly modified version of call_VM
6354     // which has the ability to fetch the return PC out of
6355     // thread-local storage and also sets up last_Java_sp slightly
6356     // differently than the real call_VM
6357 
6358     __ enter(); // Save FP and LR before call
6359 
6360     assert(is_even(framesize/2), "sp not 16-byte aligned");
6361 
6362     // lr and fp are already in place
6363     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
6364 
6365     int frame_complete = __ pc() - start;
6366 
6367     // Set up last_Java_sp and last_Java_fp
6368     address the_pc = __ pc();
6369     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6370 
6371     // Call runtime
6372     if (arg1 != noreg) {
6373       assert(arg2 != c_rarg1, "clobbered");
6374       __ mov(c_rarg1, arg1);
6375     }
6376     if (arg2 != noreg) {
6377       __ mov(c_rarg2, arg2);
6378     }
6379     __ mov(c_rarg0, rthread);
6380     BLOCK_COMMENT("call runtime_entry");
6381     __ mov(rscratch1, runtime_entry);
6382     __ blr(rscratch1);
6383 
6384     // Generate oop map
6385     OopMap* map = new OopMap(framesize, 0);
6386 
6387     oop_maps->add_gc_map(the_pc - start, map);
6388 
6389     __ reset_last_Java_frame(true);
6390 
6391     // Reinitialize the ptrue predicate register, in case the external runtime
6392     // call clobbers ptrue reg, as we may return to SVE compiled code.
6393     __ reinitialize_ptrue();
6394 
6395     __ leave();
6396 
6397     // check for pending exceptions
6398 #ifdef ASSERT
6399     Label L;
6400     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6401     __ cbnz(rscratch1, L);
6402     __ should_not_reach_here();
6403     __ bind(L);
6404 #endif // ASSERT
6405     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6406 
6407 
6408     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6409     RuntimeStub* stub =
6410       RuntimeStub::new_runtime_stub(name,
6411                                     &code,
6412                                     frame_complete,
6413                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6414                                     oop_maps, false);
6415     return stub->entry_point();
6416   }
6417 
6418   class MontgomeryMultiplyGenerator : public MacroAssembler {
6419 
6420     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6421       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6422 
6423     RegSet _toSave;
6424     bool _squaring;
6425 
6426   public:
6427     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6428       : MacroAssembler(as->code()), _squaring(squaring) {
6429 
6430       // Register allocation
6431 
6432       RegSetIterator<> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6433       Pa_base = *regs;       // Argument registers
6434       if (squaring)
6435         Pb_base = Pa_base;
6436       else
6437         Pb_base = *++regs;
6438       Pn_base = *++regs;
6439       Rlen= *++regs;
6440       inv = *++regs;
6441       Pm_base = *++regs;
6442 
6443                           // Working registers:
6444       Ra =  *++regs;        // The current digit of a, b, n, and m.
6445       Rb =  *++regs;
6446       Rm =  *++regs;
6447       Rn =  *++regs;
6448 
6449       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6450       Pb =  *++regs;
6451       Pm =  *++regs;
6452       Pn =  *++regs;
6453 
6454       t0 =  *++regs;        // Three registers which form a
6455       t1 =  *++regs;        // triple-precision accumuator.
6456       t2 =  *++regs;
6457 
6458       Ri =  *++regs;        // Inner and outer loop indexes.
6459       Rj =  *++regs;
6460 
6461       Rhi_ab = *++regs;     // Product registers: low and high parts
6462       Rlo_ab = *++regs;     // of a*b and m*n.
6463       Rhi_mn = *++regs;
6464       Rlo_mn = *++regs;
6465 
6466       // r19 and up are callee-saved.
6467       _toSave = RegSet::range(r19, *regs) + Pm_base;
6468     }
6469 
6470   private:
6471     void save_regs() {
6472       push(_toSave, sp);
6473     }
6474 
6475     void restore_regs() {
6476       pop(_toSave, sp);
6477     }
6478 
6479     template <typename T>
6480     void unroll_2(Register count, T block) {
6481       Label loop, end, odd;
6482       tbnz(count, 0, odd);
6483       cbz(count, end);
6484       align(16);
6485       bind(loop);
6486       (this->*block)();
6487       bind(odd);
6488       (this->*block)();
6489       subs(count, count, 2);
6490       br(Assembler::GT, loop);
6491       bind(end);
6492     }
6493 
6494     template <typename T>
6495     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
6496       Label loop, end, odd;
6497       tbnz(count, 0, odd);
6498       cbz(count, end);
6499       align(16);
6500       bind(loop);
6501       (this->*block)(d, s, tmp);
6502       bind(odd);
6503       (this->*block)(d, s, tmp);
6504       subs(count, count, 2);
6505       br(Assembler::GT, loop);
6506       bind(end);
6507     }
6508 
6509     void pre1(RegisterOrConstant i) {
6510       block_comment("pre1");
6511       // Pa = Pa_base;
6512       // Pb = Pb_base + i;
6513       // Pm = Pm_base;
6514       // Pn = Pn_base + i;
6515       // Ra = *Pa;
6516       // Rb = *Pb;
6517       // Rm = *Pm;
6518       // Rn = *Pn;
6519       ldr(Ra, Address(Pa_base));
6520       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6521       ldr(Rm, Address(Pm_base));
6522       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6523       lea(Pa, Address(Pa_base));
6524       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
6525       lea(Pm, Address(Pm_base));
6526       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6527 
6528       // Zero the m*n result.
6529       mov(Rhi_mn, zr);
6530       mov(Rlo_mn, zr);
6531     }
6532 
6533     // The core multiply-accumulate step of a Montgomery
6534     // multiplication.  The idea is to schedule operations as a
6535     // pipeline so that instructions with long latencies (loads and
6536     // multiplies) have time to complete before their results are
6537     // used.  This most benefits in-order implementations of the
6538     // architecture but out-of-order ones also benefit.
6539     void step() {
6540       block_comment("step");
6541       // MACC(Ra, Rb, t0, t1, t2);
6542       // Ra = *++Pa;
6543       // Rb = *--Pb;
6544       umulh(Rhi_ab, Ra, Rb);
6545       mul(Rlo_ab, Ra, Rb);
6546       ldr(Ra, pre(Pa, wordSize));
6547       ldr(Rb, pre(Pb, -wordSize));
6548       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
6549                                        // previous iteration.
6550       // MACC(Rm, Rn, t0, t1, t2);
6551       // Rm = *++Pm;
6552       // Rn = *--Pn;
6553       umulh(Rhi_mn, Rm, Rn);
6554       mul(Rlo_mn, Rm, Rn);
6555       ldr(Rm, pre(Pm, wordSize));
6556       ldr(Rn, pre(Pn, -wordSize));
6557       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6558     }
6559 
6560     void post1() {
6561       block_comment("post1");
6562 
6563       // MACC(Ra, Rb, t0, t1, t2);
6564       // Ra = *++Pa;
6565       // Rb = *--Pb;
6566       umulh(Rhi_ab, Ra, Rb);
6567       mul(Rlo_ab, Ra, Rb);
6568       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6569       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6570 
6571       // *Pm = Rm = t0 * inv;
6572       mul(Rm, t0, inv);
6573       str(Rm, Address(Pm));
6574 
6575       // MACC(Rm, Rn, t0, t1, t2);
6576       // t0 = t1; t1 = t2; t2 = 0;
6577       umulh(Rhi_mn, Rm, Rn);
6578 
6579 #ifndef PRODUCT
6580       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6581       {
6582         mul(Rlo_mn, Rm, Rn);
6583         add(Rlo_mn, t0, Rlo_mn);
6584         Label ok;
6585         cbz(Rlo_mn, ok); {
6586           stop("broken Montgomery multiply");
6587         } bind(ok);
6588       }
6589 #endif
6590       // We have very carefully set things up so that
6591       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6592       // the lower half of Rm * Rn because we know the result already:
6593       // it must be -t0.  t0 + (-t0) must generate a carry iff
6594       // t0 != 0.  So, rather than do a mul and an adds we just set
6595       // the carry flag iff t0 is nonzero.
6596       //
6597       // mul(Rlo_mn, Rm, Rn);
6598       // adds(zr, t0, Rlo_mn);
6599       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6600       adcs(t0, t1, Rhi_mn);
6601       adc(t1, t2, zr);
6602       mov(t2, zr);
6603     }
6604 
6605     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
6606       block_comment("pre2");
6607       // Pa = Pa_base + i-len;
6608       // Pb = Pb_base + len;
6609       // Pm = Pm_base + i-len;
6610       // Pn = Pn_base + len;
6611 
6612       if (i.is_register()) {
6613         sub(Rj, i.as_register(), len);
6614       } else {
6615         mov(Rj, i.as_constant());
6616         sub(Rj, Rj, len);
6617       }
6618       // Rj == i-len
6619 
6620       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
6621       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
6622       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6623       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
6624 
6625       // Ra = *++Pa;
6626       // Rb = *--Pb;
6627       // Rm = *++Pm;
6628       // Rn = *--Pn;
6629       ldr(Ra, pre(Pa, wordSize));
6630       ldr(Rb, pre(Pb, -wordSize));
6631       ldr(Rm, pre(Pm, wordSize));
6632       ldr(Rn, pre(Pn, -wordSize));
6633 
6634       mov(Rhi_mn, zr);
6635       mov(Rlo_mn, zr);
6636     }
6637 
6638     void post2(RegisterOrConstant i, RegisterOrConstant len) {
6639       block_comment("post2");
6640       if (i.is_constant()) {
6641         mov(Rj, i.as_constant()-len.as_constant());
6642       } else {
6643         sub(Rj, i.as_register(), len);
6644       }
6645 
6646       adds(t0, t0, Rlo_mn); // The pending m*n, low part
6647 
6648       // As soon as we know the least significant digit of our result,
6649       // store it.
6650       // Pm_base[i-len] = t0;
6651       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
6652 
6653       // t0 = t1; t1 = t2; t2 = 0;
6654       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
6655       adc(t1, t2, zr);
6656       mov(t2, zr);
6657     }
6658 
6659     // A carry in t0 after Montgomery multiplication means that we
6660     // should subtract multiples of n from our result in m.  We'll
6661     // keep doing that until there is no carry.
6662     void normalize(RegisterOrConstant len) {
6663       block_comment("normalize");
6664       // while (t0)
6665       //   t0 = sub(Pm_base, Pn_base, t0, len);
6666       Label loop, post, again;
6667       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
6668       cbz(t0, post); {
6669         bind(again); {
6670           mov(i, zr);
6671           mov(cnt, len);
6672           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6673           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6674           subs(zr, zr, zr); // set carry flag, i.e. no borrow
6675           align(16);
6676           bind(loop); {
6677             sbcs(Rm, Rm, Rn);
6678             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6679             add(i, i, 1);
6680             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
6681             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
6682             sub(cnt, cnt, 1);
6683           } cbnz(cnt, loop);
6684           sbc(t0, t0, zr);
6685         } cbnz(t0, again);
6686       } bind(post);
6687     }
6688 
6689     // Move memory at s to d, reversing words.
6690     //    Increments d to end of copied memory
6691     //    Destroys tmp1, tmp2
6692     //    Preserves len
6693     //    Leaves s pointing to the address which was in d at start
6694     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
6695       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
6696 
6697       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
6698       mov(tmp1, len);
6699       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
6700       sub(s, d, len, ext::uxtw, LogBytesPerWord);
6701     }
6702     // where
6703     void reverse1(Register d, Register s, Register tmp) {
6704       ldr(tmp, pre(s, -wordSize));
6705       ror(tmp, tmp, 32);
6706       str(tmp, post(d, wordSize));
6707     }
6708 
6709     void step_squaring() {
6710       // An extra ACC
6711       step();
6712       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6713     }
6714 
6715     void last_squaring(RegisterOrConstant i) {
6716       Label dont;
6717       // if ((i & 1) == 0) {
6718       tbnz(i.as_register(), 0, dont); {
6719         // MACC(Ra, Rb, t0, t1, t2);
6720         // Ra = *++Pa;
6721         // Rb = *--Pb;
6722         umulh(Rhi_ab, Ra, Rb);
6723         mul(Rlo_ab, Ra, Rb);
6724         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
6725       } bind(dont);
6726     }
6727 
6728     void extra_step_squaring() {
6729       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6730 
6731       // MACC(Rm, Rn, t0, t1, t2);
6732       // Rm = *++Pm;
6733       // Rn = *--Pn;
6734       umulh(Rhi_mn, Rm, Rn);
6735       mul(Rlo_mn, Rm, Rn);
6736       ldr(Rm, pre(Pm, wordSize));
6737       ldr(Rn, pre(Pn, -wordSize));
6738     }
6739 
6740     void post1_squaring() {
6741       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
6742 
6743       // *Pm = Rm = t0 * inv;
6744       mul(Rm, t0, inv);
6745       str(Rm, Address(Pm));
6746 
6747       // MACC(Rm, Rn, t0, t1, t2);
6748       // t0 = t1; t1 = t2; t2 = 0;
6749       umulh(Rhi_mn, Rm, Rn);
6750 
6751 #ifndef PRODUCT
6752       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
6753       {
6754         mul(Rlo_mn, Rm, Rn);
6755         add(Rlo_mn, t0, Rlo_mn);
6756         Label ok;
6757         cbz(Rlo_mn, ok); {
6758           stop("broken Montgomery multiply");
6759         } bind(ok);
6760       }
6761 #endif
6762       // We have very carefully set things up so that
6763       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
6764       // the lower half of Rm * Rn because we know the result already:
6765       // it must be -t0.  t0 + (-t0) must generate a carry iff
6766       // t0 != 0.  So, rather than do a mul and an adds we just set
6767       // the carry flag iff t0 is nonzero.
6768       //
6769       // mul(Rlo_mn, Rm, Rn);
6770       // adds(zr, t0, Rlo_mn);
6771       subs(zr, t0, 1); // Set carry iff t0 is nonzero
6772       adcs(t0, t1, Rhi_mn);
6773       adc(t1, t2, zr);
6774       mov(t2, zr);
6775     }
6776 
6777     void acc(Register Rhi, Register Rlo,
6778              Register t0, Register t1, Register t2) {
6779       adds(t0, t0, Rlo);
6780       adcs(t1, t1, Rhi);
6781       adc(t2, t2, zr);
6782     }
6783 
6784   public:
6785     /**
6786      * Fast Montgomery multiplication.  The derivation of the
6787      * algorithm is in A Cryptographic Library for the Motorola
6788      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
6789      *
6790      * Arguments:
6791      *
6792      * Inputs for multiplication:
6793      *   c_rarg0   - int array elements a
6794      *   c_rarg1   - int array elements b
6795      *   c_rarg2   - int array elements n (the modulus)
6796      *   c_rarg3   - int length
6797      *   c_rarg4   - int inv
6798      *   c_rarg5   - int array elements m (the result)
6799      *
6800      * Inputs for squaring:
6801      *   c_rarg0   - int array elements a
6802      *   c_rarg1   - int array elements n (the modulus)
6803      *   c_rarg2   - int length
6804      *   c_rarg3   - int inv
6805      *   c_rarg4   - int array elements m (the result)
6806      *
6807      */
6808     address generate_multiply() {
6809       Label argh, nothing;
6810       bind(argh);
6811       stop("MontgomeryMultiply total_allocation must be <= 8192");
6812 
6813       align(CodeEntryAlignment);
6814       address entry = pc();
6815 
6816       cbzw(Rlen, nothing);
6817 
6818       enter();
6819 
6820       // Make room.
6821       cmpw(Rlen, 512);
6822       br(Assembler::HI, argh);
6823       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
6824       andr(sp, Ra, -2 * wordSize);
6825 
6826       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
6827 
6828       {
6829         // Copy input args, reversing as we go.  We use Ra as a
6830         // temporary variable.
6831         reverse(Ra, Pa_base, Rlen, t0, t1);
6832         if (!_squaring)
6833           reverse(Ra, Pb_base, Rlen, t0, t1);
6834         reverse(Ra, Pn_base, Rlen, t0, t1);
6835       }
6836 
6837       // Push all call-saved registers and also Pm_base which we'll need
6838       // at the end.
6839       save_regs();
6840 
6841 #ifndef PRODUCT
6842       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
6843       {
6844         ldr(Rn, Address(Pn_base, 0));
6845         mul(Rlo_mn, Rn, inv);
6846         subs(zr, Rlo_mn, -1);
6847         Label ok;
6848         br(EQ, ok); {
6849           stop("broken inverse in Montgomery multiply");
6850         } bind(ok);
6851       }
6852 #endif
6853 
6854       mov(Pm_base, Ra);
6855 
6856       mov(t0, zr);
6857       mov(t1, zr);
6858       mov(t2, zr);
6859 
6860       block_comment("for (int i = 0; i < len; i++) {");
6861       mov(Ri, zr); {
6862         Label loop, end;
6863         cmpw(Ri, Rlen);
6864         br(Assembler::GE, end);
6865 
6866         bind(loop);
6867         pre1(Ri);
6868 
6869         block_comment("  for (j = i; j; j--) {"); {
6870           movw(Rj, Ri);
6871           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
6872         } block_comment("  } // j");
6873 
6874         post1();
6875         addw(Ri, Ri, 1);
6876         cmpw(Ri, Rlen);
6877         br(Assembler::LT, loop);
6878         bind(end);
6879         block_comment("} // i");
6880       }
6881 
6882       block_comment("for (int i = len; i < 2*len; i++) {");
6883       mov(Ri, Rlen); {
6884         Label loop, end;
6885         cmpw(Ri, Rlen, Assembler::LSL, 1);
6886         br(Assembler::GE, end);
6887 
6888         bind(loop);
6889         pre2(Ri, Rlen);
6890 
6891         block_comment("  for (j = len*2-i-1; j; j--) {"); {
6892           lslw(Rj, Rlen, 1);
6893           subw(Rj, Rj, Ri);
6894           subw(Rj, Rj, 1);
6895           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
6896         } block_comment("  } // j");
6897 
6898         post2(Ri, Rlen);
6899         addw(Ri, Ri, 1);
6900         cmpw(Ri, Rlen, Assembler::LSL, 1);
6901         br(Assembler::LT, loop);
6902         bind(end);
6903       }
6904       block_comment("} // i");
6905 
6906       normalize(Rlen);
6907 
6908       mov(Ra, Pm_base);  // Save Pm_base in Ra
6909       restore_regs();  // Restore caller's Pm_base
6910 
6911       // Copy our result into caller's Pm_base
6912       reverse(Pm_base, Ra, Rlen, t0, t1);
6913 
6914       leave();
6915       bind(nothing);
6916       ret(lr);
6917 
6918       return entry;
6919     }
6920     // In C, approximately:
6921 
6922     // void
6923     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
6924     //                     julong Pn_base[], julong Pm_base[],
6925     //                     julong inv, int len) {
6926     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
6927     //   julong *Pa, *Pb, *Pn, *Pm;
6928     //   julong Ra, Rb, Rn, Rm;
6929 
6930     //   int i;
6931 
6932     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
6933 
6934     //   for (i = 0; i < len; i++) {
6935     //     int j;
6936 
6937     //     Pa = Pa_base;
6938     //     Pb = Pb_base + i;
6939     //     Pm = Pm_base;
6940     //     Pn = Pn_base + i;
6941 
6942     //     Ra = *Pa;
6943     //     Rb = *Pb;
6944     //     Rm = *Pm;
6945     //     Rn = *Pn;
6946 
6947     //     int iters = i;
6948     //     for (j = 0; iters--; j++) {
6949     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
6950     //       MACC(Ra, Rb, t0, t1, t2);
6951     //       Ra = *++Pa;
6952     //       Rb = *--Pb;
6953     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6954     //       MACC(Rm, Rn, t0, t1, t2);
6955     //       Rm = *++Pm;
6956     //       Rn = *--Pn;
6957     //     }
6958 
6959     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
6960     //     MACC(Ra, Rb, t0, t1, t2);
6961     //     *Pm = Rm = t0 * inv;
6962     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
6963     //     MACC(Rm, Rn, t0, t1, t2);
6964 
6965     //     assert(t0 == 0, "broken Montgomery multiply");
6966 
6967     //     t0 = t1; t1 = t2; t2 = 0;
6968     //   }
6969 
6970     //   for (i = len; i < 2*len; i++) {
6971     //     int j;
6972 
6973     //     Pa = Pa_base + i-len;
6974     //     Pb = Pb_base + len;
6975     //     Pm = Pm_base + i-len;
6976     //     Pn = Pn_base + len;
6977 
6978     //     Ra = *++Pa;
6979     //     Rb = *--Pb;
6980     //     Rm = *++Pm;
6981     //     Rn = *--Pn;
6982 
6983     //     int iters = len*2-i-1;
6984     //     for (j = i-len+1; iters--; j++) {
6985     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
6986     //       MACC(Ra, Rb, t0, t1, t2);
6987     //       Ra = *++Pa;
6988     //       Rb = *--Pb;
6989     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
6990     //       MACC(Rm, Rn, t0, t1, t2);
6991     //       Rm = *++Pm;
6992     //       Rn = *--Pn;
6993     //     }
6994 
6995     //     Pm_base[i-len] = t0;
6996     //     t0 = t1; t1 = t2; t2 = 0;
6997     //   }
6998 
6999     //   while (t0)
7000     //     t0 = sub(Pm_base, Pn_base, t0, len);
7001     // }
7002 
7003     /**
7004      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7005      * multiplies than Montgomery multiplication so it should be up to
7006      * 25% faster.  However, its loop control is more complex and it
7007      * may actually run slower on some machines.
7008      *
7009      * Arguments:
7010      *
7011      * Inputs:
7012      *   c_rarg0   - int array elements a
7013      *   c_rarg1   - int array elements n (the modulus)
7014      *   c_rarg2   - int length
7015      *   c_rarg3   - int inv
7016      *   c_rarg4   - int array elements m (the result)
7017      *
7018      */
7019     address generate_square() {
7020       Label argh;
7021       bind(argh);
7022       stop("MontgomeryMultiply total_allocation must be <= 8192");
7023 
7024       align(CodeEntryAlignment);
7025       address entry = pc();
7026 
7027       enter();
7028 
7029       // Make room.
7030       cmpw(Rlen, 512);
7031       br(Assembler::HI, argh);
7032       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7033       andr(sp, Ra, -2 * wordSize);
7034 
7035       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7036 
7037       {
7038         // Copy input args, reversing as we go.  We use Ra as a
7039         // temporary variable.
7040         reverse(Ra, Pa_base, Rlen, t0, t1);
7041         reverse(Ra, Pn_base, Rlen, t0, t1);
7042       }
7043 
7044       // Push all call-saved registers and also Pm_base which we'll need
7045       // at the end.
7046       save_regs();
7047 
7048       mov(Pm_base, Ra);
7049 
7050       mov(t0, zr);
7051       mov(t1, zr);
7052       mov(t2, zr);
7053 
7054       block_comment("for (int i = 0; i < len; i++) {");
7055       mov(Ri, zr); {
7056         Label loop, end;
7057         bind(loop);
7058         cmp(Ri, Rlen);
7059         br(Assembler::GE, end);
7060 
7061         pre1(Ri);
7062 
7063         block_comment("for (j = (i+1)/2; j; j--) {"); {
7064           add(Rj, Ri, 1);
7065           lsr(Rj, Rj, 1);
7066           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7067         } block_comment("  } // j");
7068 
7069         last_squaring(Ri);
7070 
7071         block_comment("  for (j = i/2; j; j--) {"); {
7072           lsr(Rj, Ri, 1);
7073           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7074         } block_comment("  } // j");
7075 
7076         post1_squaring();
7077         add(Ri, Ri, 1);
7078         cmp(Ri, Rlen);
7079         br(Assembler::LT, loop);
7080 
7081         bind(end);
7082         block_comment("} // i");
7083       }
7084 
7085       block_comment("for (int i = len; i < 2*len; i++) {");
7086       mov(Ri, Rlen); {
7087         Label loop, end;
7088         bind(loop);
7089         cmp(Ri, Rlen, Assembler::LSL, 1);
7090         br(Assembler::GE, end);
7091 
7092         pre2(Ri, Rlen);
7093 
7094         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7095           lsl(Rj, Rlen, 1);
7096           sub(Rj, Rj, Ri);
7097           sub(Rj, Rj, 1);
7098           lsr(Rj, Rj, 1);
7099           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7100         } block_comment("  } // j");
7101 
7102         last_squaring(Ri);
7103 
7104         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7105           lsl(Rj, Rlen, 1);
7106           sub(Rj, Rj, Ri);
7107           lsr(Rj, Rj, 1);
7108           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7109         } block_comment("  } // j");
7110 
7111         post2(Ri, Rlen);
7112         add(Ri, Ri, 1);
7113         cmp(Ri, Rlen, Assembler::LSL, 1);
7114 
7115         br(Assembler::LT, loop);
7116         bind(end);
7117         block_comment("} // i");
7118       }
7119 
7120       normalize(Rlen);
7121 
7122       mov(Ra, Pm_base);  // Save Pm_base in Ra
7123       restore_regs();  // Restore caller's Pm_base
7124 
7125       // Copy our result into caller's Pm_base
7126       reverse(Pm_base, Ra, Rlen, t0, t1);
7127 
7128       leave();
7129       ret(lr);
7130 
7131       return entry;
7132     }
7133     // In C, approximately:
7134 
7135     // void
7136     // montgomery_square(julong Pa_base[], julong Pn_base[],
7137     //                   julong Pm_base[], julong inv, int len) {
7138     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7139     //   julong *Pa, *Pb, *Pn, *Pm;
7140     //   julong Ra, Rb, Rn, Rm;
7141 
7142     //   int i;
7143 
7144     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7145 
7146     //   for (i = 0; i < len; i++) {
7147     //     int j;
7148 
7149     //     Pa = Pa_base;
7150     //     Pb = Pa_base + i;
7151     //     Pm = Pm_base;
7152     //     Pn = Pn_base + i;
7153 
7154     //     Ra = *Pa;
7155     //     Rb = *Pb;
7156     //     Rm = *Pm;
7157     //     Rn = *Pn;
7158 
7159     //     int iters = (i+1)/2;
7160     //     for (j = 0; iters--; j++) {
7161     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7162     //       MACC2(Ra, Rb, t0, t1, t2);
7163     //       Ra = *++Pa;
7164     //       Rb = *--Pb;
7165     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7166     //       MACC(Rm, Rn, t0, t1, t2);
7167     //       Rm = *++Pm;
7168     //       Rn = *--Pn;
7169     //     }
7170     //     if ((i & 1) == 0) {
7171     //       assert(Ra == Pa_base[j], "must be");
7172     //       MACC(Ra, Ra, t0, t1, t2);
7173     //     }
7174     //     iters = i/2;
7175     //     assert(iters == i-j, "must be");
7176     //     for (; iters--; j++) {
7177     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7178     //       MACC(Rm, Rn, t0, t1, t2);
7179     //       Rm = *++Pm;
7180     //       Rn = *--Pn;
7181     //     }
7182 
7183     //     *Pm = Rm = t0 * inv;
7184     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7185     //     MACC(Rm, Rn, t0, t1, t2);
7186 
7187     //     assert(t0 == 0, "broken Montgomery multiply");
7188 
7189     //     t0 = t1; t1 = t2; t2 = 0;
7190     //   }
7191 
7192     //   for (i = len; i < 2*len; i++) {
7193     //     int start = i-len+1;
7194     //     int end = start + (len - start)/2;
7195     //     int j;
7196 
7197     //     Pa = Pa_base + i-len;
7198     //     Pb = Pa_base + len;
7199     //     Pm = Pm_base + i-len;
7200     //     Pn = Pn_base + len;
7201 
7202     //     Ra = *++Pa;
7203     //     Rb = *--Pb;
7204     //     Rm = *++Pm;
7205     //     Rn = *--Pn;
7206 
7207     //     int iters = (2*len-i-1)/2;
7208     //     assert(iters == end-start, "must be");
7209     //     for (j = start; iters--; j++) {
7210     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7211     //       MACC2(Ra, Rb, t0, t1, t2);
7212     //       Ra = *++Pa;
7213     //       Rb = *--Pb;
7214     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7215     //       MACC(Rm, Rn, t0, t1, t2);
7216     //       Rm = *++Pm;
7217     //       Rn = *--Pn;
7218     //     }
7219     //     if ((i & 1) == 0) {
7220     //       assert(Ra == Pa_base[j], "must be");
7221     //       MACC(Ra, Ra, t0, t1, t2);
7222     //     }
7223     //     iters =  (2*len-i)/2;
7224     //     assert(iters == len-j, "must be");
7225     //     for (; iters--; j++) {
7226     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7227     //       MACC(Rm, Rn, t0, t1, t2);
7228     //       Rm = *++Pm;
7229     //       Rn = *--Pn;
7230     //     }
7231     //     Pm_base[i-len] = t0;
7232     //     t0 = t1; t1 = t2; t2 = 0;
7233     //   }
7234 
7235     //   while (t0)
7236     //     t0 = sub(Pm_base, Pn_base, t0, len);
7237     // }
7238   };
7239 
7240 
7241   // Initialization
7242   void generate_initial() {
7243     // Generate initial stubs and initializes the entry points
7244 
7245     // entry points that exist in all platforms Note: This is code
7246     // that could be shared among different platforms - however the
7247     // benefit seems to be smaller than the disadvantage of having a
7248     // much more complicated generator structure. See also comment in
7249     // stubRoutines.hpp.
7250 
7251     StubRoutines::_forward_exception_entry = generate_forward_exception();
7252 
7253     StubRoutines::_call_stub_entry =
7254       generate_call_stub(StubRoutines::_call_stub_return_address);
7255 
7256     // is referenced by megamorphic call
7257     StubRoutines::_catch_exception_entry = generate_catch_exception();
7258 
7259     // Build this early so it's available for the interpreter.
7260     StubRoutines::_throw_StackOverflowError_entry =
7261       generate_throw_exception("StackOverflowError throw_exception",
7262                                CAST_FROM_FN_PTR(address,
7263                                                 SharedRuntime::throw_StackOverflowError));
7264     StubRoutines::_throw_delayed_StackOverflowError_entry =
7265       generate_throw_exception("delayed StackOverflowError throw_exception",
7266                                CAST_FROM_FN_PTR(address,
7267                                                 SharedRuntime::throw_delayed_StackOverflowError));
7268     if (UseCRC32Intrinsics) {
7269       // set table address before stub generation which use it
7270       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7271       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7272     }
7273 
7274     if (UseCRC32CIntrinsics) {
7275       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7276     }
7277 
7278     // Disabled until JDK-8210858 is fixed
7279     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7280     //   StubRoutines::_dlog = generate_dlog();
7281     // }
7282 
7283     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7284       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7285     }
7286 
7287     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7288       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7289     }
7290 
7291     // Safefetch stubs.
7292     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
7293                                                        &StubRoutines::_safefetch32_fault_pc,
7294                                                        &StubRoutines::_safefetch32_continuation_pc);
7295     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
7296                                                        &StubRoutines::_safefetchN_fault_pc,
7297                                                        &StubRoutines::_safefetchN_continuation_pc);
7298   }
7299 
7300   void generate_all() {
7301     // support for verify_oop (must happen after universe_init)
7302     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
7303     StubRoutines::_throw_AbstractMethodError_entry =
7304       generate_throw_exception("AbstractMethodError throw_exception",
7305                                CAST_FROM_FN_PTR(address,
7306                                                 SharedRuntime::
7307                                                 throw_AbstractMethodError));
7308 
7309     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7310       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7311                                CAST_FROM_FN_PTR(address,
7312                                                 SharedRuntime::
7313                                                 throw_IncompatibleClassChangeError));
7314 
7315     StubRoutines::_throw_NullPointerException_at_call_entry =
7316       generate_throw_exception("NullPointerException at call throw_exception",
7317                                CAST_FROM_FN_PTR(address,
7318                                                 SharedRuntime::
7319                                                 throw_NullPointerException_at_call));
7320 
7321     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7322 
7323     // arraycopy stubs used by compilers
7324     generate_arraycopy_stubs();
7325 
7326     // has negatives stub for large arrays.
7327     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
7328 
7329     // array equals stub for large arrays.
7330     if (!UseSimpleArrayEquals) {
7331       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7332     }
7333 
7334     generate_compare_long_strings();
7335 
7336     generate_string_indexof_stubs();
7337 
7338     // byte_array_inflate stub for large arrays.
7339     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7340 
7341     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7342     if (bs_nm != NULL) {
7343       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7344     }
7345 #ifdef COMPILER2
7346     if (UseMultiplyToLenIntrinsic) {
7347       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7348     }
7349 
7350     if (UseSquareToLenIntrinsic) {
7351       StubRoutines::_squareToLen = generate_squareToLen();
7352     }
7353 
7354     if (UseMulAddIntrinsic) {
7355       StubRoutines::_mulAdd = generate_mulAdd();
7356     }
7357 
7358     if (UseSIMDForBigIntegerShiftIntrinsics) {
7359       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7360       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7361     }
7362 
7363     if (UseMontgomeryMultiplyIntrinsic) {
7364       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7365       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7366       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7367     }
7368 
7369     if (UseMontgomerySquareIntrinsic) {
7370       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7371       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7372       // We use generate_multiply() rather than generate_square()
7373       // because it's faster for the sizes of modulus we care about.
7374       StubRoutines::_montgomerySquare = g.generate_multiply();
7375     }
7376 #endif // COMPILER2
7377 
7378     // generate GHASH intrinsics code
7379     if (UseGHASHIntrinsics) {
7380       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7381       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7382     }
7383 
7384     if (UseBASE64Intrinsics) {
7385         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7386         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7387     }
7388 
7389     // data cache line writeback
7390     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7391     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7392 
7393     if (UseAESIntrinsics) {
7394       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7395       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7396       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7397       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7398       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7399       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7400     }
7401 
7402     if (UseSHA1Intrinsics) {
7403       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7404       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7405     }
7406     if (UseSHA256Intrinsics) {
7407       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7408       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7409     }
7410     if (UseSHA512Intrinsics) {
7411       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7412       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7413     }
7414     if (UseSHA3Intrinsics) {
7415       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7416       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7417     }
7418 
7419     // generate Adler32 intrinsics code
7420     if (UseAdler32Intrinsics) {
7421       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7422     }
7423 
7424 #ifdef LINUX
7425 
7426     generate_atomic_entry_points();
7427 
7428 #endif // LINUX
7429 
7430     StubRoutines::aarch64::set_completed();
7431   }
7432 
7433  public:
7434   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7435     if (all) {
7436       generate_all();
7437     } else {
7438       generate_initial();
7439     }
7440   }
7441 }; // end class declaration
7442 
7443 #define UCM_TABLE_MAX_ENTRIES 8
7444 void StubGenerator_generate(CodeBuffer* code, bool all) {
7445   if (UnsafeCopyMemory::_table == NULL) {
7446     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7447   }
7448   StubGenerator g(code, all);
7449 }
7450 
7451 
7452 #ifdef LINUX
7453 
7454 // Define pointers to atomic stubs and initialize them to point to the
7455 // code in atomic_aarch64.S.
7456 
7457 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7458   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7459     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7460   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7461     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7462 
7463 DEFAULT_ATOMIC_OP(fetch_add, 4, )
7464 DEFAULT_ATOMIC_OP(fetch_add, 8, )
7465 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
7466 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
7467 DEFAULT_ATOMIC_OP(xchg, 4, )
7468 DEFAULT_ATOMIC_OP(xchg, 8, )
7469 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
7470 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
7471 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
7472 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
7473 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
7474 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
7475 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
7476 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
7477 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
7478 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
7479 
7480 #undef DEFAULT_ATOMIC_OP
7481 
7482 #endif // LINUX