1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/continuation.hpp"
  46 #include "runtime/continuationEntry.inline.hpp"
  47 #include "runtime/frame.inline.hpp"
  48 #include "runtime/handles.inline.hpp"
  49 #include "runtime/javaThread.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubCodeGenerator.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "utilities/align.hpp"
  54 #include "utilities/globalDefinitions.hpp"
  55 #include "utilities/powerOfTwo.hpp"
  56 #ifdef COMPILER2
  57 #include "opto/runtime.hpp"
  58 #endif
  59 #if INCLUDE_ZGC
  60 #include "gc/z/zThreadLocalData.hpp"
  61 #endif
  62 
  63 // Declaration and definition of StubGenerator (no .hpp file).
  64 // For a more detailed description of the stub routine structure
  65 // see the comment in stubRoutines.hpp
  66 
  67 #undef __
  68 #define __ _masm->
  69 
  70 #ifdef PRODUCT
  71 #define BLOCK_COMMENT(str) /* nothing */
  72 #else
  73 #define BLOCK_COMMENT(str) __ block_comment(str)
  74 #endif
  75 
  76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  77 
  78 // Stub Code definitions
  79 
  80 class StubGenerator: public StubCodeGenerator {
  81  private:
  82 
  83 #ifdef PRODUCT
  84 #define inc_counter_np(counter) ((void)0)
  85 #else
  86   void inc_counter_np_(int& counter) {
  87     __ lea(rscratch2, ExternalAddress((address)&counter));
  88     __ ldrw(rscratch1, Address(rscratch2));
  89     __ addw(rscratch1, rscratch1, 1);
  90     __ strw(rscratch1, Address(rscratch2));
  91   }
  92 #define inc_counter_np(counter) \
  93   BLOCK_COMMENT("inc_counter " #counter); \
  94   inc_counter_np_(counter);
  95 #endif
  96 
  97   // Call stubs are used to call Java from C
  98   //
  99   // Arguments:
 100   //    c_rarg0:   call wrapper address                   address
 101   //    c_rarg1:   result                                 address
 102   //    c_rarg2:   result type                            BasicType
 103   //    c_rarg3:   method                                 Method*
 104   //    c_rarg4:   (interpreter) entry point              address
 105   //    c_rarg5:   parameters                             intptr_t*
 106   //    c_rarg6:   parameter size (in words)              int
 107   //    c_rarg7:   thread                                 Thread*
 108   //
 109   // There is no return from the stub itself as any Java result
 110   // is written to result
 111   //
 112   // we save r30 (lr) as the return PC at the base of the frame and
 113   // link r29 (fp) below it as the frame pointer installing sp (r31)
 114   // into fp.
 115   //
 116   // we save r0-r7, which accounts for all the c arguments.
 117   //
 118   // TODO: strictly do we need to save them all? they are treated as
 119   // volatile by C so could we omit saving the ones we are going to
 120   // place in global registers (thread? method?) or those we only use
 121   // during setup of the Java call?
 122   //
 123   // we don't need to save r8 which C uses as an indirect result location
 124   // return register.
 125   //
 126   // we don't need to save r9-r15 which both C and Java treat as
 127   // volatile
 128   //
 129   // we don't need to save r16-18 because Java does not use them
 130   //
 131   // we save r19-r28 which Java uses as scratch registers and C
 132   // expects to be callee-save
 133   //
 134   // we save the bottom 64 bits of each value stored in v8-v15; it is
 135   // the responsibility of the caller to preserve larger values.
 136   //
 137   // so the stub frame looks like this when we enter Java code
 138   //
 139   //     [ return_from_Java     ] <--- sp
 140   //     [ argument word n      ]
 141   //      ...
 142   // -27 [ argument word 1      ]
 143   // -26 [ saved v15            ] <--- sp_after_call
 144   // -25 [ saved v14            ]
 145   // -24 [ saved v13            ]
 146   // -23 [ saved v12            ]
 147   // -22 [ saved v11            ]
 148   // -21 [ saved v10            ]
 149   // -20 [ saved v9             ]
 150   // -19 [ saved v8             ]
 151   // -18 [ saved r28            ]
 152   // -17 [ saved r27            ]
 153   // -16 [ saved r26            ]
 154   // -15 [ saved r25            ]
 155   // -14 [ saved r24            ]
 156   // -13 [ saved r23            ]
 157   // -12 [ saved r22            ]
 158   // -11 [ saved r21            ]
 159   // -10 [ saved r20            ]
 160   //  -9 [ saved r19            ]
 161   //  -8 [ call wrapper    (r0) ]
 162   //  -7 [ result          (r1) ]
 163   //  -6 [ result type     (r2) ]
 164   //  -5 [ method          (r3) ]
 165   //  -4 [ entry point     (r4) ]
 166   //  -3 [ parameters      (r5) ]
 167   //  -2 [ parameter size  (r6) ]
 168   //  -1 [ thread (r7)          ]
 169   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 170   //   1 [ saved lr       (r30) ]
 171 
 172   // Call stub stack layout word offsets from fp
 173   enum call_stub_layout {
 174     sp_after_call_off = -26,
 175 
 176     d15_off            = -26,
 177     d13_off            = -24,
 178     d11_off            = -22,
 179     d9_off             = -20,
 180 
 181     r28_off            = -18,
 182     r26_off            = -16,
 183     r24_off            = -14,
 184     r22_off            = -12,
 185     r20_off            = -10,
 186     call_wrapper_off   =  -8,
 187     result_off         =  -7,
 188     result_type_off    =  -6,
 189     method_off         =  -5,
 190     entry_point_off    =  -4,
 191     parameter_size_off =  -2,
 192     thread_off         =  -1,
 193     fp_f               =   0,
 194     retaddr_off        =   1,
 195   };
 196 
 197   address generate_call_stub(address& return_address) {
 198     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 199            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 200            "adjust this code");
 201 
 202     StubCodeMark mark(this, "StubRoutines", "call_stub");
 203     address start = __ pc();
 204 
 205     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 206 
 207     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 208     const Address result        (rfp, result_off         * wordSize);
 209     const Address result_type   (rfp, result_type_off    * wordSize);
 210     const Address method        (rfp, method_off         * wordSize);
 211     const Address entry_point   (rfp, entry_point_off    * wordSize);
 212     const Address parameter_size(rfp, parameter_size_off * wordSize);
 213 
 214     const Address thread        (rfp, thread_off         * wordSize);
 215 
 216     const Address d15_save      (rfp, d15_off * wordSize);
 217     const Address d13_save      (rfp, d13_off * wordSize);
 218     const Address d11_save      (rfp, d11_off * wordSize);
 219     const Address d9_save       (rfp, d9_off * wordSize);
 220 
 221     const Address r28_save      (rfp, r28_off * wordSize);
 222     const Address r26_save      (rfp, r26_off * wordSize);
 223     const Address r24_save      (rfp, r24_off * wordSize);
 224     const Address r22_save      (rfp, r22_off * wordSize);
 225     const Address r20_save      (rfp, r20_off * wordSize);
 226 
 227     // stub code
 228 
 229     address aarch64_entry = __ pc();
 230 
 231     // set up frame and move sp to end of save area
 232     __ enter();
 233     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 234 
 235     // save register parameters and Java scratch/global registers
 236     // n.b. we save thread even though it gets installed in
 237     // rthread because we want to sanity check rthread later
 238     __ str(c_rarg7,  thread);
 239     __ strw(c_rarg6, parameter_size);
 240     __ stp(c_rarg4, c_rarg5,  entry_point);
 241     __ stp(c_rarg2, c_rarg3,  result_type);
 242     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 243 
 244     __ stp(r20, r19,   r20_save);
 245     __ stp(r22, r21,   r22_save);
 246     __ stp(r24, r23,   r24_save);
 247     __ stp(r26, r25,   r26_save);
 248     __ stp(r28, r27,   r28_save);
 249 
 250     __ stpd(v9,  v8,   d9_save);
 251     __ stpd(v11, v10,  d11_save);
 252     __ stpd(v13, v12,  d13_save);
 253     __ stpd(v15, v14,  d15_save);
 254 
 255     // install Java thread in global register now we have saved
 256     // whatever value it held
 257     __ mov(rthread, c_rarg7);
 258     // And method
 259     __ mov(rmethod, c_rarg3);
 260 
 261     // set up the heapbase register
 262     __ reinit_heapbase();
 263 
 264 #ifdef ASSERT
 265     // make sure we have no pending exceptions
 266     {
 267       Label L;
 268       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 269       __ cmp(rscratch1, (u1)NULL_WORD);
 270       __ br(Assembler::EQ, L);
 271       __ stop("StubRoutines::call_stub: entered with pending exception");
 272       __ BIND(L);
 273     }
 274 #endif
 275     // pass parameters if any
 276     __ mov(esp, sp);
 277     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 278     __ andr(sp, rscratch1, -2 * wordSize);
 279 
 280     BLOCK_COMMENT("pass parameters if any");
 281     Label parameters_done;
 282     // parameter count is still in c_rarg6
 283     // and parameter pointer identifying param 1 is in c_rarg5
 284     __ cbzw(c_rarg6, parameters_done);
 285 
 286     address loop = __ pc();
 287     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 288     __ subsw(c_rarg6, c_rarg6, 1);
 289     __ push(rscratch1);
 290     __ br(Assembler::GT, loop);
 291 
 292     __ BIND(parameters_done);
 293 
 294     // call Java entry -- passing methdoOop, and current sp
 295     //      rmethod: Method*
 296     //      r19_sender_sp: sender sp
 297     BLOCK_COMMENT("call Java function");
 298     __ mov(r19_sender_sp, sp);
 299     __ blr(c_rarg4);
 300 
 301     // we do this here because the notify will already have been done
 302     // if we get to the next instruction via an exception
 303     //
 304     // n.b. adding this instruction here affects the calculation of
 305     // whether or not a routine returns to the call stub (used when
 306     // doing stack walks) since the normal test is to check the return
 307     // pc against the address saved below. so we may need to allow for
 308     // this extra instruction in the check.
 309 
 310     // save current address for use by exception handling code
 311 
 312     return_address = __ pc();
 313 
 314     // store result depending on type (everything that is not
 315     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 316     // n.b. this assumes Java returns an integral result in r0
 317     // and a floating result in j_farg0
 318     __ ldr(j_rarg2, result);
 319     Label is_long, is_float, is_double, exit;
 320     __ ldr(j_rarg1, result_type);
 321     __ cmp(j_rarg1, (u1)T_OBJECT);
 322     __ br(Assembler::EQ, is_long);
 323     __ cmp(j_rarg1, (u1)T_LONG);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(j_rarg1, (u1)T_FLOAT);
 326     __ br(Assembler::EQ, is_float);
 327     __ cmp(j_rarg1, (u1)T_DOUBLE);
 328     __ br(Assembler::EQ, is_double);
 329 
 330     // handle T_INT case
 331     __ strw(r0, Address(j_rarg2));
 332 
 333     __ BIND(exit);
 334 
 335     // pop parameters
 336     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 337 
 338 #ifdef ASSERT
 339     // verify that threads correspond
 340     {
 341       Label L, S;
 342       __ ldr(rscratch1, thread);
 343       __ cmp(rthread, rscratch1);
 344       __ br(Assembler::NE, S);
 345       __ get_thread(rscratch1);
 346       __ cmp(rthread, rscratch1);
 347       __ br(Assembler::EQ, L);
 348       __ BIND(S);
 349       __ stop("StubRoutines::call_stub: threads must correspond");
 350       __ BIND(L);
 351     }
 352 #endif
 353 
 354     __ pop_cont_fastpath(rthread);
 355 
 356     // restore callee-save registers
 357     __ ldpd(v15, v14,  d15_save);
 358     __ ldpd(v13, v12,  d13_save);
 359     __ ldpd(v11, v10,  d11_save);
 360     __ ldpd(v9,  v8,   d9_save);
 361 
 362     __ ldp(r28, r27,   r28_save);
 363     __ ldp(r26, r25,   r26_save);
 364     __ ldp(r24, r23,   r24_save);
 365     __ ldp(r22, r21,   r22_save);
 366     __ ldp(r20, r19,   r20_save);
 367 
 368     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 369     __ ldrw(c_rarg2, result_type);
 370     __ ldr(c_rarg3,  method);
 371     __ ldp(c_rarg4, c_rarg5,  entry_point);
 372     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 373 
 374     // leave frame and return to caller
 375     __ leave();
 376     __ ret(lr);
 377 
 378     // handle return types different from T_INT
 379 
 380     __ BIND(is_long);
 381     __ str(r0, Address(j_rarg2, 0));
 382     __ br(Assembler::AL, exit);
 383 
 384     __ BIND(is_float);
 385     __ strs(j_farg0, Address(j_rarg2, 0));
 386     __ br(Assembler::AL, exit);
 387 
 388     __ BIND(is_double);
 389     __ strd(j_farg0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     return start;
 393   }
 394 
 395   // Return point for a Java call if there's an exception thrown in
 396   // Java code.  The exception is caught and transformed into a
 397   // pending exception stored in JavaThread that can be tested from
 398   // within the VM.
 399   //
 400   // Note: Usually the parameters are removed by the callee. In case
 401   // of an exception crossing an activation frame boundary, that is
 402   // not the case if the callee is compiled code => need to setup the
 403   // rsp.
 404   //
 405   // r0: exception oop
 406 
 407   address generate_catch_exception() {
 408     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 409     address start = __ pc();
 410 
 411     // same as in generate_call_stub():
 412     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 413     const Address thread        (rfp, thread_off         * wordSize);
 414 
 415 #ifdef ASSERT
 416     // verify that threads correspond
 417     {
 418       Label L, S;
 419       __ ldr(rscratch1, thread);
 420       __ cmp(rthread, rscratch1);
 421       __ br(Assembler::NE, S);
 422       __ get_thread(rscratch1);
 423       __ cmp(rthread, rscratch1);
 424       __ br(Assembler::EQ, L);
 425       __ bind(S);
 426       __ stop("StubRoutines::catch_exception: threads must correspond");
 427       __ bind(L);
 428     }
 429 #endif
 430 
 431     // set pending exception
 432     __ verify_oop(r0);
 433 
 434     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 435     __ mov(rscratch1, (address)__FILE__);
 436     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 437     __ movw(rscratch1, (int)__LINE__);
 438     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 439 
 440     // complete return to VM
 441     assert(StubRoutines::_call_stub_return_address != NULL,
 442            "_call_stub_return_address must have been generated before");
 443     __ b(StubRoutines::_call_stub_return_address);
 444 
 445     return start;
 446   }
 447 
 448   // Continuation point for runtime calls returning with a pending
 449   // exception.  The pending exception check happened in the runtime
 450   // or native call stub.  The pending exception in Thread is
 451   // converted into a Java-level exception.
 452   //
 453   // Contract with Java-level exception handlers:
 454   // r0: exception
 455   // r3: throwing pc
 456   //
 457   // NOTE: At entry of this stub, exception-pc must be in LR !!
 458 
 459   // NOTE: this is always used as a jump target within generated code
 460   // so it just needs to be generated code with no x86 prolog
 461 
 462   address generate_forward_exception() {
 463     StubCodeMark mark(this, "StubRoutines", "forward exception");
 464     address start = __ pc();
 465 
 466     // Upon entry, LR points to the return address returning into
 467     // Java (interpreted or compiled) code; i.e., the return address
 468     // becomes the throwing pc.
 469     //
 470     // Arguments pushed before the runtime call are still on the stack
 471     // but the exception handler will reset the stack pointer ->
 472     // ignore them.  A potential result in registers can be ignored as
 473     // well.
 474 
 475 #ifdef ASSERT
 476     // make sure this code is only executed if there is a pending exception
 477     {
 478       Label L;
 479       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 480       __ cbnz(rscratch1, L);
 481       __ stop("StubRoutines::forward exception: no pending exception (1)");
 482       __ bind(L);
 483     }
 484 #endif
 485 
 486     // compute exception handler into r19
 487 
 488     // call the VM to find the handler address associated with the
 489     // caller address. pass thread in r0 and caller pc (ret address)
 490     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 491     // the stack.
 492     __ mov(c_rarg1, lr);
 493     // lr will be trashed by the VM call so we move it to R19
 494     // (callee-saved) because we also need to pass it to the handler
 495     // returned by this call.
 496     __ mov(r19, lr);
 497     BLOCK_COMMENT("call exception_handler_for_return_address");
 498     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 499                          SharedRuntime::exception_handler_for_return_address),
 500                     rthread, c_rarg1);
 501     // Reinitialize the ptrue predicate register, in case the external runtime
 502     // call clobbers ptrue reg, as we may return to SVE compiled code.
 503     __ reinitialize_ptrue();
 504 
 505     // we should not really care that lr is no longer the callee
 506     // address. we saved the value the handler needs in r19 so we can
 507     // just copy it to r3. however, the C2 handler will push its own
 508     // frame and then calls into the VM and the VM code asserts that
 509     // the PC for the frame above the handler belongs to a compiled
 510     // Java method. So, we restore lr here to satisfy that assert.
 511     __ mov(lr, r19);
 512     // setup r0 & r3 & clear pending exception
 513     __ mov(r3, r19);
 514     __ mov(r19, r0);
 515     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 516     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 517 
 518 #ifdef ASSERT
 519     // make sure exception is set
 520     {
 521       Label L;
 522       __ cbnz(r0, L);
 523       __ stop("StubRoutines::forward exception: no pending exception (2)");
 524       __ bind(L);
 525     }
 526 #endif
 527 
 528     // continue at exception handler
 529     // r0: exception
 530     // r3: throwing pc
 531     // r19: exception handler
 532     __ verify_oop(r0);
 533     __ br(r19);
 534 
 535     return start;
 536   }
 537 
 538   // Non-destructive plausibility checks for oops
 539   //
 540   // Arguments:
 541   //    r0: oop to verify
 542   //    rscratch1: error message
 543   //
 544   // Stack after saving c_rarg3:
 545   //    [tos + 0]: saved c_rarg3
 546   //    [tos + 1]: saved c_rarg2
 547   //    [tos + 2]: saved lr
 548   //    [tos + 3]: saved rscratch2
 549   //    [tos + 4]: saved r0
 550   //    [tos + 5]: saved rscratch1
 551   address generate_verify_oop() {
 552 
 553     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 554     address start = __ pc();
 555 
 556     Label exit, error;
 557 
 558     // save c_rarg2 and c_rarg3
 559     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 560 
 561     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 562     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 563     __ ldr(c_rarg3, Address(c_rarg2));
 564     __ add(c_rarg3, c_rarg3, 1);
 565     __ str(c_rarg3, Address(c_rarg2));
 566 
 567     // object is in r0
 568     // make sure object is 'reasonable'
 569     __ cbz(r0, exit); // if obj is NULL it is OK
 570 
 571 #if INCLUDE_ZGC
 572     if (UseZGC) {
 573       // Check if mask is good.
 574       // verifies that ZAddressBadMask & r0 == 0
 575       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 576       __ andr(c_rarg2, r0, c_rarg3);
 577       __ cbnz(c_rarg2, error);
 578     }
 579 #endif
 580 
 581     // Check if the oop is in the right area of memory
 582     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 583     __ andr(c_rarg2, r0, c_rarg3);
 584     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 585 
 586     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 587     // instruction here because the flags register is live.
 588     __ eor(c_rarg2, c_rarg2, c_rarg3);
 589     __ cbnz(c_rarg2, error);
 590 
 591     // make sure klass is 'reasonable', which is not zero.
 592     __ load_klass(r0, r0);  // get klass
 593     __ cbz(r0, error);      // if klass is NULL it is broken
 594 
 595     // return if everything seems ok
 596     __ bind(exit);
 597 
 598     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 599     __ ret(lr);
 600 
 601     // handle errors
 602     __ bind(error);
 603     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 604 
 605     __ push(RegSet::range(r0, r29), sp);
 606     // debug(char* msg, int64_t pc, int64_t regs[])
 607     __ mov(c_rarg0, rscratch1);      // pass address of error message
 608     __ mov(c_rarg1, lr);             // pass return address
 609     __ mov(c_rarg2, sp);             // pass address of regs on stack
 610 #ifndef PRODUCT
 611     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 612 #endif
 613     BLOCK_COMMENT("call MacroAssembler::debug");
 614     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 615     __ blr(rscratch1);
 616     __ hlt(0);
 617 
 618     return start;
 619   }
 620 
 621   // Generate indices for iota vector.
 622   address generate_iota_indices(const char *stub_name) {
 623     __ align(CodeEntryAlignment);
 624     StubCodeMark mark(this, "StubRoutines", stub_name);
 625     address start = __ pc();
 626     // B
 627     __ emit_data64(0x0706050403020100, relocInfo::none);
 628     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 629     // H
 630     __ emit_data64(0x0003000200010000, relocInfo::none);
 631     __ emit_data64(0x0007000600050004, relocInfo::none);
 632     // S
 633     __ emit_data64(0x0000000100000000, relocInfo::none);
 634     __ emit_data64(0x0000000300000002, relocInfo::none);
 635     // D
 636     __ emit_data64(0x0000000000000000, relocInfo::none);
 637     __ emit_data64(0x0000000000000001, relocInfo::none);
 638     // S - FP
 639     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 640     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 641     // D - FP
 642     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 643     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 644     return start;
 645   }
 646 
 647   // The inner part of zero_words().  This is the bulk operation,
 648   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 649   // caller is responsible for zeroing the last few words.
 650   //
 651   // Inputs:
 652   // r10: the HeapWord-aligned base address of an array to zero.
 653   // r11: the count in HeapWords, r11 > 0.
 654   //
 655   // Returns r10 and r11, adjusted for the caller to clear.
 656   // r10: the base address of the tail of words left to clear.
 657   // r11: the number of words in the tail.
 658   //      r11 < MacroAssembler::zero_words_block_size.
 659 
 660   address generate_zero_blocks() {
 661     Label done;
 662     Label base_aligned;
 663 
 664     Register base = r10, cnt = r11;
 665 
 666     __ align(CodeEntryAlignment);
 667     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 668     address start = __ pc();
 669 
 670     if (UseBlockZeroing) {
 671       int zva_length = VM_Version::zva_length();
 672 
 673       // Ensure ZVA length can be divided by 16. This is required by
 674       // the subsequent operations.
 675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 676 
 677       __ tbz(base, 3, base_aligned);
 678       __ str(zr, Address(__ post(base, 8)));
 679       __ sub(cnt, cnt, 1);
 680       __ bind(base_aligned);
 681 
 682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 683       // alignment.
 684       Label small;
 685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 686       __ subs(rscratch1, cnt, low_limit >> 3);
 687       __ br(Assembler::LT, small);
 688       __ zero_dcache_blocks(base, cnt);
 689       __ bind(small);
 690     }
 691 
 692     {
 693       // Number of stp instructions we'll unroll
 694       const int unroll =
 695         MacroAssembler::zero_words_block_size / 2;
 696       // Clear the remaining blocks.
 697       Label loop;
 698       __ subs(cnt, cnt, unroll * 2);
 699       __ br(Assembler::LT, done);
 700       __ bind(loop);
 701       for (int i = 0; i < unroll; i++)
 702         __ stp(zr, zr, __ post(base, 16));
 703       __ subs(cnt, cnt, unroll * 2);
 704       __ br(Assembler::GE, loop);
 705       __ bind(done);
 706       __ add(cnt, cnt, unroll * 2);
 707     }
 708 
 709     __ ret(lr);
 710 
 711     return start;
 712   }
 713 
 714 
 715   typedef enum {
 716     copy_forwards = 1,
 717     copy_backwards = -1
 718   } copy_direction;
 719 
 720   // Bulk copy of blocks of 8 words.
 721   //
 722   // count is a count of words.
 723   //
 724   // Precondition: count >= 8
 725   //
 726   // Postconditions:
 727   //
 728   // The least significant bit of count contains the remaining count
 729   // of words to copy.  The rest of count is trash.
 730   //
 731   // s and d are adjusted to point to the remaining words to copy
 732   //
 733   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 734                            copy_direction direction) {
 735     int unit = wordSize * direction;
 736     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 737 
 738     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 739       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 740     const Register stride = r13;
 741 
 742     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 743     assert_different_registers(s, d, count, rscratch1);
 744 
 745     Label again, drain;
 746     const char *stub_name;
 747     if (direction == copy_forwards)
 748       stub_name = "forward_copy_longs";
 749     else
 750       stub_name = "backward_copy_longs";
 751 
 752     __ align(CodeEntryAlignment);
 753 
 754     StubCodeMark mark(this, "StubRoutines", stub_name);
 755 
 756     __ bind(start);
 757 
 758     Label unaligned_copy_long;
 759     if (AvoidUnalignedAccesses) {
 760       __ tbnz(d, 3, unaligned_copy_long);
 761     }
 762 
 763     if (direction == copy_forwards) {
 764       __ sub(s, s, bias);
 765       __ sub(d, d, bias);
 766     }
 767 
 768 #ifdef ASSERT
 769     // Make sure we are never given < 8 words
 770     {
 771       Label L;
 772       __ cmp(count, (u1)8);
 773       __ br(Assembler::GE, L);
 774       __ stop("genrate_copy_longs called with < 8 words");
 775       __ bind(L);
 776     }
 777 #endif
 778 
 779     // Fill 8 registers
 780     if (UseSIMDForMemoryOps) {
 781       __ ldpq(v0, v1, Address(s, 4 * unit));
 782       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 783     } else {
 784       __ ldp(t0, t1, Address(s, 2 * unit));
 785       __ ldp(t2, t3, Address(s, 4 * unit));
 786       __ ldp(t4, t5, Address(s, 6 * unit));
 787       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 788     }
 789 
 790     __ subs(count, count, 16);
 791     __ br(Assembler::LO, drain);
 792 
 793     int prefetch = PrefetchCopyIntervalInBytes;
 794     bool use_stride = false;
 795     if (direction == copy_backwards) {
 796        use_stride = prefetch > 256;
 797        prefetch = -prefetch;
 798        if (use_stride) __ mov(stride, prefetch);
 799     }
 800 
 801     __ bind(again);
 802 
 803     if (PrefetchCopyIntervalInBytes > 0)
 804       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 805 
 806     if (UseSIMDForMemoryOps) {
 807       __ stpq(v0, v1, Address(d, 4 * unit));
 808       __ ldpq(v0, v1, Address(s, 4 * unit));
 809       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 810       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 811     } else {
 812       __ stp(t0, t1, Address(d, 2 * unit));
 813       __ ldp(t0, t1, Address(s, 2 * unit));
 814       __ stp(t2, t3, Address(d, 4 * unit));
 815       __ ldp(t2, t3, Address(s, 4 * unit));
 816       __ stp(t4, t5, Address(d, 6 * unit));
 817       __ ldp(t4, t5, Address(s, 6 * unit));
 818       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 819       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 820     }
 821 
 822     __ subs(count, count, 8);
 823     __ br(Assembler::HS, again);
 824 
 825     // Drain
 826     __ bind(drain);
 827     if (UseSIMDForMemoryOps) {
 828       __ stpq(v0, v1, Address(d, 4 * unit));
 829       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 830     } else {
 831       __ stp(t0, t1, Address(d, 2 * unit));
 832       __ stp(t2, t3, Address(d, 4 * unit));
 833       __ stp(t4, t5, Address(d, 6 * unit));
 834       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 835     }
 836 
 837     {
 838       Label L1, L2;
 839       __ tbz(count, exact_log2(4), L1);
 840       if (UseSIMDForMemoryOps) {
 841         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 842         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 843       } else {
 844         __ ldp(t0, t1, Address(s, 2 * unit));
 845         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 846         __ stp(t0, t1, Address(d, 2 * unit));
 847         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 848       }
 849       __ bind(L1);
 850 
 851       if (direction == copy_forwards) {
 852         __ add(s, s, bias);
 853         __ add(d, d, bias);
 854       }
 855 
 856       __ tbz(count, 1, L2);
 857       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 858       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 859       __ bind(L2);
 860     }
 861 
 862     __ ret(lr);
 863 
 864     if (AvoidUnalignedAccesses) {
 865       Label drain, again;
 866       // Register order for storing. Order is different for backward copy.
 867 
 868       __ bind(unaligned_copy_long);
 869 
 870       // source address is even aligned, target odd aligned
 871       //
 872       // when forward copying word pairs we read long pairs at offsets
 873       // {0, 2, 4, 6} (in long words). when backwards copying we read
 874       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 875       // address by -2 in the forwards case so we can compute the
 876       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 877       // or -1.
 878       //
 879       // when forward copying we need to store 1 word, 3 pairs and
 880       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 881       // zero offset We adjust the destination by -1 which means we
 882       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 883       //
 884       // When backwards copyng we need to store 1 word, 3 pairs and
 885       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 886       // offsets {1, 3, 5, 7, 8} * unit.
 887 
 888       if (direction == copy_forwards) {
 889         __ sub(s, s, 16);
 890         __ sub(d, d, 8);
 891       }
 892 
 893       // Fill 8 registers
 894       //
 895       // for forwards copy s was offset by -16 from the original input
 896       // value of s so the register contents are at these offsets
 897       // relative to the 64 bit block addressed by that original input
 898       // and so on for each successive 64 byte block when s is updated
 899       //
 900       // t0 at offset 0,  t1 at offset 8
 901       // t2 at offset 16, t3 at offset 24
 902       // t4 at offset 32, t5 at offset 40
 903       // t6 at offset 48, t7 at offset 56
 904 
 905       // for backwards copy s was not offset so the register contents
 906       // are at these offsets into the preceding 64 byte block
 907       // relative to that original input and so on for each successive
 908       // preceding 64 byte block when s is updated. this explains the
 909       // slightly counter-intuitive looking pattern of register usage
 910       // in the stp instructions for backwards copy.
 911       //
 912       // t0 at offset -16, t1 at offset -8
 913       // t2 at offset -32, t3 at offset -24
 914       // t4 at offset -48, t5 at offset -40
 915       // t6 at offset -64, t7 at offset -56
 916 
 917       __ ldp(t0, t1, Address(s, 2 * unit));
 918       __ ldp(t2, t3, Address(s, 4 * unit));
 919       __ ldp(t4, t5, Address(s, 6 * unit));
 920       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 921 
 922       __ subs(count, count, 16);
 923       __ br(Assembler::LO, drain);
 924 
 925       int prefetch = PrefetchCopyIntervalInBytes;
 926       bool use_stride = false;
 927       if (direction == copy_backwards) {
 928          use_stride = prefetch > 256;
 929          prefetch = -prefetch;
 930          if (use_stride) __ mov(stride, prefetch);
 931       }
 932 
 933       __ bind(again);
 934 
 935       if (PrefetchCopyIntervalInBytes > 0)
 936         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 937 
 938       if (direction == copy_forwards) {
 939        // allowing for the offset of -8 the store instructions place
 940        // registers into the target 64 bit block at the following
 941        // offsets
 942        //
 943        // t0 at offset 0
 944        // t1 at offset 8,  t2 at offset 16
 945        // t3 at offset 24, t4 at offset 32
 946        // t5 at offset 40, t6 at offset 48
 947        // t7 at offset 56
 948 
 949         __ str(t0, Address(d, 1 * unit));
 950         __ stp(t1, t2, Address(d, 2 * unit));
 951         __ ldp(t0, t1, Address(s, 2 * unit));
 952         __ stp(t3, t4, Address(d, 4 * unit));
 953         __ ldp(t2, t3, Address(s, 4 * unit));
 954         __ stp(t5, t6, Address(d, 6 * unit));
 955         __ ldp(t4, t5, Address(s, 6 * unit));
 956         __ str(t7, Address(__ pre(d, 8 * unit)));
 957         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 958       } else {
 959        // d was not offset when we started so the registers are
 960        // written into the 64 bit block preceding d with the following
 961        // offsets
 962        //
 963        // t1 at offset -8
 964        // t3 at offset -24, t0 at offset -16
 965        // t5 at offset -48, t2 at offset -32
 966        // t7 at offset -56, t4 at offset -48
 967        //                   t6 at offset -64
 968        //
 969        // note that this matches the offsets previously noted for the
 970        // loads
 971 
 972         __ str(t1, Address(d, 1 * unit));
 973         __ stp(t3, t0, Address(d, 3 * unit));
 974         __ ldp(t0, t1, Address(s, 2 * unit));
 975         __ stp(t5, t2, Address(d, 5 * unit));
 976         __ ldp(t2, t3, Address(s, 4 * unit));
 977         __ stp(t7, t4, Address(d, 7 * unit));
 978         __ ldp(t4, t5, Address(s, 6 * unit));
 979         __ str(t6, Address(__ pre(d, 8 * unit)));
 980         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 981       }
 982 
 983       __ subs(count, count, 8);
 984       __ br(Assembler::HS, again);
 985 
 986       // Drain
 987       //
 988       // this uses the same pattern of offsets and register arguments
 989       // as above
 990       __ bind(drain);
 991       if (direction == copy_forwards) {
 992         __ str(t0, Address(d, 1 * unit));
 993         __ stp(t1, t2, Address(d, 2 * unit));
 994         __ stp(t3, t4, Address(d, 4 * unit));
 995         __ stp(t5, t6, Address(d, 6 * unit));
 996         __ str(t7, Address(__ pre(d, 8 * unit)));
 997       } else {
 998         __ str(t1, Address(d, 1 * unit));
 999         __ stp(t3, t0, Address(d, 3 * unit));
1000         __ stp(t5, t2, Address(d, 5 * unit));
1001         __ stp(t7, t4, Address(d, 7 * unit));
1002         __ str(t6, Address(__ pre(d, 8 * unit)));
1003       }
1004       // now we need to copy any remaining part block which may
1005       // include a 4 word block subblock and/or a 2 word subblock.
1006       // bits 2 and 1 in the count are the tell-tale for whether we
1007       // have each such subblock
1008       {
1009         Label L1, L2;
1010         __ tbz(count, exact_log2(4), L1);
1011        // this is the same as above but copying only 4 longs hence
1012        // with only one intervening stp between the str instructions
1013        // but note that the offsets and registers still follow the
1014        // same pattern
1015         __ ldp(t0, t1, Address(s, 2 * unit));
1016         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1017         if (direction == copy_forwards) {
1018           __ str(t0, Address(d, 1 * unit));
1019           __ stp(t1, t2, Address(d, 2 * unit));
1020           __ str(t3, Address(__ pre(d, 4 * unit)));
1021         } else {
1022           __ str(t1, Address(d, 1 * unit));
1023           __ stp(t3, t0, Address(d, 3 * unit));
1024           __ str(t2, Address(__ pre(d, 4 * unit)));
1025         }
1026         __ bind(L1);
1027 
1028         __ tbz(count, 1, L2);
1029        // this is the same as above but copying only 2 longs hence
1030        // there is no intervening stp between the str instructions
1031        // but note that the offset and register patterns are still
1032        // the same
1033         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1034         if (direction == copy_forwards) {
1035           __ str(t0, Address(d, 1 * unit));
1036           __ str(t1, Address(__ pre(d, 2 * unit)));
1037         } else {
1038           __ str(t1, Address(d, 1 * unit));
1039           __ str(t0, Address(__ pre(d, 2 * unit)));
1040         }
1041         __ bind(L2);
1042 
1043        // for forwards copy we need to re-adjust the offsets we
1044        // applied so that s and d are follow the last words written
1045 
1046        if (direction == copy_forwards) {
1047          __ add(s, s, 16);
1048          __ add(d, d, 8);
1049        }
1050 
1051       }
1052 
1053       __ ret(lr);
1054       }
1055   }
1056 
1057   // Small copy: less than 16 bytes.
1058   //
1059   // NB: Ignores all of the bits of count which represent more than 15
1060   // bytes, so a caller doesn't have to mask them.
1061 
1062   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1063     bool is_backwards = step < 0;
1064     size_t granularity = uabs(step);
1065     int direction = is_backwards ? -1 : 1;
1066     int unit = wordSize * direction;
1067 
1068     Label Lword, Lint, Lshort, Lbyte;
1069 
1070     assert(granularity
1071            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1072 
1073     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1074 
1075     // ??? I don't know if this bit-test-and-branch is the right thing
1076     // to do.  It does a lot of jumping, resulting in several
1077     // mispredicted branches.  It might make more sense to do this
1078     // with something like Duff's device with a single computed branch.
1079 
1080     __ tbz(count, 3 - exact_log2(granularity), Lword);
1081     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1082     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1083     __ bind(Lword);
1084 
1085     if (granularity <= sizeof (jint)) {
1086       __ tbz(count, 2 - exact_log2(granularity), Lint);
1087       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1088       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1089       __ bind(Lint);
1090     }
1091 
1092     if (granularity <= sizeof (jshort)) {
1093       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1094       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1095       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1096       __ bind(Lshort);
1097     }
1098 
1099     if (granularity <= sizeof (jbyte)) {
1100       __ tbz(count, 0, Lbyte);
1101       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1102       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1103       __ bind(Lbyte);
1104     }
1105   }
1106 
1107   Label copy_f, copy_b;
1108 
1109   // All-singing all-dancing memory copy.
1110   //
1111   // Copy count units of memory from s to d.  The size of a unit is
1112   // step, which can be positive or negative depending on the direction
1113   // of copy.  If is_aligned is false, we align the source address.
1114   //
1115 
1116   void copy_memory(bool is_aligned, Register s, Register d,
1117                    Register count, Register tmp, int step) {
1118     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1119     bool is_backwards = step < 0;
1120     unsigned int granularity = uabs(step);
1121     const Register t0 = r3, t1 = r4;
1122 
1123     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1124     // load all the data before writing anything
1125     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1126     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1127     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1128     const Register send = r17, dend = r16;
1129 
1130     if (PrefetchCopyIntervalInBytes > 0)
1131       __ prfm(Address(s, 0), PLDL1KEEP);
1132     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1133     __ br(Assembler::HI, copy_big);
1134 
1135     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1136     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1137 
1138     __ cmp(count, u1(16/granularity));
1139     __ br(Assembler::LS, copy16);
1140 
1141     __ cmp(count, u1(64/granularity));
1142     __ br(Assembler::HI, copy80);
1143 
1144     __ cmp(count, u1(32/granularity));
1145     __ br(Assembler::LS, copy32);
1146 
1147     // 33..64 bytes
1148     if (UseSIMDForMemoryOps) {
1149       __ ldpq(v0, v1, Address(s, 0));
1150       __ ldpq(v2, v3, Address(send, -32));
1151       __ stpq(v0, v1, Address(d, 0));
1152       __ stpq(v2, v3, Address(dend, -32));
1153     } else {
1154       __ ldp(t0, t1, Address(s, 0));
1155       __ ldp(t2, t3, Address(s, 16));
1156       __ ldp(t4, t5, Address(send, -32));
1157       __ ldp(t6, t7, Address(send, -16));
1158 
1159       __ stp(t0, t1, Address(d, 0));
1160       __ stp(t2, t3, Address(d, 16));
1161       __ stp(t4, t5, Address(dend, -32));
1162       __ stp(t6, t7, Address(dend, -16));
1163     }
1164     __ b(finish);
1165 
1166     // 17..32 bytes
1167     __ bind(copy32);
1168     __ ldp(t0, t1, Address(s, 0));
1169     __ ldp(t2, t3, Address(send, -16));
1170     __ stp(t0, t1, Address(d, 0));
1171     __ stp(t2, t3, Address(dend, -16));
1172     __ b(finish);
1173 
1174     // 65..80/96 bytes
1175     // (96 bytes if SIMD because we do 32 byes per instruction)
1176     __ bind(copy80);
1177     if (UseSIMDForMemoryOps) {
1178       __ ldpq(v0, v1, Address(s, 0));
1179       __ ldpq(v2, v3, Address(s, 32));
1180       // Unaligned pointers can be an issue for copying.
1181       // The issue has more chances to happen when granularity of data is
1182       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1183       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1184       // The most performance drop has been seen for the range 65-80 bytes.
1185       // For such cases using the pair of ldp/stp instead of the third pair of
1186       // ldpq/stpq fixes the performance issue.
1187       if (granularity < sizeof (jint)) {
1188         Label copy96;
1189         __ cmp(count, u1(80/granularity));
1190         __ br(Assembler::HI, copy96);
1191         __ ldp(t0, t1, Address(send, -16));
1192 
1193         __ stpq(v0, v1, Address(d, 0));
1194         __ stpq(v2, v3, Address(d, 32));
1195         __ stp(t0, t1, Address(dend, -16));
1196         __ b(finish);
1197 
1198         __ bind(copy96);
1199       }
1200       __ ldpq(v4, v5, Address(send, -32));
1201 
1202       __ stpq(v0, v1, Address(d, 0));
1203       __ stpq(v2, v3, Address(d, 32));
1204       __ stpq(v4, v5, Address(dend, -32));
1205     } else {
1206       __ ldp(t0, t1, Address(s, 0));
1207       __ ldp(t2, t3, Address(s, 16));
1208       __ ldp(t4, t5, Address(s, 32));
1209       __ ldp(t6, t7, Address(s, 48));
1210       __ ldp(t8, t9, Address(send, -16));
1211 
1212       __ stp(t0, t1, Address(d, 0));
1213       __ stp(t2, t3, Address(d, 16));
1214       __ stp(t4, t5, Address(d, 32));
1215       __ stp(t6, t7, Address(d, 48));
1216       __ stp(t8, t9, Address(dend, -16));
1217     }
1218     __ b(finish);
1219 
1220     // 0..16 bytes
1221     __ bind(copy16);
1222     __ cmp(count, u1(8/granularity));
1223     __ br(Assembler::LO, copy8);
1224 
1225     // 8..16 bytes
1226     __ ldr(t0, Address(s, 0));
1227     __ ldr(t1, Address(send, -8));
1228     __ str(t0, Address(d, 0));
1229     __ str(t1, Address(dend, -8));
1230     __ b(finish);
1231 
1232     if (granularity < 8) {
1233       // 4..7 bytes
1234       __ bind(copy8);
1235       __ tbz(count, 2 - exact_log2(granularity), copy4);
1236       __ ldrw(t0, Address(s, 0));
1237       __ ldrw(t1, Address(send, -4));
1238       __ strw(t0, Address(d, 0));
1239       __ strw(t1, Address(dend, -4));
1240       __ b(finish);
1241       if (granularity < 4) {
1242         // 0..3 bytes
1243         __ bind(copy4);
1244         __ cbz(count, finish); // get rid of 0 case
1245         if (granularity == 2) {
1246           __ ldrh(t0, Address(s, 0));
1247           __ strh(t0, Address(d, 0));
1248         } else { // granularity == 1
1249           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1250           // the first and last byte.
1251           // Handle the 3 byte case by loading and storing base + count/2
1252           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1253           // This does means in the 1 byte case we load/store the same
1254           // byte 3 times.
1255           __ lsr(count, count, 1);
1256           __ ldrb(t0, Address(s, 0));
1257           __ ldrb(t1, Address(send, -1));
1258           __ ldrb(t2, Address(s, count));
1259           __ strb(t0, Address(d, 0));
1260           __ strb(t1, Address(dend, -1));
1261           __ strb(t2, Address(d, count));
1262         }
1263         __ b(finish);
1264       }
1265     }
1266 
1267     __ bind(copy_big);
1268     if (is_backwards) {
1269       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1270       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1271     }
1272 
1273     // Now we've got the small case out of the way we can align the
1274     // source address on a 2-word boundary.
1275 
1276     Label aligned;
1277 
1278     if (is_aligned) {
1279       // We may have to adjust by 1 word to get s 2-word-aligned.
1280       __ tbz(s, exact_log2(wordSize), aligned);
1281       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1282       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1283       __ sub(count, count, wordSize/granularity);
1284     } else {
1285       if (is_backwards) {
1286         __ andr(rscratch2, s, 2 * wordSize - 1);
1287       } else {
1288         __ neg(rscratch2, s);
1289         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1290       }
1291       // rscratch2 is the byte adjustment needed to align s.
1292       __ cbz(rscratch2, aligned);
1293       int shift = exact_log2(granularity);
1294       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1295       __ sub(count, count, rscratch2);
1296 
1297 #if 0
1298       // ?? This code is only correct for a disjoint copy.  It may or
1299       // may not make sense to use it in that case.
1300 
1301       // Copy the first pair; s and d may not be aligned.
1302       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1303       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1304 
1305       // Align s and d, adjust count
1306       if (is_backwards) {
1307         __ sub(s, s, rscratch2);
1308         __ sub(d, d, rscratch2);
1309       } else {
1310         __ add(s, s, rscratch2);
1311         __ add(d, d, rscratch2);
1312       }
1313 #else
1314       copy_memory_small(s, d, rscratch2, rscratch1, step);
1315 #endif
1316     }
1317 
1318     __ bind(aligned);
1319 
1320     // s is now 2-word-aligned.
1321 
1322     // We have a count of units and some trailing bytes.  Adjust the
1323     // count and do a bulk copy of words.
1324     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1325     if (direction == copy_forwards)
1326       __ bl(copy_f);
1327     else
1328       __ bl(copy_b);
1329 
1330     // And the tail.
1331     copy_memory_small(s, d, count, tmp, step);
1332 
1333     if (granularity >= 8) __ bind(copy8);
1334     if (granularity >= 4) __ bind(copy4);
1335     __ bind(finish);
1336   }
1337 
1338 
1339   void clobber_registers() {
1340 #ifdef ASSERT
1341     RegSet clobbered
1342       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1343     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1344     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1345     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1346       __ mov(*it, rscratch1);
1347     }
1348 #endif
1349 
1350   }
1351 
1352   // Scan over array at a for count oops, verifying each one.
1353   // Preserves a and count, clobbers rscratch1 and rscratch2.
1354   void verify_oop_array (int size, Register a, Register count, Register temp) {
1355     Label loop, end;
1356     __ mov(rscratch1, a);
1357     __ mov(rscratch2, zr);
1358     __ bind(loop);
1359     __ cmp(rscratch2, count);
1360     __ br(Assembler::HS, end);
1361     if (size == wordSize) {
1362       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1363       __ verify_oop(temp);
1364     } else {
1365       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1366       __ decode_heap_oop(temp); // calls verify_oop
1367     }
1368     __ add(rscratch2, rscratch2, 1);
1369     __ b(loop);
1370     __ bind(end);
1371   }
1372 
1373   // Arguments:
1374   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1375   //             ignored
1376   //   is_oop  - true => oop array, so generate store check code
1377   //   name    - stub name string
1378   //
1379   // Inputs:
1380   //   c_rarg0   - source array address
1381   //   c_rarg1   - destination array address
1382   //   c_rarg2   - element count, treated as ssize_t, can be zero
1383   //
1384   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1385   // the hardware handle it.  The two dwords within qwords that span
1386   // cache line boundaries will still be loaded and stored atomically.
1387   //
1388   // Side Effects:
1389   //   disjoint_int_copy_entry is set to the no-overlap entry point
1390   //   used by generate_conjoint_int_oop_copy().
1391   //
1392   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1393                                   const char *name, bool dest_uninitialized = false) {
1394     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1395     RegSet saved_reg = RegSet::of(s, d, count);
1396     __ align(CodeEntryAlignment);
1397     StubCodeMark mark(this, "StubRoutines", name);
1398     address start = __ pc();
1399     __ enter();
1400 
1401     if (entry != NULL) {
1402       *entry = __ pc();
1403       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1404       BLOCK_COMMENT("Entry:");
1405     }
1406 
1407     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1408     if (dest_uninitialized) {
1409       decorators |= IS_DEST_UNINITIALIZED;
1410     }
1411     if (aligned) {
1412       decorators |= ARRAYCOPY_ALIGNED;
1413     }
1414 
1415     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1416     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1417 
1418     if (is_oop) {
1419       // save regs before copy_memory
1420       __ push(RegSet::of(d, count), sp);
1421     }
1422     {
1423       // UnsafeCopyMemory page error: continue after ucm
1424       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1425       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1426       copy_memory(aligned, s, d, count, rscratch1, size);
1427     }
1428 
1429     if (is_oop) {
1430       __ pop(RegSet::of(d, count), sp);
1431       if (VerifyOops)
1432         verify_oop_array(size, d, count, r16);
1433     }
1434 
1435     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1436 
1437     __ leave();
1438     __ mov(r0, zr); // return 0
1439     __ ret(lr);
1440     return start;
1441   }
1442 
1443   // Arguments:
1444   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1445   //             ignored
1446   //   is_oop  - true => oop array, so generate store check code
1447   //   name    - stub name string
1448   //
1449   // Inputs:
1450   //   c_rarg0   - source array address
1451   //   c_rarg1   - destination array address
1452   //   c_rarg2   - element count, treated as ssize_t, can be zero
1453   //
1454   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1455   // the hardware handle it.  The two dwords within qwords that span
1456   // cache line boundaries will still be loaded and stored atomically.
1457   //
1458   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1459                                  address *entry, const char *name,
1460                                  bool dest_uninitialized = false) {
1461     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1462     RegSet saved_regs = RegSet::of(s, d, count);
1463     StubCodeMark mark(this, "StubRoutines", name);
1464     address start = __ pc();
1465     __ enter();
1466 
1467     if (entry != NULL) {
1468       *entry = __ pc();
1469       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1470       BLOCK_COMMENT("Entry:");
1471     }
1472 
1473     // use fwd copy when (d-s) above_equal (count*size)
1474     __ sub(rscratch1, d, s);
1475     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1476     __ br(Assembler::HS, nooverlap_target);
1477 
1478     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1479     if (dest_uninitialized) {
1480       decorators |= IS_DEST_UNINITIALIZED;
1481     }
1482     if (aligned) {
1483       decorators |= ARRAYCOPY_ALIGNED;
1484     }
1485 
1486     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1487     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1488 
1489     if (is_oop) {
1490       // save regs before copy_memory
1491       __ push(RegSet::of(d, count), sp);
1492     }
1493     {
1494       // UnsafeCopyMemory page error: continue after ucm
1495       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1496       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1497       copy_memory(aligned, s, d, count, rscratch1, -size);
1498     }
1499     if (is_oop) {
1500       __ pop(RegSet::of(d, count), sp);
1501       if (VerifyOops)
1502         verify_oop_array(size, d, count, r16);
1503     }
1504     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1505     __ leave();
1506     __ mov(r0, zr); // return 0
1507     __ ret(lr);
1508     return start;
1509 }
1510 
1511   // Arguments:
1512   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1513   //             ignored
1514   //   name    - stub name string
1515   //
1516   // Inputs:
1517   //   c_rarg0   - source array address
1518   //   c_rarg1   - destination array address
1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
1520   //
1521   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1522   // we let the hardware handle it.  The one to eight bytes within words,
1523   // dwords or qwords that span cache line boundaries will still be loaded
1524   // and stored atomically.
1525   //
1526   // Side Effects:
1527   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1528   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1529   // we let the hardware handle it.  The one to eight bytes within words,
1530   // dwords or qwords that span cache line boundaries will still be loaded
1531   // and stored atomically.
1532   //
1533   // Side Effects:
1534   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1535   //   used by generate_conjoint_byte_copy().
1536   //
1537   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1538     const bool not_oop = false;
1539     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1540   }
1541 
1542   // Arguments:
1543   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1544   //             ignored
1545   //   name    - stub name string
1546   //
1547   // Inputs:
1548   //   c_rarg0   - source array address
1549   //   c_rarg1   - destination array address
1550   //   c_rarg2   - element count, treated as ssize_t, can be zero
1551   //
1552   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1553   // we let the hardware handle it.  The one to eight bytes within words,
1554   // dwords or qwords that span cache line boundaries will still be loaded
1555   // and stored atomically.
1556   //
1557   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1558                                       address* entry, const char *name) {
1559     const bool not_oop = false;
1560     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1561   }
1562 
1563   // Arguments:
1564   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1565   //             ignored
1566   //   name    - stub name string
1567   //
1568   // Inputs:
1569   //   c_rarg0   - source array address
1570   //   c_rarg1   - destination array address
1571   //   c_rarg2   - element count, treated as ssize_t, can be zero
1572   //
1573   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1574   // let the hardware handle it.  The two or four words within dwords
1575   // or qwords that span cache line boundaries will still be loaded
1576   // and stored atomically.
1577   //
1578   // Side Effects:
1579   //   disjoint_short_copy_entry is set to the no-overlap entry point
1580   //   used by generate_conjoint_short_copy().
1581   //
1582   address generate_disjoint_short_copy(bool aligned,
1583                                        address* entry, const char *name) {
1584     const bool not_oop = false;
1585     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1586   }
1587 
1588   // Arguments:
1589   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1590   //             ignored
1591   //   name    - stub name string
1592   //
1593   // Inputs:
1594   //   c_rarg0   - source array address
1595   //   c_rarg1   - destination array address
1596   //   c_rarg2   - element count, treated as ssize_t, can be zero
1597   //
1598   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1599   // let the hardware handle it.  The two or four words within dwords
1600   // or qwords that span cache line boundaries will still be loaded
1601   // and stored atomically.
1602   //
1603   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1604                                        address *entry, const char *name) {
1605     const bool not_oop = false;
1606     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1607 
1608   }
1609   // Arguments:
1610   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1611   //             ignored
1612   //   name    - stub name string
1613   //
1614   // Inputs:
1615   //   c_rarg0   - source array address
1616   //   c_rarg1   - destination array address
1617   //   c_rarg2   - element count, treated as ssize_t, can be zero
1618   //
1619   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1620   // the hardware handle it.  The two dwords within qwords that span
1621   // cache line boundaries will still be loaded and stored atomically.
1622   //
1623   // Side Effects:
1624   //   disjoint_int_copy_entry is set to the no-overlap entry point
1625   //   used by generate_conjoint_int_oop_copy().
1626   //
1627   address generate_disjoint_int_copy(bool aligned, address *entry,
1628                                          const char *name, bool dest_uninitialized = false) {
1629     const bool not_oop = false;
1630     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1631   }
1632 
1633   // Arguments:
1634   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1635   //             ignored
1636   //   name    - stub name string
1637   //
1638   // Inputs:
1639   //   c_rarg0   - source array address
1640   //   c_rarg1   - destination array address
1641   //   c_rarg2   - element count, treated as ssize_t, can be zero
1642   //
1643   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1644   // the hardware handle it.  The two dwords within qwords that span
1645   // cache line boundaries will still be loaded and stored atomically.
1646   //
1647   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1648                                      address *entry, const char *name,
1649                                      bool dest_uninitialized = false) {
1650     const bool not_oop = false;
1651     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1652   }
1653 
1654 
1655   // Arguments:
1656   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1657   //             ignored
1658   //   name    - stub name string
1659   //
1660   // Inputs:
1661   //   c_rarg0   - source array address
1662   //   c_rarg1   - destination array address
1663   //   c_rarg2   - element count, treated as size_t, can be zero
1664   //
1665   // Side Effects:
1666   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1667   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1668   //
1669   address generate_disjoint_long_copy(bool aligned, address *entry,
1670                                           const char *name, bool dest_uninitialized = false) {
1671     const bool not_oop = false;
1672     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1673   }
1674 
1675   // Arguments:
1676   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1677   //             ignored
1678   //   name    - stub name string
1679   //
1680   // Inputs:
1681   //   c_rarg0   - source array address
1682   //   c_rarg1   - destination array address
1683   //   c_rarg2   - element count, treated as size_t, can be zero
1684   //
1685   address generate_conjoint_long_copy(bool aligned,
1686                                       address nooverlap_target, address *entry,
1687                                       const char *name, bool dest_uninitialized = false) {
1688     const bool not_oop = false;
1689     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1690   }
1691 
1692   // Arguments:
1693   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1694   //             ignored
1695   //   name    - stub name string
1696   //
1697   // Inputs:
1698   //   c_rarg0   - source array address
1699   //   c_rarg1   - destination array address
1700   //   c_rarg2   - element count, treated as size_t, can be zero
1701   //
1702   // Side Effects:
1703   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1704   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1705   //
1706   address generate_disjoint_oop_copy(bool aligned, address *entry,
1707                                      const char *name, bool dest_uninitialized) {
1708     const bool is_oop = true;
1709     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1710     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1711   }
1712 
1713   // Arguments:
1714   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1715   //             ignored
1716   //   name    - stub name string
1717   //
1718   // Inputs:
1719   //   c_rarg0   - source array address
1720   //   c_rarg1   - destination array address
1721   //   c_rarg2   - element count, treated as size_t, can be zero
1722   //
1723   address generate_conjoint_oop_copy(bool aligned,
1724                                      address nooverlap_target, address *entry,
1725                                      const char *name, bool dest_uninitialized) {
1726     const bool is_oop = true;
1727     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1728     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1729                                   name, dest_uninitialized);
1730   }
1731 
1732 
1733   // Helper for generating a dynamic type check.
1734   // Smashes rscratch1, rscratch2.
1735   void generate_type_check(Register sub_klass,
1736                            Register super_check_offset,
1737                            Register super_klass,
1738                            Label& L_success) {
1739     assert_different_registers(sub_klass, super_check_offset, super_klass);
1740 
1741     BLOCK_COMMENT("type_check:");
1742 
1743     Label L_miss;
1744 
1745     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1746                                      super_check_offset);
1747     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1748 
1749     // Fall through on failure!
1750     __ BIND(L_miss);
1751   }
1752 
1753   //
1754   //  Generate checkcasting array copy stub
1755   //
1756   //  Input:
1757   //    c_rarg0   - source array address
1758   //    c_rarg1   - destination array address
1759   //    c_rarg2   - element count, treated as ssize_t, can be zero
1760   //    c_rarg3   - size_t ckoff (super_check_offset)
1761   //    c_rarg4   - oop ckval (super_klass)
1762   //
1763   //  Output:
1764   //    r0 ==  0  -  success
1765   //    r0 == -1^K - failure, where K is partial transfer count
1766   //
1767   address generate_checkcast_copy(const char *name, address *entry,
1768                                   bool dest_uninitialized = false) {
1769 
1770     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1771 
1772     // Input registers (after setup_arg_regs)
1773     const Register from        = c_rarg0;   // source array address
1774     const Register to          = c_rarg1;   // destination array address
1775     const Register count       = c_rarg2;   // elementscount
1776     const Register ckoff       = c_rarg3;   // super_check_offset
1777     const Register ckval       = c_rarg4;   // super_klass
1778 
1779     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1780     RegSet wb_post_saved_regs = RegSet::of(count);
1781 
1782     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1783     const Register copied_oop  = r22;       // actual oop copied
1784     const Register count_save  = r21;       // orig elementscount
1785     const Register start_to    = r20;       // destination array start address
1786     const Register r19_klass   = r19;       // oop._klass
1787 
1788     //---------------------------------------------------------------
1789     // Assembler stub will be used for this call to arraycopy
1790     // if the two arrays are subtypes of Object[] but the
1791     // destination array type is not equal to or a supertype
1792     // of the source type.  Each element must be separately
1793     // checked.
1794 
1795     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1796                                copied_oop, r19_klass, count_save);
1797 
1798     __ align(CodeEntryAlignment);
1799     StubCodeMark mark(this, "StubRoutines", name);
1800     address start = __ pc();
1801 
1802     __ enter(); // required for proper stackwalking of RuntimeStub frame
1803 
1804 #ifdef ASSERT
1805     // caller guarantees that the arrays really are different
1806     // otherwise, we would have to make conjoint checks
1807     { Label L;
1808       __ b(L);                  // conjoint check not yet implemented
1809       __ stop("checkcast_copy within a single array");
1810       __ bind(L);
1811     }
1812 #endif //ASSERT
1813 
1814     // Caller of this entry point must set up the argument registers.
1815     if (entry != NULL) {
1816       *entry = __ pc();
1817       BLOCK_COMMENT("Entry:");
1818     }
1819 
1820      // Empty array:  Nothing to do.
1821     __ cbz(count, L_done);
1822     __ push(RegSet::of(r19, r20, r21, r22), sp);
1823 
1824 #ifdef ASSERT
1825     BLOCK_COMMENT("assert consistent ckoff/ckval");
1826     // The ckoff and ckval must be mutually consistent,
1827     // even though caller generates both.
1828     { Label L;
1829       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1830       __ ldrw(start_to, Address(ckval, sco_offset));
1831       __ cmpw(ckoff, start_to);
1832       __ br(Assembler::EQ, L);
1833       __ stop("super_check_offset inconsistent");
1834       __ bind(L);
1835     }
1836 #endif //ASSERT
1837 
1838     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1839     bool is_oop = true;
1840     if (dest_uninitialized) {
1841       decorators |= IS_DEST_UNINITIALIZED;
1842     }
1843 
1844     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1845     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1846 
1847     // save the original count
1848     __ mov(count_save, count);
1849 
1850     // Copy from low to high addresses
1851     __ mov(start_to, to);              // Save destination array start address
1852     __ b(L_load_element);
1853 
1854     // ======== begin loop ========
1855     // (Loop is rotated; its entry is L_load_element.)
1856     // Loop control:
1857     //   for (; count != 0; count--) {
1858     //     copied_oop = load_heap_oop(from++);
1859     //     ... generate_type_check ...;
1860     //     store_heap_oop(to++, copied_oop);
1861     //   }
1862     __ align(OptoLoopAlignment);
1863 
1864     __ BIND(L_store_element);
1865     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
1866     __ sub(count, count, 1);
1867     __ cbz(count, L_do_card_marks);
1868 
1869     // ======== loop entry is here ========
1870     __ BIND(L_load_element);
1871     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1872     __ cbz(copied_oop, L_store_element);
1873 
1874     __ load_klass(r19_klass, copied_oop);// query the object klass
1875     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1876     // ======== end loop ========
1877 
1878     // It was a real error; we must depend on the caller to finish the job.
1879     // Register count = remaining oops, count_orig = total oops.
1880     // Emit GC store barriers for the oops we have copied and report
1881     // their number to the caller.
1882 
1883     __ subs(count, count_save, count);     // K = partially copied oop count
1884     __ eon(count, count, zr);                   // report (-1^K) to caller
1885     __ br(Assembler::EQ, L_done_pop);
1886 
1887     __ BIND(L_do_card_marks);
1888     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1889 
1890     __ bind(L_done_pop);
1891     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1892     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1893 
1894     __ bind(L_done);
1895     __ mov(r0, count);
1896     __ leave();
1897     __ ret(lr);
1898 
1899     return start;
1900   }
1901 
1902   // Perform range checks on the proposed arraycopy.
1903   // Kills temp, but nothing else.
1904   // Also, clean the sign bits of src_pos and dst_pos.
1905   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1906                               Register src_pos, // source position (c_rarg1)
1907                               Register dst,     // destination array oo (c_rarg2)
1908                               Register dst_pos, // destination position (c_rarg3)
1909                               Register length,
1910                               Register temp,
1911                               Label& L_failed) {
1912     BLOCK_COMMENT("arraycopy_range_checks:");
1913 
1914     assert_different_registers(rscratch1, temp);
1915 
1916     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1917     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1918     __ addw(temp, length, src_pos);
1919     __ cmpw(temp, rscratch1);
1920     __ br(Assembler::HI, L_failed);
1921 
1922     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1923     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1924     __ addw(temp, length, dst_pos);
1925     __ cmpw(temp, rscratch1);
1926     __ br(Assembler::HI, L_failed);
1927 
1928     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1929     __ movw(src_pos, src_pos);
1930     __ movw(dst_pos, dst_pos);
1931 
1932     BLOCK_COMMENT("arraycopy_range_checks done");
1933   }
1934 
1935   // These stubs get called from some dumb test routine.
1936   // I'll write them properly when they're called from
1937   // something that's actually doing something.
1938   static void fake_arraycopy_stub(address src, address dst, int count) {
1939     assert(count == 0, "huh?");
1940   }
1941 
1942 
1943   //
1944   //  Generate 'unsafe' array copy stub
1945   //  Though just as safe as the other stubs, it takes an unscaled
1946   //  size_t argument instead of an element count.
1947   //
1948   //  Input:
1949   //    c_rarg0   - source array address
1950   //    c_rarg1   - destination array address
1951   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1952   //
1953   // Examines the alignment of the operands and dispatches
1954   // to a long, int, short, or byte copy loop.
1955   //
1956   address generate_unsafe_copy(const char *name,
1957                                address byte_copy_entry,
1958                                address short_copy_entry,
1959                                address int_copy_entry,
1960                                address long_copy_entry) {
1961     Label L_long_aligned, L_int_aligned, L_short_aligned;
1962     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1963 
1964     __ align(CodeEntryAlignment);
1965     StubCodeMark mark(this, "StubRoutines", name);
1966     address start = __ pc();
1967     __ enter(); // required for proper stackwalking of RuntimeStub frame
1968 
1969     // bump this on entry, not on exit:
1970     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1971 
1972     __ orr(rscratch1, s, d);
1973     __ orr(rscratch1, rscratch1, count);
1974 
1975     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1976     __ cbz(rscratch1, L_long_aligned);
1977     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1978     __ cbz(rscratch1, L_int_aligned);
1979     __ tbz(rscratch1, 0, L_short_aligned);
1980     __ b(RuntimeAddress(byte_copy_entry));
1981 
1982     __ BIND(L_short_aligned);
1983     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1984     __ b(RuntimeAddress(short_copy_entry));
1985     __ BIND(L_int_aligned);
1986     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1987     __ b(RuntimeAddress(int_copy_entry));
1988     __ BIND(L_long_aligned);
1989     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1990     __ b(RuntimeAddress(long_copy_entry));
1991 
1992     return start;
1993   }
1994 
1995   //
1996   //  Generate generic array copy stubs
1997   //
1998   //  Input:
1999   //    c_rarg0    -  src oop
2000   //    c_rarg1    -  src_pos (32-bits)
2001   //    c_rarg2    -  dst oop
2002   //    c_rarg3    -  dst_pos (32-bits)
2003   //    c_rarg4    -  element count (32-bits)
2004   //
2005   //  Output:
2006   //    r0 ==  0  -  success
2007   //    r0 == -1^K - failure, where K is partial transfer count
2008   //
2009   address generate_generic_copy(const char *name,
2010                                 address byte_copy_entry, address short_copy_entry,
2011                                 address int_copy_entry, address oop_copy_entry,
2012                                 address long_copy_entry, address checkcast_copy_entry) {
2013 
2014     Label L_failed, L_objArray;
2015     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2016 
2017     // Input registers
2018     const Register src        = c_rarg0;  // source array oop
2019     const Register src_pos    = c_rarg1;  // source position
2020     const Register dst        = c_rarg2;  // destination array oop
2021     const Register dst_pos    = c_rarg3;  // destination position
2022     const Register length     = c_rarg4;
2023 
2024 
2025     // Registers used as temps
2026     const Register dst_klass  = c_rarg5;
2027 
2028     __ align(CodeEntryAlignment);
2029 
2030     StubCodeMark mark(this, "StubRoutines", name);
2031 
2032     address start = __ pc();
2033 
2034     __ enter(); // required for proper stackwalking of RuntimeStub frame
2035 
2036     // bump this on entry, not on exit:
2037     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2038 
2039     //-----------------------------------------------------------------------
2040     // Assembler stub will be used for this call to arraycopy
2041     // if the following conditions are met:
2042     //
2043     // (1) src and dst must not be null.
2044     // (2) src_pos must not be negative.
2045     // (3) dst_pos must not be negative.
2046     // (4) length  must not be negative.
2047     // (5) src klass and dst klass should be the same and not NULL.
2048     // (6) src and dst should be arrays.
2049     // (7) src_pos + length must not exceed length of src.
2050     // (8) dst_pos + length must not exceed length of dst.
2051     //
2052 
2053     //  if (src == NULL) return -1;
2054     __ cbz(src, L_failed);
2055 
2056     //  if (src_pos < 0) return -1;
2057     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2058 
2059     //  if (dst == NULL) return -1;
2060     __ cbz(dst, L_failed);
2061 
2062     //  if (dst_pos < 0) return -1;
2063     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2064 
2065     // registers used as temp
2066     const Register scratch_length    = r16; // elements count to copy
2067     const Register scratch_src_klass = r17; // array klass
2068     const Register lh                = r15; // layout helper
2069 
2070     //  if (length < 0) return -1;
2071     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2072     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2073 
2074     __ load_klass(scratch_src_klass, src);
2075 #ifdef ASSERT
2076     //  assert(src->klass() != NULL);
2077     {
2078       BLOCK_COMMENT("assert klasses not null {");
2079       Label L1, L2;
2080       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2081       __ bind(L1);
2082       __ stop("broken null klass");
2083       __ bind(L2);
2084       __ load_klass(rscratch1, dst);
2085       __ cbz(rscratch1, L1);     // this would be broken also
2086       BLOCK_COMMENT("} assert klasses not null done");
2087     }
2088 #endif
2089 
2090     // Load layout helper (32-bits)
2091     //
2092     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2093     // 32        30    24            16              8     2                 0
2094     //
2095     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2096     //
2097 
2098     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2099 
2100     // Handle objArrays completely differently...
2101     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2102     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2103     __ movw(rscratch1, objArray_lh);
2104     __ eorw(rscratch2, lh, rscratch1);
2105     __ cbzw(rscratch2, L_objArray);
2106 
2107     //  if (src->klass() != dst->klass()) return -1;
2108     __ load_klass(rscratch2, dst);
2109     __ eor(rscratch2, rscratch2, scratch_src_klass);
2110     __ cbnz(rscratch2, L_failed);
2111 
2112     //  if (!src->is_Array()) return -1;
2113     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2114 
2115     // At this point, it is known to be a typeArray (array_tag 0x3).
2116 #ifdef ASSERT
2117     {
2118       BLOCK_COMMENT("assert primitive array {");
2119       Label L;
2120       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2121       __ cmpw(lh, rscratch2);
2122       __ br(Assembler::GE, L);
2123       __ stop("must be a primitive array");
2124       __ bind(L);
2125       BLOCK_COMMENT("} assert primitive array done");
2126     }
2127 #endif
2128 
2129     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2130                            rscratch2, L_failed);
2131 
2132     // TypeArrayKlass
2133     //
2134     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2135     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2136     //
2137 
2138     const Register rscratch1_offset = rscratch1;    // array offset
2139     const Register r15_elsize = lh; // element size
2140 
2141     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2142            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2143     __ add(src, src, rscratch1_offset);           // src array offset
2144     __ add(dst, dst, rscratch1_offset);           // dst array offset
2145     BLOCK_COMMENT("choose copy loop based on element size");
2146 
2147     // next registers should be set before the jump to corresponding stub
2148     const Register from     = c_rarg0;  // source array address
2149     const Register to       = c_rarg1;  // destination array address
2150     const Register count    = c_rarg2;  // elements count
2151 
2152     // 'from', 'to', 'count' registers should be set in such order
2153     // since they are the same as 'src', 'src_pos', 'dst'.
2154 
2155     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2156 
2157     // The possible values of elsize are 0-3, i.e. exact_log2(element
2158     // size in bytes).  We do a simple bitwise binary search.
2159   __ BIND(L_copy_bytes);
2160     __ tbnz(r15_elsize, 1, L_copy_ints);
2161     __ tbnz(r15_elsize, 0, L_copy_shorts);
2162     __ lea(from, Address(src, src_pos));// src_addr
2163     __ lea(to,   Address(dst, dst_pos));// dst_addr
2164     __ movw(count, scratch_length); // length
2165     __ b(RuntimeAddress(byte_copy_entry));
2166 
2167   __ BIND(L_copy_shorts);
2168     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2169     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2170     __ movw(count, scratch_length); // length
2171     __ b(RuntimeAddress(short_copy_entry));
2172 
2173   __ BIND(L_copy_ints);
2174     __ tbnz(r15_elsize, 0, L_copy_longs);
2175     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2176     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2177     __ movw(count, scratch_length); // length
2178     __ b(RuntimeAddress(int_copy_entry));
2179 
2180   __ BIND(L_copy_longs);
2181 #ifdef ASSERT
2182     {
2183       BLOCK_COMMENT("assert long copy {");
2184       Label L;
2185       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2186       __ cmpw(r15_elsize, LogBytesPerLong);
2187       __ br(Assembler::EQ, L);
2188       __ stop("must be long copy, but elsize is wrong");
2189       __ bind(L);
2190       BLOCK_COMMENT("} assert long copy done");
2191     }
2192 #endif
2193     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2194     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2195     __ movw(count, scratch_length); // length
2196     __ b(RuntimeAddress(long_copy_entry));
2197 
2198     // ObjArrayKlass
2199   __ BIND(L_objArray);
2200     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2201 
2202     Label L_plain_copy, L_checkcast_copy;
2203     //  test array classes for subtyping
2204     __ load_klass(r15, dst);
2205     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2206     __ br(Assembler::NE, L_checkcast_copy);
2207 
2208     // Identically typed arrays can be copied without element-wise checks.
2209     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2210                            rscratch2, L_failed);
2211 
2212     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2213     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2214     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2215     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2216     __ movw(count, scratch_length); // length
2217   __ BIND(L_plain_copy);
2218     __ b(RuntimeAddress(oop_copy_entry));
2219 
2220   __ BIND(L_checkcast_copy);
2221     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2222     {
2223       // Before looking at dst.length, make sure dst is also an objArray.
2224       __ ldrw(rscratch1, Address(r15, lh_offset));
2225       __ movw(rscratch2, objArray_lh);
2226       __ eorw(rscratch1, rscratch1, rscratch2);
2227       __ cbnzw(rscratch1, L_failed);
2228 
2229       // It is safe to examine both src.length and dst.length.
2230       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2231                              r15, L_failed);
2232 
2233       __ load_klass(dst_klass, dst); // reload
2234 
2235       // Marshal the base address arguments now, freeing registers.
2236       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2237       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2238       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2239       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2240       __ movw(count, length);           // length (reloaded)
2241       Register sco_temp = c_rarg3;      // this register is free now
2242       assert_different_registers(from, to, count, sco_temp,
2243                                  dst_klass, scratch_src_klass);
2244       // assert_clean_int(count, sco_temp);
2245 
2246       // Generate the type check.
2247       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2248       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2249 
2250       // Smashes rscratch1, rscratch2
2251       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2252 
2253       // Fetch destination element klass from the ObjArrayKlass header.
2254       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2255       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2256       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2257 
2258       // the checkcast_copy loop needs two extra arguments:
2259       assert(c_rarg3 == sco_temp, "#3 already in place");
2260       // Set up arguments for checkcast_copy_entry.
2261       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2262       __ b(RuntimeAddress(checkcast_copy_entry));
2263     }
2264 
2265   __ BIND(L_failed);
2266     __ mov(r0, -1);
2267     __ leave();   // required for proper stackwalking of RuntimeStub frame
2268     __ ret(lr);
2269 
2270     return start;
2271   }
2272 
2273   //
2274   // Generate stub for array fill. If "aligned" is true, the
2275   // "to" address is assumed to be heapword aligned.
2276   //
2277   // Arguments for generated stub:
2278   //   to:    c_rarg0
2279   //   value: c_rarg1
2280   //   count: c_rarg2 treated as signed
2281   //
2282   address generate_fill(BasicType t, bool aligned, const char *name) {
2283     __ align(CodeEntryAlignment);
2284     StubCodeMark mark(this, "StubRoutines", name);
2285     address start = __ pc();
2286 
2287     BLOCK_COMMENT("Entry:");
2288 
2289     const Register to        = c_rarg0;  // source array address
2290     const Register value     = c_rarg1;  // value
2291     const Register count     = c_rarg2;  // elements count
2292 
2293     const Register bz_base = r10;        // base for block_zero routine
2294     const Register cnt_words = r11;      // temp register
2295 
2296     __ enter();
2297 
2298     Label L_fill_elements, L_exit1;
2299 
2300     int shift = -1;
2301     switch (t) {
2302       case T_BYTE:
2303         shift = 0;
2304         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2305         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2306         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2307         __ br(Assembler::LO, L_fill_elements);
2308         break;
2309       case T_SHORT:
2310         shift = 1;
2311         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2312         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2313         __ br(Assembler::LO, L_fill_elements);
2314         break;
2315       case T_INT:
2316         shift = 2;
2317         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2318         __ br(Assembler::LO, L_fill_elements);
2319         break;
2320       default: ShouldNotReachHere();
2321     }
2322 
2323     // Align source address at 8 bytes address boundary.
2324     Label L_skip_align1, L_skip_align2, L_skip_align4;
2325     if (!aligned) {
2326       switch (t) {
2327         case T_BYTE:
2328           // One byte misalignment happens only for byte arrays.
2329           __ tbz(to, 0, L_skip_align1);
2330           __ strb(value, Address(__ post(to, 1)));
2331           __ subw(count, count, 1);
2332           __ bind(L_skip_align1);
2333           // Fallthrough
2334         case T_SHORT:
2335           // Two bytes misalignment happens only for byte and short (char) arrays.
2336           __ tbz(to, 1, L_skip_align2);
2337           __ strh(value, Address(__ post(to, 2)));
2338           __ subw(count, count, 2 >> shift);
2339           __ bind(L_skip_align2);
2340           // Fallthrough
2341         case T_INT:
2342           // Align to 8 bytes, we know we are 4 byte aligned to start.
2343           __ tbz(to, 2, L_skip_align4);
2344           __ strw(value, Address(__ post(to, 4)));
2345           __ subw(count, count, 4 >> shift);
2346           __ bind(L_skip_align4);
2347           break;
2348         default: ShouldNotReachHere();
2349       }
2350     }
2351 
2352     //
2353     //  Fill large chunks
2354     //
2355     __ lsrw(cnt_words, count, 3 - shift); // number of words
2356     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2357     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2358     if (UseBlockZeroing) {
2359       Label non_block_zeroing, rest;
2360       // If the fill value is zero we can use the fast zero_words().
2361       __ cbnz(value, non_block_zeroing);
2362       __ mov(bz_base, to);
2363       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2364       address tpc = __ zero_words(bz_base, cnt_words);
2365       if (tpc == nullptr) {
2366         fatal("CodeCache is full at generate_fill");
2367       }
2368       __ b(rest);
2369       __ bind(non_block_zeroing);
2370       __ fill_words(to, cnt_words, value);
2371       __ bind(rest);
2372     } else {
2373       __ fill_words(to, cnt_words, value);
2374     }
2375 
2376     // Remaining count is less than 8 bytes. Fill it by a single store.
2377     // Note that the total length is no less than 8 bytes.
2378     if (t == T_BYTE || t == T_SHORT) {
2379       Label L_exit1;
2380       __ cbzw(count, L_exit1);
2381       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2382       __ str(value, Address(to, -8));    // overwrite some elements
2383       __ bind(L_exit1);
2384       __ leave();
2385       __ ret(lr);
2386     }
2387 
2388     // Handle copies less than 8 bytes.
2389     Label L_fill_2, L_fill_4, L_exit2;
2390     __ bind(L_fill_elements);
2391     switch (t) {
2392       case T_BYTE:
2393         __ tbz(count, 0, L_fill_2);
2394         __ strb(value, Address(__ post(to, 1)));
2395         __ bind(L_fill_2);
2396         __ tbz(count, 1, L_fill_4);
2397         __ strh(value, Address(__ post(to, 2)));
2398         __ bind(L_fill_4);
2399         __ tbz(count, 2, L_exit2);
2400         __ strw(value, Address(to));
2401         break;
2402       case T_SHORT:
2403         __ tbz(count, 0, L_fill_4);
2404         __ strh(value, Address(__ post(to, 2)));
2405         __ bind(L_fill_4);
2406         __ tbz(count, 1, L_exit2);
2407         __ strw(value, Address(to));
2408         break;
2409       case T_INT:
2410         __ cbzw(count, L_exit2);
2411         __ strw(value, Address(to));
2412         break;
2413       default: ShouldNotReachHere();
2414     }
2415     __ bind(L_exit2);
2416     __ leave();
2417     __ ret(lr);
2418     return start;
2419   }
2420 
2421   address generate_data_cache_writeback() {
2422     const Register line        = c_rarg0;  // address of line to write back
2423 
2424     __ align(CodeEntryAlignment);
2425 
2426     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2427 
2428     address start = __ pc();
2429     __ enter();
2430     __ cache_wb(Address(line, 0));
2431     __ leave();
2432     __ ret(lr);
2433 
2434     return start;
2435   }
2436 
2437   address generate_data_cache_writeback_sync() {
2438     const Register is_pre     = c_rarg0;  // pre or post sync
2439 
2440     __ align(CodeEntryAlignment);
2441 
2442     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2443 
2444     // pre wbsync is a no-op
2445     // post wbsync translates to an sfence
2446 
2447     Label skip;
2448     address start = __ pc();
2449     __ enter();
2450     __ cbnz(is_pre, skip);
2451     __ cache_wbsync(false);
2452     __ bind(skip);
2453     __ leave();
2454     __ ret(lr);
2455 
2456     return start;
2457   }
2458 
2459   void generate_arraycopy_stubs() {
2460     address entry;
2461     address entry_jbyte_arraycopy;
2462     address entry_jshort_arraycopy;
2463     address entry_jint_arraycopy;
2464     address entry_oop_arraycopy;
2465     address entry_jlong_arraycopy;
2466     address entry_checkcast_arraycopy;
2467 
2468     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2469     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2470 
2471     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2472 
2473     //*** jbyte
2474     // Always need aligned and unaligned versions
2475     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2476                                                                                   "jbyte_disjoint_arraycopy");
2477     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2478                                                                                   &entry_jbyte_arraycopy,
2479                                                                                   "jbyte_arraycopy");
2480     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2481                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2482     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2483                                                                                   "arrayof_jbyte_arraycopy");
2484 
2485     //*** jshort
2486     // Always need aligned and unaligned versions
2487     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2488                                                                                     "jshort_disjoint_arraycopy");
2489     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2490                                                                                     &entry_jshort_arraycopy,
2491                                                                                     "jshort_arraycopy");
2492     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2493                                                                                     "arrayof_jshort_disjoint_arraycopy");
2494     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2495                                                                                     "arrayof_jshort_arraycopy");
2496 
2497     //*** jint
2498     // Aligned versions
2499     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2500                                                                                 "arrayof_jint_disjoint_arraycopy");
2501     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2502                                                                                 "arrayof_jint_arraycopy");
2503     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2504     // entry_jint_arraycopy always points to the unaligned version
2505     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2506                                                                                 "jint_disjoint_arraycopy");
2507     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2508                                                                                 &entry_jint_arraycopy,
2509                                                                                 "jint_arraycopy");
2510 
2511     //*** jlong
2512     // It is always aligned
2513     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2514                                                                                   "arrayof_jlong_disjoint_arraycopy");
2515     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2516                                                                                   "arrayof_jlong_arraycopy");
2517     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2518     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2519 
2520     //*** oops
2521     {
2522       // With compressed oops we need unaligned versions; notice that
2523       // we overwrite entry_oop_arraycopy.
2524       bool aligned = !UseCompressedOops;
2525 
2526       StubRoutines::_arrayof_oop_disjoint_arraycopy
2527         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2528                                      /*dest_uninitialized*/false);
2529       StubRoutines::_arrayof_oop_arraycopy
2530         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2531                                      /*dest_uninitialized*/false);
2532       // Aligned versions without pre-barriers
2533       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2534         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2535                                      /*dest_uninitialized*/true);
2536       StubRoutines::_arrayof_oop_arraycopy_uninit
2537         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2538                                      /*dest_uninitialized*/true);
2539     }
2540 
2541     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2542     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2543     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2544     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2545 
2546     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2547     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2548                                                                         /*dest_uninitialized*/true);
2549 
2550     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2551                                                               entry_jbyte_arraycopy,
2552                                                               entry_jshort_arraycopy,
2553                                                               entry_jint_arraycopy,
2554                                                               entry_jlong_arraycopy);
2555 
2556     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2557                                                                entry_jbyte_arraycopy,
2558                                                                entry_jshort_arraycopy,
2559                                                                entry_jint_arraycopy,
2560                                                                entry_oop_arraycopy,
2561                                                                entry_jlong_arraycopy,
2562                                                                entry_checkcast_arraycopy);
2563 
2564     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2565     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2566     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2567     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2568     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2569     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2570   }
2571 
2572   void generate_math_stubs() { Unimplemented(); }
2573 
2574   // Arguments:
2575   //
2576   // Inputs:
2577   //   c_rarg0   - source byte array address
2578   //   c_rarg1   - destination byte array address
2579   //   c_rarg2   - K (key) in little endian int array
2580   //
2581   address generate_aescrypt_encryptBlock() {
2582     __ align(CodeEntryAlignment);
2583     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2584 
2585     const Register from        = c_rarg0;  // source array address
2586     const Register to          = c_rarg1;  // destination array address
2587     const Register key         = c_rarg2;  // key array address
2588     const Register keylen      = rscratch1;
2589 
2590     address start = __ pc();
2591     __ enter();
2592 
2593     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2594 
2595     __ aesenc_loadkeys(key, keylen);
2596     __ aesecb_encrypt(from, to, keylen);
2597 
2598     __ mov(r0, 0);
2599 
2600     __ leave();
2601     __ ret(lr);
2602 
2603     return start;
2604   }
2605 
2606   // Arguments:
2607   //
2608   // Inputs:
2609   //   c_rarg0   - source byte array address
2610   //   c_rarg1   - destination byte array address
2611   //   c_rarg2   - K (key) in little endian int array
2612   //
2613   address generate_aescrypt_decryptBlock() {
2614     assert(UseAES, "need AES cryptographic extension support");
2615     __ align(CodeEntryAlignment);
2616     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2617     Label L_doLast;
2618 
2619     const Register from        = c_rarg0;  // source array address
2620     const Register to          = c_rarg1;  // destination array address
2621     const Register key         = c_rarg2;  // key array address
2622     const Register keylen      = rscratch1;
2623 
2624     address start = __ pc();
2625     __ enter(); // required for proper stackwalking of RuntimeStub frame
2626 
2627     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2628 
2629     __ aesecb_decrypt(from, to, key, keylen);
2630 
2631     __ mov(r0, 0);
2632 
2633     __ leave();
2634     __ ret(lr);
2635 
2636     return start;
2637   }
2638 
2639   // Arguments:
2640   //
2641   // Inputs:
2642   //   c_rarg0   - source byte array address
2643   //   c_rarg1   - destination byte array address
2644   //   c_rarg2   - K (key) in little endian int array
2645   //   c_rarg3   - r vector byte array address
2646   //   c_rarg4   - input length
2647   //
2648   // Output:
2649   //   x0        - input length
2650   //
2651   address generate_cipherBlockChaining_encryptAESCrypt() {
2652     assert(UseAES, "need AES cryptographic extension support");
2653     __ align(CodeEntryAlignment);
2654     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2655 
2656     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2657 
2658     const Register from        = c_rarg0;  // source array address
2659     const Register to          = c_rarg1;  // destination array address
2660     const Register key         = c_rarg2;  // key array address
2661     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2662                                            // and left with the results of the last encryption block
2663     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2664     const Register keylen      = rscratch1;
2665 
2666     address start = __ pc();
2667 
2668       __ enter();
2669 
2670       __ movw(rscratch2, len_reg);
2671 
2672       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2673 
2674       __ ld1(v0, __ T16B, rvec);
2675 
2676       __ cmpw(keylen, 52);
2677       __ br(Assembler::CC, L_loadkeys_44);
2678       __ br(Assembler::EQ, L_loadkeys_52);
2679 
2680       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2681       __ rev32(v17, __ T16B, v17);
2682       __ rev32(v18, __ T16B, v18);
2683     __ BIND(L_loadkeys_52);
2684       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2685       __ rev32(v19, __ T16B, v19);
2686       __ rev32(v20, __ T16B, v20);
2687     __ BIND(L_loadkeys_44);
2688       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2689       __ rev32(v21, __ T16B, v21);
2690       __ rev32(v22, __ T16B, v22);
2691       __ rev32(v23, __ T16B, v23);
2692       __ rev32(v24, __ T16B, v24);
2693       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2694       __ rev32(v25, __ T16B, v25);
2695       __ rev32(v26, __ T16B, v26);
2696       __ rev32(v27, __ T16B, v27);
2697       __ rev32(v28, __ T16B, v28);
2698       __ ld1(v29, v30, v31, __ T16B, key);
2699       __ rev32(v29, __ T16B, v29);
2700       __ rev32(v30, __ T16B, v30);
2701       __ rev32(v31, __ T16B, v31);
2702 
2703     __ BIND(L_aes_loop);
2704       __ ld1(v1, __ T16B, __ post(from, 16));
2705       __ eor(v0, __ T16B, v0, v1);
2706 
2707       __ br(Assembler::CC, L_rounds_44);
2708       __ br(Assembler::EQ, L_rounds_52);
2709 
2710       __ aese(v0, v17); __ aesmc(v0, v0);
2711       __ aese(v0, v18); __ aesmc(v0, v0);
2712     __ BIND(L_rounds_52);
2713       __ aese(v0, v19); __ aesmc(v0, v0);
2714       __ aese(v0, v20); __ aesmc(v0, v0);
2715     __ BIND(L_rounds_44);
2716       __ aese(v0, v21); __ aesmc(v0, v0);
2717       __ aese(v0, v22); __ aesmc(v0, v0);
2718       __ aese(v0, v23); __ aesmc(v0, v0);
2719       __ aese(v0, v24); __ aesmc(v0, v0);
2720       __ aese(v0, v25); __ aesmc(v0, v0);
2721       __ aese(v0, v26); __ aesmc(v0, v0);
2722       __ aese(v0, v27); __ aesmc(v0, v0);
2723       __ aese(v0, v28); __ aesmc(v0, v0);
2724       __ aese(v0, v29); __ aesmc(v0, v0);
2725       __ aese(v0, v30);
2726       __ eor(v0, __ T16B, v0, v31);
2727 
2728       __ st1(v0, __ T16B, __ post(to, 16));
2729 
2730       __ subw(len_reg, len_reg, 16);
2731       __ cbnzw(len_reg, L_aes_loop);
2732 
2733       __ st1(v0, __ T16B, rvec);
2734 
2735       __ mov(r0, rscratch2);
2736 
2737       __ leave();
2738       __ ret(lr);
2739 
2740       return start;
2741   }
2742 
2743   // Arguments:
2744   //
2745   // Inputs:
2746   //   c_rarg0   - source byte array address
2747   //   c_rarg1   - destination byte array address
2748   //   c_rarg2   - K (key) in little endian int array
2749   //   c_rarg3   - r vector byte array address
2750   //   c_rarg4   - input length
2751   //
2752   // Output:
2753   //   r0        - input length
2754   //
2755   address generate_cipherBlockChaining_decryptAESCrypt() {
2756     assert(UseAES, "need AES cryptographic extension support");
2757     __ align(CodeEntryAlignment);
2758     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2759 
2760     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2761 
2762     const Register from        = c_rarg0;  // source array address
2763     const Register to          = c_rarg1;  // destination array address
2764     const Register key         = c_rarg2;  // key array address
2765     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2766                                            // and left with the results of the last encryption block
2767     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2768     const Register keylen      = rscratch1;
2769 
2770     address start = __ pc();
2771 
2772       __ enter();
2773 
2774       __ movw(rscratch2, len_reg);
2775 
2776       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2777 
2778       __ ld1(v2, __ T16B, rvec);
2779 
2780       __ ld1(v31, __ T16B, __ post(key, 16));
2781       __ rev32(v31, __ T16B, v31);
2782 
2783       __ cmpw(keylen, 52);
2784       __ br(Assembler::CC, L_loadkeys_44);
2785       __ br(Assembler::EQ, L_loadkeys_52);
2786 
2787       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2788       __ rev32(v17, __ T16B, v17);
2789       __ rev32(v18, __ T16B, v18);
2790     __ BIND(L_loadkeys_52);
2791       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2792       __ rev32(v19, __ T16B, v19);
2793       __ rev32(v20, __ T16B, v20);
2794     __ BIND(L_loadkeys_44);
2795       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2796       __ rev32(v21, __ T16B, v21);
2797       __ rev32(v22, __ T16B, v22);
2798       __ rev32(v23, __ T16B, v23);
2799       __ rev32(v24, __ T16B, v24);
2800       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2801       __ rev32(v25, __ T16B, v25);
2802       __ rev32(v26, __ T16B, v26);
2803       __ rev32(v27, __ T16B, v27);
2804       __ rev32(v28, __ T16B, v28);
2805       __ ld1(v29, v30, __ T16B, key);
2806       __ rev32(v29, __ T16B, v29);
2807       __ rev32(v30, __ T16B, v30);
2808 
2809     __ BIND(L_aes_loop);
2810       __ ld1(v0, __ T16B, __ post(from, 16));
2811       __ orr(v1, __ T16B, v0, v0);
2812 
2813       __ br(Assembler::CC, L_rounds_44);
2814       __ br(Assembler::EQ, L_rounds_52);
2815 
2816       __ aesd(v0, v17); __ aesimc(v0, v0);
2817       __ aesd(v0, v18); __ aesimc(v0, v0);
2818     __ BIND(L_rounds_52);
2819       __ aesd(v0, v19); __ aesimc(v0, v0);
2820       __ aesd(v0, v20); __ aesimc(v0, v0);
2821     __ BIND(L_rounds_44);
2822       __ aesd(v0, v21); __ aesimc(v0, v0);
2823       __ aesd(v0, v22); __ aesimc(v0, v0);
2824       __ aesd(v0, v23); __ aesimc(v0, v0);
2825       __ aesd(v0, v24); __ aesimc(v0, v0);
2826       __ aesd(v0, v25); __ aesimc(v0, v0);
2827       __ aesd(v0, v26); __ aesimc(v0, v0);
2828       __ aesd(v0, v27); __ aesimc(v0, v0);
2829       __ aesd(v0, v28); __ aesimc(v0, v0);
2830       __ aesd(v0, v29); __ aesimc(v0, v0);
2831       __ aesd(v0, v30);
2832       __ eor(v0, __ T16B, v0, v31);
2833       __ eor(v0, __ T16B, v0, v2);
2834 
2835       __ st1(v0, __ T16B, __ post(to, 16));
2836       __ orr(v2, __ T16B, v1, v1);
2837 
2838       __ subw(len_reg, len_reg, 16);
2839       __ cbnzw(len_reg, L_aes_loop);
2840 
2841       __ st1(v2, __ T16B, rvec);
2842 
2843       __ mov(r0, rscratch2);
2844 
2845       __ leave();
2846       __ ret(lr);
2847 
2848     return start;
2849   }
2850 
2851   // CTR AES crypt.
2852   // Arguments:
2853   //
2854   // Inputs:
2855   //   c_rarg0   - source byte array address
2856   //   c_rarg1   - destination byte array address
2857   //   c_rarg2   - K (key) in little endian int array
2858   //   c_rarg3   - counter vector byte array address
2859   //   c_rarg4   - input length
2860   //   c_rarg5   - saved encryptedCounter start
2861   //   c_rarg6   - saved used length
2862   //
2863   // Output:
2864   //   r0       - input length
2865   //
2866   address generate_counterMode_AESCrypt() {
2867     const Register in = c_rarg0;
2868     const Register out = c_rarg1;
2869     const Register key = c_rarg2;
2870     const Register counter = c_rarg3;
2871     const Register saved_len = c_rarg4, len = r10;
2872     const Register saved_encrypted_ctr = c_rarg5;
2873     const Register used_ptr = c_rarg6, used = r12;
2874 
2875     const Register offset = r7;
2876     const Register keylen = r11;
2877 
2878     const unsigned char block_size = 16;
2879     const int bulk_width = 4;
2880     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2881     // performance with larger data sizes, but it also means that the
2882     // fast path isn't used until you have at least 8 blocks, and up
2883     // to 127 bytes of data will be executed on the slow path. For
2884     // that reason, and also so as not to blow away too much icache, 4
2885     // blocks seems like a sensible compromise.
2886 
2887     // Algorithm:
2888     //
2889     //    if (len == 0) {
2890     //        goto DONE;
2891     //    }
2892     //    int result = len;
2893     //    do {
2894     //        if (used >= blockSize) {
2895     //            if (len >= bulk_width * blockSize) {
2896     //                CTR_large_block();
2897     //                if (len == 0)
2898     //                    goto DONE;
2899     //            }
2900     //            for (;;) {
2901     //                16ByteVector v0 = counter;
2902     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2903     //                used = 0;
2904     //                if (len < blockSize)
2905     //                    break;    /* goto NEXT */
2906     //                16ByteVector v1 = load16Bytes(in, offset);
2907     //                v1 = v1 ^ encryptedCounter;
2908     //                store16Bytes(out, offset);
2909     //                used = blockSize;
2910     //                offset += blockSize;
2911     //                len -= blockSize;
2912     //                if (len == 0)
2913     //                    goto DONE;
2914     //            }
2915     //        }
2916     //      NEXT:
2917     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2918     //        len--;
2919     //    } while (len != 0);
2920     //  DONE:
2921     //    return result;
2922     //
2923     // CTR_large_block()
2924     //    Wide bulk encryption of whole blocks.
2925 
2926     __ align(CodeEntryAlignment);
2927     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2928     const address start = __ pc();
2929     __ enter();
2930 
2931     Label DONE, CTR_large_block, large_block_return;
2932     __ ldrw(used, Address(used_ptr));
2933     __ cbzw(saved_len, DONE);
2934 
2935     __ mov(len, saved_len);
2936     __ mov(offset, 0);
2937 
2938     // Compute #rounds for AES based on the length of the key array
2939     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2940 
2941     __ aesenc_loadkeys(key, keylen);
2942 
2943     {
2944       Label L_CTR_loop, NEXT;
2945 
2946       __ bind(L_CTR_loop);
2947 
2948       __ cmp(used, block_size);
2949       __ br(__ LO, NEXT);
2950 
2951       // Maybe we have a lot of data
2952       __ subsw(rscratch1, len, bulk_width * block_size);
2953       __ br(__ HS, CTR_large_block);
2954       __ BIND(large_block_return);
2955       __ cbzw(len, DONE);
2956 
2957       // Setup the counter
2958       __ movi(v4, __ T4S, 0);
2959       __ movi(v5, __ T4S, 1);
2960       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2961 
2962       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2963       __ rev32(v16, __ T16B, v0);
2964       __ addv(v16, __ T4S, v16, v4);
2965       __ rev32(v16, __ T16B, v16);
2966       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2967 
2968       {
2969         // We have fewer than bulk_width blocks of data left. Encrypt
2970         // them one by one until there is less than a full block
2971         // remaining, being careful to save both the encrypted counter
2972         // and the counter.
2973 
2974         Label inner_loop;
2975         __ bind(inner_loop);
2976         // Counter to encrypt is in v0
2977         __ aesecb_encrypt(noreg, noreg, keylen);
2978         __ st1(v0, __ T16B, saved_encrypted_ctr);
2979 
2980         // Do we have a remaining full block?
2981 
2982         __ mov(used, 0);
2983         __ cmp(len, block_size);
2984         __ br(__ LO, NEXT);
2985 
2986         // Yes, we have a full block
2987         __ ldrq(v1, Address(in, offset));
2988         __ eor(v1, __ T16B, v1, v0);
2989         __ strq(v1, Address(out, offset));
2990         __ mov(used, block_size);
2991         __ add(offset, offset, block_size);
2992 
2993         __ subw(len, len, block_size);
2994         __ cbzw(len, DONE);
2995 
2996         // Increment the counter, store it back
2997         __ orr(v0, __ T16B, v16, v16);
2998         __ rev32(v16, __ T16B, v16);
2999         __ addv(v16, __ T4S, v16, v4);
3000         __ rev32(v16, __ T16B, v16);
3001         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3002 
3003         __ b(inner_loop);
3004       }
3005 
3006       __ BIND(NEXT);
3007 
3008       // Encrypt a single byte, and loop.
3009       // We expect this to be a rare event.
3010       __ ldrb(rscratch1, Address(in, offset));
3011       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3012       __ eor(rscratch1, rscratch1, rscratch2);
3013       __ strb(rscratch1, Address(out, offset));
3014       __ add(offset, offset, 1);
3015       __ add(used, used, 1);
3016       __ subw(len, len,1);
3017       __ cbnzw(len, L_CTR_loop);
3018     }
3019 
3020     __ bind(DONE);
3021     __ strw(used, Address(used_ptr));
3022     __ mov(r0, saved_len);
3023 
3024     __ leave(); // required for proper stackwalking of RuntimeStub frame
3025     __ ret(lr);
3026 
3027     // Bulk encryption
3028 
3029     __ BIND (CTR_large_block);
3030     assert(bulk_width == 4 || bulk_width == 8, "must be");
3031 
3032     if (bulk_width == 8) {
3033       __ sub(sp, sp, 4 * 16);
3034       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3035     }
3036     __ sub(sp, sp, 4 * 16);
3037     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3038     RegSet saved_regs = (RegSet::of(in, out, offset)
3039                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3040     __ push(saved_regs, sp);
3041     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3042     __ add(in, in, offset);
3043     __ add(out, out, offset);
3044 
3045     // Keys should already be loaded into the correct registers
3046 
3047     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3048     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3049 
3050     // AES/CTR loop
3051     {
3052       Label L_CTR_loop;
3053       __ BIND(L_CTR_loop);
3054 
3055       // Setup the counters
3056       __ movi(v8, __ T4S, 0);
3057       __ movi(v9, __ T4S, 1);
3058       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3059 
3060       for (int i = 0; i < bulk_width; i++) {
3061         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3062         __ rev32(v0_ofs, __ T16B, v16);
3063         __ addv(v16, __ T4S, v16, v8);
3064       }
3065 
3066       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3067 
3068       // Encrypt the counters
3069       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3070 
3071       if (bulk_width == 8) {
3072         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3073       }
3074 
3075       // XOR the encrypted counters with the inputs
3076       for (int i = 0; i < bulk_width; i++) {
3077         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3078         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3079         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3080       }
3081 
3082       // Write the encrypted data
3083       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3084       if (bulk_width == 8) {
3085         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3086       }
3087 
3088       __ subw(len, len, 16 * bulk_width);
3089       __ cbnzw(len, L_CTR_loop);
3090     }
3091 
3092     // Save the counter back where it goes
3093     __ rev32(v16, __ T16B, v16);
3094     __ st1(v16, __ T16B, counter);
3095 
3096     __ pop(saved_regs, sp);
3097 
3098     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3099     if (bulk_width == 8) {
3100       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3101     }
3102 
3103     __ andr(rscratch1, len, -16 * bulk_width);
3104     __ sub(len, len, rscratch1);
3105     __ add(offset, offset, rscratch1);
3106     __ mov(used, 16);
3107     __ strw(used, Address(used_ptr));
3108     __ b(large_block_return);
3109 
3110     return start;
3111   }
3112 
3113   // Vector AES Galois Counter Mode implementation. Parameters:
3114   //
3115   // in = c_rarg0
3116   // len = c_rarg1
3117   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3118   // out = c_rarg3
3119   // key = c_rarg4
3120   // state = c_rarg5 - GHASH.state
3121   // subkeyHtbl = c_rarg6 - powers of H
3122   // counter = c_rarg7 - 16 bytes of CTR
3123   // return - number of processed bytes
3124   address generate_galoisCounterMode_AESCrypt() {
3125     address ghash_polynomial = __ pc();
3126     __ emit_int64(0x87);  // The low-order bits of the field
3127                           // polynomial (i.e. p = z^7+z^2+z+1)
3128                           // repeated in the low and high parts of a
3129                           // 128-bit vector
3130     __ emit_int64(0x87);
3131 
3132     __ align(CodeEntryAlignment);
3133      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3134     address start = __ pc();
3135     __ enter();
3136 
3137     const Register in = c_rarg0;
3138     const Register len = c_rarg1;
3139     const Register ct = c_rarg2;
3140     const Register out = c_rarg3;
3141     // and updated with the incremented counter in the end
3142 
3143     const Register key = c_rarg4;
3144     const Register state = c_rarg5;
3145 
3146     const Register subkeyHtbl = c_rarg6;
3147 
3148     const Register counter = c_rarg7;
3149 
3150     const Register keylen = r10;
3151     // Save state before entering routine
3152     __ sub(sp, sp, 4 * 16);
3153     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3154     __ sub(sp, sp, 4 * 16);
3155     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3156 
3157     // __ andr(len, len, -512);
3158     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3159     __ str(len, __ pre(sp, -2 * wordSize));
3160 
3161     Label DONE;
3162     __ cbz(len, DONE);
3163 
3164     // Compute #rounds for AES based on the length of the key array
3165     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3166 
3167     __ aesenc_loadkeys(key, keylen);
3168     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3169     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3170 
3171     // AES/CTR loop
3172     {
3173       Label L_CTR_loop;
3174       __ BIND(L_CTR_loop);
3175 
3176       // Setup the counters
3177       __ movi(v8, __ T4S, 0);
3178       __ movi(v9, __ T4S, 1);
3179       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3180 
3181       assert(v0->encoding() < v8->encoding(), "");
3182       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3183         FloatRegister f = as_FloatRegister(i);
3184         __ rev32(f, __ T16B, v16);
3185         __ addv(v16, __ T4S, v16, v8);
3186       }
3187 
3188       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3189 
3190       // Encrypt the counters
3191       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3192 
3193       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3194 
3195       // XOR the encrypted counters with the inputs
3196       for (int i = 0; i < 8; i++) {
3197         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3198         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3199         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3200       }
3201       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3202       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3203 
3204       __ subw(len, len, 16 * 8);
3205       __ cbnzw(len, L_CTR_loop);
3206     }
3207 
3208     __ rev32(v16, __ T16B, v16);
3209     __ st1(v16, __ T16B, counter);
3210 
3211     __ ldr(len, Address(sp));
3212     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3213 
3214     // GHASH/CTR loop
3215     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3216                                 len, /*unrolls*/4);
3217 
3218 #ifdef ASSERT
3219     { Label L;
3220       __ cmp(len, (unsigned char)0);
3221       __ br(Assembler::EQ, L);
3222       __ stop("stubGenerator: abort");
3223       __ bind(L);
3224   }
3225 #endif
3226 
3227   __ bind(DONE);
3228     // Return the number of bytes processed
3229     __ ldr(r0, __ post(sp, 2 * wordSize));
3230 
3231     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3232     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3233 
3234     __ leave(); // required for proper stackwalking of RuntimeStub frame
3235     __ ret(lr);
3236      return start;
3237   }
3238 
3239   // Utility routines for md5.
3240   // Clobbers r10 and r11.
3241   void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4,
3242               int k, int s, int t) {
3243     Register rscratch3 = r10;
3244     Register rscratch4 = r11;
3245 
3246     __ eorw(rscratch3, r3, r4);
3247     __ movw(rscratch2, t);
3248     __ andw(rscratch3, rscratch3, r2);
3249     __ addw(rscratch4, r1, rscratch2);
3250     __ ldrw(rscratch1, Address(buf, k*4));
3251     __ eorw(rscratch3, rscratch3, r4);
3252     __ addw(rscratch4, rscratch4, rscratch1);
3253     __ addw(rscratch3, rscratch3, rscratch4);
3254     __ rorw(rscratch2, rscratch3, 32 - s);
3255     __ addw(r1, rscratch2, r2);
3256   }
3257 
3258   void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4,
3259               int k, int s, int t) {
3260     Register rscratch3 = r10;
3261     Register rscratch4 = r11;
3262 
3263     __ andw(rscratch3, r2, r4);
3264     __ bicw(rscratch4, r3, r4);
3265     __ ldrw(rscratch1, Address(buf, k*4));
3266     __ movw(rscratch2, t);
3267     __ orrw(rscratch3, rscratch3, rscratch4);
3268     __ addw(rscratch4, r1, rscratch2);
3269     __ addw(rscratch4, rscratch4, rscratch1);
3270     __ addw(rscratch3, rscratch3, rscratch4);
3271     __ rorw(rscratch2, rscratch3, 32 - s);
3272     __ addw(r1, rscratch2, r2);
3273   }
3274 
3275   void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4,
3276               int k, int s, int t) {
3277     Register rscratch3 = r10;
3278     Register rscratch4 = r11;
3279 
3280     __ eorw(rscratch3, r3, r4);
3281     __ movw(rscratch2, t);
3282     __ addw(rscratch4, r1, rscratch2);
3283     __ ldrw(rscratch1, Address(buf, k*4));
3284     __ eorw(rscratch3, rscratch3, r2);
3285     __ addw(rscratch4, rscratch4, rscratch1);
3286     __ addw(rscratch3, rscratch3, rscratch4);
3287     __ rorw(rscratch2, rscratch3, 32 - s);
3288     __ addw(r1, rscratch2, r2);
3289   }
3290 
3291   void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4,
3292               int k, int s, int t) {
3293     Register rscratch3 = r10;
3294     Register rscratch4 = r11;
3295 
3296     __ movw(rscratch3, t);
3297     __ ornw(rscratch2, r2, r4);
3298     __ addw(rscratch4, r1, rscratch3);
3299     __ ldrw(rscratch1, Address(buf, k*4));
3300     __ eorw(rscratch3, rscratch2, r3);
3301     __ addw(rscratch4, rscratch4, rscratch1);
3302     __ addw(rscratch3, rscratch3, rscratch4);
3303     __ rorw(rscratch2, rscratch3, 32 - s);
3304     __ addw(r1, rscratch2, r2);
3305   }
3306 
3307   // Arguments:
3308   //
3309   // Inputs:
3310   //   c_rarg0   - byte[]  source+offset
3311   //   c_rarg1   - int[]   SHA.state
3312   //   c_rarg2   - int     offset
3313   //   c_rarg3   - int     limit
3314   //
3315   address generate_md5_implCompress(bool multi_block, const char *name) {
3316     __ align(CodeEntryAlignment);
3317     StubCodeMark mark(this, "StubRoutines", name);
3318     address start = __ pc();
3319 
3320     Register buf       = c_rarg0;
3321     Register state     = c_rarg1;
3322     Register ofs       = c_rarg2;
3323     Register limit     = c_rarg3;
3324     Register a         = r4;
3325     Register b         = r5;
3326     Register c         = r6;
3327     Register d         = r7;
3328     Register rscratch3 = r10;
3329     Register rscratch4 = r11;
3330 
3331     Label md5_loop;
3332     __ BIND(md5_loop);
3333 
3334     // Save hash values for addition after rounds
3335     __ ldrw(a, Address(state,  0));
3336     __ ldrw(b, Address(state,  4));
3337     __ ldrw(c, Address(state,  8));
3338     __ ldrw(d, Address(state, 12));
3339 
3340     // Round 1
3341     md5_FF(buf, a, b, c, d,  0,  7, 0xd76aa478);
3342     md5_FF(buf, d, a, b, c,  1, 12, 0xe8c7b756);
3343     md5_FF(buf, c, d, a, b,  2, 17, 0x242070db);
3344     md5_FF(buf, b, c, d, a,  3, 22, 0xc1bdceee);
3345     md5_FF(buf, a, b, c, d,  4,  7, 0xf57c0faf);
3346     md5_FF(buf, d, a, b, c,  5, 12, 0x4787c62a);
3347     md5_FF(buf, c, d, a, b,  6, 17, 0xa8304613);
3348     md5_FF(buf, b, c, d, a,  7, 22, 0xfd469501);
3349     md5_FF(buf, a, b, c, d,  8,  7, 0x698098d8);
3350     md5_FF(buf, d, a, b, c,  9, 12, 0x8b44f7af);
3351     md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1);
3352     md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be);
3353     md5_FF(buf, a, b, c, d, 12,  7, 0x6b901122);
3354     md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193);
3355     md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e);
3356     md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821);
3357 
3358     // Round 2
3359     md5_GG(buf, a, b, c, d,  1,  5, 0xf61e2562);
3360     md5_GG(buf, d, a, b, c,  6,  9, 0xc040b340);
3361     md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51);
3362     md5_GG(buf, b, c, d, a,  0, 20, 0xe9b6c7aa);
3363     md5_GG(buf, a, b, c, d,  5,  5, 0xd62f105d);
3364     md5_GG(buf, d, a, b, c, 10,  9, 0x02441453);
3365     md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681);
3366     md5_GG(buf, b, c, d, a,  4, 20, 0xe7d3fbc8);
3367     md5_GG(buf, a, b, c, d,  9,  5, 0x21e1cde6);
3368     md5_GG(buf, d, a, b, c, 14,  9, 0xc33707d6);
3369     md5_GG(buf, c, d, a, b,  3, 14, 0xf4d50d87);
3370     md5_GG(buf, b, c, d, a,  8, 20, 0x455a14ed);
3371     md5_GG(buf, a, b, c, d, 13,  5, 0xa9e3e905);
3372     md5_GG(buf, d, a, b, c,  2,  9, 0xfcefa3f8);
3373     md5_GG(buf, c, d, a, b,  7, 14, 0x676f02d9);
3374     md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a);
3375 
3376     // Round 3
3377     md5_HH(buf, a, b, c, d,  5,  4, 0xfffa3942);
3378     md5_HH(buf, d, a, b, c,  8, 11, 0x8771f681);
3379     md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122);
3380     md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c);
3381     md5_HH(buf, a, b, c, d,  1,  4, 0xa4beea44);
3382     md5_HH(buf, d, a, b, c,  4, 11, 0x4bdecfa9);
3383     md5_HH(buf, c, d, a, b,  7, 16, 0xf6bb4b60);
3384     md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70);
3385     md5_HH(buf, a, b, c, d, 13,  4, 0x289b7ec6);
3386     md5_HH(buf, d, a, b, c,  0, 11, 0xeaa127fa);
3387     md5_HH(buf, c, d, a, b,  3, 16, 0xd4ef3085);
3388     md5_HH(buf, b, c, d, a,  6, 23, 0x04881d05);
3389     md5_HH(buf, a, b, c, d,  9,  4, 0xd9d4d039);
3390     md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5);
3391     md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8);
3392     md5_HH(buf, b, c, d, a,  2, 23, 0xc4ac5665);
3393 
3394     // Round 4
3395     md5_II(buf, a, b, c, d,  0,  6, 0xf4292244);
3396     md5_II(buf, d, a, b, c,  7, 10, 0x432aff97);
3397     md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7);
3398     md5_II(buf, b, c, d, a,  5, 21, 0xfc93a039);
3399     md5_II(buf, a, b, c, d, 12,  6, 0x655b59c3);
3400     md5_II(buf, d, a, b, c,  3, 10, 0x8f0ccc92);
3401     md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d);
3402     md5_II(buf, b, c, d, a,  1, 21, 0x85845dd1);
3403     md5_II(buf, a, b, c, d,  8,  6, 0x6fa87e4f);
3404     md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0);
3405     md5_II(buf, c, d, a, b,  6, 15, 0xa3014314);
3406     md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1);
3407     md5_II(buf, a, b, c, d,  4,  6, 0xf7537e82);
3408     md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235);
3409     md5_II(buf, c, d, a, b,  2, 15, 0x2ad7d2bb);
3410     md5_II(buf, b, c, d, a,  9, 21, 0xeb86d391);
3411 
3412     // write hash values back in the correct order
3413     __ ldrw(rscratch1, Address(state,  0));
3414     __ addw(rscratch1, rscratch1, a);
3415     __ strw(rscratch1, Address(state,  0));
3416 
3417     __ ldrw(rscratch2, Address(state,  4));
3418     __ addw(rscratch2, rscratch2, b);
3419     __ strw(rscratch2, Address(state,  4));
3420 
3421     __ ldrw(rscratch3, Address(state,  8));
3422     __ addw(rscratch3, rscratch3, c);
3423     __ strw(rscratch3, Address(state,  8));
3424 
3425     __ ldrw(rscratch4, Address(state, 12));
3426     __ addw(rscratch4, rscratch4, d);
3427     __ strw(rscratch4, Address(state, 12));
3428 
3429     if (multi_block) {
3430       __ add(buf, buf, 64);
3431       __ add(ofs, ofs, 64);
3432       __ cmp(ofs, limit);
3433       __ br(Assembler::LE, md5_loop);
3434       __ mov(c_rarg0, ofs); // return ofs
3435     }
3436 
3437     __ ret(lr);
3438 
3439     return start;
3440   }
3441 
3442   // Arguments:
3443   //
3444   // Inputs:
3445   //   c_rarg0   - byte[]  source+offset
3446   //   c_rarg1   - int[]   SHA.state
3447   //   c_rarg2   - int     offset
3448   //   c_rarg3   - int     limit
3449   //
3450   address generate_sha1_implCompress(bool multi_block, const char *name) {
3451     __ align(CodeEntryAlignment);
3452     StubCodeMark mark(this, "StubRoutines", name);
3453     address start = __ pc();
3454 
3455     Register buf   = c_rarg0;
3456     Register state = c_rarg1;
3457     Register ofs   = c_rarg2;
3458     Register limit = c_rarg3;
3459 
3460     Label keys;
3461     Label sha1_loop;
3462 
3463     // load the keys into v0..v3
3464     __ adr(rscratch1, keys);
3465     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3466     // load 5 words state into v6, v7
3467     __ ldrq(v6, Address(state, 0));
3468     __ ldrs(v7, Address(state, 16));
3469 
3470 
3471     __ BIND(sha1_loop);
3472     // load 64 bytes of data into v16..v19
3473     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3474     __ rev32(v16, __ T16B, v16);
3475     __ rev32(v17, __ T16B, v17);
3476     __ rev32(v18, __ T16B, v18);
3477     __ rev32(v19, __ T16B, v19);
3478 
3479     // do the sha1
3480     __ addv(v4, __ T4S, v16, v0);
3481     __ orr(v20, __ T16B, v6, v6);
3482 
3483     FloatRegister d0 = v16;
3484     FloatRegister d1 = v17;
3485     FloatRegister d2 = v18;
3486     FloatRegister d3 = v19;
3487 
3488     for (int round = 0; round < 20; round++) {
3489       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3490       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3491       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3492       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3493       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3494 
3495       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3496       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3497       __ sha1h(tmp2, __ T4S, v20);
3498       if (round < 5)
3499         __ sha1c(v20, __ T4S, tmp3, tmp4);
3500       else if (round < 10 || round >= 15)
3501         __ sha1p(v20, __ T4S, tmp3, tmp4);
3502       else
3503         __ sha1m(v20, __ T4S, tmp3, tmp4);
3504       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3505 
3506       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3507     }
3508 
3509     __ addv(v7, __ T2S, v7, v21);
3510     __ addv(v6, __ T4S, v6, v20);
3511 
3512     if (multi_block) {
3513       __ add(ofs, ofs, 64);
3514       __ cmp(ofs, limit);
3515       __ br(Assembler::LE, sha1_loop);
3516       __ mov(c_rarg0, ofs); // return ofs
3517     }
3518 
3519     __ strq(v6, Address(state, 0));
3520     __ strs(v7, Address(state, 16));
3521 
3522     __ ret(lr);
3523 
3524     __ bind(keys);
3525     __ emit_int32(0x5a827999);
3526     __ emit_int32(0x6ed9eba1);
3527     __ emit_int32(0x8f1bbcdc);
3528     __ emit_int32(0xca62c1d6);
3529 
3530     return start;
3531   }
3532 
3533 
3534   // Arguments:
3535   //
3536   // Inputs:
3537   //   c_rarg0   - byte[]  source+offset
3538   //   c_rarg1   - int[]   SHA.state
3539   //   c_rarg2   - int     offset
3540   //   c_rarg3   - int     limit
3541   //
3542   address generate_sha256_implCompress(bool multi_block, const char *name) {
3543     static const uint32_t round_consts[64] = {
3544       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3545       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3546       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3547       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3548       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3549       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3550       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3551       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3552       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3553       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3554       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3555       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3556       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3557       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3558       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3559       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3560     };
3561     __ align(CodeEntryAlignment);
3562     StubCodeMark mark(this, "StubRoutines", name);
3563     address start = __ pc();
3564 
3565     Register buf   = c_rarg0;
3566     Register state = c_rarg1;
3567     Register ofs   = c_rarg2;
3568     Register limit = c_rarg3;
3569 
3570     Label sha1_loop;
3571 
3572     __ stpd(v8, v9, __ pre(sp, -32));
3573     __ stpd(v10, v11, Address(sp, 16));
3574 
3575 // dga == v0
3576 // dgb == v1
3577 // dg0 == v2
3578 // dg1 == v3
3579 // dg2 == v4
3580 // t0 == v6
3581 // t1 == v7
3582 
3583     // load 16 keys to v16..v31
3584     __ lea(rscratch1, ExternalAddress((address)round_consts));
3585     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3586     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3587     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3588     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3589 
3590     // load 8 words (256 bits) state
3591     __ ldpq(v0, v1, state);
3592 
3593     __ BIND(sha1_loop);
3594     // load 64 bytes of data into v8..v11
3595     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3596     __ rev32(v8, __ T16B, v8);
3597     __ rev32(v9, __ T16B, v9);
3598     __ rev32(v10, __ T16B, v10);
3599     __ rev32(v11, __ T16B, v11);
3600 
3601     __ addv(v6, __ T4S, v8, v16);
3602     __ orr(v2, __ T16B, v0, v0);
3603     __ orr(v3, __ T16B, v1, v1);
3604 
3605     FloatRegister d0 = v8;
3606     FloatRegister d1 = v9;
3607     FloatRegister d2 = v10;
3608     FloatRegister d3 = v11;
3609 
3610 
3611     for (int round = 0; round < 16; round++) {
3612       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3613       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3614       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3615       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3616 
3617       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3618        __ orr(v4, __ T16B, v2, v2);
3619       if (round < 15)
3620         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3621       __ sha256h(v2, __ T4S, v3, tmp2);
3622       __ sha256h2(v3, __ T4S, v4, tmp2);
3623       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3624 
3625       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3626     }
3627 
3628     __ addv(v0, __ T4S, v0, v2);
3629     __ addv(v1, __ T4S, v1, v3);
3630 
3631     if (multi_block) {
3632       __ add(ofs, ofs, 64);
3633       __ cmp(ofs, limit);
3634       __ br(Assembler::LE, sha1_loop);
3635       __ mov(c_rarg0, ofs); // return ofs
3636     }
3637 
3638     __ ldpd(v10, v11, Address(sp, 16));
3639     __ ldpd(v8, v9, __ post(sp, 32));
3640 
3641     __ stpq(v0, v1, state);
3642 
3643     __ ret(lr);
3644 
3645     return start;
3646   }
3647 
3648   // Double rounds for sha512.
3649   void sha512_dround(int dr,
3650                      FloatRegister vi0, FloatRegister vi1,
3651                      FloatRegister vi2, FloatRegister vi3,
3652                      FloatRegister vi4, FloatRegister vrc0,
3653                      FloatRegister vrc1, FloatRegister vin0,
3654                      FloatRegister vin1, FloatRegister vin2,
3655                      FloatRegister vin3, FloatRegister vin4) {
3656       if (dr < 36) {
3657         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3658       }
3659       __ addv(v5, __ T2D, vrc0, vin0);
3660       __ ext(v6, __ T16B, vi2, vi3, 8);
3661       __ ext(v5, __ T16B, v5, v5, 8);
3662       __ ext(v7, __ T16B, vi1, vi2, 8);
3663       __ addv(vi3, __ T2D, vi3, v5);
3664       if (dr < 32) {
3665         __ ext(v5, __ T16B, vin3, vin4, 8);
3666         __ sha512su0(vin0, __ T2D, vin1);
3667       }
3668       __ sha512h(vi3, __ T2D, v6, v7);
3669       if (dr < 32) {
3670         __ sha512su1(vin0, __ T2D, vin2, v5);
3671       }
3672       __ addv(vi4, __ T2D, vi1, vi3);
3673       __ sha512h2(vi3, __ T2D, vi1, vi0);
3674   }
3675 
3676   // Arguments:
3677   //
3678   // Inputs:
3679   //   c_rarg0   - byte[]  source+offset
3680   //   c_rarg1   - int[]   SHA.state
3681   //   c_rarg2   - int     offset
3682   //   c_rarg3   - int     limit
3683   //
3684   address generate_sha512_implCompress(bool multi_block, const char *name) {
3685     static const uint64_t round_consts[80] = {
3686       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3687       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3688       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3689       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3690       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3691       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3692       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3693       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3694       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3695       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3696       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3697       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3698       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3699       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3700       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3701       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3702       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3703       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3704       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3705       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3706       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3707       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3708       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3709       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3710       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3711       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3712       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3713     };
3714 
3715     __ align(CodeEntryAlignment);
3716     StubCodeMark mark(this, "StubRoutines", name);
3717     address start = __ pc();
3718 
3719     Register buf   = c_rarg0;
3720     Register state = c_rarg1;
3721     Register ofs   = c_rarg2;
3722     Register limit = c_rarg3;
3723 
3724     __ stpd(v8, v9, __ pre(sp, -64));
3725     __ stpd(v10, v11, Address(sp, 16));
3726     __ stpd(v12, v13, Address(sp, 32));
3727     __ stpd(v14, v15, Address(sp, 48));
3728 
3729     Label sha512_loop;
3730 
3731     // load state
3732     __ ld1(v8, v9, v10, v11, __ T2D, state);
3733 
3734     // load first 4 round constants
3735     __ lea(rscratch1, ExternalAddress((address)round_consts));
3736     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3737 
3738     __ BIND(sha512_loop);
3739     // load 128B of data into v12..v19
3740     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3741     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3742     __ rev64(v12, __ T16B, v12);
3743     __ rev64(v13, __ T16B, v13);
3744     __ rev64(v14, __ T16B, v14);
3745     __ rev64(v15, __ T16B, v15);
3746     __ rev64(v16, __ T16B, v16);
3747     __ rev64(v17, __ T16B, v17);
3748     __ rev64(v18, __ T16B, v18);
3749     __ rev64(v19, __ T16B, v19);
3750 
3751     __ mov(rscratch2, rscratch1);
3752 
3753     __ mov(v0, __ T16B, v8);
3754     __ mov(v1, __ T16B, v9);
3755     __ mov(v2, __ T16B, v10);
3756     __ mov(v3, __ T16B, v11);
3757 
3758     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3759     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3760     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3761     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3762     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3763     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3764     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3765     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3766     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3767     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3768     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3769     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3770     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3771     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3772     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3773     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3774     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3775     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3776     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3777     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3778     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3779     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3780     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3781     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3782     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3783     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3784     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3785     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3786     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3787     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3788     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3789     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3790     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3791     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3792     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3793     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3794     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3795     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3796     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3797     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3798 
3799     __ addv(v8, __ T2D, v8, v0);
3800     __ addv(v9, __ T2D, v9, v1);
3801     __ addv(v10, __ T2D, v10, v2);
3802     __ addv(v11, __ T2D, v11, v3);
3803 
3804     if (multi_block) {
3805       __ add(ofs, ofs, 128);
3806       __ cmp(ofs, limit);
3807       __ br(Assembler::LE, sha512_loop);
3808       __ mov(c_rarg0, ofs); // return ofs
3809     }
3810 
3811     __ st1(v8, v9, v10, v11, __ T2D, state);
3812 
3813     __ ldpd(v14, v15, Address(sp, 48));
3814     __ ldpd(v12, v13, Address(sp, 32));
3815     __ ldpd(v10, v11, Address(sp, 16));
3816     __ ldpd(v8, v9, __ post(sp, 64));
3817 
3818     __ ret(lr);
3819 
3820     return start;
3821   }
3822 
3823   // Arguments:
3824   //
3825   // Inputs:
3826   //   c_rarg0   - byte[]  source+offset
3827   //   c_rarg1   - byte[]  SHA.state
3828   //   c_rarg2   - int     block_size
3829   //   c_rarg3   - int     offset
3830   //   c_rarg4   - int     limit
3831   //
3832   address generate_sha3_implCompress(bool multi_block, const char *name) {
3833     static const uint64_t round_consts[24] = {
3834       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3835       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3836       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3837       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3838       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3839       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3840       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3841       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3842     };
3843 
3844     __ align(CodeEntryAlignment);
3845     StubCodeMark mark(this, "StubRoutines", name);
3846     address start = __ pc();
3847 
3848     Register buf           = c_rarg0;
3849     Register state         = c_rarg1;
3850     Register block_size    = c_rarg2;
3851     Register ofs           = c_rarg3;
3852     Register limit         = c_rarg4;
3853 
3854     Label sha3_loop, rounds24_loop;
3855     Label sha3_512_or_sha3_384, shake128;
3856 
3857     __ stpd(v8, v9, __ pre(sp, -64));
3858     __ stpd(v10, v11, Address(sp, 16));
3859     __ stpd(v12, v13, Address(sp, 32));
3860     __ stpd(v14, v15, Address(sp, 48));
3861 
3862     // load state
3863     __ add(rscratch1, state, 32);
3864     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3865     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3866     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3867     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3868     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3869     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3870     __ ld1(v24, __ T1D, rscratch1);
3871 
3872     __ BIND(sha3_loop);
3873 
3874     // 24 keccak rounds
3875     __ movw(rscratch2, 24);
3876 
3877     // load round_constants base
3878     __ lea(rscratch1, ExternalAddress((address) round_consts));
3879 
3880     // load input
3881     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3882     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3883     __ eor(v0, __ T8B, v0, v25);
3884     __ eor(v1, __ T8B, v1, v26);
3885     __ eor(v2, __ T8B, v2, v27);
3886     __ eor(v3, __ T8B, v3, v28);
3887     __ eor(v4, __ T8B, v4, v29);
3888     __ eor(v5, __ T8B, v5, v30);
3889     __ eor(v6, __ T8B, v6, v31);
3890 
3891     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
3892     __ tbz(block_size, 7, sha3_512_or_sha3_384);
3893 
3894     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3895     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3896     __ eor(v7, __ T8B, v7, v25);
3897     __ eor(v8, __ T8B, v8, v26);
3898     __ eor(v9, __ T8B, v9, v27);
3899     __ eor(v10, __ T8B, v10, v28);
3900     __ eor(v11, __ T8B, v11, v29);
3901     __ eor(v12, __ T8B, v12, v30);
3902     __ eor(v13, __ T8B, v13, v31);
3903 
3904     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
3905     __ eor(v14, __ T8B, v14, v25);
3906     __ eor(v15, __ T8B, v15, v26);
3907     __ eor(v16, __ T8B, v16, v27);
3908 
3909     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
3910     __ andw(c_rarg5, block_size, 48);
3911     __ cbzw(c_rarg5, rounds24_loop);
3912 
3913     __ tbnz(block_size, 5, shake128);
3914     // block_size == 144, bit5 == 0, SHA3-244
3915     __ ldrd(v28, __ post(buf, 8));
3916     __ eor(v17, __ T8B, v17, v28);
3917     __ b(rounds24_loop);
3918 
3919     __ BIND(shake128);
3920     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
3921     __ eor(v17, __ T8B, v17, v28);
3922     __ eor(v18, __ T8B, v18, v29);
3923     __ eor(v19, __ T8B, v19, v30);
3924     __ eor(v20, __ T8B, v20, v31);
3925     __ b(rounds24_loop); // block_size == 168, SHAKE128
3926 
3927     __ BIND(sha3_512_or_sha3_384);
3928     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3929     __ eor(v7, __ T8B, v7, v25);
3930     __ eor(v8, __ T8B, v8, v26);
3931     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
3932 
3933     // SHA3-384
3934     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
3935     __ eor(v9,  __ T8B, v9,  v27);
3936     __ eor(v10, __ T8B, v10, v28);
3937     __ eor(v11, __ T8B, v11, v29);
3938     __ eor(v12, __ T8B, v12, v30);
3939 
3940     __ BIND(rounds24_loop);
3941     __ subw(rscratch2, rscratch2, 1);
3942 
3943     __ eor3(v29, __ T16B, v4, v9, v14);
3944     __ eor3(v26, __ T16B, v1, v6, v11);
3945     __ eor3(v28, __ T16B, v3, v8, v13);
3946     __ eor3(v25, __ T16B, v0, v5, v10);
3947     __ eor3(v27, __ T16B, v2, v7, v12);
3948     __ eor3(v29, __ T16B, v29, v19, v24);
3949     __ eor3(v26, __ T16B, v26, v16, v21);
3950     __ eor3(v28, __ T16B, v28, v18, v23);
3951     __ eor3(v25, __ T16B, v25, v15, v20);
3952     __ eor3(v27, __ T16B, v27, v17, v22);
3953 
3954     __ rax1(v30, __ T2D, v29, v26);
3955     __ rax1(v26, __ T2D, v26, v28);
3956     __ rax1(v28, __ T2D, v28, v25);
3957     __ rax1(v25, __ T2D, v25, v27);
3958     __ rax1(v27, __ T2D, v27, v29);
3959 
3960     __ eor(v0, __ T16B, v0, v30);
3961     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3962     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3963     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3964     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3965     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3966     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3967     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3968     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3969     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3970     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3971     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3972     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3973     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3974     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3975     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3976     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3977     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3978     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3979     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3980     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3981     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3982     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3983     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3984     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3985 
3986     __ bcax(v20, __ T16B, v31, v22, v8);
3987     __ bcax(v21, __ T16B, v8,  v23, v22);
3988     __ bcax(v22, __ T16B, v22, v24, v23);
3989     __ bcax(v23, __ T16B, v23, v31, v24);
3990     __ bcax(v24, __ T16B, v24, v8,  v31);
3991 
3992     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3993 
3994     __ bcax(v17, __ T16B, v25, v19, v3);
3995     __ bcax(v18, __ T16B, v3,  v15, v19);
3996     __ bcax(v19, __ T16B, v19, v16, v15);
3997     __ bcax(v15, __ T16B, v15, v25, v16);
3998     __ bcax(v16, __ T16B, v16, v3,  v25);
3999 
4000     __ bcax(v10, __ T16B, v29, v12, v26);
4001     __ bcax(v11, __ T16B, v26, v13, v12);
4002     __ bcax(v12, __ T16B, v12, v14, v13);
4003     __ bcax(v13, __ T16B, v13, v29, v14);
4004     __ bcax(v14, __ T16B, v14, v26, v29);
4005 
4006     __ bcax(v7, __ T16B, v30, v9,  v4);
4007     __ bcax(v8, __ T16B, v4,  v5,  v9);
4008     __ bcax(v9, __ T16B, v9,  v6,  v5);
4009     __ bcax(v5, __ T16B, v5,  v30, v6);
4010     __ bcax(v6, __ T16B, v6,  v4,  v30);
4011 
4012     __ bcax(v3, __ T16B, v27, v0,  v28);
4013     __ bcax(v4, __ T16B, v28, v1,  v0);
4014     __ bcax(v0, __ T16B, v0,  v2,  v1);
4015     __ bcax(v1, __ T16B, v1,  v27, v2);
4016     __ bcax(v2, __ T16B, v2,  v28, v27);
4017 
4018     __ eor(v0, __ T16B, v0, v31);
4019 
4020     __ cbnzw(rscratch2, rounds24_loop);
4021 
4022     if (multi_block) {
4023       __ add(ofs, ofs, block_size);
4024       __ cmp(ofs, limit);
4025       __ br(Assembler::LE, sha3_loop);
4026       __ mov(c_rarg0, ofs); // return ofs
4027     }
4028 
4029     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4030     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4031     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4032     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4033     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4034     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4035     __ st1(v24, __ T1D, state);
4036 
4037     __ ldpd(v14, v15, Address(sp, 48));
4038     __ ldpd(v12, v13, Address(sp, 32));
4039     __ ldpd(v10, v11, Address(sp, 16));
4040     __ ldpd(v8, v9, __ post(sp, 64));
4041 
4042     __ ret(lr);
4043 
4044     return start;
4045   }
4046 
4047   /**
4048    *  Arguments:
4049    *
4050    * Inputs:
4051    *   c_rarg0   - int crc
4052    *   c_rarg1   - byte* buf
4053    *   c_rarg2   - int length
4054    *
4055    * Output:
4056    *       rax   - int crc result
4057    */
4058   address generate_updateBytesCRC32() {
4059     assert(UseCRC32Intrinsics, "what are we doing here?");
4060 
4061     __ align(CodeEntryAlignment);
4062     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4063 
4064     address start = __ pc();
4065 
4066     const Register crc   = c_rarg0;  // crc
4067     const Register buf   = c_rarg1;  // source java byte array address
4068     const Register len   = c_rarg2;  // length
4069     const Register table0 = c_rarg3; // crc_table address
4070     const Register table1 = c_rarg4;
4071     const Register table2 = c_rarg5;
4072     const Register table3 = c_rarg6;
4073     const Register tmp3 = c_rarg7;
4074 
4075     BLOCK_COMMENT("Entry:");
4076     __ enter(); // required for proper stackwalking of RuntimeStub frame
4077 
4078     __ kernel_crc32(crc, buf, len,
4079               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4080 
4081     __ leave(); // required for proper stackwalking of RuntimeStub frame
4082     __ ret(lr);
4083 
4084     return start;
4085   }
4086 
4087   // ChaCha20 block function.  This version parallelizes by loading
4088   // individual 32-bit state elements into vectors for four blocks
4089   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4090   //
4091   // state (int[16]) = c_rarg0
4092   // keystream (byte[1024]) = c_rarg1
4093   // return - number of bytes of keystream (always 256)
4094   address generate_chacha20Block_blockpar() {
4095     Label L_twoRounds, L_cc20_const;
4096     // The constant data is broken into two 128-bit segments to be loaded
4097     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4098     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4099     // The second 128-bits is a table constant used for 8-bit left rotations.
4100     __ BIND(L_cc20_const);
4101     __ emit_int64(0x0000000100000000UL);
4102     __ emit_int64(0x0000000300000002UL);
4103     __ emit_int64(0x0605040702010003UL);
4104     __ emit_int64(0x0E0D0C0F0A09080BUL);
4105 
4106     __ align(CodeEntryAlignment);
4107     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4108     address start = __ pc();
4109     __ enter();
4110 
4111     int i, j;
4112     const Register state = c_rarg0;
4113     const Register keystream = c_rarg1;
4114     const Register loopCtr = r10;
4115     const Register tmpAddr = r11;
4116 
4117     const FloatRegister stateFirst = v0;
4118     const FloatRegister stateSecond = v1;
4119     const FloatRegister stateThird = v2;
4120     const FloatRegister stateFourth = v3;
4121     const FloatRegister origCtrState = v28;
4122     const FloatRegister scratch = v29;
4123     const FloatRegister lrot8Tbl = v30;
4124 
4125     // Organize SIMD registers in an array that facilitates
4126     // putting repetitive opcodes into loop structures.  It is
4127     // important that each grouping of 4 registers is monotonically
4128     // increasing to support the requirements of multi-register
4129     // instructions (e.g. ld4r, st4, etc.)
4130     const FloatRegister workSt[16] = {
4131          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4132         v20, v21, v22, v23, v24, v25, v26, v27
4133     };
4134 
4135     // Load from memory and interlace across 16 SIMD registers,
4136     // With each word from memory being broadcast to all lanes of
4137     // each successive SIMD register.
4138     //      Addr(0) -> All lanes in workSt[i]
4139     //      Addr(4) -> All lanes workSt[i + 1], etc.
4140     __ mov(tmpAddr, state);
4141     for (i = 0; i < 16; i += 4) {
4142       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4143           __ post(tmpAddr, 16));
4144     }
4145 
4146     // Pull in constant data.  The first 16 bytes are the add overlay
4147     // which is applied to the vector holding the counter (state[12]).
4148     // The second 16 bytes is the index register for the 8-bit left
4149     // rotation tbl instruction.
4150     __ adr(tmpAddr, L_cc20_const);
4151     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4152     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4153 
4154     // Set up the 10 iteration loop and perform all 8 quarter round ops
4155     __ mov(loopCtr, 10);
4156     __ BIND(L_twoRounds);
4157 
4158     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4159         scratch, lrot8Tbl);
4160     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4161         scratch, lrot8Tbl);
4162     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4163         scratch, lrot8Tbl);
4164     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4165         scratch, lrot8Tbl);
4166 
4167     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4168         scratch, lrot8Tbl);
4169     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4170         scratch, lrot8Tbl);
4171     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4172         scratch, lrot8Tbl);
4173     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4174         scratch, lrot8Tbl);
4175 
4176     // Decrement and iterate
4177     __ sub(loopCtr, loopCtr, 1);
4178     __ cbnz(loopCtr, L_twoRounds);
4179 
4180     __ mov(tmpAddr, state);
4181 
4182     // Add the starting state back to the post-loop keystream
4183     // state.  We read/interlace the state array from memory into
4184     // 4 registers similar to what we did in the beginning.  Then
4185     // add the counter overlay onto workSt[12] at the end.
4186     for (i = 0; i < 16; i += 4) {
4187       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4188           __ post(tmpAddr, 16));
4189       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4190       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4191       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4192       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4193     }
4194     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4195 
4196     // Write to key stream, storing the same element out of workSt[0..15]
4197     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4198     // for the next element position.
4199     for (i = 0; i < 4; i++) {
4200       for (j = 0; j < 16; j += 4) {
4201         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4202             __ post(keystream, 16));
4203       }
4204     }
4205 
4206     __ mov(r0, 256);             // Return length of output keystream
4207     __ leave();
4208     __ ret(lr);
4209 
4210     return start;
4211   }
4212 
4213   /**
4214    *  Arguments:
4215    *
4216    * Inputs:
4217    *   c_rarg0   - int crc
4218    *   c_rarg1   - byte* buf
4219    *   c_rarg2   - int length
4220    *   c_rarg3   - int* table
4221    *
4222    * Output:
4223    *       r0   - int crc result
4224    */
4225   address generate_updateBytesCRC32C() {
4226     assert(UseCRC32CIntrinsics, "what are we doing here?");
4227 
4228     __ align(CodeEntryAlignment);
4229     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4230 
4231     address start = __ pc();
4232 
4233     const Register crc   = c_rarg0;  // crc
4234     const Register buf   = c_rarg1;  // source java byte array address
4235     const Register len   = c_rarg2;  // length
4236     const Register table0 = c_rarg3; // crc_table address
4237     const Register table1 = c_rarg4;
4238     const Register table2 = c_rarg5;
4239     const Register table3 = c_rarg6;
4240     const Register tmp3 = c_rarg7;
4241 
4242     BLOCK_COMMENT("Entry:");
4243     __ enter(); // required for proper stackwalking of RuntimeStub frame
4244 
4245     __ kernel_crc32c(crc, buf, len,
4246               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4247 
4248     __ leave(); // required for proper stackwalking of RuntimeStub frame
4249     __ ret(lr);
4250 
4251     return start;
4252   }
4253 
4254   /***
4255    *  Arguments:
4256    *
4257    *  Inputs:
4258    *   c_rarg0   - int   adler
4259    *   c_rarg1   - byte* buff
4260    *   c_rarg2   - int   len
4261    *
4262    * Output:
4263    *   c_rarg0   - int adler result
4264    */
4265   address generate_updateBytesAdler32() {
4266     __ align(CodeEntryAlignment);
4267     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4268     address start = __ pc();
4269 
4270     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4271 
4272     // Aliases
4273     Register adler  = c_rarg0;
4274     Register s1     = c_rarg0;
4275     Register s2     = c_rarg3;
4276     Register buff   = c_rarg1;
4277     Register len    = c_rarg2;
4278     Register nmax  = r4;
4279     Register base  = r5;
4280     Register count = r6;
4281     Register temp0 = rscratch1;
4282     Register temp1 = rscratch2;
4283     FloatRegister vbytes = v0;
4284     FloatRegister vs1acc = v1;
4285     FloatRegister vs2acc = v2;
4286     FloatRegister vtable = v3;
4287 
4288     // Max number of bytes we can process before having to take the mod
4289     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4290     uint64_t BASE = 0xfff1;
4291     uint64_t NMAX = 0x15B0;
4292 
4293     __ mov(base, BASE);
4294     __ mov(nmax, NMAX);
4295 
4296     // Load accumulation coefficients for the upper 16 bits
4297     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4298     __ ld1(vtable, __ T16B, Address(temp0));
4299 
4300     // s1 is initialized to the lower 16 bits of adler
4301     // s2 is initialized to the upper 16 bits of adler
4302     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4303     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4304 
4305     // The pipelined loop needs at least 16 elements for 1 iteration
4306     // It does check this, but it is more effective to skip to the cleanup loop
4307     __ cmp(len, (u1)16);
4308     __ br(Assembler::HS, L_nmax);
4309     __ cbz(len, L_combine);
4310 
4311     __ bind(L_simple_by1_loop);
4312     __ ldrb(temp0, Address(__ post(buff, 1)));
4313     __ add(s1, s1, temp0);
4314     __ add(s2, s2, s1);
4315     __ subs(len, len, 1);
4316     __ br(Assembler::HI, L_simple_by1_loop);
4317 
4318     // s1 = s1 % BASE
4319     __ subs(temp0, s1, base);
4320     __ csel(s1, temp0, s1, Assembler::HS);
4321 
4322     // s2 = s2 % BASE
4323     __ lsr(temp0, s2, 16);
4324     __ lsl(temp1, temp0, 4);
4325     __ sub(temp1, temp1, temp0);
4326     __ add(s2, temp1, s2, ext::uxth);
4327 
4328     __ subs(temp0, s2, base);
4329     __ csel(s2, temp0, s2, Assembler::HS);
4330 
4331     __ b(L_combine);
4332 
4333     __ bind(L_nmax);
4334     __ subs(len, len, nmax);
4335     __ sub(count, nmax, 16);
4336     __ br(Assembler::LO, L_by16);
4337 
4338     __ bind(L_nmax_loop);
4339 
4340     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4341                                       vbytes, vs1acc, vs2acc, vtable);
4342 
4343     __ subs(count, count, 16);
4344     __ br(Assembler::HS, L_nmax_loop);
4345 
4346     // s1 = s1 % BASE
4347     __ lsr(temp0, s1, 16);
4348     __ lsl(temp1, temp0, 4);
4349     __ sub(temp1, temp1, temp0);
4350     __ add(temp1, temp1, s1, ext::uxth);
4351 
4352     __ lsr(temp0, temp1, 16);
4353     __ lsl(s1, temp0, 4);
4354     __ sub(s1, s1, temp0);
4355     __ add(s1, s1, temp1, ext:: uxth);
4356 
4357     __ subs(temp0, s1, base);
4358     __ csel(s1, temp0, s1, Assembler::HS);
4359 
4360     // s2 = s2 % BASE
4361     __ lsr(temp0, s2, 16);
4362     __ lsl(temp1, temp0, 4);
4363     __ sub(temp1, temp1, temp0);
4364     __ add(temp1, temp1, s2, ext::uxth);
4365 
4366     __ lsr(temp0, temp1, 16);
4367     __ lsl(s2, temp0, 4);
4368     __ sub(s2, s2, temp0);
4369     __ add(s2, s2, temp1, ext:: uxth);
4370 
4371     __ subs(temp0, s2, base);
4372     __ csel(s2, temp0, s2, Assembler::HS);
4373 
4374     __ subs(len, len, nmax);
4375     __ sub(count, nmax, 16);
4376     __ br(Assembler::HS, L_nmax_loop);
4377 
4378     __ bind(L_by16);
4379     __ adds(len, len, count);
4380     __ br(Assembler::LO, L_by1);
4381 
4382     __ bind(L_by16_loop);
4383 
4384     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4385                                       vbytes, vs1acc, vs2acc, vtable);
4386 
4387     __ subs(len, len, 16);
4388     __ br(Assembler::HS, L_by16_loop);
4389 
4390     __ bind(L_by1);
4391     __ adds(len, len, 15);
4392     __ br(Assembler::LO, L_do_mod);
4393 
4394     __ bind(L_by1_loop);
4395     __ ldrb(temp0, Address(__ post(buff, 1)));
4396     __ add(s1, temp0, s1);
4397     __ add(s2, s2, s1);
4398     __ subs(len, len, 1);
4399     __ br(Assembler::HS, L_by1_loop);
4400 
4401     __ bind(L_do_mod);
4402     // s1 = s1 % BASE
4403     __ lsr(temp0, s1, 16);
4404     __ lsl(temp1, temp0, 4);
4405     __ sub(temp1, temp1, temp0);
4406     __ add(temp1, temp1, s1, ext::uxth);
4407 
4408     __ lsr(temp0, temp1, 16);
4409     __ lsl(s1, temp0, 4);
4410     __ sub(s1, s1, temp0);
4411     __ add(s1, s1, temp1, ext:: uxth);
4412 
4413     __ subs(temp0, s1, base);
4414     __ csel(s1, temp0, s1, Assembler::HS);
4415 
4416     // s2 = s2 % BASE
4417     __ lsr(temp0, s2, 16);
4418     __ lsl(temp1, temp0, 4);
4419     __ sub(temp1, temp1, temp0);
4420     __ add(temp1, temp1, s2, ext::uxth);
4421 
4422     __ lsr(temp0, temp1, 16);
4423     __ lsl(s2, temp0, 4);
4424     __ sub(s2, s2, temp0);
4425     __ add(s2, s2, temp1, ext:: uxth);
4426 
4427     __ subs(temp0, s2, base);
4428     __ csel(s2, temp0, s2, Assembler::HS);
4429 
4430     // Combine lower bits and higher bits
4431     __ bind(L_combine);
4432     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4433 
4434     __ ret(lr);
4435 
4436     return start;
4437   }
4438 
4439   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4440           Register temp0, Register temp1, FloatRegister vbytes,
4441           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4442     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4443     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4444     // In non-vectorized code, we update s1 and s2 as:
4445     //   s1 <- s1 + b1
4446     //   s2 <- s2 + s1
4447     //   s1 <- s1 + b2
4448     //   s2 <- s2 + b1
4449     //   ...
4450     //   s1 <- s1 + b16
4451     //   s2 <- s2 + s1
4452     // Putting above assignments together, we have:
4453     //   s1_new = s1 + b1 + b2 + ... + b16
4454     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4455     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4456     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4457     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4458 
4459     // s2 = s2 + s1 * 16
4460     __ add(s2, s2, s1, Assembler::LSL, 4);
4461 
4462     // vs1acc = b1 + b2 + b3 + ... + b16
4463     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4464     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4465     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4466     __ uaddlv(vs1acc, __ T16B, vbytes);
4467     __ uaddlv(vs2acc, __ T8H, vs2acc);
4468 
4469     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4470     __ fmovd(temp0, vs1acc);
4471     __ fmovd(temp1, vs2acc);
4472     __ add(s1, s1, temp0);
4473     __ add(s2, s2, temp1);
4474   }
4475 
4476   /**
4477    *  Arguments:
4478    *
4479    *  Input:
4480    *    c_rarg0   - x address
4481    *    c_rarg1   - x length
4482    *    c_rarg2   - y address
4483    *    c_rarg3   - y length
4484    *    c_rarg4   - z address
4485    *    c_rarg5   - z length
4486    */
4487   address generate_multiplyToLen() {
4488     __ align(CodeEntryAlignment);
4489     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4490 
4491     address start = __ pc();
4492     const Register x     = r0;
4493     const Register xlen  = r1;
4494     const Register y     = r2;
4495     const Register ylen  = r3;
4496     const Register z     = r4;
4497     const Register zlen  = r5;
4498 
4499     const Register tmp1  = r10;
4500     const Register tmp2  = r11;
4501     const Register tmp3  = r12;
4502     const Register tmp4  = r13;
4503     const Register tmp5  = r14;
4504     const Register tmp6  = r15;
4505     const Register tmp7  = r16;
4506 
4507     BLOCK_COMMENT("Entry:");
4508     __ enter(); // required for proper stackwalking of RuntimeStub frame
4509     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4510     __ leave(); // required for proper stackwalking of RuntimeStub frame
4511     __ ret(lr);
4512 
4513     return start;
4514   }
4515 
4516   address generate_squareToLen() {
4517     // squareToLen algorithm for sizes 1..127 described in java code works
4518     // faster than multiply_to_len on some CPUs and slower on others, but
4519     // multiply_to_len shows a bit better overall results
4520     __ align(CodeEntryAlignment);
4521     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4522     address start = __ pc();
4523 
4524     const Register x     = r0;
4525     const Register xlen  = r1;
4526     const Register z     = r2;
4527     const Register zlen  = r3;
4528     const Register y     = r4; // == x
4529     const Register ylen  = r5; // == xlen
4530 
4531     const Register tmp1  = r10;
4532     const Register tmp2  = r11;
4533     const Register tmp3  = r12;
4534     const Register tmp4  = r13;
4535     const Register tmp5  = r14;
4536     const Register tmp6  = r15;
4537     const Register tmp7  = r16;
4538 
4539     RegSet spilled_regs = RegSet::of(y, ylen);
4540     BLOCK_COMMENT("Entry:");
4541     __ enter();
4542     __ push(spilled_regs, sp);
4543     __ mov(y, x);
4544     __ mov(ylen, xlen);
4545     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4546     __ pop(spilled_regs, sp);
4547     __ leave();
4548     __ ret(lr);
4549     return start;
4550   }
4551 
4552   address generate_mulAdd() {
4553     __ align(CodeEntryAlignment);
4554     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4555 
4556     address start = __ pc();
4557 
4558     const Register out     = r0;
4559     const Register in      = r1;
4560     const Register offset  = r2;
4561     const Register len     = r3;
4562     const Register k       = r4;
4563 
4564     BLOCK_COMMENT("Entry:");
4565     __ enter();
4566     __ mul_add(out, in, offset, len, k);
4567     __ leave();
4568     __ ret(lr);
4569 
4570     return start;
4571   }
4572 
4573   // Arguments:
4574   //
4575   // Input:
4576   //   c_rarg0   - newArr address
4577   //   c_rarg1   - oldArr address
4578   //   c_rarg2   - newIdx
4579   //   c_rarg3   - shiftCount
4580   //   c_rarg4   - numIter
4581   //
4582   address generate_bigIntegerRightShift() {
4583     __ align(CodeEntryAlignment);
4584     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4585     address start = __ pc();
4586 
4587     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4588 
4589     Register newArr        = c_rarg0;
4590     Register oldArr        = c_rarg1;
4591     Register newIdx        = c_rarg2;
4592     Register shiftCount    = c_rarg3;
4593     Register numIter       = c_rarg4;
4594     Register idx           = numIter;
4595 
4596     Register newArrCur     = rscratch1;
4597     Register shiftRevCount = rscratch2;
4598     Register oldArrCur     = r13;
4599     Register oldArrNext    = r14;
4600 
4601     FloatRegister oldElem0        = v0;
4602     FloatRegister oldElem1        = v1;
4603     FloatRegister newElem         = v2;
4604     FloatRegister shiftVCount     = v3;
4605     FloatRegister shiftVRevCount  = v4;
4606 
4607     __ cbz(idx, Exit);
4608 
4609     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4610 
4611     // left shift count
4612     __ movw(shiftRevCount, 32);
4613     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4614 
4615     // numIter too small to allow a 4-words SIMD loop, rolling back
4616     __ cmp(numIter, (u1)4);
4617     __ br(Assembler::LT, ShiftThree);
4618 
4619     __ dup(shiftVCount,    __ T4S, shiftCount);
4620     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4621     __ negr(shiftVCount,   __ T4S, shiftVCount);
4622 
4623     __ BIND(ShiftSIMDLoop);
4624 
4625     // Calculate the load addresses
4626     __ sub(idx, idx, 4);
4627     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4628     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4629     __ add(oldArrCur,  oldArrNext, 4);
4630 
4631     // Load 4 words and process
4632     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4633     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4634     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4635     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4636     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4637     __ st1(newElem,   __ T4S,  Address(newArrCur));
4638 
4639     __ cmp(idx, (u1)4);
4640     __ br(Assembler::LT, ShiftTwoLoop);
4641     __ b(ShiftSIMDLoop);
4642 
4643     __ BIND(ShiftTwoLoop);
4644     __ cbz(idx, Exit);
4645     __ cmp(idx, (u1)1);
4646     __ br(Assembler::EQ, ShiftOne);
4647 
4648     // Calculate the load addresses
4649     __ sub(idx, idx, 2);
4650     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4651     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4652     __ add(oldArrCur,  oldArrNext, 4);
4653 
4654     // Load 2 words and process
4655     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4656     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4657     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4658     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4659     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4660     __ st1(newElem,   __ T2S, Address(newArrCur));
4661     __ b(ShiftTwoLoop);
4662 
4663     __ BIND(ShiftThree);
4664     __ tbz(idx, 1, ShiftOne);
4665     __ tbz(idx, 0, ShiftTwo);
4666     __ ldrw(r10,  Address(oldArr, 12));
4667     __ ldrw(r11,  Address(oldArr, 8));
4668     __ lsrvw(r10, r10, shiftCount);
4669     __ lslvw(r11, r11, shiftRevCount);
4670     __ orrw(r12,  r10, r11);
4671     __ strw(r12,  Address(newArr, 8));
4672 
4673     __ BIND(ShiftTwo);
4674     __ ldrw(r10,  Address(oldArr, 8));
4675     __ ldrw(r11,  Address(oldArr, 4));
4676     __ lsrvw(r10, r10, shiftCount);
4677     __ lslvw(r11, r11, shiftRevCount);
4678     __ orrw(r12,  r10, r11);
4679     __ strw(r12,  Address(newArr, 4));
4680 
4681     __ BIND(ShiftOne);
4682     __ ldrw(r10,  Address(oldArr, 4));
4683     __ ldrw(r11,  Address(oldArr));
4684     __ lsrvw(r10, r10, shiftCount);
4685     __ lslvw(r11, r11, shiftRevCount);
4686     __ orrw(r12,  r10, r11);
4687     __ strw(r12,  Address(newArr));
4688 
4689     __ BIND(Exit);
4690     __ ret(lr);
4691 
4692     return start;
4693   }
4694 
4695   // Arguments:
4696   //
4697   // Input:
4698   //   c_rarg0   - newArr address
4699   //   c_rarg1   - oldArr address
4700   //   c_rarg2   - newIdx
4701   //   c_rarg3   - shiftCount
4702   //   c_rarg4   - numIter
4703   //
4704   address generate_bigIntegerLeftShift() {
4705     __ align(CodeEntryAlignment);
4706     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4707     address start = __ pc();
4708 
4709     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4710 
4711     Register newArr        = c_rarg0;
4712     Register oldArr        = c_rarg1;
4713     Register newIdx        = c_rarg2;
4714     Register shiftCount    = c_rarg3;
4715     Register numIter       = c_rarg4;
4716 
4717     Register shiftRevCount = rscratch1;
4718     Register oldArrNext    = rscratch2;
4719 
4720     FloatRegister oldElem0        = v0;
4721     FloatRegister oldElem1        = v1;
4722     FloatRegister newElem         = v2;
4723     FloatRegister shiftVCount     = v3;
4724     FloatRegister shiftVRevCount  = v4;
4725 
4726     __ cbz(numIter, Exit);
4727 
4728     __ add(oldArrNext, oldArr, 4);
4729     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4730 
4731     // right shift count
4732     __ movw(shiftRevCount, 32);
4733     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4734 
4735     // numIter too small to allow a 4-words SIMD loop, rolling back
4736     __ cmp(numIter, (u1)4);
4737     __ br(Assembler::LT, ShiftThree);
4738 
4739     __ dup(shiftVCount,     __ T4S, shiftCount);
4740     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4741     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4742 
4743     __ BIND(ShiftSIMDLoop);
4744 
4745     // load 4 words and process
4746     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4747     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4748     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4749     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4750     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4751     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4752     __ sub(numIter,   numIter, 4);
4753 
4754     __ cmp(numIter, (u1)4);
4755     __ br(Assembler::LT, ShiftTwoLoop);
4756     __ b(ShiftSIMDLoop);
4757 
4758     __ BIND(ShiftTwoLoop);
4759     __ cbz(numIter, Exit);
4760     __ cmp(numIter, (u1)1);
4761     __ br(Assembler::EQ, ShiftOne);
4762 
4763     // load 2 words and process
4764     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4765     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4766     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4767     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4768     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4769     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4770     __ sub(numIter,   numIter, 2);
4771     __ b(ShiftTwoLoop);
4772 
4773     __ BIND(ShiftThree);
4774     __ ldrw(r10,  __ post(oldArr, 4));
4775     __ ldrw(r11,  __ post(oldArrNext, 4));
4776     __ lslvw(r10, r10, shiftCount);
4777     __ lsrvw(r11, r11, shiftRevCount);
4778     __ orrw(r12,  r10, r11);
4779     __ strw(r12,  __ post(newArr, 4));
4780     __ tbz(numIter, 1, Exit);
4781     __ tbz(numIter, 0, ShiftOne);
4782 
4783     __ BIND(ShiftTwo);
4784     __ ldrw(r10,  __ post(oldArr, 4));
4785     __ ldrw(r11,  __ post(oldArrNext, 4));
4786     __ lslvw(r10, r10, shiftCount);
4787     __ lsrvw(r11, r11, shiftRevCount);
4788     __ orrw(r12,  r10, r11);
4789     __ strw(r12,  __ post(newArr, 4));
4790 
4791     __ BIND(ShiftOne);
4792     __ ldrw(r10,  Address(oldArr));
4793     __ ldrw(r11,  Address(oldArrNext));
4794     __ lslvw(r10, r10, shiftCount);
4795     __ lsrvw(r11, r11, shiftRevCount);
4796     __ orrw(r12,  r10, r11);
4797     __ strw(r12,  Address(newArr));
4798 
4799     __ BIND(Exit);
4800     __ ret(lr);
4801 
4802     return start;
4803   }
4804 
4805   address generate_count_positives(address &count_positives_long) {
4806     const u1 large_loop_size = 64;
4807     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4808     int dcache_line = VM_Version::dcache_line_size();
4809 
4810     Register ary1 = r1, len = r2, result = r0;
4811 
4812     __ align(CodeEntryAlignment);
4813 
4814     StubCodeMark mark(this, "StubRoutines", "count_positives");
4815 
4816     address entry = __ pc();
4817 
4818     __ enter();
4819     // precondition: a copy of len is already in result
4820     // __ mov(result, len);
4821 
4822   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4823         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4824 
4825   __ cmp(len, (u1)15);
4826   __ br(Assembler::GT, LEN_OVER_15);
4827   // The only case when execution falls into this code is when pointer is near
4828   // the end of memory page and we have to avoid reading next page
4829   __ add(ary1, ary1, len);
4830   __ subs(len, len, 8);
4831   __ br(Assembler::GT, LEN_OVER_8);
4832   __ ldr(rscratch2, Address(ary1, -8));
4833   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4834   __ lsrv(rscratch2, rscratch2, rscratch1);
4835   __ tst(rscratch2, UPPER_BIT_MASK);
4836   __ csel(result, zr, result, Assembler::NE);
4837   __ leave();
4838   __ ret(lr);
4839   __ bind(LEN_OVER_8);
4840   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4841   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4842   __ tst(rscratch2, UPPER_BIT_MASK);
4843   __ br(Assembler::NE, RET_NO_POP);
4844   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4845   __ lsrv(rscratch1, rscratch1, rscratch2);
4846   __ tst(rscratch1, UPPER_BIT_MASK);
4847   __ bind(RET_NO_POP);
4848   __ csel(result, zr, result, Assembler::NE);
4849   __ leave();
4850   __ ret(lr);
4851 
4852   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4853   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4854 
4855   count_positives_long = __ pc(); // 2nd entry point
4856 
4857   __ enter();
4858 
4859   __ bind(LEN_OVER_15);
4860     __ push(spilled_regs, sp);
4861     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4862     __ cbz(rscratch2, ALIGNED);
4863     __ ldp(tmp6, tmp1, Address(ary1));
4864     __ mov(tmp5, 16);
4865     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4866     __ add(ary1, ary1, rscratch1);
4867     __ orr(tmp6, tmp6, tmp1);
4868     __ tst(tmp6, UPPER_BIT_MASK);
4869     __ br(Assembler::NE, RET_ADJUST);
4870     __ sub(len, len, rscratch1);
4871 
4872   __ bind(ALIGNED);
4873     __ cmp(len, large_loop_size);
4874     __ br(Assembler::LT, CHECK_16);
4875     // Perform 16-byte load as early return in pre-loop to handle situation
4876     // when initially aligned large array has negative values at starting bytes,
4877     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4878     // slower. Cases with negative bytes further ahead won't be affected that
4879     // much. In fact, it'll be faster due to early loads, less instructions and
4880     // less branches in LARGE_LOOP.
4881     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4882     __ sub(len, len, 16);
4883     __ orr(tmp6, tmp6, tmp1);
4884     __ tst(tmp6, UPPER_BIT_MASK);
4885     __ br(Assembler::NE, RET_ADJUST_16);
4886     __ cmp(len, large_loop_size);
4887     __ br(Assembler::LT, CHECK_16);
4888 
4889     if (SoftwarePrefetchHintDistance >= 0
4890         && SoftwarePrefetchHintDistance >= dcache_line) {
4891       // initial prefetch
4892       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4893     }
4894   __ bind(LARGE_LOOP);
4895     if (SoftwarePrefetchHintDistance >= 0) {
4896       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4897     }
4898     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4899     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4900     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4901     // instructions per cycle and have less branches, but this approach disables
4902     // early return, thus, all 64 bytes are loaded and checked every time.
4903     __ ldp(tmp2, tmp3, Address(ary1));
4904     __ ldp(tmp4, tmp5, Address(ary1, 16));
4905     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4906     __ ldp(tmp6, tmp1, Address(ary1, 48));
4907     __ add(ary1, ary1, large_loop_size);
4908     __ sub(len, len, large_loop_size);
4909     __ orr(tmp2, tmp2, tmp3);
4910     __ orr(tmp4, tmp4, tmp5);
4911     __ orr(rscratch1, rscratch1, rscratch2);
4912     __ orr(tmp6, tmp6, tmp1);
4913     __ orr(tmp2, tmp2, tmp4);
4914     __ orr(rscratch1, rscratch1, tmp6);
4915     __ orr(tmp2, tmp2, rscratch1);
4916     __ tst(tmp2, UPPER_BIT_MASK);
4917     __ br(Assembler::NE, RET_ADJUST_LONG);
4918     __ cmp(len, large_loop_size);
4919     __ br(Assembler::GE, LARGE_LOOP);
4920 
4921   __ bind(CHECK_16); // small 16-byte load pre-loop
4922     __ cmp(len, (u1)16);
4923     __ br(Assembler::LT, POST_LOOP16);
4924 
4925   __ bind(LOOP16); // small 16-byte load loop
4926     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4927     __ sub(len, len, 16);
4928     __ orr(tmp2, tmp2, tmp3);
4929     __ tst(tmp2, UPPER_BIT_MASK);
4930     __ br(Assembler::NE, RET_ADJUST_16);
4931     __ cmp(len, (u1)16);
4932     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4933 
4934   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4935     __ cmp(len, (u1)8);
4936     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4937     __ ldr(tmp3, Address(__ post(ary1, 8)));
4938     __ tst(tmp3, UPPER_BIT_MASK);
4939     __ br(Assembler::NE, RET_ADJUST);
4940     __ sub(len, len, 8);
4941 
4942   __ bind(POST_LOOP16_LOAD_TAIL);
4943     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4944     __ ldr(tmp1, Address(ary1));
4945     __ mov(tmp2, 64);
4946     __ sub(tmp4, tmp2, len, __ LSL, 3);
4947     __ lslv(tmp1, tmp1, tmp4);
4948     __ tst(tmp1, UPPER_BIT_MASK);
4949     __ br(Assembler::NE, RET_ADJUST);
4950     // Fallthrough
4951 
4952   __ bind(RET_LEN);
4953     __ pop(spilled_regs, sp);
4954     __ leave();
4955     __ ret(lr);
4956 
4957     // difference result - len is the count of guaranteed to be
4958     // positive bytes
4959 
4960   __ bind(RET_ADJUST_LONG);
4961     __ add(len, len, (u1)(large_loop_size - 16));
4962   __ bind(RET_ADJUST_16);
4963     __ add(len, len, 16);
4964   __ bind(RET_ADJUST);
4965     __ pop(spilled_regs, sp);
4966     __ leave();
4967     __ sub(result, result, len);
4968     __ ret(lr);
4969 
4970     return entry;
4971   }
4972 
4973   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4974         bool usePrefetch, Label &NOT_EQUAL) {
4975     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4976         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4977         tmp7 = r12, tmp8 = r13;
4978     Label LOOP;
4979 
4980     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4981     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4982     __ bind(LOOP);
4983     if (usePrefetch) {
4984       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4985       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4986     }
4987     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4988     __ eor(tmp1, tmp1, tmp2);
4989     __ eor(tmp3, tmp3, tmp4);
4990     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4991     __ orr(tmp1, tmp1, tmp3);
4992     __ cbnz(tmp1, NOT_EQUAL);
4993     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4994     __ eor(tmp5, tmp5, tmp6);
4995     __ eor(tmp7, tmp7, tmp8);
4996     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4997     __ orr(tmp5, tmp5, tmp7);
4998     __ cbnz(tmp5, NOT_EQUAL);
4999     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5000     __ eor(tmp1, tmp1, tmp2);
5001     __ eor(tmp3, tmp3, tmp4);
5002     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5003     __ orr(tmp1, tmp1, tmp3);
5004     __ cbnz(tmp1, NOT_EQUAL);
5005     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5006     __ eor(tmp5, tmp5, tmp6);
5007     __ sub(cnt1, cnt1, 8 * wordSize);
5008     __ eor(tmp7, tmp7, tmp8);
5009     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5010     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5011     // cmp) because subs allows an unlimited range of immediate operand.
5012     __ subs(tmp6, cnt1, loopThreshold);
5013     __ orr(tmp5, tmp5, tmp7);
5014     __ cbnz(tmp5, NOT_EQUAL);
5015     __ br(__ GE, LOOP);
5016     // post-loop
5017     __ eor(tmp1, tmp1, tmp2);
5018     __ eor(tmp3, tmp3, tmp4);
5019     __ orr(tmp1, tmp1, tmp3);
5020     __ sub(cnt1, cnt1, 2 * wordSize);
5021     __ cbnz(tmp1, NOT_EQUAL);
5022   }
5023 
5024   void generate_large_array_equals_loop_simd(int loopThreshold,
5025         bool usePrefetch, Label &NOT_EQUAL) {
5026     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5027         tmp2 = rscratch2;
5028     Label LOOP;
5029 
5030     __ bind(LOOP);
5031     if (usePrefetch) {
5032       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5033       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5034     }
5035     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5036     __ sub(cnt1, cnt1, 8 * wordSize);
5037     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5038     __ subs(tmp1, cnt1, loopThreshold);
5039     __ eor(v0, __ T16B, v0, v4);
5040     __ eor(v1, __ T16B, v1, v5);
5041     __ eor(v2, __ T16B, v2, v6);
5042     __ eor(v3, __ T16B, v3, v7);
5043     __ orr(v0, __ T16B, v0, v1);
5044     __ orr(v1, __ T16B, v2, v3);
5045     __ orr(v0, __ T16B, v0, v1);
5046     __ umov(tmp1, v0, __ D, 0);
5047     __ umov(tmp2, v0, __ D, 1);
5048     __ orr(tmp1, tmp1, tmp2);
5049     __ cbnz(tmp1, NOT_EQUAL);
5050     __ br(__ GE, LOOP);
5051   }
5052 
5053   // a1 = r1 - array1 address
5054   // a2 = r2 - array2 address
5055   // result = r0 - return value. Already contains "false"
5056   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5057   // r3-r5 are reserved temporary registers
5058   address generate_large_array_equals() {
5059     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5060         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5061         tmp7 = r12, tmp8 = r13;
5062     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5063         SMALL_LOOP, POST_LOOP;
5064     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5065     // calculate if at least 32 prefetched bytes are used
5066     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5067     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5068     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5069     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5070         tmp5, tmp6, tmp7, tmp8);
5071 
5072     __ align(CodeEntryAlignment);
5073 
5074     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5075 
5076     address entry = __ pc();
5077     __ enter();
5078     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5079     // also advance pointers to use post-increment instead of pre-increment
5080     __ add(a1, a1, wordSize);
5081     __ add(a2, a2, wordSize);
5082     if (AvoidUnalignedAccesses) {
5083       // both implementations (SIMD/nonSIMD) are using relatively large load
5084       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5085       // on some CPUs in case of address is not at least 16-byte aligned.
5086       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5087       // load if needed at least for 1st address and make if 16-byte aligned.
5088       Label ALIGNED16;
5089       __ tbz(a1, 3, ALIGNED16);
5090       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5091       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5092       __ sub(cnt1, cnt1, wordSize);
5093       __ eor(tmp1, tmp1, tmp2);
5094       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5095       __ bind(ALIGNED16);
5096     }
5097     if (UseSIMDForArrayEquals) {
5098       if (SoftwarePrefetchHintDistance >= 0) {
5099         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5100         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5101         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5102             /* prfm = */ true, NOT_EQUAL);
5103         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5104         __ br(__ LT, TAIL);
5105       }
5106       __ bind(NO_PREFETCH_LARGE_LOOP);
5107       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5108           /* prfm = */ false, NOT_EQUAL);
5109     } else {
5110       __ push(spilled_regs, sp);
5111       if (SoftwarePrefetchHintDistance >= 0) {
5112         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5113         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5114         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5115             /* prfm = */ true, NOT_EQUAL);
5116         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5117         __ br(__ LT, TAIL);
5118       }
5119       __ bind(NO_PREFETCH_LARGE_LOOP);
5120       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5121           /* prfm = */ false, NOT_EQUAL);
5122     }
5123     __ bind(TAIL);
5124       __ cbz(cnt1, EQUAL);
5125       __ subs(cnt1, cnt1, wordSize);
5126       __ br(__ LE, POST_LOOP);
5127     __ bind(SMALL_LOOP);
5128       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5129       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5130       __ subs(cnt1, cnt1, wordSize);
5131       __ eor(tmp1, tmp1, tmp2);
5132       __ cbnz(tmp1, NOT_EQUAL);
5133       __ br(__ GT, SMALL_LOOP);
5134     __ bind(POST_LOOP);
5135       __ ldr(tmp1, Address(a1, cnt1));
5136       __ ldr(tmp2, Address(a2, cnt1));
5137       __ eor(tmp1, tmp1, tmp2);
5138       __ cbnz(tmp1, NOT_EQUAL);
5139     __ bind(EQUAL);
5140       __ mov(result, true);
5141     __ bind(NOT_EQUAL);
5142       if (!UseSIMDForArrayEquals) {
5143         __ pop(spilled_regs, sp);
5144       }
5145     __ bind(NOT_EQUAL_NO_POP);
5146     __ leave();
5147     __ ret(lr);
5148     return entry;
5149   }
5150 
5151   address generate_dsin_dcos(bool isCos) {
5152     __ align(CodeEntryAlignment);
5153     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5154     address start = __ pc();
5155     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5156         (address)StubRoutines::aarch64::_two_over_pi,
5157         (address)StubRoutines::aarch64::_pio2,
5158         (address)StubRoutines::aarch64::_dsin_coef,
5159         (address)StubRoutines::aarch64::_dcos_coef);
5160     return start;
5161   }
5162 
5163   address generate_dlog() {
5164     __ align(CodeEntryAlignment);
5165     StubCodeMark mark(this, "StubRoutines", "dlog");
5166     address entry = __ pc();
5167     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5168         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5169     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5170     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5171         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5172     return entry;
5173   }
5174 
5175 
5176   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5177   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5178       Label &DIFF2) {
5179     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5180     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5181 
5182     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5183     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5184     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5185     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5186 
5187     __ fmovd(tmpL, vtmp3);
5188     __ eor(rscratch2, tmp3, tmpL);
5189     __ cbnz(rscratch2, DIFF2);
5190 
5191     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5192     __ umov(tmpL, vtmp3, __ D, 1);
5193     __ eor(rscratch2, tmpU, tmpL);
5194     __ cbnz(rscratch2, DIFF1);
5195 
5196     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5197     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5198     __ fmovd(tmpL, vtmp);
5199     __ eor(rscratch2, tmp3, tmpL);
5200     __ cbnz(rscratch2, DIFF2);
5201 
5202     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5203     __ umov(tmpL, vtmp, __ D, 1);
5204     __ eor(rscratch2, tmpU, tmpL);
5205     __ cbnz(rscratch2, DIFF1);
5206   }
5207 
5208   // r0  = result
5209   // r1  = str1
5210   // r2  = cnt1
5211   // r3  = str2
5212   // r4  = cnt2
5213   // r10 = tmp1
5214   // r11 = tmp2
5215   address generate_compare_long_string_different_encoding(bool isLU) {
5216     __ align(CodeEntryAlignment);
5217     StubCodeMark mark(this, "StubRoutines", isLU
5218         ? "compare_long_string_different_encoding LU"
5219         : "compare_long_string_different_encoding UL");
5220     address entry = __ pc();
5221     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5222         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5223         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5224     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5225         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5226     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5227     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5228 
5229     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5230 
5231     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5232     // cnt2 == amount of characters left to compare
5233     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5234     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5235     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5236     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5237     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5238     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5239     __ eor(rscratch2, tmp1, tmp2);
5240     __ mov(rscratch1, tmp2);
5241     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5242     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5243              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5244     __ push(spilled_regs, sp);
5245     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5246     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5247 
5248     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5249 
5250     if (SoftwarePrefetchHintDistance >= 0) {
5251       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5252       __ br(__ LT, NO_PREFETCH);
5253       __ bind(LARGE_LOOP_PREFETCH);
5254         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5255         __ mov(tmp4, 2);
5256         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5257         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5258           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5259           __ subs(tmp4, tmp4, 1);
5260           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5261           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5262           __ mov(tmp4, 2);
5263         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5264           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5265           __ subs(tmp4, tmp4, 1);
5266           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5267           __ sub(cnt2, cnt2, 64);
5268           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5269           __ br(__ GE, LARGE_LOOP_PREFETCH);
5270     }
5271     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5272     __ bind(NO_PREFETCH);
5273     __ subs(cnt2, cnt2, 16);
5274     __ br(__ LT, TAIL);
5275     __ align(OptoLoopAlignment);
5276     __ bind(SMALL_LOOP); // smaller loop
5277       __ subs(cnt2, cnt2, 16);
5278       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5279       __ br(__ GE, SMALL_LOOP);
5280       __ cmn(cnt2, (u1)16);
5281       __ br(__ EQ, LOAD_LAST);
5282     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5283       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5284       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5285       __ ldr(tmp3, Address(cnt1, -8));
5286       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5287       __ b(LOAD_LAST);
5288     __ bind(DIFF2);
5289       __ mov(tmpU, tmp3);
5290     __ bind(DIFF1);
5291       __ pop(spilled_regs, sp);
5292       __ b(CALCULATE_DIFFERENCE);
5293     __ bind(LOAD_LAST);
5294       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5295       // No need to load it again
5296       __ mov(tmpU, tmp3);
5297       __ pop(spilled_regs, sp);
5298 
5299       // tmp2 points to the address of the last 4 Latin1 characters right now
5300       __ ldrs(vtmp, Address(tmp2));
5301       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5302       __ fmovd(tmpL, vtmp);
5303 
5304       __ eor(rscratch2, tmpU, tmpL);
5305       __ cbz(rscratch2, DONE);
5306 
5307     // Find the first different characters in the longwords and
5308     // compute their difference.
5309     __ bind(CALCULATE_DIFFERENCE);
5310       __ rev(rscratch2, rscratch2);
5311       __ clz(rscratch2, rscratch2);
5312       __ andr(rscratch2, rscratch2, -16);
5313       __ lsrv(tmp1, tmp1, rscratch2);
5314       __ uxthw(tmp1, tmp1);
5315       __ lsrv(rscratch1, rscratch1, rscratch2);
5316       __ uxthw(rscratch1, rscratch1);
5317       __ subw(result, tmp1, rscratch1);
5318     __ bind(DONE);
5319       __ ret(lr);
5320     return entry;
5321   }
5322 
5323   address generate_method_entry_barrier() {
5324     __ align(CodeEntryAlignment);
5325     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5326 
5327     Label deoptimize_label;
5328 
5329     address start = __ pc();
5330 
5331     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5332 
5333     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5334       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5335       // We can get here despite the nmethod being good, if we have not
5336       // yet applied our cross modification fence (or data fence).
5337       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5338       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5339       __ ldrw(rscratch2, rscratch2);
5340       __ strw(rscratch2, thread_epoch_addr);
5341       __ isb();
5342       __ membar(__ LoadLoad);
5343     }
5344 
5345     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5346 
5347     __ enter();
5348     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5349 
5350     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5351 
5352     __ push_call_clobbered_registers();
5353 
5354     __ mov(c_rarg0, rscratch2);
5355     __ call_VM_leaf
5356          (CAST_FROM_FN_PTR
5357           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5358 
5359     __ reset_last_Java_frame(true);
5360 
5361     __ mov(rscratch1, r0);
5362 
5363     __ pop_call_clobbered_registers();
5364 
5365     __ cbnz(rscratch1, deoptimize_label);
5366 
5367     __ leave();
5368     __ ret(lr);
5369 
5370     __ BIND(deoptimize_label);
5371 
5372     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5373     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5374 
5375     __ mov(sp, rscratch1);
5376     __ br(rscratch2);
5377 
5378     return start;
5379   }
5380 
5381   // r0  = result
5382   // r1  = str1
5383   // r2  = cnt1
5384   // r3  = str2
5385   // r4  = cnt2
5386   // r10 = tmp1
5387   // r11 = tmp2
5388   address generate_compare_long_string_same_encoding(bool isLL) {
5389     __ align(CodeEntryAlignment);
5390     StubCodeMark mark(this, "StubRoutines", isLL
5391         ? "compare_long_string_same_encoding LL"
5392         : "compare_long_string_same_encoding UU");
5393     address entry = __ pc();
5394     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5395         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5396 
5397     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5398 
5399     // exit from large loop when less than 64 bytes left to read or we're about
5400     // to prefetch memory behind array border
5401     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5402 
5403     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5404     __ eor(rscratch2, tmp1, tmp2);
5405     __ cbnz(rscratch2, CAL_DIFFERENCE);
5406 
5407     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5408     // update pointers, because of previous read
5409     __ add(str1, str1, wordSize);
5410     __ add(str2, str2, wordSize);
5411     if (SoftwarePrefetchHintDistance >= 0) {
5412       __ align(OptoLoopAlignment);
5413       __ bind(LARGE_LOOP_PREFETCH);
5414         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5415         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5416 
5417         for (int i = 0; i < 4; i++) {
5418           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5419           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5420           __ cmp(tmp1, tmp2);
5421           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5422           __ br(Assembler::NE, DIFF);
5423         }
5424         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5425         __ add(str1, str1, 64);
5426         __ add(str2, str2, 64);
5427         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5428         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5429         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5430     }
5431 
5432     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5433     __ br(Assembler::LE, LESS16);
5434     __ align(OptoLoopAlignment);
5435     __ bind(LOOP_COMPARE16);
5436       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5437       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5438       __ cmp(tmp1, tmp2);
5439       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5440       __ br(Assembler::NE, DIFF);
5441       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5442       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5443       __ br(Assembler::LT, LESS16);
5444 
5445       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5446       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5447       __ cmp(tmp1, tmp2);
5448       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5449       __ br(Assembler::NE, DIFF);
5450       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5451       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5452       __ br(Assembler::GE, LOOP_COMPARE16);
5453       __ cbz(cnt2, LENGTH_DIFF);
5454 
5455     __ bind(LESS16);
5456       // each 8 compare
5457       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5458       __ br(Assembler::LE, LESS8);
5459       __ ldr(tmp1, Address(__ post(str1, 8)));
5460       __ ldr(tmp2, Address(__ post(str2, 8)));
5461       __ eor(rscratch2, tmp1, tmp2);
5462       __ cbnz(rscratch2, CAL_DIFFERENCE);
5463       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5464 
5465     __ bind(LESS8); // directly load last 8 bytes
5466       if (!isLL) {
5467         __ add(cnt2, cnt2, cnt2);
5468       }
5469       __ ldr(tmp1, Address(str1, cnt2));
5470       __ ldr(tmp2, Address(str2, cnt2));
5471       __ eor(rscratch2, tmp1, tmp2);
5472       __ cbz(rscratch2, LENGTH_DIFF);
5473       __ b(CAL_DIFFERENCE);
5474 
5475     __ bind(DIFF);
5476       __ cmp(tmp1, tmp2);
5477       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5478       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5479       // reuse rscratch2 register for the result of eor instruction
5480       __ eor(rscratch2, tmp1, tmp2);
5481 
5482     __ bind(CAL_DIFFERENCE);
5483       __ rev(rscratch2, rscratch2);
5484       __ clz(rscratch2, rscratch2);
5485       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5486       __ lsrv(tmp1, tmp1, rscratch2);
5487       __ lsrv(tmp2, tmp2, rscratch2);
5488       if (isLL) {
5489         __ uxtbw(tmp1, tmp1);
5490         __ uxtbw(tmp2, tmp2);
5491       } else {
5492         __ uxthw(tmp1, tmp1);
5493         __ uxthw(tmp2, tmp2);
5494       }
5495       __ subw(result, tmp1, tmp2);
5496 
5497     __ bind(LENGTH_DIFF);
5498       __ ret(lr);
5499     return entry;
5500   }
5501 
5502   enum string_compare_mode {
5503     LL,
5504     LU,
5505     UL,
5506     UU,
5507   };
5508 
5509   // The following registers are declared in aarch64.ad
5510   // r0  = result
5511   // r1  = str1
5512   // r2  = cnt1
5513   // r3  = str2
5514   // r4  = cnt2
5515   // r10 = tmp1
5516   // r11 = tmp2
5517   // z0  = ztmp1
5518   // z1  = ztmp2
5519   // p0  = pgtmp1
5520   // p1  = pgtmp2
5521   address generate_compare_long_string_sve(string_compare_mode mode) {
5522     __ align(CodeEntryAlignment);
5523     address entry = __ pc();
5524     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5525              tmp1 = r10, tmp2 = r11;
5526 
5527     Label LOOP, DONE, MISMATCH;
5528     Register vec_len = tmp1;
5529     Register idx = tmp2;
5530     // The minimum of the string lengths has been stored in cnt2.
5531     Register cnt = cnt2;
5532     FloatRegister ztmp1 = z0, ztmp2 = z1;
5533     PRegister pgtmp1 = p0, pgtmp2 = p1;
5534 
5535 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5536     switch (mode) {                                                            \
5537       case LL:                                                                 \
5538         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5539         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5540         break;                                                                 \
5541       case LU:                                                                 \
5542         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5543         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5544         break;                                                                 \
5545       case UL:                                                                 \
5546         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5547         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5548         break;                                                                 \
5549       case UU:                                                                 \
5550         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5551         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5552         break;                                                                 \
5553       default:                                                                 \
5554         ShouldNotReachHere();                                                  \
5555     }
5556 
5557     const char* stubname;
5558     switch (mode) {
5559       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5560       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5561       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5562       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5563       default: ShouldNotReachHere();
5564     }
5565 
5566     StubCodeMark mark(this, "StubRoutines", stubname);
5567 
5568     __ mov(idx, 0);
5569     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5570 
5571     if (mode == LL) {
5572       __ sve_cntb(vec_len);
5573     } else {
5574       __ sve_cnth(vec_len);
5575     }
5576 
5577     __ sub(rscratch1, cnt, vec_len);
5578 
5579     __ bind(LOOP);
5580 
5581       // main loop
5582       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5583       __ add(idx, idx, vec_len);
5584       // Compare strings.
5585       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5586       __ br(__ NE, MISMATCH);
5587       __ cmp(idx, rscratch1);
5588       __ br(__ LT, LOOP);
5589 
5590     // post loop, last iteration
5591     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5592 
5593     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5594     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5595     __ br(__ EQ, DONE);
5596 
5597     __ bind(MISMATCH);
5598 
5599     // Crop the vector to find its location.
5600     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5601     // Extract the first different characters of each string.
5602     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5603     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5604 
5605     // Compute the difference of the first different characters.
5606     __ sub(result, rscratch1, rscratch2);
5607 
5608     __ bind(DONE);
5609     __ ret(lr);
5610 #undef LOAD_PAIR
5611     return entry;
5612   }
5613 
5614   void generate_compare_long_strings() {
5615     if (UseSVE == 0) {
5616       StubRoutines::aarch64::_compare_long_string_LL
5617           = generate_compare_long_string_same_encoding(true);
5618       StubRoutines::aarch64::_compare_long_string_UU
5619           = generate_compare_long_string_same_encoding(false);
5620       StubRoutines::aarch64::_compare_long_string_LU
5621           = generate_compare_long_string_different_encoding(true);
5622       StubRoutines::aarch64::_compare_long_string_UL
5623           = generate_compare_long_string_different_encoding(false);
5624     } else {
5625       StubRoutines::aarch64::_compare_long_string_LL
5626           = generate_compare_long_string_sve(LL);
5627       StubRoutines::aarch64::_compare_long_string_UU
5628           = generate_compare_long_string_sve(UU);
5629       StubRoutines::aarch64::_compare_long_string_LU
5630           = generate_compare_long_string_sve(LU);
5631       StubRoutines::aarch64::_compare_long_string_UL
5632           = generate_compare_long_string_sve(UL);
5633     }
5634   }
5635 
5636   // R0 = result
5637   // R1 = str2
5638   // R2 = cnt1
5639   // R3 = str1
5640   // R4 = cnt2
5641   // This generic linear code use few additional ideas, which makes it faster:
5642   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5643   // in order to skip initial loading(help in systems with 1 ld pipeline)
5644   // 2) we can use "fast" algorithm of finding single character to search for
5645   // first symbol with less branches(1 branch per each loaded register instead
5646   // of branch for each symbol), so, this is where constants like
5647   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5648   // 3) after loading and analyzing 1st register of source string, it can be
5649   // used to search for every 1st character entry, saving few loads in
5650   // comparison with "simplier-but-slower" implementation
5651   // 4) in order to avoid lots of push/pop operations, code below is heavily
5652   // re-using/re-initializing/compressing register values, which makes code
5653   // larger and a bit less readable, however, most of extra operations are
5654   // issued during loads or branches, so, penalty is minimal
5655   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5656     const char* stubName = str1_isL
5657         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5658         : "indexof_linear_uu";
5659     __ align(CodeEntryAlignment);
5660     StubCodeMark mark(this, "StubRoutines", stubName);
5661     address entry = __ pc();
5662 
5663     int str1_chr_size = str1_isL ? 1 : 2;
5664     int str2_chr_size = str2_isL ? 1 : 2;
5665     int str1_chr_shift = str1_isL ? 0 : 1;
5666     int str2_chr_shift = str2_isL ? 0 : 1;
5667     bool isL = str1_isL && str2_isL;
5668    // parameters
5669     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5670     // temporary registers
5671     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5672     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5673     // redefinitions
5674     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5675 
5676     __ push(spilled_regs, sp);
5677     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5678         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5679         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5680         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5681         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5682         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5683     // Read whole register from str1. It is safe, because length >=8 here
5684     __ ldr(ch1, Address(str1));
5685     // Read whole register from str2. It is safe, because length >=8 here
5686     __ ldr(ch2, Address(str2));
5687     __ sub(cnt2, cnt2, cnt1);
5688     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5689     if (str1_isL != str2_isL) {
5690       __ eor(v0, __ T16B, v0, v0);
5691     }
5692     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5693     __ mul(first, first, tmp1);
5694     // check if we have less than 1 register to check
5695     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5696     if (str1_isL != str2_isL) {
5697       __ fmovd(v1, ch1);
5698     }
5699     __ br(__ LE, L_SMALL);
5700     __ eor(ch2, first, ch2);
5701     if (str1_isL != str2_isL) {
5702       __ zip1(v1, __ T16B, v1, v0);
5703     }
5704     __ sub(tmp2, ch2, tmp1);
5705     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5706     __ bics(tmp2, tmp2, ch2);
5707     if (str1_isL != str2_isL) {
5708       __ fmovd(ch1, v1);
5709     }
5710     __ br(__ NE, L_HAS_ZERO);
5711     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5712     __ add(result, result, wordSize/str2_chr_size);
5713     __ add(str2, str2, wordSize);
5714     __ br(__ LT, L_POST_LOOP);
5715     __ BIND(L_LOOP);
5716       __ ldr(ch2, Address(str2));
5717       __ eor(ch2, first, ch2);
5718       __ sub(tmp2, ch2, tmp1);
5719       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5720       __ bics(tmp2, tmp2, ch2);
5721       __ br(__ NE, L_HAS_ZERO);
5722     __ BIND(L_LOOP_PROCEED);
5723       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5724       __ add(str2, str2, wordSize);
5725       __ add(result, result, wordSize/str2_chr_size);
5726       __ br(__ GE, L_LOOP);
5727     __ BIND(L_POST_LOOP);
5728       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5729       __ br(__ LE, NOMATCH);
5730       __ ldr(ch2, Address(str2));
5731       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5732       __ eor(ch2, first, ch2);
5733       __ sub(tmp2, ch2, tmp1);
5734       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5735       __ mov(tmp4, -1); // all bits set
5736       __ b(L_SMALL_PROCEED);
5737     __ align(OptoLoopAlignment);
5738     __ BIND(L_SMALL);
5739       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5740       __ eor(ch2, first, ch2);
5741       if (str1_isL != str2_isL) {
5742         __ zip1(v1, __ T16B, v1, v0);
5743       }
5744       __ sub(tmp2, ch2, tmp1);
5745       __ mov(tmp4, -1); // all bits set
5746       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5747       if (str1_isL != str2_isL) {
5748         __ fmovd(ch1, v1); // move converted 4 symbols
5749       }
5750     __ BIND(L_SMALL_PROCEED);
5751       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5752       __ bic(tmp2, tmp2, ch2);
5753       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5754       __ rbit(tmp2, tmp2);
5755       __ br(__ EQ, NOMATCH);
5756     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5757       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5758       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5759       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5760       if (str2_isL) { // LL
5761         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5762         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5763         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5764         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5765         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5766       } else {
5767         __ mov(ch2, 0xE); // all bits in byte set except last one
5768         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5769         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5770         __ lslv(tmp2, tmp2, tmp4);
5771         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5772         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5773         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5774         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5775       }
5776       __ cmp(ch1, ch2);
5777       __ mov(tmp4, wordSize/str2_chr_size);
5778       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5779     __ BIND(L_SMALL_CMP_LOOP);
5780       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5781                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5782       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5783                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5784       __ add(tmp4, tmp4, 1);
5785       __ cmp(tmp4, cnt1);
5786       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5787       __ cmp(first, ch2);
5788       __ br(__ EQ, L_SMALL_CMP_LOOP);
5789     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5790       __ cbz(tmp2, NOMATCH); // no more matches. exit
5791       __ clz(tmp4, tmp2);
5792       __ add(result, result, 1); // advance index
5793       __ add(str2, str2, str2_chr_size); // advance pointer
5794       __ b(L_SMALL_HAS_ZERO_LOOP);
5795     __ align(OptoLoopAlignment);
5796     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5797       __ cmp(first, ch2);
5798       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5799       __ b(DONE);
5800     __ align(OptoLoopAlignment);
5801     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5802       if (str2_isL) { // LL
5803         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5804         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5805         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5806         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5807         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5808       } else {
5809         __ mov(ch2, 0xE); // all bits in byte set except last one
5810         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5811         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5812         __ lslv(tmp2, tmp2, tmp4);
5813         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5814         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5815         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5816         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5817       }
5818       __ cmp(ch1, ch2);
5819       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5820       __ b(DONE);
5821     __ align(OptoLoopAlignment);
5822     __ BIND(L_HAS_ZERO);
5823       __ rbit(tmp2, tmp2);
5824       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5825       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5826       // It's fine because both counters are 32bit and are not changed in this
5827       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5828       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5829       __ sub(result, result, 1);
5830     __ BIND(L_HAS_ZERO_LOOP);
5831       __ mov(cnt1, wordSize/str2_chr_size);
5832       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5833       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5834       if (str2_isL) {
5835         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5836         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5837         __ lslv(tmp2, tmp2, tmp4);
5838         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5839         __ add(tmp4, tmp4, 1);
5840         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5841         __ lsl(tmp2, tmp2, 1);
5842         __ mov(tmp4, wordSize/str2_chr_size);
5843       } else {
5844         __ mov(ch2, 0xE);
5845         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5846         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5847         __ lslv(tmp2, tmp2, tmp4);
5848         __ add(tmp4, tmp4, 1);
5849         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5850         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5851         __ lsl(tmp2, tmp2, 1);
5852         __ mov(tmp4, wordSize/str2_chr_size);
5853         __ sub(str2, str2, str2_chr_size);
5854       }
5855       __ cmp(ch1, ch2);
5856       __ mov(tmp4, wordSize/str2_chr_size);
5857       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5858     __ BIND(L_CMP_LOOP);
5859       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5860                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5861       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5862                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5863       __ add(tmp4, tmp4, 1);
5864       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5865       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5866       __ cmp(cnt1, ch2);
5867       __ br(__ EQ, L_CMP_LOOP);
5868     __ BIND(L_CMP_LOOP_NOMATCH);
5869       // here we're not matched
5870       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5871       __ clz(tmp4, tmp2);
5872       __ add(str2, str2, str2_chr_size); // advance pointer
5873       __ b(L_HAS_ZERO_LOOP);
5874     __ align(OptoLoopAlignment);
5875     __ BIND(L_CMP_LOOP_LAST_CMP);
5876       __ cmp(cnt1, ch2);
5877       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5878       __ b(DONE);
5879     __ align(OptoLoopAlignment);
5880     __ BIND(L_CMP_LOOP_LAST_CMP2);
5881       if (str2_isL) {
5882         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5883         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5884         __ lslv(tmp2, tmp2, tmp4);
5885         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5886         __ add(tmp4, tmp4, 1);
5887         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5888         __ lsl(tmp2, tmp2, 1);
5889       } else {
5890         __ mov(ch2, 0xE);
5891         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5892         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5893         __ lslv(tmp2, tmp2, tmp4);
5894         __ add(tmp4, tmp4, 1);
5895         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5896         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5897         __ lsl(tmp2, tmp2, 1);
5898         __ sub(str2, str2, str2_chr_size);
5899       }
5900       __ cmp(ch1, ch2);
5901       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5902       __ b(DONE);
5903     __ align(OptoLoopAlignment);
5904     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5905       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5906       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5907       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5908       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5909       // result by analyzed characters value, so, we can just reset lower bits
5910       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5911       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5912       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5913       // index of last analyzed substring inside current octet. So, str2 in at
5914       // respective start address. We need to advance it to next octet
5915       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5916       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5917       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5918       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5919       __ movw(cnt2, cnt2);
5920       __ b(L_LOOP_PROCEED);
5921     __ align(OptoLoopAlignment);
5922     __ BIND(NOMATCH);
5923       __ mov(result, -1);
5924     __ BIND(DONE);
5925       __ pop(spilled_regs, sp);
5926       __ ret(lr);
5927     return entry;
5928   }
5929 
5930   void generate_string_indexof_stubs() {
5931     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5932     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5933     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5934   }
5935 
5936   void inflate_and_store_2_fp_registers(bool generatePrfm,
5937       FloatRegister src1, FloatRegister src2) {
5938     Register dst = r1;
5939     __ zip1(v1, __ T16B, src1, v0);
5940     __ zip2(v2, __ T16B, src1, v0);
5941     if (generatePrfm) {
5942       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5943     }
5944     __ zip1(v3, __ T16B, src2, v0);
5945     __ zip2(v4, __ T16B, src2, v0);
5946     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5947   }
5948 
5949   // R0 = src
5950   // R1 = dst
5951   // R2 = len
5952   // R3 = len >> 3
5953   // V0 = 0
5954   // v1 = loaded 8 bytes
5955   address generate_large_byte_array_inflate() {
5956     __ align(CodeEntryAlignment);
5957     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5958     address entry = __ pc();
5959     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5960     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5961     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5962 
5963     // do one more 8-byte read to have address 16-byte aligned in most cases
5964     // also use single store instruction
5965     __ ldrd(v2, __ post(src, 8));
5966     __ sub(octetCounter, octetCounter, 2);
5967     __ zip1(v1, __ T16B, v1, v0);
5968     __ zip1(v2, __ T16B, v2, v0);
5969     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5970     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5971     __ subs(rscratch1, octetCounter, large_loop_threshold);
5972     __ br(__ LE, LOOP_START);
5973     __ b(LOOP_PRFM_START);
5974     __ bind(LOOP_PRFM);
5975       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5976     __ bind(LOOP_PRFM_START);
5977       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5978       __ sub(octetCounter, octetCounter, 8);
5979       __ subs(rscratch1, octetCounter, large_loop_threshold);
5980       inflate_and_store_2_fp_registers(true, v3, v4);
5981       inflate_and_store_2_fp_registers(true, v5, v6);
5982       __ br(__ GT, LOOP_PRFM);
5983       __ cmp(octetCounter, (u1)8);
5984       __ br(__ LT, DONE);
5985     __ bind(LOOP);
5986       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5987       __ bind(LOOP_START);
5988       __ sub(octetCounter, octetCounter, 8);
5989       __ cmp(octetCounter, (u1)8);
5990       inflate_and_store_2_fp_registers(false, v3, v4);
5991       inflate_and_store_2_fp_registers(false, v5, v6);
5992       __ br(__ GE, LOOP);
5993     __ bind(DONE);
5994       __ ret(lr);
5995     return entry;
5996   }
5997 
5998   /**
5999    *  Arguments:
6000    *
6001    *  Input:
6002    *  c_rarg0   - current state address
6003    *  c_rarg1   - H key address
6004    *  c_rarg2   - data address
6005    *  c_rarg3   - number of blocks
6006    *
6007    *  Output:
6008    *  Updated state at c_rarg0
6009    */
6010   address generate_ghash_processBlocks() {
6011     // Bafflingly, GCM uses little-endian for the byte order, but
6012     // big-endian for the bit order.  For example, the polynomial 1 is
6013     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6014     //
6015     // So, we must either reverse the bytes in each word and do
6016     // everything big-endian or reverse the bits in each byte and do
6017     // it little-endian.  On AArch64 it's more idiomatic to reverse
6018     // the bits in each byte (we have an instruction, RBIT, to do
6019     // that) and keep the data in little-endian bit order through the
6020     // calculation, bit-reversing the inputs and outputs.
6021 
6022     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6023     __ align(wordSize * 2);
6024     address p = __ pc();
6025     __ emit_int64(0x87);  // The low-order bits of the field
6026                           // polynomial (i.e. p = z^7+z^2+z+1)
6027                           // repeated in the low and high parts of a
6028                           // 128-bit vector
6029     __ emit_int64(0x87);
6030 
6031     __ align(CodeEntryAlignment);
6032     address start = __ pc();
6033 
6034     Register state   = c_rarg0;
6035     Register subkeyH = c_rarg1;
6036     Register data    = c_rarg2;
6037     Register blocks  = c_rarg3;
6038 
6039     FloatRegister vzr = v30;
6040     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6041 
6042     __ ldrq(v24, p);    // The field polynomial
6043 
6044     __ ldrq(v0, Address(state));
6045     __ ldrq(v1, Address(subkeyH));
6046 
6047     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6048     __ rbit(v0, __ T16B, v0);
6049     __ rev64(v1, __ T16B, v1);
6050     __ rbit(v1, __ T16B, v1);
6051 
6052     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6053     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6054 
6055     {
6056       Label L_ghash_loop;
6057       __ bind(L_ghash_loop);
6058 
6059       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6060                                                  // reversing each byte
6061       __ rbit(v2, __ T16B, v2);
6062       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6063 
6064       // Multiply state in v2 by subkey in v1
6065       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6066                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6067                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6068       // Reduce v7:v5 by the field polynomial
6069       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6070 
6071       __ sub(blocks, blocks, 1);
6072       __ cbnz(blocks, L_ghash_loop);
6073     }
6074 
6075     // The bit-reversed result is at this point in v0
6076     __ rev64(v0, __ T16B, v0);
6077     __ rbit(v0, __ T16B, v0);
6078 
6079     __ st1(v0, __ T16B, state);
6080     __ ret(lr);
6081 
6082     return start;
6083   }
6084 
6085   address generate_ghash_processBlocks_wide() {
6086     address small = generate_ghash_processBlocks();
6087 
6088     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6089     __ align(wordSize * 2);
6090     address p = __ pc();
6091     __ emit_int64(0x87);  // The low-order bits of the field
6092                           // polynomial (i.e. p = z^7+z^2+z+1)
6093                           // repeated in the low and high parts of a
6094                           // 128-bit vector
6095     __ emit_int64(0x87);
6096 
6097     __ align(CodeEntryAlignment);
6098     address start = __ pc();
6099 
6100     Register state   = c_rarg0;
6101     Register subkeyH = c_rarg1;
6102     Register data    = c_rarg2;
6103     Register blocks  = c_rarg3;
6104 
6105     const int unroll = 4;
6106 
6107     __ cmp(blocks, (unsigned char)(unroll * 2));
6108     __ br(__ LT, small);
6109 
6110     if (unroll > 1) {
6111     // Save state before entering routine
6112       __ sub(sp, sp, 4 * 16);
6113       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6114       __ sub(sp, sp, 4 * 16);
6115       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6116     }
6117 
6118     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6119 
6120     if (unroll > 1) {
6121       // And restore state
6122       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6123       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6124     }
6125 
6126     __ cmp(blocks, (unsigned char)0);
6127     __ br(__ GT, small);
6128 
6129     __ ret(lr);
6130 
6131     return start;
6132   }
6133 
6134   void generate_base64_encode_simdround(Register src, Register dst,
6135         FloatRegister codec, u8 size) {
6136 
6137     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6138     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6139     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6140 
6141     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6142 
6143     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6144 
6145     __ ushr(ind0, arrangement, in0,  2);
6146 
6147     __ ushr(ind1, arrangement, in1,  2);
6148     __ shl(in0,   arrangement, in0,  6);
6149     __ orr(ind1,  arrangement, ind1, in0);
6150     __ ushr(ind1, arrangement, ind1, 2);
6151 
6152     __ ushr(ind2, arrangement, in2,  4);
6153     __ shl(in1,   arrangement, in1,  4);
6154     __ orr(ind2,  arrangement, in1,  ind2);
6155     __ ushr(ind2, arrangement, ind2, 2);
6156 
6157     __ shl(ind3,  arrangement, in2,  2);
6158     __ ushr(ind3, arrangement, ind3, 2);
6159 
6160     __ tbl(out0,  arrangement, codec,  4, ind0);
6161     __ tbl(out1,  arrangement, codec,  4, ind1);
6162     __ tbl(out2,  arrangement, codec,  4, ind2);
6163     __ tbl(out3,  arrangement, codec,  4, ind3);
6164 
6165     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6166   }
6167 
6168    /**
6169    *  Arguments:
6170    *
6171    *  Input:
6172    *  c_rarg0   - src_start
6173    *  c_rarg1   - src_offset
6174    *  c_rarg2   - src_length
6175    *  c_rarg3   - dest_start
6176    *  c_rarg4   - dest_offset
6177    *  c_rarg5   - isURL
6178    *
6179    */
6180   address generate_base64_encodeBlock() {
6181 
6182     static const char toBase64[64] = {
6183       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6184       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6185       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6186       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6187       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6188     };
6189 
6190     static const char toBase64URL[64] = {
6191       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6192       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6193       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6194       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6195       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6196     };
6197 
6198     __ align(CodeEntryAlignment);
6199     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6200     address start = __ pc();
6201 
6202     Register src   = c_rarg0;  // source array
6203     Register soff  = c_rarg1;  // source start offset
6204     Register send  = c_rarg2;  // source end offset
6205     Register dst   = c_rarg3;  // dest array
6206     Register doff  = c_rarg4;  // position for writing to dest array
6207     Register isURL = c_rarg5;  // Base64 or URL character set
6208 
6209     // c_rarg6 and c_rarg7 are free to use as temps
6210     Register codec  = c_rarg6;
6211     Register length = c_rarg7;
6212 
6213     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6214 
6215     __ add(src, src, soff);
6216     __ add(dst, dst, doff);
6217     __ sub(length, send, soff);
6218 
6219     // load the codec base address
6220     __ lea(codec, ExternalAddress((address) toBase64));
6221     __ cbz(isURL, ProcessData);
6222     __ lea(codec, ExternalAddress((address) toBase64URL));
6223 
6224     __ BIND(ProcessData);
6225 
6226     // too short to formup a SIMD loop, roll back
6227     __ cmp(length, (u1)24);
6228     __ br(Assembler::LT, Process3B);
6229 
6230     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6231 
6232     __ BIND(Process48B);
6233     __ cmp(length, (u1)48);
6234     __ br(Assembler::LT, Process24B);
6235     generate_base64_encode_simdround(src, dst, v0, 16);
6236     __ sub(length, length, 48);
6237     __ b(Process48B);
6238 
6239     __ BIND(Process24B);
6240     __ cmp(length, (u1)24);
6241     __ br(Assembler::LT, SIMDExit);
6242     generate_base64_encode_simdround(src, dst, v0, 8);
6243     __ sub(length, length, 24);
6244 
6245     __ BIND(SIMDExit);
6246     __ cbz(length, Exit);
6247 
6248     __ BIND(Process3B);
6249     //  3 src bytes, 24 bits
6250     __ ldrb(r10, __ post(src, 1));
6251     __ ldrb(r11, __ post(src, 1));
6252     __ ldrb(r12, __ post(src, 1));
6253     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6254     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6255     // codec index
6256     __ ubfmw(r15, r12, 18, 23);
6257     __ ubfmw(r14, r12, 12, 17);
6258     __ ubfmw(r13, r12, 6,  11);
6259     __ andw(r12,  r12, 63);
6260     // get the code based on the codec
6261     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6262     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6263     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6264     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6265     __ strb(r15, __ post(dst, 1));
6266     __ strb(r14, __ post(dst, 1));
6267     __ strb(r13, __ post(dst, 1));
6268     __ strb(r12, __ post(dst, 1));
6269     __ sub(length, length, 3);
6270     __ cbnz(length, Process3B);
6271 
6272     __ BIND(Exit);
6273     __ ret(lr);
6274 
6275     return start;
6276   }
6277 
6278   void generate_base64_decode_simdround(Register src, Register dst,
6279         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6280 
6281     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6282     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6283 
6284     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6285     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6286 
6287     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6288 
6289     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6290 
6291     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6292 
6293     // we need unsigned saturating subtract, to make sure all input values
6294     // in range [0, 63] will have 0U value in the higher half lookup
6295     __ uqsubv(decH0, __ T16B, in0, v27);
6296     __ uqsubv(decH1, __ T16B, in1, v27);
6297     __ uqsubv(decH2, __ T16B, in2, v27);
6298     __ uqsubv(decH3, __ T16B, in3, v27);
6299 
6300     // lower half lookup
6301     __ tbl(decL0, arrangement, codecL, 4, in0);
6302     __ tbl(decL1, arrangement, codecL, 4, in1);
6303     __ tbl(decL2, arrangement, codecL, 4, in2);
6304     __ tbl(decL3, arrangement, codecL, 4, in3);
6305 
6306     // higher half lookup
6307     __ tbx(decH0, arrangement, codecH, 4, decH0);
6308     __ tbx(decH1, arrangement, codecH, 4, decH1);
6309     __ tbx(decH2, arrangement, codecH, 4, decH2);
6310     __ tbx(decH3, arrangement, codecH, 4, decH3);
6311 
6312     // combine lower and higher
6313     __ orr(decL0, arrangement, decL0, decH0);
6314     __ orr(decL1, arrangement, decL1, decH1);
6315     __ orr(decL2, arrangement, decL2, decH2);
6316     __ orr(decL3, arrangement, decL3, decH3);
6317 
6318     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6319     __ cmhi(decH0, arrangement, decL0, v27);
6320     __ cmhi(decH1, arrangement, decL1, v27);
6321     __ cmhi(decH2, arrangement, decL2, v27);
6322     __ cmhi(decH3, arrangement, decL3, v27);
6323     __ orr(in0, arrangement, decH0, decH1);
6324     __ orr(in1, arrangement, decH2, decH3);
6325     __ orr(in2, arrangement, in0,   in1);
6326     __ umaxv(in3, arrangement, in2);
6327     __ umov(rscratch2, in3, __ B, 0);
6328 
6329     // get the data to output
6330     __ shl(out0,  arrangement, decL0, 2);
6331     __ ushr(out1, arrangement, decL1, 4);
6332     __ orr(out0,  arrangement, out0,  out1);
6333     __ shl(out1,  arrangement, decL1, 4);
6334     __ ushr(out2, arrangement, decL2, 2);
6335     __ orr(out1,  arrangement, out1,  out2);
6336     __ shl(out2,  arrangement, decL2, 6);
6337     __ orr(out2,  arrangement, out2,  decL3);
6338 
6339     __ cbz(rscratch2, NoIllegalData);
6340 
6341     // handle illegal input
6342     __ umov(r10, in2, __ D, 0);
6343     if (size == 16) {
6344       __ cbnz(r10, ErrorInLowerHalf);
6345 
6346       // illegal input is in higher half, store the lower half now.
6347       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6348 
6349       __ umov(r10, in2,  __ D, 1);
6350       __ umov(r11, out0, __ D, 1);
6351       __ umov(r12, out1, __ D, 1);
6352       __ umov(r13, out2, __ D, 1);
6353       __ b(StoreLegalData);
6354 
6355       __ BIND(ErrorInLowerHalf);
6356     }
6357     __ umov(r11, out0, __ D, 0);
6358     __ umov(r12, out1, __ D, 0);
6359     __ umov(r13, out2, __ D, 0);
6360 
6361     __ BIND(StoreLegalData);
6362     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6363     __ strb(r11, __ post(dst, 1));
6364     __ strb(r12, __ post(dst, 1));
6365     __ strb(r13, __ post(dst, 1));
6366     __ lsr(r10, r10, 8);
6367     __ lsr(r11, r11, 8);
6368     __ lsr(r12, r12, 8);
6369     __ lsr(r13, r13, 8);
6370     __ b(StoreLegalData);
6371 
6372     __ BIND(NoIllegalData);
6373     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6374   }
6375 
6376 
6377    /**
6378    *  Arguments:
6379    *
6380    *  Input:
6381    *  c_rarg0   - src_start
6382    *  c_rarg1   - src_offset
6383    *  c_rarg2   - src_length
6384    *  c_rarg3   - dest_start
6385    *  c_rarg4   - dest_offset
6386    *  c_rarg5   - isURL
6387    *  c_rarg6   - isMIME
6388    *
6389    */
6390   address generate_base64_decodeBlock() {
6391 
6392     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6393     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6394     // titled "Base64 decoding".
6395 
6396     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6397     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6398     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6399     static const uint8_t fromBase64ForNoSIMD[256] = {
6400       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6401       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6402       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6403        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6404       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6405        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6406       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6407        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6408       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6409       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6410       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6411       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6412       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6413       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6414       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6415       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6416     };
6417 
6418     static const uint8_t fromBase64URLForNoSIMD[256] = {
6419       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6420       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6421       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6422        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6423       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6424        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6425       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6426        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6427       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6428       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6429       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6430       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6431       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6432       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6433       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6434       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6435     };
6436 
6437     // A legal value of base64 code is in range [0, 127].  We need two lookups
6438     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6439     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6440     // table vector lookup use tbx, out of range indices are unchanged in
6441     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6442     // The value of index 64 is set to 0, so that we know that we already get the
6443     // decoded data with the 1st lookup.
6444     static const uint8_t fromBase64ForSIMD[128] = {
6445       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6446       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6447       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6448        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6449         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6450        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6451       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6452        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6453     };
6454 
6455     static const uint8_t fromBase64URLForSIMD[128] = {
6456       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6457       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6458       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6459        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6460         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6461        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6462        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6463        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6464     };
6465 
6466     __ align(CodeEntryAlignment);
6467     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6468     address start = __ pc();
6469 
6470     Register src    = c_rarg0;  // source array
6471     Register soff   = c_rarg1;  // source start offset
6472     Register send   = c_rarg2;  // source end offset
6473     Register dst    = c_rarg3;  // dest array
6474     Register doff   = c_rarg4;  // position for writing to dest array
6475     Register isURL  = c_rarg5;  // Base64 or URL character set
6476     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6477 
6478     Register length = send;    // reuse send as length of source data to process
6479 
6480     Register simd_codec   = c_rarg6;
6481     Register nosimd_codec = c_rarg7;
6482 
6483     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6484 
6485     __ enter();
6486 
6487     __ add(src, src, soff);
6488     __ add(dst, dst, doff);
6489 
6490     __ mov(doff, dst);
6491 
6492     __ sub(length, send, soff);
6493     __ bfm(length, zr, 0, 1);
6494 
6495     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6496     __ cbz(isURL, ProcessData);
6497     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6498 
6499     __ BIND(ProcessData);
6500     __ mov(rscratch1, length);
6501     __ cmp(length, (u1)144); // 144 = 80 + 64
6502     __ br(Assembler::LT, Process4B);
6503 
6504     // In the MIME case, the line length cannot be more than 76
6505     // bytes (see RFC 2045). This is too short a block for SIMD
6506     // to be worthwhile, so we use non-SIMD here.
6507     __ movw(rscratch1, 79);
6508 
6509     __ BIND(Process4B);
6510     __ ldrw(r14, __ post(src, 4));
6511     __ ubfxw(r10, r14, 0,  8);
6512     __ ubfxw(r11, r14, 8,  8);
6513     __ ubfxw(r12, r14, 16, 8);
6514     __ ubfxw(r13, r14, 24, 8);
6515     // get the de-code
6516     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6517     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6518     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6519     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6520     // error detection, 255u indicates an illegal input
6521     __ orrw(r14, r10, r11);
6522     __ orrw(r15, r12, r13);
6523     __ orrw(r14, r14, r15);
6524     __ tbnz(r14, 7, Exit);
6525     // recover the data
6526     __ lslw(r14, r10, 10);
6527     __ bfiw(r14, r11, 4, 6);
6528     __ bfmw(r14, r12, 2, 5);
6529     __ rev16w(r14, r14);
6530     __ bfiw(r13, r12, 6, 2);
6531     __ strh(r14, __ post(dst, 2));
6532     __ strb(r13, __ post(dst, 1));
6533     // non-simd loop
6534     __ subsw(rscratch1, rscratch1, 4);
6535     __ br(Assembler::GT, Process4B);
6536 
6537     // if exiting from PreProcess80B, rscratch1 == -1;
6538     // otherwise, rscratch1 == 0.
6539     __ cbzw(rscratch1, Exit);
6540     __ sub(length, length, 80);
6541 
6542     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6543     __ cbz(isURL, SIMDEnter);
6544     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6545 
6546     __ BIND(SIMDEnter);
6547     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6548     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6549     __ mov(rscratch1, 63);
6550     __ dup(v27, __ T16B, rscratch1);
6551 
6552     __ BIND(Process64B);
6553     __ cmp(length, (u1)64);
6554     __ br(Assembler::LT, Process32B);
6555     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6556     __ sub(length, length, 64);
6557     __ b(Process64B);
6558 
6559     __ BIND(Process32B);
6560     __ cmp(length, (u1)32);
6561     __ br(Assembler::LT, SIMDExit);
6562     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6563     __ sub(length, length, 32);
6564     __ b(Process32B);
6565 
6566     __ BIND(SIMDExit);
6567     __ cbz(length, Exit);
6568     __ movw(rscratch1, length);
6569     __ b(Process4B);
6570 
6571     __ BIND(Exit);
6572     __ sub(c_rarg0, dst, doff);
6573 
6574     __ leave();
6575     __ ret(lr);
6576 
6577     return start;
6578   }
6579 
6580   // Support for spin waits.
6581   address generate_spin_wait() {
6582     __ align(CodeEntryAlignment);
6583     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6584     address start = __ pc();
6585 
6586     __ spin_wait();
6587     __ ret(lr);
6588 
6589     return start;
6590   }
6591 
6592 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6593 
6594   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6595   //
6596   // If LSE is in use, generate LSE versions of all the stubs. The
6597   // non-LSE versions are in atomic_aarch64.S.
6598 
6599   // class AtomicStubMark records the entry point of a stub and the
6600   // stub pointer which will point to it. The stub pointer is set to
6601   // the entry point when ~AtomicStubMark() is called, which must be
6602   // after ICache::invalidate_range. This ensures safe publication of
6603   // the generated code.
6604   class AtomicStubMark {
6605     address _entry_point;
6606     aarch64_atomic_stub_t *_stub;
6607     MacroAssembler *_masm;
6608   public:
6609     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6610       _masm = masm;
6611       __ align(32);
6612       _entry_point = __ pc();
6613       _stub = stub;
6614     }
6615     ~AtomicStubMark() {
6616       *_stub = (aarch64_atomic_stub_t)_entry_point;
6617     }
6618   };
6619 
6620   // NB: For memory_order_conservative we need a trailing membar after
6621   // LSE atomic operations but not a leading membar.
6622   //
6623   // We don't need a leading membar because a clause in the Arm ARM
6624   // says:
6625   //
6626   //   Barrier-ordered-before
6627   //
6628   //   Barrier instructions order prior Memory effects before subsequent
6629   //   Memory effects generated by the same Observer. A read or a write
6630   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6631   //   Observer if and only if RW1 appears in program order before RW 2
6632   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6633   //   instruction with both Acquire and Release semantics.
6634   //
6635   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6636   // and Release semantics, therefore we don't need a leading
6637   // barrier. However, there is no corresponding Barrier-ordered-after
6638   // relationship, therefore we need a trailing membar to prevent a
6639   // later store or load from being reordered with the store in an
6640   // atomic instruction.
6641   //
6642   // This was checked by using the herd7 consistency model simulator
6643   // (http://diy.inria.fr/) with this test case:
6644   //
6645   // AArch64 LseCas
6646   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6647   // P0 | P1;
6648   // LDR W4, [X2] | MOV W3, #0;
6649   // DMB LD       | MOV W4, #1;
6650   // LDR W3, [X1] | CASAL W3, W4, [X1];
6651   //              | DMB ISH;
6652   //              | STR W4, [X2];
6653   // exists
6654   // (0:X3=0 /\ 0:X4=1)
6655   //
6656   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6657   // with the store to x in P1. Without the DMB in P1 this may happen.
6658   //
6659   // At the time of writing we don't know of any AArch64 hardware that
6660   // reorders stores in this way, but the Reference Manual permits it.
6661 
6662   void gen_cas_entry(Assembler::operand_size size,
6663                      atomic_memory_order order) {
6664     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6665       exchange_val = c_rarg2;
6666     bool acquire, release;
6667     switch (order) {
6668       case memory_order_relaxed:
6669         acquire = false;
6670         release = false;
6671         break;
6672       case memory_order_release:
6673         acquire = false;
6674         release = true;
6675         break;
6676       default:
6677         acquire = true;
6678         release = true;
6679         break;
6680     }
6681     __ mov(prev, compare_val);
6682     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6683     if (order == memory_order_conservative) {
6684       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6685     }
6686     if (size == Assembler::xword) {
6687       __ mov(r0, prev);
6688     } else {
6689       __ movw(r0, prev);
6690     }
6691     __ ret(lr);
6692   }
6693 
6694   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6695     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6696     // If not relaxed, then default to conservative.  Relaxed is the only
6697     // case we use enough to be worth specializing.
6698     if (order == memory_order_relaxed) {
6699       __ ldadd(size, incr, prev, addr);
6700     } else {
6701       __ ldaddal(size, incr, prev, addr);
6702       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6703     }
6704     if (size == Assembler::xword) {
6705       __ mov(r0, prev);
6706     } else {
6707       __ movw(r0, prev);
6708     }
6709     __ ret(lr);
6710   }
6711 
6712   void gen_swpal_entry(Assembler::operand_size size) {
6713     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6714     __ swpal(size, incr, prev, addr);
6715     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6716     if (size == Assembler::xword) {
6717       __ mov(r0, prev);
6718     } else {
6719       __ movw(r0, prev);
6720     }
6721     __ ret(lr);
6722   }
6723 
6724   void generate_atomic_entry_points() {
6725     if (! UseLSE) {
6726       return;
6727     }
6728 
6729     __ align(CodeEntryAlignment);
6730     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6731     address first_entry = __ pc();
6732 
6733     // ADD, memory_order_conservative
6734     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6735     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6736     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6737     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6738 
6739     // ADD, memory_order_relaxed
6740     AtomicStubMark mark_fetch_add_4_relaxed
6741       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6742     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6743     AtomicStubMark mark_fetch_add_8_relaxed
6744       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6745     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6746 
6747     // XCHG, memory_order_conservative
6748     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6749     gen_swpal_entry(Assembler::word);
6750     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6751     gen_swpal_entry(Assembler::xword);
6752 
6753     // CAS, memory_order_conservative
6754     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6755     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6756     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6757     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6758     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6759     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6760 
6761     // CAS, memory_order_relaxed
6762     AtomicStubMark mark_cmpxchg_1_relaxed
6763       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6764     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6765     AtomicStubMark mark_cmpxchg_4_relaxed
6766       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6767     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6768     AtomicStubMark mark_cmpxchg_8_relaxed
6769       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6770     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6771 
6772     AtomicStubMark mark_cmpxchg_4_release
6773       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6774     gen_cas_entry(MacroAssembler::word, memory_order_release);
6775     AtomicStubMark mark_cmpxchg_8_release
6776       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6777     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6778 
6779     AtomicStubMark mark_cmpxchg_4_seq_cst
6780       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6781     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6782     AtomicStubMark mark_cmpxchg_8_seq_cst
6783       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6784     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6785 
6786     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6787   }
6788 #endif // LINUX
6789 
6790   address generate_cont_thaw(Continuation::thaw_kind kind) {
6791     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6792     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6793 
6794     address start = __ pc();
6795 
6796     if (return_barrier) {
6797       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6798       __ mov(sp, rscratch1);
6799     }
6800     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6801 
6802     if (return_barrier) {
6803       // preserve possible return value from a method returning to the return barrier
6804       __ fmovd(rscratch1, v0);
6805       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6806     }
6807 
6808     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6809     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6810     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6811 
6812     if (return_barrier) {
6813       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6814       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6815       __ fmovd(v0, rscratch1);
6816     }
6817     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6818 
6819 
6820     Label thaw_success;
6821     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6822     __ cbnz(rscratch2, thaw_success);
6823     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6824     __ br(rscratch1);
6825     __ bind(thaw_success);
6826 
6827     // make room for the thawed frames
6828     __ sub(rscratch1, sp, rscratch2);
6829     __ andr(rscratch1, rscratch1, -16); // align
6830     __ mov(sp, rscratch1);
6831 
6832     if (return_barrier) {
6833       // save original return value -- again
6834       __ fmovd(rscratch1, v0);
6835       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6836     }
6837 
6838     // If we want, we can templatize thaw by kind, and have three different entries
6839     __ movw(c_rarg1, (uint32_t)kind);
6840 
6841     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
6842     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
6843 
6844     if (return_barrier) {
6845       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6846       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6847       __ fmovd(v0, rscratch1);
6848     } else {
6849       __ mov(r0, zr); // return 0 (success) from doYield
6850     }
6851 
6852     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
6853     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
6854     __ mov(rfp, sp);
6855 
6856     if (return_barrier_exception) {
6857       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
6858       __ verify_oop(r0);
6859       __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
6860 
6861       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
6862 
6863       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
6864       // __ reinitialize_ptrue();
6865 
6866       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
6867 
6868       __ mov(r1, r0); // the exception handler
6869       __ mov(r0, r19); // restore return value contaning the exception oop
6870       __ verify_oop(r0);
6871 
6872       __ leave();
6873       __ mov(r3, lr);
6874       __ br(r1); // the exception handler
6875     } else {
6876       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
6877       __ leave();
6878       __ ret(lr);
6879     }
6880 
6881     return start;
6882   }
6883 
6884   address generate_cont_thaw() {
6885     if (!Continuations::enabled()) return nullptr;
6886 
6887     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
6888     address start = __ pc();
6889     generate_cont_thaw(Continuation::thaw_top);
6890     return start;
6891   }
6892 
6893   address generate_cont_returnBarrier() {
6894     if (!Continuations::enabled()) return nullptr;
6895 
6896     // TODO: will probably need multiple return barriers depending on return type
6897     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
6898     address start = __ pc();
6899 
6900     generate_cont_thaw(Continuation::thaw_return_barrier);
6901 
6902     return start;
6903   }
6904 
6905   address generate_cont_returnBarrier_exception() {
6906     if (!Continuations::enabled()) return nullptr;
6907 
6908     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
6909     address start = __ pc();
6910 
6911     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
6912 
6913     return start;
6914   }
6915 
6916 #if INCLUDE_JFR
6917 
6918   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
6919     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6920     __ mov(c_rarg0, thread);
6921   }
6922 
6923   // The handle is dereferenced through a load barrier.
6924   static void jfr_epilogue(MacroAssembler* _masm) {
6925     __ reset_last_Java_frame(true);
6926     Label null_jobject;
6927     __ cbz(r0, null_jobject);
6928     DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
6929     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
6930     bs->load_at(_masm, decorators, T_OBJECT, r0, Address(r0, 0), rscratch1, rscratch2);
6931     __ bind(null_jobject);
6932   }
6933 
6934   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
6935   // It returns a jobject handle to the event writer.
6936   // The handle is dereferenced and the return value is the event writer oop.
6937   static RuntimeStub* generate_jfr_write_checkpoint() {
6938     enum layout {
6939       rbp_off,
6940       rbpH_off,
6941       return_off,
6942       return_off2,
6943       framesize // inclusive of return address
6944     };
6945 
6946     int insts_size = 512;
6947     int locs_size = 64;
6948     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
6949     OopMapSet* oop_maps = new OopMapSet();
6950     MacroAssembler* masm = new MacroAssembler(&code);
6951     MacroAssembler* _masm = masm;
6952 
6953     address start = __ pc();
6954     __ enter();
6955     int frame_complete = __ pc() - start;
6956     address the_pc = __ pc();
6957     jfr_prologue(the_pc, _masm, rthread);
6958     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
6959     jfr_epilogue(_masm);
6960     __ leave();
6961     __ ret(lr);
6962 
6963     OopMap* map = new OopMap(framesize, 1); // rfp
6964     oop_maps->add_gc_map(the_pc - start, map);
6965 
6966     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
6967       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
6968                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6969                                     oop_maps, false);
6970     return stub;
6971   }
6972 
6973 #endif // INCLUDE_JFR
6974 
6975   // Continuation point for throwing of implicit exceptions that are
6976   // not handled in the current activation. Fabricates an exception
6977   // oop and initiates normal exception dispatching in this
6978   // frame. Since we need to preserve callee-saved values (currently
6979   // only for C2, but done for C1 as well) we need a callee-saved oop
6980   // map and therefore have to make these stubs into RuntimeStubs
6981   // rather than BufferBlobs.  If the compiler needs all registers to
6982   // be preserved between the fault point and the exception handler
6983   // then it must assume responsibility for that in
6984   // AbstractCompiler::continuation_for_implicit_null_exception or
6985   // continuation_for_implicit_division_by_zero_exception. All other
6986   // implicit exceptions (e.g., NullPointerException or
6987   // AbstractMethodError on entry) are either at call sites or
6988   // otherwise assume that stack unwinding will be initiated, so
6989   // caller saved registers were assumed volatile in the compiler.
6990 
6991 #undef __
6992 #define __ masm->
6993 
6994   address generate_throw_exception(const char* name,
6995                                    address runtime_entry,
6996                                    Register arg1 = noreg,
6997                                    Register arg2 = noreg) {
6998     // Information about frame layout at time of blocking runtime call.
6999     // Note that we only have to preserve callee-saved registers since
7000     // the compilers are responsible for supplying a continuation point
7001     // if they expect all registers to be preserved.
7002     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7003     enum layout {
7004       rfp_off = 0,
7005       rfp_off2,
7006       return_off,
7007       return_off2,
7008       framesize // inclusive of return address
7009     };
7010 
7011     int insts_size = 512;
7012     int locs_size  = 64;
7013 
7014     CodeBuffer code(name, insts_size, locs_size);
7015     OopMapSet* oop_maps  = new OopMapSet();
7016     MacroAssembler* masm = new MacroAssembler(&code);
7017 
7018     address start = __ pc();
7019 
7020     // This is an inlined and slightly modified version of call_VM
7021     // which has the ability to fetch the return PC out of
7022     // thread-local storage and also sets up last_Java_sp slightly
7023     // differently than the real call_VM
7024 
7025     __ enter(); // Save FP and LR before call
7026 
7027     assert(is_even(framesize/2), "sp not 16-byte aligned");
7028 
7029     // lr and fp are already in place
7030     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7031 
7032     int frame_complete = __ pc() - start;
7033 
7034     // Set up last_Java_sp and last_Java_fp
7035     address the_pc = __ pc();
7036     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7037 
7038     // Call runtime
7039     if (arg1 != noreg) {
7040       assert(arg2 != c_rarg1, "clobbered");
7041       __ mov(c_rarg1, arg1);
7042     }
7043     if (arg2 != noreg) {
7044       __ mov(c_rarg2, arg2);
7045     }
7046     __ mov(c_rarg0, rthread);
7047     BLOCK_COMMENT("call runtime_entry");
7048     __ mov(rscratch1, runtime_entry);
7049     __ blr(rscratch1);
7050 
7051     // Generate oop map
7052     OopMap* map = new OopMap(framesize, 0);
7053 
7054     oop_maps->add_gc_map(the_pc - start, map);
7055 
7056     __ reset_last_Java_frame(true);
7057 
7058     // Reinitialize the ptrue predicate register, in case the external runtime
7059     // call clobbers ptrue reg, as we may return to SVE compiled code.
7060     __ reinitialize_ptrue();
7061 
7062     __ leave();
7063 
7064     // check for pending exceptions
7065 #ifdef ASSERT
7066     Label L;
7067     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7068     __ cbnz(rscratch1, L);
7069     __ should_not_reach_here();
7070     __ bind(L);
7071 #endif // ASSERT
7072     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7073 
7074     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7075     RuntimeStub* stub =
7076       RuntimeStub::new_runtime_stub(name,
7077                                     &code,
7078                                     frame_complete,
7079                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7080                                     oop_maps, false);
7081     return stub->entry_point();
7082   }
7083 
7084   class MontgomeryMultiplyGenerator : public MacroAssembler {
7085 
7086     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7087       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7088 
7089     RegSet _toSave;
7090     bool _squaring;
7091 
7092   public:
7093     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7094       : MacroAssembler(as->code()), _squaring(squaring) {
7095 
7096       // Register allocation
7097 
7098       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7099       Pa_base = *regs;       // Argument registers
7100       if (squaring)
7101         Pb_base = Pa_base;
7102       else
7103         Pb_base = *++regs;
7104       Pn_base = *++regs;
7105       Rlen= *++regs;
7106       inv = *++regs;
7107       Pm_base = *++regs;
7108 
7109                           // Working registers:
7110       Ra =  *++regs;        // The current digit of a, b, n, and m.
7111       Rb =  *++regs;
7112       Rm =  *++regs;
7113       Rn =  *++regs;
7114 
7115       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7116       Pb =  *++regs;
7117       Pm =  *++regs;
7118       Pn =  *++regs;
7119 
7120       t0 =  *++regs;        // Three registers which form a
7121       t1 =  *++regs;        // triple-precision accumuator.
7122       t2 =  *++regs;
7123 
7124       Ri =  *++regs;        // Inner and outer loop indexes.
7125       Rj =  *++regs;
7126 
7127       Rhi_ab = *++regs;     // Product registers: low and high parts
7128       Rlo_ab = *++regs;     // of a*b and m*n.
7129       Rhi_mn = *++regs;
7130       Rlo_mn = *++regs;
7131 
7132       // r19 and up are callee-saved.
7133       _toSave = RegSet::range(r19, *regs) + Pm_base;
7134     }
7135 
7136   private:
7137     void save_regs() {
7138       push(_toSave, sp);
7139     }
7140 
7141     void restore_regs() {
7142       pop(_toSave, sp);
7143     }
7144 
7145     template <typename T>
7146     void unroll_2(Register count, T block) {
7147       Label loop, end, odd;
7148       tbnz(count, 0, odd);
7149       cbz(count, end);
7150       align(16);
7151       bind(loop);
7152       (this->*block)();
7153       bind(odd);
7154       (this->*block)();
7155       subs(count, count, 2);
7156       br(Assembler::GT, loop);
7157       bind(end);
7158     }
7159 
7160     template <typename T>
7161     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7162       Label loop, end, odd;
7163       tbnz(count, 0, odd);
7164       cbz(count, end);
7165       align(16);
7166       bind(loop);
7167       (this->*block)(d, s, tmp);
7168       bind(odd);
7169       (this->*block)(d, s, tmp);
7170       subs(count, count, 2);
7171       br(Assembler::GT, loop);
7172       bind(end);
7173     }
7174 
7175     void pre1(RegisterOrConstant i) {
7176       block_comment("pre1");
7177       // Pa = Pa_base;
7178       // Pb = Pb_base + i;
7179       // Pm = Pm_base;
7180       // Pn = Pn_base + i;
7181       // Ra = *Pa;
7182       // Rb = *Pb;
7183       // Rm = *Pm;
7184       // Rn = *Pn;
7185       ldr(Ra, Address(Pa_base));
7186       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7187       ldr(Rm, Address(Pm_base));
7188       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7189       lea(Pa, Address(Pa_base));
7190       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7191       lea(Pm, Address(Pm_base));
7192       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7193 
7194       // Zero the m*n result.
7195       mov(Rhi_mn, zr);
7196       mov(Rlo_mn, zr);
7197     }
7198 
7199     // The core multiply-accumulate step of a Montgomery
7200     // multiplication.  The idea is to schedule operations as a
7201     // pipeline so that instructions with long latencies (loads and
7202     // multiplies) have time to complete before their results are
7203     // used.  This most benefits in-order implementations of the
7204     // architecture but out-of-order ones also benefit.
7205     void step() {
7206       block_comment("step");
7207       // MACC(Ra, Rb, t0, t1, t2);
7208       // Ra = *++Pa;
7209       // Rb = *--Pb;
7210       umulh(Rhi_ab, Ra, Rb);
7211       mul(Rlo_ab, Ra, Rb);
7212       ldr(Ra, pre(Pa, wordSize));
7213       ldr(Rb, pre(Pb, -wordSize));
7214       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7215                                        // previous iteration.
7216       // MACC(Rm, Rn, t0, t1, t2);
7217       // Rm = *++Pm;
7218       // Rn = *--Pn;
7219       umulh(Rhi_mn, Rm, Rn);
7220       mul(Rlo_mn, Rm, Rn);
7221       ldr(Rm, pre(Pm, wordSize));
7222       ldr(Rn, pre(Pn, -wordSize));
7223       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7224     }
7225 
7226     void post1() {
7227       block_comment("post1");
7228 
7229       // MACC(Ra, Rb, t0, t1, t2);
7230       // Ra = *++Pa;
7231       // Rb = *--Pb;
7232       umulh(Rhi_ab, Ra, Rb);
7233       mul(Rlo_ab, Ra, Rb);
7234       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7235       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7236 
7237       // *Pm = Rm = t0 * inv;
7238       mul(Rm, t0, inv);
7239       str(Rm, Address(Pm));
7240 
7241       // MACC(Rm, Rn, t0, t1, t2);
7242       // t0 = t1; t1 = t2; t2 = 0;
7243       umulh(Rhi_mn, Rm, Rn);
7244 
7245 #ifndef PRODUCT
7246       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7247       {
7248         mul(Rlo_mn, Rm, Rn);
7249         add(Rlo_mn, t0, Rlo_mn);
7250         Label ok;
7251         cbz(Rlo_mn, ok); {
7252           stop("broken Montgomery multiply");
7253         } bind(ok);
7254       }
7255 #endif
7256       // We have very carefully set things up so that
7257       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7258       // the lower half of Rm * Rn because we know the result already:
7259       // it must be -t0.  t0 + (-t0) must generate a carry iff
7260       // t0 != 0.  So, rather than do a mul and an adds we just set
7261       // the carry flag iff t0 is nonzero.
7262       //
7263       // mul(Rlo_mn, Rm, Rn);
7264       // adds(zr, t0, Rlo_mn);
7265       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7266       adcs(t0, t1, Rhi_mn);
7267       adc(t1, t2, zr);
7268       mov(t2, zr);
7269     }
7270 
7271     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7272       block_comment("pre2");
7273       // Pa = Pa_base + i-len;
7274       // Pb = Pb_base + len;
7275       // Pm = Pm_base + i-len;
7276       // Pn = Pn_base + len;
7277 
7278       if (i.is_register()) {
7279         sub(Rj, i.as_register(), len);
7280       } else {
7281         mov(Rj, i.as_constant());
7282         sub(Rj, Rj, len);
7283       }
7284       // Rj == i-len
7285 
7286       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7287       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7288       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7289       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7290 
7291       // Ra = *++Pa;
7292       // Rb = *--Pb;
7293       // Rm = *++Pm;
7294       // Rn = *--Pn;
7295       ldr(Ra, pre(Pa, wordSize));
7296       ldr(Rb, pre(Pb, -wordSize));
7297       ldr(Rm, pre(Pm, wordSize));
7298       ldr(Rn, pre(Pn, -wordSize));
7299 
7300       mov(Rhi_mn, zr);
7301       mov(Rlo_mn, zr);
7302     }
7303 
7304     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7305       block_comment("post2");
7306       if (i.is_constant()) {
7307         mov(Rj, i.as_constant()-len.as_constant());
7308       } else {
7309         sub(Rj, i.as_register(), len);
7310       }
7311 
7312       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7313 
7314       // As soon as we know the least significant digit of our result,
7315       // store it.
7316       // Pm_base[i-len] = t0;
7317       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7318 
7319       // t0 = t1; t1 = t2; t2 = 0;
7320       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7321       adc(t1, t2, zr);
7322       mov(t2, zr);
7323     }
7324 
7325     // A carry in t0 after Montgomery multiplication means that we
7326     // should subtract multiples of n from our result in m.  We'll
7327     // keep doing that until there is no carry.
7328     void normalize(RegisterOrConstant len) {
7329       block_comment("normalize");
7330       // while (t0)
7331       //   t0 = sub(Pm_base, Pn_base, t0, len);
7332       Label loop, post, again;
7333       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7334       cbz(t0, post); {
7335         bind(again); {
7336           mov(i, zr);
7337           mov(cnt, len);
7338           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7339           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7340           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7341           align(16);
7342           bind(loop); {
7343             sbcs(Rm, Rm, Rn);
7344             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7345             add(i, i, 1);
7346             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7347             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7348             sub(cnt, cnt, 1);
7349           } cbnz(cnt, loop);
7350           sbc(t0, t0, zr);
7351         } cbnz(t0, again);
7352       } bind(post);
7353     }
7354 
7355     // Move memory at s to d, reversing words.
7356     //    Increments d to end of copied memory
7357     //    Destroys tmp1, tmp2
7358     //    Preserves len
7359     //    Leaves s pointing to the address which was in d at start
7360     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7361       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7362       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7363 
7364       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7365       mov(tmp1, len);
7366       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7367       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7368     }
7369     // where
7370     void reverse1(Register d, Register s, Register tmp) {
7371       ldr(tmp, pre(s, -wordSize));
7372       ror(tmp, tmp, 32);
7373       str(tmp, post(d, wordSize));
7374     }
7375 
7376     void step_squaring() {
7377       // An extra ACC
7378       step();
7379       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7380     }
7381 
7382     void last_squaring(RegisterOrConstant i) {
7383       Label dont;
7384       // if ((i & 1) == 0) {
7385       tbnz(i.as_register(), 0, dont); {
7386         // MACC(Ra, Rb, t0, t1, t2);
7387         // Ra = *++Pa;
7388         // Rb = *--Pb;
7389         umulh(Rhi_ab, Ra, Rb);
7390         mul(Rlo_ab, Ra, Rb);
7391         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7392       } bind(dont);
7393     }
7394 
7395     void extra_step_squaring() {
7396       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7397 
7398       // MACC(Rm, Rn, t0, t1, t2);
7399       // Rm = *++Pm;
7400       // Rn = *--Pn;
7401       umulh(Rhi_mn, Rm, Rn);
7402       mul(Rlo_mn, Rm, Rn);
7403       ldr(Rm, pre(Pm, wordSize));
7404       ldr(Rn, pre(Pn, -wordSize));
7405     }
7406 
7407     void post1_squaring() {
7408       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7409 
7410       // *Pm = Rm = t0 * inv;
7411       mul(Rm, t0, inv);
7412       str(Rm, Address(Pm));
7413 
7414       // MACC(Rm, Rn, t0, t1, t2);
7415       // t0 = t1; t1 = t2; t2 = 0;
7416       umulh(Rhi_mn, Rm, Rn);
7417 
7418 #ifndef PRODUCT
7419       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7420       {
7421         mul(Rlo_mn, Rm, Rn);
7422         add(Rlo_mn, t0, Rlo_mn);
7423         Label ok;
7424         cbz(Rlo_mn, ok); {
7425           stop("broken Montgomery multiply");
7426         } bind(ok);
7427       }
7428 #endif
7429       // We have very carefully set things up so that
7430       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7431       // the lower half of Rm * Rn because we know the result already:
7432       // it must be -t0.  t0 + (-t0) must generate a carry iff
7433       // t0 != 0.  So, rather than do a mul and an adds we just set
7434       // the carry flag iff t0 is nonzero.
7435       //
7436       // mul(Rlo_mn, Rm, Rn);
7437       // adds(zr, t0, Rlo_mn);
7438       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7439       adcs(t0, t1, Rhi_mn);
7440       adc(t1, t2, zr);
7441       mov(t2, zr);
7442     }
7443 
7444     void acc(Register Rhi, Register Rlo,
7445              Register t0, Register t1, Register t2) {
7446       adds(t0, t0, Rlo);
7447       adcs(t1, t1, Rhi);
7448       adc(t2, t2, zr);
7449     }
7450 
7451   public:
7452     /**
7453      * Fast Montgomery multiplication.  The derivation of the
7454      * algorithm is in A Cryptographic Library for the Motorola
7455      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7456      *
7457      * Arguments:
7458      *
7459      * Inputs for multiplication:
7460      *   c_rarg0   - int array elements a
7461      *   c_rarg1   - int array elements b
7462      *   c_rarg2   - int array elements n (the modulus)
7463      *   c_rarg3   - int length
7464      *   c_rarg4   - int inv
7465      *   c_rarg5   - int array elements m (the result)
7466      *
7467      * Inputs for squaring:
7468      *   c_rarg0   - int array elements a
7469      *   c_rarg1   - int array elements n (the modulus)
7470      *   c_rarg2   - int length
7471      *   c_rarg3   - int inv
7472      *   c_rarg4   - int array elements m (the result)
7473      *
7474      */
7475     address generate_multiply() {
7476       Label argh, nothing;
7477       bind(argh);
7478       stop("MontgomeryMultiply total_allocation must be <= 8192");
7479 
7480       align(CodeEntryAlignment);
7481       address entry = pc();
7482 
7483       cbzw(Rlen, nothing);
7484 
7485       enter();
7486 
7487       // Make room.
7488       cmpw(Rlen, 512);
7489       br(Assembler::HI, argh);
7490       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7491       andr(sp, Ra, -2 * wordSize);
7492 
7493       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7494 
7495       {
7496         // Copy input args, reversing as we go.  We use Ra as a
7497         // temporary variable.
7498         reverse(Ra, Pa_base, Rlen, t0, t1);
7499         if (!_squaring)
7500           reverse(Ra, Pb_base, Rlen, t0, t1);
7501         reverse(Ra, Pn_base, Rlen, t0, t1);
7502       }
7503 
7504       // Push all call-saved registers and also Pm_base which we'll need
7505       // at the end.
7506       save_regs();
7507 
7508 #ifndef PRODUCT
7509       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7510       {
7511         ldr(Rn, Address(Pn_base, 0));
7512         mul(Rlo_mn, Rn, inv);
7513         subs(zr, Rlo_mn, -1);
7514         Label ok;
7515         br(EQ, ok); {
7516           stop("broken inverse in Montgomery multiply");
7517         } bind(ok);
7518       }
7519 #endif
7520 
7521       mov(Pm_base, Ra);
7522 
7523       mov(t0, zr);
7524       mov(t1, zr);
7525       mov(t2, zr);
7526 
7527       block_comment("for (int i = 0; i < len; i++) {");
7528       mov(Ri, zr); {
7529         Label loop, end;
7530         cmpw(Ri, Rlen);
7531         br(Assembler::GE, end);
7532 
7533         bind(loop);
7534         pre1(Ri);
7535 
7536         block_comment("  for (j = i; j; j--) {"); {
7537           movw(Rj, Ri);
7538           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7539         } block_comment("  } // j");
7540 
7541         post1();
7542         addw(Ri, Ri, 1);
7543         cmpw(Ri, Rlen);
7544         br(Assembler::LT, loop);
7545         bind(end);
7546         block_comment("} // i");
7547       }
7548 
7549       block_comment("for (int i = len; i < 2*len; i++) {");
7550       mov(Ri, Rlen); {
7551         Label loop, end;
7552         cmpw(Ri, Rlen, Assembler::LSL, 1);
7553         br(Assembler::GE, end);
7554 
7555         bind(loop);
7556         pre2(Ri, Rlen);
7557 
7558         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7559           lslw(Rj, Rlen, 1);
7560           subw(Rj, Rj, Ri);
7561           subw(Rj, Rj, 1);
7562           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7563         } block_comment("  } // j");
7564 
7565         post2(Ri, Rlen);
7566         addw(Ri, Ri, 1);
7567         cmpw(Ri, Rlen, Assembler::LSL, 1);
7568         br(Assembler::LT, loop);
7569         bind(end);
7570       }
7571       block_comment("} // i");
7572 
7573       normalize(Rlen);
7574 
7575       mov(Ra, Pm_base);  // Save Pm_base in Ra
7576       restore_regs();  // Restore caller's Pm_base
7577 
7578       // Copy our result into caller's Pm_base
7579       reverse(Pm_base, Ra, Rlen, t0, t1);
7580 
7581       leave();
7582       bind(nothing);
7583       ret(lr);
7584 
7585       return entry;
7586     }
7587     // In C, approximately:
7588 
7589     // void
7590     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7591     //                     julong Pn_base[], julong Pm_base[],
7592     //                     julong inv, int len) {
7593     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7594     //   julong *Pa, *Pb, *Pn, *Pm;
7595     //   julong Ra, Rb, Rn, Rm;
7596 
7597     //   int i;
7598 
7599     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7600 
7601     //   for (i = 0; i < len; i++) {
7602     //     int j;
7603 
7604     //     Pa = Pa_base;
7605     //     Pb = Pb_base + i;
7606     //     Pm = Pm_base;
7607     //     Pn = Pn_base + i;
7608 
7609     //     Ra = *Pa;
7610     //     Rb = *Pb;
7611     //     Rm = *Pm;
7612     //     Rn = *Pn;
7613 
7614     //     int iters = i;
7615     //     for (j = 0; iters--; j++) {
7616     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7617     //       MACC(Ra, Rb, t0, t1, t2);
7618     //       Ra = *++Pa;
7619     //       Rb = *--Pb;
7620     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7621     //       MACC(Rm, Rn, t0, t1, t2);
7622     //       Rm = *++Pm;
7623     //       Rn = *--Pn;
7624     //     }
7625 
7626     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7627     //     MACC(Ra, Rb, t0, t1, t2);
7628     //     *Pm = Rm = t0 * inv;
7629     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7630     //     MACC(Rm, Rn, t0, t1, t2);
7631 
7632     //     assert(t0 == 0, "broken Montgomery multiply");
7633 
7634     //     t0 = t1; t1 = t2; t2 = 0;
7635     //   }
7636 
7637     //   for (i = len; i < 2*len; i++) {
7638     //     int j;
7639 
7640     //     Pa = Pa_base + i-len;
7641     //     Pb = Pb_base + len;
7642     //     Pm = Pm_base + i-len;
7643     //     Pn = Pn_base + len;
7644 
7645     //     Ra = *++Pa;
7646     //     Rb = *--Pb;
7647     //     Rm = *++Pm;
7648     //     Rn = *--Pn;
7649 
7650     //     int iters = len*2-i-1;
7651     //     for (j = i-len+1; iters--; j++) {
7652     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7653     //       MACC(Ra, Rb, t0, t1, t2);
7654     //       Ra = *++Pa;
7655     //       Rb = *--Pb;
7656     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7657     //       MACC(Rm, Rn, t0, t1, t2);
7658     //       Rm = *++Pm;
7659     //       Rn = *--Pn;
7660     //     }
7661 
7662     //     Pm_base[i-len] = t0;
7663     //     t0 = t1; t1 = t2; t2 = 0;
7664     //   }
7665 
7666     //   while (t0)
7667     //     t0 = sub(Pm_base, Pn_base, t0, len);
7668     // }
7669 
7670     /**
7671      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7672      * multiplies than Montgomery multiplication so it should be up to
7673      * 25% faster.  However, its loop control is more complex and it
7674      * may actually run slower on some machines.
7675      *
7676      * Arguments:
7677      *
7678      * Inputs:
7679      *   c_rarg0   - int array elements a
7680      *   c_rarg1   - int array elements n (the modulus)
7681      *   c_rarg2   - int length
7682      *   c_rarg3   - int inv
7683      *   c_rarg4   - int array elements m (the result)
7684      *
7685      */
7686     address generate_square() {
7687       Label argh;
7688       bind(argh);
7689       stop("MontgomeryMultiply total_allocation must be <= 8192");
7690 
7691       align(CodeEntryAlignment);
7692       address entry = pc();
7693 
7694       enter();
7695 
7696       // Make room.
7697       cmpw(Rlen, 512);
7698       br(Assembler::HI, argh);
7699       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7700       andr(sp, Ra, -2 * wordSize);
7701 
7702       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7703 
7704       {
7705         // Copy input args, reversing as we go.  We use Ra as a
7706         // temporary variable.
7707         reverse(Ra, Pa_base, Rlen, t0, t1);
7708         reverse(Ra, Pn_base, Rlen, t0, t1);
7709       }
7710 
7711       // Push all call-saved registers and also Pm_base which we'll need
7712       // at the end.
7713       save_regs();
7714 
7715       mov(Pm_base, Ra);
7716 
7717       mov(t0, zr);
7718       mov(t1, zr);
7719       mov(t2, zr);
7720 
7721       block_comment("for (int i = 0; i < len; i++) {");
7722       mov(Ri, zr); {
7723         Label loop, end;
7724         bind(loop);
7725         cmp(Ri, Rlen);
7726         br(Assembler::GE, end);
7727 
7728         pre1(Ri);
7729 
7730         block_comment("for (j = (i+1)/2; j; j--) {"); {
7731           add(Rj, Ri, 1);
7732           lsr(Rj, Rj, 1);
7733           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7734         } block_comment("  } // j");
7735 
7736         last_squaring(Ri);
7737 
7738         block_comment("  for (j = i/2; j; j--) {"); {
7739           lsr(Rj, Ri, 1);
7740           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7741         } block_comment("  } // j");
7742 
7743         post1_squaring();
7744         add(Ri, Ri, 1);
7745         cmp(Ri, Rlen);
7746         br(Assembler::LT, loop);
7747 
7748         bind(end);
7749         block_comment("} // i");
7750       }
7751 
7752       block_comment("for (int i = len; i < 2*len; i++) {");
7753       mov(Ri, Rlen); {
7754         Label loop, end;
7755         bind(loop);
7756         cmp(Ri, Rlen, Assembler::LSL, 1);
7757         br(Assembler::GE, end);
7758 
7759         pre2(Ri, Rlen);
7760 
7761         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7762           lsl(Rj, Rlen, 1);
7763           sub(Rj, Rj, Ri);
7764           sub(Rj, Rj, 1);
7765           lsr(Rj, Rj, 1);
7766           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7767         } block_comment("  } // j");
7768 
7769         last_squaring(Ri);
7770 
7771         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7772           lsl(Rj, Rlen, 1);
7773           sub(Rj, Rj, Ri);
7774           lsr(Rj, Rj, 1);
7775           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7776         } block_comment("  } // j");
7777 
7778         post2(Ri, Rlen);
7779         add(Ri, Ri, 1);
7780         cmp(Ri, Rlen, Assembler::LSL, 1);
7781 
7782         br(Assembler::LT, loop);
7783         bind(end);
7784         block_comment("} // i");
7785       }
7786 
7787       normalize(Rlen);
7788 
7789       mov(Ra, Pm_base);  // Save Pm_base in Ra
7790       restore_regs();  // Restore caller's Pm_base
7791 
7792       // Copy our result into caller's Pm_base
7793       reverse(Pm_base, Ra, Rlen, t0, t1);
7794 
7795       leave();
7796       ret(lr);
7797 
7798       return entry;
7799     }
7800     // In C, approximately:
7801 
7802     // void
7803     // montgomery_square(julong Pa_base[], julong Pn_base[],
7804     //                   julong Pm_base[], julong inv, int len) {
7805     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7806     //   julong *Pa, *Pb, *Pn, *Pm;
7807     //   julong Ra, Rb, Rn, Rm;
7808 
7809     //   int i;
7810 
7811     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7812 
7813     //   for (i = 0; i < len; i++) {
7814     //     int j;
7815 
7816     //     Pa = Pa_base;
7817     //     Pb = Pa_base + i;
7818     //     Pm = Pm_base;
7819     //     Pn = Pn_base + i;
7820 
7821     //     Ra = *Pa;
7822     //     Rb = *Pb;
7823     //     Rm = *Pm;
7824     //     Rn = *Pn;
7825 
7826     //     int iters = (i+1)/2;
7827     //     for (j = 0; iters--; j++) {
7828     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7829     //       MACC2(Ra, Rb, t0, t1, t2);
7830     //       Ra = *++Pa;
7831     //       Rb = *--Pb;
7832     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7833     //       MACC(Rm, Rn, t0, t1, t2);
7834     //       Rm = *++Pm;
7835     //       Rn = *--Pn;
7836     //     }
7837     //     if ((i & 1) == 0) {
7838     //       assert(Ra == Pa_base[j], "must be");
7839     //       MACC(Ra, Ra, t0, t1, t2);
7840     //     }
7841     //     iters = i/2;
7842     //     assert(iters == i-j, "must be");
7843     //     for (; iters--; j++) {
7844     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7845     //       MACC(Rm, Rn, t0, t1, t2);
7846     //       Rm = *++Pm;
7847     //       Rn = *--Pn;
7848     //     }
7849 
7850     //     *Pm = Rm = t0 * inv;
7851     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7852     //     MACC(Rm, Rn, t0, t1, t2);
7853 
7854     //     assert(t0 == 0, "broken Montgomery multiply");
7855 
7856     //     t0 = t1; t1 = t2; t2 = 0;
7857     //   }
7858 
7859     //   for (i = len; i < 2*len; i++) {
7860     //     int start = i-len+1;
7861     //     int end = start + (len - start)/2;
7862     //     int j;
7863 
7864     //     Pa = Pa_base + i-len;
7865     //     Pb = Pa_base + len;
7866     //     Pm = Pm_base + i-len;
7867     //     Pn = Pn_base + len;
7868 
7869     //     Ra = *++Pa;
7870     //     Rb = *--Pb;
7871     //     Rm = *++Pm;
7872     //     Rn = *--Pn;
7873 
7874     //     int iters = (2*len-i-1)/2;
7875     //     assert(iters == end-start, "must be");
7876     //     for (j = start; iters--; j++) {
7877     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7878     //       MACC2(Ra, Rb, t0, t1, t2);
7879     //       Ra = *++Pa;
7880     //       Rb = *--Pb;
7881     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7882     //       MACC(Rm, Rn, t0, t1, t2);
7883     //       Rm = *++Pm;
7884     //       Rn = *--Pn;
7885     //     }
7886     //     if ((i & 1) == 0) {
7887     //       assert(Ra == Pa_base[j], "must be");
7888     //       MACC(Ra, Ra, t0, t1, t2);
7889     //     }
7890     //     iters =  (2*len-i)/2;
7891     //     assert(iters == len-j, "must be");
7892     //     for (; iters--; j++) {
7893     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7894     //       MACC(Rm, Rn, t0, t1, t2);
7895     //       Rm = *++Pm;
7896     //       Rn = *--Pn;
7897     //     }
7898     //     Pm_base[i-len] = t0;
7899     //     t0 = t1; t1 = t2; t2 = 0;
7900     //   }
7901 
7902     //   while (t0)
7903     //     t0 = sub(Pm_base, Pn_base, t0, len);
7904     // }
7905   };
7906 
7907 
7908   // Initialization
7909   void generate_initial() {
7910     // Generate initial stubs and initializes the entry points
7911 
7912     // entry points that exist in all platforms Note: This is code
7913     // that could be shared among different platforms - however the
7914     // benefit seems to be smaller than the disadvantage of having a
7915     // much more complicated generator structure. See also comment in
7916     // stubRoutines.hpp.
7917 
7918     StubRoutines::_forward_exception_entry = generate_forward_exception();
7919 
7920     StubRoutines::_call_stub_entry =
7921       generate_call_stub(StubRoutines::_call_stub_return_address);
7922 
7923     // is referenced by megamorphic call
7924     StubRoutines::_catch_exception_entry = generate_catch_exception();
7925 
7926     // Build this early so it's available for the interpreter.
7927     StubRoutines::_throw_StackOverflowError_entry =
7928       generate_throw_exception("StackOverflowError throw_exception",
7929                                CAST_FROM_FN_PTR(address,
7930                                                 SharedRuntime::throw_StackOverflowError));
7931     StubRoutines::_throw_delayed_StackOverflowError_entry =
7932       generate_throw_exception("delayed StackOverflowError throw_exception",
7933                                CAST_FROM_FN_PTR(address,
7934                                                 SharedRuntime::throw_delayed_StackOverflowError));
7935     if (UseCRC32Intrinsics) {
7936       // set table address before stub generation which use it
7937       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7938       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7939     }
7940 
7941     if (UseCRC32CIntrinsics) {
7942       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7943     }
7944 
7945     // Disabled until JDK-8210858 is fixed
7946     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7947     //   StubRoutines::_dlog = generate_dlog();
7948     // }
7949 
7950     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7951       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7952     }
7953 
7954     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7955       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7956     }
7957   }
7958 
7959   void generate_phase1() {
7960     // Continuation stubs:
7961     StubRoutines::_cont_thaw          = generate_cont_thaw();
7962     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
7963     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7964 
7965     JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();)
7966     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();)
7967   }
7968 
7969   void generate_all() {
7970     // support for verify_oop (must happen after universe_init)
7971     if (VerifyOops) {
7972       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
7973     }
7974     StubRoutines::_throw_AbstractMethodError_entry =
7975       generate_throw_exception("AbstractMethodError throw_exception",
7976                                CAST_FROM_FN_PTR(address,
7977                                                 SharedRuntime::
7978                                                 throw_AbstractMethodError));
7979 
7980     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7981       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7982                                CAST_FROM_FN_PTR(address,
7983                                                 SharedRuntime::
7984                                                 throw_IncompatibleClassChangeError));
7985 
7986     StubRoutines::_throw_NullPointerException_at_call_entry =
7987       generate_throw_exception("NullPointerException at call throw_exception",
7988                                CAST_FROM_FN_PTR(address,
7989                                                 SharedRuntime::
7990                                                 throw_NullPointerException_at_call));
7991 
7992     if (UseSVE == 0) {
7993       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
7994     }
7995 
7996     // arraycopy stubs used by compilers
7997     generate_arraycopy_stubs();
7998 
7999     // countPositives stub for large arrays.
8000     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8001 
8002     // array equals stub for large arrays.
8003     if (!UseSimpleArrayEquals) {
8004       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8005     }
8006 
8007     generate_compare_long_strings();
8008 
8009     generate_string_indexof_stubs();
8010 
8011     // byte_array_inflate stub for large arrays.
8012     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8013 
8014     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8015     if (bs_nm != NULL) {
8016       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
8017     }
8018 #ifdef COMPILER2
8019     if (UseMultiplyToLenIntrinsic) {
8020       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8021     }
8022 
8023     if (UseSquareToLenIntrinsic) {
8024       StubRoutines::_squareToLen = generate_squareToLen();
8025     }
8026 
8027     if (UseMulAddIntrinsic) {
8028       StubRoutines::_mulAdd = generate_mulAdd();
8029     }
8030 
8031     if (UseSIMDForBigIntegerShiftIntrinsics) {
8032       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8033       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8034     }
8035 
8036     if (UseMontgomeryMultiplyIntrinsic) {
8037       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8038       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8039       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8040     }
8041 
8042     if (UseMontgomerySquareIntrinsic) {
8043       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8044       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8045       // We use generate_multiply() rather than generate_square()
8046       // because it's faster for the sizes of modulus we care about.
8047       StubRoutines::_montgomerySquare = g.generate_multiply();
8048     }
8049 #endif // COMPILER2
8050 
8051     if (UseChaCha20Intrinsics) {
8052       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8053     }
8054 
8055     if (UseBASE64Intrinsics) {
8056         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8057         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8058     }
8059 
8060     // data cache line writeback
8061     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8062     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8063 
8064     if (UseAESIntrinsics) {
8065       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8066       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8067       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8068       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8069       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8070     }
8071     if (UseGHASHIntrinsics) {
8072       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8073       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8074     }
8075     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8076       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8077     }
8078 
8079     if (UseMD5Intrinsics) {
8080       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8081       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8082     }
8083     if (UseSHA1Intrinsics) {
8084       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8085       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8086     }
8087     if (UseSHA256Intrinsics) {
8088       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8089       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8090     }
8091     if (UseSHA512Intrinsics) {
8092       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8093       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8094     }
8095     if (UseSHA3Intrinsics) {
8096       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8097       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8098     }
8099 
8100     // generate Adler32 intrinsics code
8101     if (UseAdler32Intrinsics) {
8102       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8103     }
8104 
8105     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8106 
8107 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8108 
8109     generate_atomic_entry_points();
8110 
8111 #endif // LINUX
8112 
8113     StubRoutines::aarch64::set_completed();
8114   }
8115 
8116  public:
8117   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
8118     if (phase == 0) {
8119       generate_initial();
8120     } else if (phase == 1) {
8121       generate_phase1(); // stubs that must be available for the interpreter
8122     } else {
8123       generate_all();
8124     }
8125   }
8126 }; // end class declaration
8127 
8128 #define UCM_TABLE_MAX_ENTRIES 8
8129 void StubGenerator_generate(CodeBuffer* code, int phase) {
8130   if (UnsafeCopyMemory::_table == NULL) {
8131     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
8132   }
8133   StubGenerator g(code, phase);
8134 }
8135 
8136 
8137 #if defined (LINUX)
8138 
8139 // Define pointers to atomic stubs and initialize them to point to the
8140 // code in atomic_aarch64.S.
8141 
8142 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8143   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8144     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8145   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8146     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8147 
8148 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8149 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8150 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8151 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8152 DEFAULT_ATOMIC_OP(xchg, 4, )
8153 DEFAULT_ATOMIC_OP(xchg, 8, )
8154 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8155 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8156 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8157 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8158 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8159 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8160 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8161 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8162 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8163 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8164 
8165 #undef DEFAULT_ATOMIC_OP
8166 
8167 #endif // LINUX