1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "asm/register.hpp"
  29 #include "atomic_aarch64.hpp"
  30 #include "compiler/oopMap.hpp"
  31 #include "gc/shared/barrierSet.hpp"
  32 #include "gc/shared/barrierSetAssembler.hpp"
  33 #include "gc/shared/gc_globals.hpp"
  34 #include "gc/shared/tlab_globals.hpp"
  35 #include "interpreter/interpreter.hpp"
  36 #include "memory/universe.hpp"
  37 #include "nativeInst_aarch64.hpp"
  38 #include "oops/instanceOop.hpp"
  39 #include "oops/method.hpp"
  40 #include "oops/objArrayKlass.hpp"
  41 #include "oops/oop.inline.hpp"
  42 #include "prims/methodHandles.hpp"
  43 #include "prims/upcallLinker.hpp"
  44 #include "runtime/arguments.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/frame.inline.hpp"
  49 #include "runtime/handles.inline.hpp"
  50 #include "runtime/javaThread.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubCodeGenerator.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/checkedCast.hpp"
  56 #include "utilities/debug.hpp"
  57 #include "utilities/globalDefinitions.hpp"
  58 #include "utilities/intpow.hpp"
  59 #include "utilities/powerOfTwo.hpp"
  60 #ifdef COMPILER2
  61 #include "opto/runtime.hpp"
  62 #endif
  63 #if INCLUDE_ZGC
  64 #include "gc/z/zThreadLocalData.hpp"
  65 #endif
  66 
  67 // Declaration and definition of StubGenerator (no .hpp file).
  68 // For a more detailed description of the stub routine structure
  69 // see the comment in stubRoutines.hpp
  70 
  71 #undef __
  72 #define __ _masm->
  73 
  74 #ifdef PRODUCT
  75 #define BLOCK_COMMENT(str) /* nothing */
  76 #else
  77 #define BLOCK_COMMENT(str) __ block_comment(str)
  78 #endif
  79 
  80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  81 
  82 // Stub Code definitions
  83 
  84 class StubGenerator: public StubCodeGenerator {
  85  private:
  86 
  87 #ifdef PRODUCT
  88 #define inc_counter_np(counter) ((void)0)
  89 #else
  90   void inc_counter_np_(uint& counter) {
  91     __ incrementw(ExternalAddress((address)&counter));
  92   }
  93 #define inc_counter_np(counter) \
  94   BLOCK_COMMENT("inc_counter " #counter); \
  95   inc_counter_np_(counter);
  96 #endif
  97 
  98   // Call stubs are used to call Java from C
  99   //
 100   // Arguments:
 101   //    c_rarg0:   call wrapper address                   address
 102   //    c_rarg1:   result                                 address
 103   //    c_rarg2:   result type                            BasicType
 104   //    c_rarg3:   method                                 Method*
 105   //    c_rarg4:   (interpreter) entry point              address
 106   //    c_rarg5:   parameters                             intptr_t*
 107   //    c_rarg6:   parameter size (in words)              int
 108   //    c_rarg7:   thread                                 Thread*
 109   //
 110   // There is no return from the stub itself as any Java result
 111   // is written to result
 112   //
 113   // we save r30 (lr) as the return PC at the base of the frame and
 114   // link r29 (fp) below it as the frame pointer installing sp (r31)
 115   // into fp.
 116   //
 117   // we save r0-r7, which accounts for all the c arguments.
 118   //
 119   // TODO: strictly do we need to save them all? they are treated as
 120   // volatile by C so could we omit saving the ones we are going to
 121   // place in global registers (thread? method?) or those we only use
 122   // during setup of the Java call?
 123   //
 124   // we don't need to save r8 which C uses as an indirect result location
 125   // return register.
 126   //
 127   // we don't need to save r9-r15 which both C and Java treat as
 128   // volatile
 129   //
 130   // we don't need to save r16-18 because Java does not use them
 131   //
 132   // we save r19-r28 which Java uses as scratch registers and C
 133   // expects to be callee-save
 134   //
 135   // we save the bottom 64 bits of each value stored in v8-v15; it is
 136   // the responsibility of the caller to preserve larger values.
 137   //
 138   // so the stub frame looks like this when we enter Java code
 139   //
 140   //     [ return_from_Java     ] <--- sp
 141   //     [ argument word n      ]
 142   //      ...
 143   // -29 [ argument word 1      ]
 144   // -28 [ saved Floating-point Control Register ]
 145   // -26 [ saved v15            ] <--- sp_after_call
 146   // -25 [ saved v14            ]
 147   // -24 [ saved v13            ]
 148   // -23 [ saved v12            ]
 149   // -22 [ saved v11            ]
 150   // -21 [ saved v10            ]
 151   // -20 [ saved v9             ]
 152   // -19 [ saved v8             ]
 153   // -18 [ saved r28            ]
 154   // -17 [ saved r27            ]
 155   // -16 [ saved r26            ]
 156   // -15 [ saved r25            ]
 157   // -14 [ saved r24            ]
 158   // -13 [ saved r23            ]
 159   // -12 [ saved r22            ]
 160   // -11 [ saved r21            ]
 161   // -10 [ saved r20            ]
 162   //  -9 [ saved r19            ]
 163   //  -8 [ call wrapper    (r0) ]
 164   //  -7 [ result          (r1) ]
 165   //  -6 [ result type     (r2) ]
 166   //  -5 [ method          (r3) ]
 167   //  -4 [ entry point     (r4) ]
 168   //  -3 [ parameters      (r5) ]
 169   //  -2 [ parameter size  (r6) ]
 170   //  -1 [ thread (r7)          ]
 171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 172   //   1 [ saved lr       (r30) ]
 173 
 174   // Call stub stack layout word offsets from fp
 175   enum call_stub_layout {
 176     sp_after_call_off  = -28,
 177 
 178     fpcr_off           = sp_after_call_off,
 179     d15_off            = -26,
 180     d13_off            = -24,
 181     d11_off            = -22,
 182     d9_off             = -20,
 183 
 184     r28_off            = -18,
 185     r26_off            = -16,
 186     r24_off            = -14,
 187     r22_off            = -12,
 188     r20_off            = -10,
 189     call_wrapper_off   =  -8,
 190     result_off         =  -7,
 191     result_type_off    =  -6,
 192     method_off         =  -5,
 193     entry_point_off    =  -4,
 194     parameter_size_off =  -2,
 195     thread_off         =  -1,
 196     fp_f               =   0,
 197     retaddr_off        =   1,
 198   };
 199 
 200   address generate_call_stub(address& return_address) {
 201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 203            "adjust this code");
 204 
 205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
 206     StubCodeMark mark(this, stub_id);
 207     address start = __ pc();
 208 
 209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 210 
 211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     __ get_fpcr(rscratch1);
 261     __ str(rscratch1, fpcr_save);
 262     // Set FPCR to the state we need. We do want Round to Nearest. We
 263     // don't want non-IEEE rounding modes or floating-point traps.
 264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 266     __ set_fpcr(rscratch1);
 267 
 268     // install Java thread in global register now we have saved
 269     // whatever value it held
 270     __ mov(rthread, c_rarg7);
 271     // And method
 272     __ mov(rmethod, c_rarg3);
 273 
 274     // set up the heapbase register
 275     __ reinit_heapbase();
 276 
 277 #ifdef ASSERT
 278     // make sure we have no pending exceptions
 279     {
 280       Label L;
 281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 282       __ cmp(rscratch1, (u1)NULL_WORD);
 283       __ br(Assembler::EQ, L);
 284       __ stop("StubRoutines::call_stub: entered with pending exception");
 285       __ BIND(L);
 286     }
 287 #endif
 288     // pass parameters if any
 289     __ mov(esp, sp);
 290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 291     __ andr(sp, rscratch1, -2 * wordSize);
 292 
 293     BLOCK_COMMENT("pass parameters if any");
 294     Label parameters_done;
 295     // parameter count is still in c_rarg6
 296     // and parameter pointer identifying param 1 is in c_rarg5
 297     __ cbzw(c_rarg6, parameters_done);
 298 
 299     address loop = __ pc();
 300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 301     __ subsw(c_rarg6, c_rarg6, 1);
 302     __ push(rscratch1);
 303     __ br(Assembler::GT, loop);
 304 
 305     __ BIND(parameters_done);
 306 
 307     // call Java entry -- passing methdoOop, and current sp
 308     //      rmethod: Method*
 309     //      r19_sender_sp: sender sp
 310     BLOCK_COMMENT("call Java function");
 311     __ mov(r19_sender_sp, sp);
 312     __ blr(c_rarg4);
 313 
 314     // we do this here because the notify will already have been done
 315     // if we get to the next instruction via an exception
 316     //
 317     // n.b. adding this instruction here affects the calculation of
 318     // whether or not a routine returns to the call stub (used when
 319     // doing stack walks) since the normal test is to check the return
 320     // pc against the address saved below. so we may need to allow for
 321     // this extra instruction in the check.
 322 
 323     // save current address for use by exception handling code
 324 
 325     return_address = __ pc();
 326 
 327     // store result depending on type (everything that is not
 328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 329     // n.b. this assumes Java returns an integral result in r0
 330     // and a floating result in j_farg0
 331     __ ldr(j_rarg2, result);
 332     Label is_long, is_float, is_double, exit;
 333     __ ldr(j_rarg1, result_type);
 334     __ cmp(j_rarg1, (u1)T_OBJECT);
 335     __ br(Assembler::EQ, is_long);
 336     __ cmp(j_rarg1, (u1)T_LONG);
 337     __ br(Assembler::EQ, is_long);
 338     __ cmp(j_rarg1, (u1)T_FLOAT);
 339     __ br(Assembler::EQ, is_float);
 340     __ cmp(j_rarg1, (u1)T_DOUBLE);
 341     __ br(Assembler::EQ, is_double);
 342 
 343     // handle T_INT case
 344     __ strw(r0, Address(j_rarg2));
 345 
 346     __ BIND(exit);
 347 
 348     // pop parameters
 349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 350 
 351 #ifdef ASSERT
 352     // verify that threads correspond
 353     {
 354       Label L, S;
 355       __ ldr(rscratch1, thread);
 356       __ cmp(rthread, rscratch1);
 357       __ br(Assembler::NE, S);
 358       __ get_thread(rscratch1);
 359       __ cmp(rthread, rscratch1);
 360       __ br(Assembler::EQ, L);
 361       __ BIND(S);
 362       __ stop("StubRoutines::call_stub: threads must correspond");
 363       __ BIND(L);
 364     }
 365 #endif
 366 
 367     __ pop_cont_fastpath(rthread);
 368 
 369     // restore callee-save registers
 370     __ ldpd(v15, v14,  d15_save);
 371     __ ldpd(v13, v12,  d13_save);
 372     __ ldpd(v11, v10,  d11_save);
 373     __ ldpd(v9,  v8,   d9_save);
 374 
 375     __ ldp(r28, r27,   r28_save);
 376     __ ldp(r26, r25,   r26_save);
 377     __ ldp(r24, r23,   r24_save);
 378     __ ldp(r22, r21,   r22_save);
 379     __ ldp(r20, r19,   r20_save);
 380 
 381     // restore fpcr
 382     __ ldr(rscratch1,  fpcr_save);
 383     __ set_fpcr(rscratch1);
 384 
 385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 386     __ ldrw(c_rarg2, result_type);
 387     __ ldr(c_rarg3,  method);
 388     __ ldp(c_rarg4, c_rarg5,  entry_point);
 389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 390 
 391     // leave frame and return to caller
 392     __ leave();
 393     __ ret(lr);
 394 
 395     // handle return types different from T_INT
 396 
 397     __ BIND(is_long);
 398     __ str(r0, Address(j_rarg2, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     __ BIND(is_float);
 402     __ strs(j_farg0, Address(j_rarg2, 0));
 403     __ br(Assembler::AL, exit);
 404 
 405     __ BIND(is_double);
 406     __ strd(j_farg0, Address(j_rarg2, 0));
 407     __ br(Assembler::AL, exit);
 408 
 409     return start;
 410   }
 411 
 412   // Return point for a Java call if there's an exception thrown in
 413   // Java code.  The exception is caught and transformed into a
 414   // pending exception stored in JavaThread that can be tested from
 415   // within the VM.
 416   //
 417   // Note: Usually the parameters are removed by the callee. In case
 418   // of an exception crossing an activation frame boundary, that is
 419   // not the case if the callee is compiled code => need to setup the
 420   // rsp.
 421   //
 422   // r0: exception oop
 423 
 424   address generate_catch_exception() {
 425     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
 426     StubCodeMark mark(this, stub_id);
 427     address start = __ pc();
 428 
 429     // same as in generate_call_stub():
 430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 431     const Address thread        (rfp, thread_off         * wordSize);
 432 
 433 #ifdef ASSERT
 434     // verify that threads correspond
 435     {
 436       Label L, S;
 437       __ ldr(rscratch1, thread);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::NE, S);
 440       __ get_thread(rscratch1);
 441       __ cmp(rthread, rscratch1);
 442       __ br(Assembler::EQ, L);
 443       __ bind(S);
 444       __ stop("StubRoutines::catch_exception: threads must correspond");
 445       __ bind(L);
 446     }
 447 #endif
 448 
 449     // set pending exception
 450     __ verify_oop(r0);
 451 
 452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 453     __ mov(rscratch1, (address)__FILE__);
 454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 455     __ movw(rscratch1, (int)__LINE__);
 456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 457 
 458     // complete return to VM
 459     assert(StubRoutines::_call_stub_return_address != nullptr,
 460            "_call_stub_return_address must have been generated before");
 461     __ b(StubRoutines::_call_stub_return_address);
 462 
 463     return start;
 464   }
 465 
 466   // Continuation point for runtime calls returning with a pending
 467   // exception.  The pending exception check happened in the runtime
 468   // or native call stub.  The pending exception in Thread is
 469   // converted into a Java-level exception.
 470   //
 471   // Contract with Java-level exception handlers:
 472   // r0: exception
 473   // r3: throwing pc
 474   //
 475   // NOTE: At entry of this stub, exception-pc must be in LR !!
 476 
 477   // NOTE: this is always used as a jump target within generated code
 478   // so it just needs to be generated code with no x86 prolog
 479 
 480   address generate_forward_exception() {
 481     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
 482     StubCodeMark mark(this, stub_id);
 483     address start = __ pc();
 484 
 485     // Upon entry, LR points to the return address returning into
 486     // Java (interpreted or compiled) code; i.e., the return address
 487     // becomes the throwing pc.
 488     //
 489     // Arguments pushed before the runtime call are still on the stack
 490     // but the exception handler will reset the stack pointer ->
 491     // ignore them.  A potential result in registers can be ignored as
 492     // well.
 493 
 494 #ifdef ASSERT
 495     // make sure this code is only executed if there is a pending exception
 496     {
 497       Label L;
 498       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 499       __ cbnz(rscratch1, L);
 500       __ stop("StubRoutines::forward exception: no pending exception (1)");
 501       __ bind(L);
 502     }
 503 #endif
 504 
 505     // compute exception handler into r19
 506 
 507     // call the VM to find the handler address associated with the
 508     // caller address. pass thread in r0 and caller pc (ret address)
 509     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 510     // the stack.
 511     __ mov(c_rarg1, lr);
 512     // lr will be trashed by the VM call so we move it to R19
 513     // (callee-saved) because we also need to pass it to the handler
 514     // returned by this call.
 515     __ mov(r19, lr);
 516     BLOCK_COMMENT("call exception_handler_for_return_address");
 517     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 518                          SharedRuntime::exception_handler_for_return_address),
 519                     rthread, c_rarg1);
 520     // Reinitialize the ptrue predicate register, in case the external runtime
 521     // call clobbers ptrue reg, as we may return to SVE compiled code.
 522     __ reinitialize_ptrue();
 523 
 524     // we should not really care that lr is no longer the callee
 525     // address. we saved the value the handler needs in r19 so we can
 526     // just copy it to r3. however, the C2 handler will push its own
 527     // frame and then calls into the VM and the VM code asserts that
 528     // the PC for the frame above the handler belongs to a compiled
 529     // Java method. So, we restore lr here to satisfy that assert.
 530     __ mov(lr, r19);
 531     // setup r0 & r3 & clear pending exception
 532     __ mov(r3, r19);
 533     __ mov(r19, r0);
 534     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 535     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 536 
 537 #ifdef ASSERT
 538     // make sure exception is set
 539     {
 540       Label L;
 541       __ cbnz(r0, L);
 542       __ stop("StubRoutines::forward exception: no pending exception (2)");
 543       __ bind(L);
 544     }
 545 #endif
 546 
 547     // continue at exception handler
 548     // r0: exception
 549     // r3: throwing pc
 550     // r19: exception handler
 551     __ verify_oop(r0);
 552     __ br(r19);
 553 
 554     return start;
 555   }
 556 
 557   // Non-destructive plausibility checks for oops
 558   //
 559   // Arguments:
 560   //    r0: oop to verify
 561   //    rscratch1: error message
 562   //
 563   // Stack after saving c_rarg3:
 564   //    [tos + 0]: saved c_rarg3
 565   //    [tos + 1]: saved c_rarg2
 566   //    [tos + 2]: saved lr
 567   //    [tos + 3]: saved rscratch2
 568   //    [tos + 4]: saved r0
 569   //    [tos + 5]: saved rscratch1
 570   address generate_verify_oop() {
 571     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
 572     StubCodeMark mark(this, stub_id);
 573     address start = __ pc();
 574 
 575     Label exit, error;
 576 
 577     // save c_rarg2 and c_rarg3
 578     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 579 
 580     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 581     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 582     __ ldr(c_rarg3, Address(c_rarg2));
 583     __ add(c_rarg3, c_rarg3, 1);
 584     __ str(c_rarg3, Address(c_rarg2));
 585 
 586     // object is in r0
 587     // make sure object is 'reasonable'
 588     __ cbz(r0, exit); // if obj is null it is OK
 589 
 590     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 591     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 592 
 593     // return if everything seems ok
 594     __ bind(exit);
 595 
 596     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 597     __ ret(lr);
 598 
 599     // handle errors
 600     __ bind(error);
 601     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 602 
 603     __ push(RegSet::range(r0, r29), sp);
 604     // debug(char* msg, int64_t pc, int64_t regs[])
 605     __ mov(c_rarg0, rscratch1);      // pass address of error message
 606     __ mov(c_rarg1, lr);             // pass return address
 607     __ mov(c_rarg2, sp);             // pass address of regs on stack
 608 #ifndef PRODUCT
 609     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 610 #endif
 611     BLOCK_COMMENT("call MacroAssembler::debug");
 612     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 613     __ blr(rscratch1);
 614     __ hlt(0);
 615 
 616     return start;
 617   }
 618 
 619   // Generate indices for iota vector.
 620   address generate_iota_indices(StubGenStubId stub_id) {
 621     __ align(CodeEntryAlignment);
 622     StubCodeMark mark(this, stub_id);
 623     address start = __ pc();
 624     // B
 625     __ emit_data64(0x0706050403020100, relocInfo::none);
 626     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 627     // H
 628     __ emit_data64(0x0003000200010000, relocInfo::none);
 629     __ emit_data64(0x0007000600050004, relocInfo::none);
 630     // S
 631     __ emit_data64(0x0000000100000000, relocInfo::none);
 632     __ emit_data64(0x0000000300000002, relocInfo::none);
 633     // D
 634     __ emit_data64(0x0000000000000000, relocInfo::none);
 635     __ emit_data64(0x0000000000000001, relocInfo::none);
 636     // S - FP
 637     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 638     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 639     // D - FP
 640     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 641     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 642     return start;
 643   }
 644 
 645   // The inner part of zero_words().  This is the bulk operation,
 646   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 647   // caller is responsible for zeroing the last few words.
 648   //
 649   // Inputs:
 650   // r10: the HeapWord-aligned base address of an array to zero.
 651   // r11: the count in HeapWords, r11 > 0.
 652   //
 653   // Returns r10 and r11, adjusted for the caller to clear.
 654   // r10: the base address of the tail of words left to clear.
 655   // r11: the number of words in the tail.
 656   //      r11 < MacroAssembler::zero_words_block_size.
 657 
 658   address generate_zero_blocks() {
 659     Label done;
 660     Label base_aligned;
 661 
 662     Register base = r10, cnt = r11;
 663 
 664     __ align(CodeEntryAlignment);
 665     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
 666     StubCodeMark mark(this, stub_id);
 667     address start = __ pc();
 668 
 669     if (UseBlockZeroing) {
 670       int zva_length = VM_Version::zva_length();
 671 
 672       // Ensure ZVA length can be divided by 16. This is required by
 673       // the subsequent operations.
 674       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 675 
 676       __ tbz(base, 3, base_aligned);
 677       __ str(zr, Address(__ post(base, 8)));
 678       __ sub(cnt, cnt, 1);
 679       __ bind(base_aligned);
 680 
 681       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 682       // alignment.
 683       Label small;
 684       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 685       __ subs(rscratch1, cnt, low_limit >> 3);
 686       __ br(Assembler::LT, small);
 687       __ zero_dcache_blocks(base, cnt);
 688       __ bind(small);
 689     }
 690 
 691     {
 692       // Number of stp instructions we'll unroll
 693       const int unroll =
 694         MacroAssembler::zero_words_block_size / 2;
 695       // Clear the remaining blocks.
 696       Label loop;
 697       __ subs(cnt, cnt, unroll * 2);
 698       __ br(Assembler::LT, done);
 699       __ bind(loop);
 700       for (int i = 0; i < unroll; i++)
 701         __ stp(zr, zr, __ post(base, 16));
 702       __ subs(cnt, cnt, unroll * 2);
 703       __ br(Assembler::GE, loop);
 704       __ bind(done);
 705       __ add(cnt, cnt, unroll * 2);
 706     }
 707 
 708     __ ret(lr);
 709 
 710     return start;
 711   }
 712 
 713 
 714   typedef enum {
 715     copy_forwards = 1,
 716     copy_backwards = -1
 717   } copy_direction;
 718 
 719   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 720   // for arraycopy stubs.
 721   class ArrayCopyBarrierSetHelper : StackObj {
 722     BarrierSetAssembler* _bs_asm;
 723     MacroAssembler* _masm;
 724     DecoratorSet _decorators;
 725     BasicType _type;
 726     Register _gct1;
 727     Register _gct2;
 728     Register _gct3;
 729     FloatRegister _gcvt1;
 730     FloatRegister _gcvt2;
 731     FloatRegister _gcvt3;
 732 
 733   public:
 734     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 735                               DecoratorSet decorators,
 736                               BasicType type,
 737                               Register gct1,
 738                               Register gct2,
 739                               Register gct3,
 740                               FloatRegister gcvt1,
 741                               FloatRegister gcvt2,
 742                               FloatRegister gcvt3)
 743       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 744         _masm(masm),
 745         _decorators(decorators),
 746         _type(type),
 747         _gct1(gct1),
 748         _gct2(gct2),
 749         _gct3(gct3),
 750         _gcvt1(gcvt1),
 751         _gcvt2(gcvt2),
 752         _gcvt3(gcvt3) {
 753     }
 754 
 755     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 756       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 757                             dst1, dst2, src,
 758                             _gct1, _gct2, _gcvt1);
 759     }
 760 
 761     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 762       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 763                              dst, src1, src2,
 764                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 765     }
 766 
 767     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 768       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 769                             dst1, dst2, src,
 770                             _gct1);
 771     }
 772 
 773     void copy_store_at_16(Address dst, Register src1, Register src2) {
 774       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 775                              dst, src1, src2,
 776                              _gct1, _gct2, _gct3);
 777     }
 778 
 779     void copy_load_at_8(Register dst, Address src) {
 780       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 781                             dst, noreg, src,
 782                             _gct1);
 783     }
 784 
 785     void copy_store_at_8(Address dst, Register src) {
 786       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 787                              dst, src, noreg,
 788                              _gct1, _gct2, _gct3);
 789     }
 790   };
 791 
 792   // Bulk copy of blocks of 8 words.
 793   //
 794   // count is a count of words.
 795   //
 796   // Precondition: count >= 8
 797   //
 798   // Postconditions:
 799   //
 800   // The least significant bit of count contains the remaining count
 801   // of words to copy.  The rest of count is trash.
 802   //
 803   // s and d are adjusted to point to the remaining words to copy
 804   //
 805   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
 806     BasicType type;
 807     copy_direction direction;
 808 
 809     switch (stub_id) {
 810     case copy_byte_f_id:
 811       direction = copy_forwards;
 812       type = T_BYTE;
 813       break;
 814     case copy_byte_b_id:
 815       direction = copy_backwards;
 816       type = T_BYTE;
 817       break;
 818     case copy_oop_f_id:
 819       direction = copy_forwards;
 820       type = T_OBJECT;
 821       break;
 822     case copy_oop_b_id:
 823       direction = copy_backwards;
 824       type = T_OBJECT;
 825       break;
 826     case copy_oop_uninit_f_id:
 827       direction = copy_forwards;
 828       type = T_OBJECT;
 829       break;
 830     case copy_oop_uninit_b_id:
 831       direction = copy_backwards;
 832       type = T_OBJECT;
 833       break;
 834     default:
 835       ShouldNotReachHere();
 836     }
 837 
 838     int unit = wordSize * direction;
 839     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 840 
 841     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 842       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 843     const Register stride = r14;
 844     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 845     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 846     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 847 
 848     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 849     assert_different_registers(s, d, count, rscratch1, rscratch2);
 850 
 851     Label again, drain;
 852 
 853     __ align(CodeEntryAlignment);
 854 
 855     StubCodeMark mark(this, stub_id);
 856 
 857     __ bind(start);
 858 
 859     Label unaligned_copy_long;
 860     if (AvoidUnalignedAccesses) {
 861       __ tbnz(d, 3, unaligned_copy_long);
 862     }
 863 
 864     if (direction == copy_forwards) {
 865       __ sub(s, s, bias);
 866       __ sub(d, d, bias);
 867     }
 868 
 869 #ifdef ASSERT
 870     // Make sure we are never given < 8 words
 871     {
 872       Label L;
 873       __ cmp(count, (u1)8);
 874       __ br(Assembler::GE, L);
 875       __ stop("genrate_copy_longs called with < 8 words");
 876       __ bind(L);
 877     }
 878 #endif
 879 
 880     // Fill 8 registers
 881     if (UseSIMDForMemoryOps) {
 882       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 884     } else {
 885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 886       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 888       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 889     }
 890 
 891     __ subs(count, count, 16);
 892     __ br(Assembler::LO, drain);
 893 
 894     int prefetch = PrefetchCopyIntervalInBytes;
 895     bool use_stride = false;
 896     if (direction == copy_backwards) {
 897        use_stride = prefetch > 256;
 898        prefetch = -prefetch;
 899        if (use_stride) __ mov(stride, prefetch);
 900     }
 901 
 902     __ bind(again);
 903 
 904     if (PrefetchCopyIntervalInBytes > 0)
 905       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 906 
 907     if (UseSIMDForMemoryOps) {
 908       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 909       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 910       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 911       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 912     } else {
 913       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 914       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 915       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 916       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 917       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 918       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 920       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 921     }
 922 
 923     __ subs(count, count, 8);
 924     __ br(Assembler::HS, again);
 925 
 926     // Drain
 927     __ bind(drain);
 928     if (UseSIMDForMemoryOps) {
 929       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 930       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 931     } else {
 932       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 933       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 934       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 936     }
 937 
 938     {
 939       Label L1, L2;
 940       __ tbz(count, exact_log2(4), L1);
 941       if (UseSIMDForMemoryOps) {
 942         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 943         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 944       } else {
 945         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 946         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 947         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 948         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 949       }
 950       __ bind(L1);
 951 
 952       if (direction == copy_forwards) {
 953         __ add(s, s, bias);
 954         __ add(d, d, bias);
 955       }
 956 
 957       __ tbz(count, 1, L2);
 958       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 959       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 960       __ bind(L2);
 961     }
 962 
 963     __ ret(lr);
 964 
 965     if (AvoidUnalignedAccesses) {
 966       Label drain, again;
 967       // Register order for storing. Order is different for backward copy.
 968 
 969       __ bind(unaligned_copy_long);
 970 
 971       // source address is even aligned, target odd aligned
 972       //
 973       // when forward copying word pairs we read long pairs at offsets
 974       // {0, 2, 4, 6} (in long words). when backwards copying we read
 975       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 976       // address by -2 in the forwards case so we can compute the
 977       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 978       // or -1.
 979       //
 980       // when forward copying we need to store 1 word, 3 pairs and
 981       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 982       // zero offset We adjust the destination by -1 which means we
 983       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 984       //
 985       // When backwards copyng we need to store 1 word, 3 pairs and
 986       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 987       // offsets {1, 3, 5, 7, 8} * unit.
 988 
 989       if (direction == copy_forwards) {
 990         __ sub(s, s, 16);
 991         __ sub(d, d, 8);
 992       }
 993 
 994       // Fill 8 registers
 995       //
 996       // for forwards copy s was offset by -16 from the original input
 997       // value of s so the register contents are at these offsets
 998       // relative to the 64 bit block addressed by that original input
 999       // and so on for each successive 64 byte block when s is updated
1000       //
1001       // t0 at offset 0,  t1 at offset 8
1002       // t2 at offset 16, t3 at offset 24
1003       // t4 at offset 32, t5 at offset 40
1004       // t6 at offset 48, t7 at offset 56
1005 
1006       // for backwards copy s was not offset so the register contents
1007       // are at these offsets into the preceding 64 byte block
1008       // relative to that original input and so on for each successive
1009       // preceding 64 byte block when s is updated. this explains the
1010       // slightly counter-intuitive looking pattern of register usage
1011       // in the stp instructions for backwards copy.
1012       //
1013       // t0 at offset -16, t1 at offset -8
1014       // t2 at offset -32, t3 at offset -24
1015       // t4 at offset -48, t5 at offset -40
1016       // t6 at offset -64, t7 at offset -56
1017 
1018       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1019       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1020       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1021       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1022 
1023       __ subs(count, count, 16);
1024       __ br(Assembler::LO, drain);
1025 
1026       int prefetch = PrefetchCopyIntervalInBytes;
1027       bool use_stride = false;
1028       if (direction == copy_backwards) {
1029          use_stride = prefetch > 256;
1030          prefetch = -prefetch;
1031          if (use_stride) __ mov(stride, prefetch);
1032       }
1033 
1034       __ bind(again);
1035 
1036       if (PrefetchCopyIntervalInBytes > 0)
1037         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1038 
1039       if (direction == copy_forwards) {
1040        // allowing for the offset of -8 the store instructions place
1041        // registers into the target 64 bit block at the following
1042        // offsets
1043        //
1044        // t0 at offset 0
1045        // t1 at offset 8,  t2 at offset 16
1046        // t3 at offset 24, t4 at offset 32
1047        // t5 at offset 40, t6 at offset 48
1048        // t7 at offset 56
1049 
1050         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1051         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1052         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1053         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1054         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1055         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1056         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1058         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1059       } else {
1060        // d was not offset when we started so the registers are
1061        // written into the 64 bit block preceding d with the following
1062        // offsets
1063        //
1064        // t1 at offset -8
1065        // t3 at offset -24, t0 at offset -16
1066        // t5 at offset -48, t2 at offset -32
1067        // t7 at offset -56, t4 at offset -48
1068        //                   t6 at offset -64
1069        //
1070        // note that this matches the offsets previously noted for the
1071        // loads
1072 
1073         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1074         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1075         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1076         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1077         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1078         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1079         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1080         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1081         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1082       }
1083 
1084       __ subs(count, count, 8);
1085       __ br(Assembler::HS, again);
1086 
1087       // Drain
1088       //
1089       // this uses the same pattern of offsets and register arguments
1090       // as above
1091       __ bind(drain);
1092       if (direction == copy_forwards) {
1093         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1094         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1095         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1096         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1097         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1098       } else {
1099         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1100         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1101         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1102         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1103         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1104       }
1105       // now we need to copy any remaining part block which may
1106       // include a 4 word block subblock and/or a 2 word subblock.
1107       // bits 2 and 1 in the count are the tell-tale for whether we
1108       // have each such subblock
1109       {
1110         Label L1, L2;
1111         __ tbz(count, exact_log2(4), L1);
1112        // this is the same as above but copying only 4 longs hence
1113        // with only one intervening stp between the str instructions
1114        // but note that the offsets and registers still follow the
1115        // same pattern
1116         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1117         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1118         if (direction == copy_forwards) {
1119           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1120           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1121           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1122         } else {
1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1124           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1125           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1126         }
1127         __ bind(L1);
1128 
1129         __ tbz(count, 1, L2);
1130        // this is the same as above but copying only 2 longs hence
1131        // there is no intervening stp between the str instructions
1132        // but note that the offset and register patterns are still
1133        // the same
1134         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1135         if (direction == copy_forwards) {
1136           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1137           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1138         } else {
1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1140           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1141         }
1142         __ bind(L2);
1143 
1144        // for forwards copy we need to re-adjust the offsets we
1145        // applied so that s and d are follow the last words written
1146 
1147        if (direction == copy_forwards) {
1148          __ add(s, s, 16);
1149          __ add(d, d, 8);
1150        }
1151 
1152       }
1153 
1154       __ ret(lr);
1155       }
1156   }
1157 
1158   // Small copy: less than 16 bytes.
1159   //
1160   // NB: Ignores all of the bits of count which represent more than 15
1161   // bytes, so a caller doesn't have to mask them.
1162 
1163   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1164     bool is_backwards = step < 0;
1165     size_t granularity = uabs(step);
1166     int direction = is_backwards ? -1 : 1;
1167 
1168     Label Lword, Lint, Lshort, Lbyte;
1169 
1170     assert(granularity
1171            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1172 
1173     const Register t0 = r3;
1174     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1175     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1176 
1177     // ??? I don't know if this bit-test-and-branch is the right thing
1178     // to do.  It does a lot of jumping, resulting in several
1179     // mispredicted branches.  It might make more sense to do this
1180     // with something like Duff's device with a single computed branch.
1181 
1182     __ tbz(count, 3 - exact_log2(granularity), Lword);
1183     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1184     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1185     __ bind(Lword);
1186 
1187     if (granularity <= sizeof (jint)) {
1188       __ tbz(count, 2 - exact_log2(granularity), Lint);
1189       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1190       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1191       __ bind(Lint);
1192     }
1193 
1194     if (granularity <= sizeof (jshort)) {
1195       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1196       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1197       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1198       __ bind(Lshort);
1199     }
1200 
1201     if (granularity <= sizeof (jbyte)) {
1202       __ tbz(count, 0, Lbyte);
1203       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1204       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1205       __ bind(Lbyte);
1206     }
1207   }
1208 
1209   Label copy_f, copy_b;
1210   Label copy_obj_f, copy_obj_b;
1211   Label copy_obj_uninit_f, copy_obj_uninit_b;
1212 
1213   // All-singing all-dancing memory copy.
1214   //
1215   // Copy count units of memory from s to d.  The size of a unit is
1216   // step, which can be positive or negative depending on the direction
1217   // of copy.  If is_aligned is false, we align the source address.
1218   //
1219 
1220   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1221                    Register s, Register d, Register count, int step) {
1222     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1223     bool is_backwards = step < 0;
1224     unsigned int granularity = uabs(step);
1225     const Register t0 = r3, t1 = r4;
1226 
1227     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1228     // load all the data before writing anything
1229     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1230     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1231     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1232     const Register send = r17, dend = r16;
1233     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1234     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1235     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1236 
1237     if (PrefetchCopyIntervalInBytes > 0)
1238       __ prfm(Address(s, 0), PLDL1KEEP);
1239     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1240     __ br(Assembler::HI, copy_big);
1241 
1242     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1243     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1244 
1245     __ cmp(count, u1(16/granularity));
1246     __ br(Assembler::LS, copy16);
1247 
1248     __ cmp(count, u1(64/granularity));
1249     __ br(Assembler::HI, copy80);
1250 
1251     __ cmp(count, u1(32/granularity));
1252     __ br(Assembler::LS, copy32);
1253 
1254     // 33..64 bytes
1255     if (UseSIMDForMemoryOps) {
1256       bs.copy_load_at_32(v0, v1, Address(s, 0));
1257       bs.copy_load_at_32(v2, v3, Address(send, -32));
1258       bs.copy_store_at_32(Address(d, 0), v0, v1);
1259       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1260     } else {
1261       bs.copy_load_at_16(t0, t1, Address(s, 0));
1262       bs.copy_load_at_16(t2, t3, Address(s, 16));
1263       bs.copy_load_at_16(t4, t5, Address(send, -32));
1264       bs.copy_load_at_16(t6, t7, Address(send, -16));
1265 
1266       bs.copy_store_at_16(Address(d, 0), t0, t1);
1267       bs.copy_store_at_16(Address(d, 16), t2, t3);
1268       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1269       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1270     }
1271     __ b(finish);
1272 
1273     // 17..32 bytes
1274     __ bind(copy32);
1275     bs.copy_load_at_16(t0, t1, Address(s, 0));
1276     bs.copy_load_at_16(t6, t7, Address(send, -16));
1277 
1278     bs.copy_store_at_16(Address(d, 0), t0, t1);
1279     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1280     __ b(finish);
1281 
1282     // 65..80/96 bytes
1283     // (96 bytes if SIMD because we do 32 byes per instruction)
1284     __ bind(copy80);
1285     if (UseSIMDForMemoryOps) {
1286       bs.copy_load_at_32(v0, v1, Address(s, 0));
1287       bs.copy_load_at_32(v2, v3, Address(s, 32));
1288       // Unaligned pointers can be an issue for copying.
1289       // The issue has more chances to happen when granularity of data is
1290       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1291       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1292       // The most performance drop has been seen for the range 65-80 bytes.
1293       // For such cases using the pair of ldp/stp instead of the third pair of
1294       // ldpq/stpq fixes the performance issue.
1295       if (granularity < sizeof (jint)) {
1296         Label copy96;
1297         __ cmp(count, u1(80/granularity));
1298         __ br(Assembler::HI, copy96);
1299         bs.copy_load_at_16(t0, t1, Address(send, -16));
1300 
1301         bs.copy_store_at_32(Address(d, 0), v0, v1);
1302         bs.copy_store_at_32(Address(d, 32), v2, v3);
1303 
1304         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1305         __ b(finish);
1306 
1307         __ bind(copy96);
1308       }
1309       bs.copy_load_at_32(v4, v5, Address(send, -32));
1310 
1311       bs.copy_store_at_32(Address(d, 0), v0, v1);
1312       bs.copy_store_at_32(Address(d, 32), v2, v3);
1313 
1314       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1315     } else {
1316       bs.copy_load_at_16(t0, t1, Address(s, 0));
1317       bs.copy_load_at_16(t2, t3, Address(s, 16));
1318       bs.copy_load_at_16(t4, t5, Address(s, 32));
1319       bs.copy_load_at_16(t6, t7, Address(s, 48));
1320       bs.copy_load_at_16(t8, t9, Address(send, -16));
1321 
1322       bs.copy_store_at_16(Address(d, 0), t0, t1);
1323       bs.copy_store_at_16(Address(d, 16), t2, t3);
1324       bs.copy_store_at_16(Address(d, 32), t4, t5);
1325       bs.copy_store_at_16(Address(d, 48), t6, t7);
1326       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1327     }
1328     __ b(finish);
1329 
1330     // 0..16 bytes
1331     __ bind(copy16);
1332     __ cmp(count, u1(8/granularity));
1333     __ br(Assembler::LO, copy8);
1334 
1335     // 8..16 bytes
1336     bs.copy_load_at_8(t0, Address(s, 0));
1337     bs.copy_load_at_8(t1, Address(send, -8));
1338     bs.copy_store_at_8(Address(d, 0), t0);
1339     bs.copy_store_at_8(Address(dend, -8), t1);
1340     __ b(finish);
1341 
1342     if (granularity < 8) {
1343       // 4..7 bytes
1344       __ bind(copy8);
1345       __ tbz(count, 2 - exact_log2(granularity), copy4);
1346       __ ldrw(t0, Address(s, 0));
1347       __ ldrw(t1, Address(send, -4));
1348       __ strw(t0, Address(d, 0));
1349       __ strw(t1, Address(dend, -4));
1350       __ b(finish);
1351       if (granularity < 4) {
1352         // 0..3 bytes
1353         __ bind(copy4);
1354         __ cbz(count, finish); // get rid of 0 case
1355         if (granularity == 2) {
1356           __ ldrh(t0, Address(s, 0));
1357           __ strh(t0, Address(d, 0));
1358         } else { // granularity == 1
1359           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1360           // the first and last byte.
1361           // Handle the 3 byte case by loading and storing base + count/2
1362           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1363           // This does means in the 1 byte case we load/store the same
1364           // byte 3 times.
1365           __ lsr(count, count, 1);
1366           __ ldrb(t0, Address(s, 0));
1367           __ ldrb(t1, Address(send, -1));
1368           __ ldrb(t2, Address(s, count));
1369           __ strb(t0, Address(d, 0));
1370           __ strb(t1, Address(dend, -1));
1371           __ strb(t2, Address(d, count));
1372         }
1373         __ b(finish);
1374       }
1375     }
1376 
1377     __ bind(copy_big);
1378     if (is_backwards) {
1379       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1380       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1381     }
1382 
1383     // Now we've got the small case out of the way we can align the
1384     // source address on a 2-word boundary.
1385 
1386     // Here we will materialize a count in r15, which is used by copy_memory_small
1387     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1388     // Up until here, we have used t9, which aliases r15, but from here on, that register
1389     // can not be used as a temp register, as it contains the count.
1390 
1391     Label aligned;
1392 
1393     if (is_aligned) {
1394       // We may have to adjust by 1 word to get s 2-word-aligned.
1395       __ tbz(s, exact_log2(wordSize), aligned);
1396       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1397       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1398       __ sub(count, count, wordSize/granularity);
1399     } else {
1400       if (is_backwards) {
1401         __ andr(r15, s, 2 * wordSize - 1);
1402       } else {
1403         __ neg(r15, s);
1404         __ andr(r15, r15, 2 * wordSize - 1);
1405       }
1406       // r15 is the byte adjustment needed to align s.
1407       __ cbz(r15, aligned);
1408       int shift = exact_log2(granularity);
1409       if (shift > 0) {
1410         __ lsr(r15, r15, shift);
1411       }
1412       __ sub(count, count, r15);
1413 
1414 #if 0
1415       // ?? This code is only correct for a disjoint copy.  It may or
1416       // may not make sense to use it in that case.
1417 
1418       // Copy the first pair; s and d may not be aligned.
1419       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1420       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1421 
1422       // Align s and d, adjust count
1423       if (is_backwards) {
1424         __ sub(s, s, r15);
1425         __ sub(d, d, r15);
1426       } else {
1427         __ add(s, s, r15);
1428         __ add(d, d, r15);
1429       }
1430 #else
1431       copy_memory_small(decorators, type, s, d, r15, step);
1432 #endif
1433     }
1434 
1435     __ bind(aligned);
1436 
1437     // s is now 2-word-aligned.
1438 
1439     // We have a count of units and some trailing bytes. Adjust the
1440     // count and do a bulk copy of words. If the shift is zero
1441     // perform a move instead to benefit from zero latency moves.
1442     int shift = exact_log2(wordSize/granularity);
1443     if (shift > 0) {
1444       __ lsr(r15, count, shift);
1445     } else {
1446       __ mov(r15, count);
1447     }
1448     if (direction == copy_forwards) {
1449       if (type != T_OBJECT) {
1450         __ bl(copy_f);
1451       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1452         __ bl(copy_obj_uninit_f);
1453       } else {
1454         __ bl(copy_obj_f);
1455       }
1456     } else {
1457       if (type != T_OBJECT) {
1458         __ bl(copy_b);
1459       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1460         __ bl(copy_obj_uninit_b);
1461       } else {
1462         __ bl(copy_obj_b);
1463       }
1464     }
1465 
1466     // And the tail.
1467     copy_memory_small(decorators, type, s, d, count, step);
1468 
1469     if (granularity >= 8) __ bind(copy8);
1470     if (granularity >= 4) __ bind(copy4);
1471     __ bind(finish);
1472   }
1473 
1474 
1475   void clobber_registers() {
1476 #ifdef ASSERT
1477     RegSet clobbered
1478       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1479     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1480     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1481     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1482       __ mov(*it, rscratch1);
1483     }
1484 #endif
1485 
1486   }
1487 
1488   // Scan over array at a for count oops, verifying each one.
1489   // Preserves a and count, clobbers rscratch1 and rscratch2.
1490   void verify_oop_array (int size, Register a, Register count, Register temp) {
1491     Label loop, end;
1492     __ mov(rscratch1, a);
1493     __ mov(rscratch2, zr);
1494     __ bind(loop);
1495     __ cmp(rscratch2, count);
1496     __ br(Assembler::HS, end);
1497     if (size == wordSize) {
1498       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1499       __ verify_oop(temp);
1500     } else {
1501       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1502       __ decode_heap_oop(temp); // calls verify_oop
1503     }
1504     __ add(rscratch2, rscratch2, 1);
1505     __ b(loop);
1506     __ bind(end);
1507   }
1508 
1509   // Arguments:
1510   //   stub_id - is used to name the stub and identify all details of
1511   //             how to perform the copy.
1512   //
1513   //   entry - is assigned to the stub's post push entry point unless
1514   //           it is null
1515   //
1516   // Inputs:
1517   //   c_rarg0   - source array address
1518   //   c_rarg1   - destination array address
1519   //   c_rarg2   - element count, treated as ssize_t, can be zero
1520   //
1521   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1522   // the hardware handle it.  The two dwords within qwords that span
1523   // cache line boundaries will still be loaded and stored atomically.
1524   //
1525   // Side Effects: entry is set to the (post push) entry point so it
1526   //               can be used by the corresponding conjoint copy
1527   //               method
1528   //
1529   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
1530     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1531     RegSet saved_reg = RegSet::of(s, d, count);
1532     int size;
1533     bool aligned;
1534     bool is_oop;
1535     bool dest_uninitialized;
1536     switch (stub_id) {
1537     case jbyte_disjoint_arraycopy_id:
1538       size = sizeof(jbyte);
1539       aligned = false;
1540       is_oop = false;
1541       dest_uninitialized = false;
1542       break;
1543     case arrayof_jbyte_disjoint_arraycopy_id:
1544       size = sizeof(jbyte);
1545       aligned = true;
1546       is_oop = false;
1547       dest_uninitialized = false;
1548       break;
1549     case jshort_disjoint_arraycopy_id:
1550       size = sizeof(jshort);
1551       aligned = false;
1552       is_oop = false;
1553       dest_uninitialized = false;
1554       break;
1555     case arrayof_jshort_disjoint_arraycopy_id:
1556       size = sizeof(jshort);
1557       aligned = true;
1558       is_oop = false;
1559       dest_uninitialized = false;
1560       break;
1561     case jint_disjoint_arraycopy_id:
1562       size = sizeof(jint);
1563       aligned = false;
1564       is_oop = false;
1565       dest_uninitialized = false;
1566       break;
1567     case arrayof_jint_disjoint_arraycopy_id:
1568       size = sizeof(jint);
1569       aligned = true;
1570       is_oop = false;
1571       dest_uninitialized = false;
1572       break;
1573     case jlong_disjoint_arraycopy_id:
1574       // since this is always aligned we can (should!) use the same
1575       // stub as for case arrayof_jlong_disjoint_arraycopy
1576       ShouldNotReachHere();
1577       break;
1578     case arrayof_jlong_disjoint_arraycopy_id:
1579       size = sizeof(jlong);
1580       aligned = true;
1581       is_oop = false;
1582       dest_uninitialized = false;
1583       break;
1584     case oop_disjoint_arraycopy_id:
1585       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1586       aligned = !UseCompressedOops;
1587       is_oop = true;
1588       dest_uninitialized = false;
1589       break;
1590     case arrayof_oop_disjoint_arraycopy_id:
1591       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1592       aligned = !UseCompressedOops;
1593       is_oop = true;
1594       dest_uninitialized = false;
1595       break;
1596     case oop_disjoint_arraycopy_uninit_id:
1597       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1598       aligned = !UseCompressedOops;
1599       is_oop = true;
1600       dest_uninitialized = true;
1601       break;
1602     case arrayof_oop_disjoint_arraycopy_uninit_id:
1603       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1604       aligned = !UseCompressedOops;
1605       is_oop = true;
1606       dest_uninitialized = true;
1607       break;
1608     default:
1609       ShouldNotReachHere();
1610       break;
1611     }
1612 
1613     __ align(CodeEntryAlignment);
1614     StubCodeMark mark(this, stub_id);
1615     address start = __ pc();
1616     __ enter();
1617 
1618     if (entry != nullptr) {
1619       *entry = __ pc();
1620       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1621       BLOCK_COMMENT("Entry:");
1622     }
1623 
1624     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1625     if (dest_uninitialized) {
1626       decorators |= IS_DEST_UNINITIALIZED;
1627     }
1628     if (aligned) {
1629       decorators |= ARRAYCOPY_ALIGNED;
1630     }
1631 
1632     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1633     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1634 
1635     if (is_oop) {
1636       // save regs before copy_memory
1637       __ push(RegSet::of(d, count), sp);
1638     }
1639     {
1640       // UnsafeMemoryAccess page error: continue after unsafe access
1641       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1642       UnsafeMemoryAccessMark umam(this, add_entry, true);
1643       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1644     }
1645 
1646     if (is_oop) {
1647       __ pop(RegSet::of(d, count), sp);
1648       if (VerifyOops)
1649         verify_oop_array(size, d, count, r16);
1650     }
1651 
1652     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1653 
1654     __ leave();
1655     __ mov(r0, zr); // return 0
1656     __ ret(lr);
1657     return start;
1658   }
1659 
1660   // Arguments:
1661   //   stub_id - is used to name the stub and identify all details of
1662   //             how to perform the copy.
1663   //
1664   //   nooverlap_target - identifes the (post push) entry for the
1665   //             corresponding disjoint copy routine which can be
1666   //             jumped to if the ranges do not actually overlap
1667   //
1668   //   entry - is assigned to the stub's post push entry point unless
1669   //           it is null
1670   //
1671   //
1672   // Inputs:
1673   //   c_rarg0   - source array address
1674   //   c_rarg1   - destination array address
1675   //   c_rarg2   - element count, treated as ssize_t, can be zero
1676   //
1677   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1678   // the hardware handle it.  The two dwords within qwords that span
1679   // cache line boundaries will still be loaded and stored atomically.
1680   //
1681   // Side Effects:
1682   //   entry is set to the no-overlap entry point so it can be used by
1683   //   some other conjoint copy method
1684   //
1685   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
1686     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1687     RegSet saved_regs = RegSet::of(s, d, count);
1688     int size;
1689     bool aligned;
1690     bool is_oop;
1691     bool dest_uninitialized;
1692     switch (stub_id) {
1693     case jbyte_arraycopy_id:
1694       size = sizeof(jbyte);
1695       aligned = false;
1696       is_oop = false;
1697       dest_uninitialized = false;
1698       break;
1699     case arrayof_jbyte_arraycopy_id:
1700       size = sizeof(jbyte);
1701       aligned = true;
1702       is_oop = false;
1703       dest_uninitialized = false;
1704       break;
1705     case jshort_arraycopy_id:
1706       size = sizeof(jshort);
1707       aligned = false;
1708       is_oop = false;
1709       dest_uninitialized = false;
1710       break;
1711     case arrayof_jshort_arraycopy_id:
1712       size = sizeof(jshort);
1713       aligned = true;
1714       is_oop = false;
1715       dest_uninitialized = false;
1716       break;
1717     case jint_arraycopy_id:
1718       size = sizeof(jint);
1719       aligned = false;
1720       is_oop = false;
1721       dest_uninitialized = false;
1722       break;
1723     case arrayof_jint_arraycopy_id:
1724       size = sizeof(jint);
1725       aligned = true;
1726       is_oop = false;
1727       dest_uninitialized = false;
1728       break;
1729     case jlong_arraycopy_id:
1730       // since this is always aligned we can (should!) use the same
1731       // stub as for case arrayof_jlong_disjoint_arraycopy
1732       ShouldNotReachHere();
1733       break;
1734     case arrayof_jlong_arraycopy_id:
1735       size = sizeof(jlong);
1736       aligned = true;
1737       is_oop = false;
1738       dest_uninitialized = false;
1739       break;
1740     case oop_arraycopy_id:
1741       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1742       aligned = !UseCompressedOops;
1743       is_oop = true;
1744       dest_uninitialized = false;
1745       break;
1746     case arrayof_oop_arraycopy_id:
1747       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1748       aligned = !UseCompressedOops;
1749       is_oop = true;
1750       dest_uninitialized = false;
1751       break;
1752     case oop_arraycopy_uninit_id:
1753       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1754       aligned = !UseCompressedOops;
1755       is_oop = true;
1756       dest_uninitialized = true;
1757       break;
1758     case arrayof_oop_arraycopy_uninit_id:
1759       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1760       aligned = !UseCompressedOops;
1761       is_oop = true;
1762       dest_uninitialized = true;
1763       break;
1764     default:
1765       ShouldNotReachHere();
1766     }
1767 
1768     StubCodeMark mark(this, stub_id);
1769     address start = __ pc();
1770     __ enter();
1771 
1772     if (entry != nullptr) {
1773       *entry = __ pc();
1774       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1775       BLOCK_COMMENT("Entry:");
1776     }
1777 
1778     // use fwd copy when (d-s) above_equal (count*size)
1779     __ sub(rscratch1, d, s);
1780     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1781     __ br(Assembler::HS, nooverlap_target);
1782 
1783     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1784     if (dest_uninitialized) {
1785       decorators |= IS_DEST_UNINITIALIZED;
1786     }
1787     if (aligned) {
1788       decorators |= ARRAYCOPY_ALIGNED;
1789     }
1790 
1791     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1792     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1793 
1794     if (is_oop) {
1795       // save regs before copy_memory
1796       __ push(RegSet::of(d, count), sp);
1797     }
1798     {
1799       // UnsafeMemoryAccess page error: continue after unsafe access
1800       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1801       UnsafeMemoryAccessMark umam(this, add_entry, true);
1802       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1803     }
1804     if (is_oop) {
1805       __ pop(RegSet::of(d, count), sp);
1806       if (VerifyOops)
1807         verify_oop_array(size, d, count, r16);
1808     }
1809     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1810     __ leave();
1811     __ mov(r0, zr); // return 0
1812     __ ret(lr);
1813     return start;
1814   }
1815 
1816   // Helper for generating a dynamic type check.
1817   // Smashes rscratch1, rscratch2.
1818   void generate_type_check(Register sub_klass,
1819                            Register super_check_offset,
1820                            Register super_klass,
1821                            Register temp1,
1822                            Register temp2,
1823                            Register result,
1824                            Label& L_success) {
1825     assert_different_registers(sub_klass, super_check_offset, super_klass);
1826 
1827     BLOCK_COMMENT("type_check:");
1828 
1829     Label L_miss;
1830 
1831     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1832                                      super_check_offset);
1833     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1834 
1835     // Fall through on failure!
1836     __ BIND(L_miss);
1837   }
1838 
1839   //
1840   //  Generate checkcasting array copy stub
1841   //
1842   //  Input:
1843   //    c_rarg0   - source array address
1844   //    c_rarg1   - destination array address
1845   //    c_rarg2   - element count, treated as ssize_t, can be zero
1846   //    c_rarg3   - size_t ckoff (super_check_offset)
1847   //    c_rarg4   - oop ckval (super_klass)
1848   //
1849   //  Output:
1850   //    r0 ==  0  -  success
1851   //    r0 == -1^K - failure, where K is partial transfer count
1852   //
1853   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
1854     bool dest_uninitialized;
1855     switch (stub_id) {
1856     case checkcast_arraycopy_id:
1857       dest_uninitialized = false;
1858       break;
1859     case checkcast_arraycopy_uninit_id:
1860       dest_uninitialized = true;
1861       break;
1862     default:
1863       ShouldNotReachHere();
1864     }
1865 
1866     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1867 
1868     // Input registers (after setup_arg_regs)
1869     const Register from        = c_rarg0;   // source array address
1870     const Register to          = c_rarg1;   // destination array address
1871     const Register count       = c_rarg2;   // elementscount
1872     const Register ckoff       = c_rarg3;   // super_check_offset
1873     const Register ckval       = c_rarg4;   // super_klass
1874 
1875     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1876     RegSet wb_post_saved_regs = RegSet::of(count);
1877 
1878     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1879     const Register copied_oop  = r22;       // actual oop copied
1880     const Register count_save  = r21;       // orig elementscount
1881     const Register start_to    = r20;       // destination array start address
1882     const Register r19_klass   = r19;       // oop._klass
1883 
1884     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1885     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1886 
1887     //---------------------------------------------------------------
1888     // Assembler stub will be used for this call to arraycopy
1889     // if the two arrays are subtypes of Object[] but the
1890     // destination array type is not equal to or a supertype
1891     // of the source type.  Each element must be separately
1892     // checked.
1893 
1894     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1895                                copied_oop, r19_klass, count_save);
1896 
1897     __ align(CodeEntryAlignment);
1898     StubCodeMark mark(this, stub_id);
1899     address start = __ pc();
1900 
1901     __ enter(); // required for proper stackwalking of RuntimeStub frame
1902 
1903 #ifdef ASSERT
1904     // caller guarantees that the arrays really are different
1905     // otherwise, we would have to make conjoint checks
1906     { Label L;
1907       __ b(L);                  // conjoint check not yet implemented
1908       __ stop("checkcast_copy within a single array");
1909       __ bind(L);
1910     }
1911 #endif //ASSERT
1912 
1913     // Caller of this entry point must set up the argument registers.
1914     if (entry != nullptr) {
1915       *entry = __ pc();
1916       BLOCK_COMMENT("Entry:");
1917     }
1918 
1919      // Empty array:  Nothing to do.
1920     __ cbz(count, L_done);
1921     __ push(RegSet::of(r19, r20, r21, r22), sp);
1922 
1923 #ifdef ASSERT
1924     BLOCK_COMMENT("assert consistent ckoff/ckval");
1925     // The ckoff and ckval must be mutually consistent,
1926     // even though caller generates both.
1927     { Label L;
1928       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1929       __ ldrw(start_to, Address(ckval, sco_offset));
1930       __ cmpw(ckoff, start_to);
1931       __ br(Assembler::EQ, L);
1932       __ stop("super_check_offset inconsistent");
1933       __ bind(L);
1934     }
1935 #endif //ASSERT
1936 
1937     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1938     bool is_oop = true;
1939     int element_size = UseCompressedOops ? 4 : 8;
1940     if (dest_uninitialized) {
1941       decorators |= IS_DEST_UNINITIALIZED;
1942     }
1943 
1944     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1945     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1946 
1947     // save the original count
1948     __ mov(count_save, count);
1949 
1950     // Copy from low to high addresses
1951     __ mov(start_to, to);              // Save destination array start address
1952     __ b(L_load_element);
1953 
1954     // ======== begin loop ========
1955     // (Loop is rotated; its entry is L_load_element.)
1956     // Loop control:
1957     //   for (; count != 0; count--) {
1958     //     copied_oop = load_heap_oop(from++);
1959     //     ... generate_type_check ...;
1960     //     store_heap_oop(to++, copied_oop);
1961     //   }
1962     __ align(OptoLoopAlignment);
1963 
1964     __ BIND(L_store_element);
1965     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1966                       __ post(to, element_size), copied_oop, noreg,
1967                       gct1, gct2, gct3);
1968     __ sub(count, count, 1);
1969     __ cbz(count, L_do_card_marks);
1970 
1971     // ======== loop entry is here ========
1972     __ BIND(L_load_element);
1973     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1974                      copied_oop, noreg, __ post(from, element_size),
1975                      gct1);
1976     __ cbz(copied_oop, L_store_element);
1977 
1978     __ load_klass(r19_klass, copied_oop);// query the object klass
1979 
1980     BLOCK_COMMENT("type_check:");
1981     generate_type_check(/*sub_klass*/r19_klass,
1982                         /*super_check_offset*/ckoff,
1983                         /*super_klass*/ckval,
1984                         /*r_array_base*/gct1,
1985                         /*temp2*/gct2,
1986                         /*result*/r10, L_store_element);
1987 
1988     // Fall through on failure!
1989 
1990     // ======== end loop ========
1991 
1992     // It was a real error; we must depend on the caller to finish the job.
1993     // Register count = remaining oops, count_orig = total oops.
1994     // Emit GC store barriers for the oops we have copied and report
1995     // their number to the caller.
1996 
1997     __ subs(count, count_save, count);     // K = partially copied oop count
1998     __ eon(count, count, zr);              // report (-1^K) to caller
1999     __ br(Assembler::EQ, L_done_pop);
2000 
2001     __ BIND(L_do_card_marks);
2002     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2003 
2004     __ bind(L_done_pop);
2005     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2007 
2008     __ bind(L_done);
2009     __ mov(r0, count);
2010     __ leave();
2011     __ ret(lr);
2012 
2013     return start;
2014   }
2015 
2016   // Perform range checks on the proposed arraycopy.
2017   // Kills temp, but nothing else.
2018   // Also, clean the sign bits of src_pos and dst_pos.
2019   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2020                               Register src_pos, // source position (c_rarg1)
2021                               Register dst,     // destination array oo (c_rarg2)
2022                               Register dst_pos, // destination position (c_rarg3)
2023                               Register length,
2024                               Register temp,
2025                               Label& L_failed) {
2026     BLOCK_COMMENT("arraycopy_range_checks:");
2027 
2028     assert_different_registers(rscratch1, temp);
2029 
2030     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2031     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2032     __ addw(temp, length, src_pos);
2033     __ cmpw(temp, rscratch1);
2034     __ br(Assembler::HI, L_failed);
2035 
2036     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2037     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2038     __ addw(temp, length, dst_pos);
2039     __ cmpw(temp, rscratch1);
2040     __ br(Assembler::HI, L_failed);
2041 
2042     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2043     __ movw(src_pos, src_pos);
2044     __ movw(dst_pos, dst_pos);
2045 
2046     BLOCK_COMMENT("arraycopy_range_checks done");
2047   }
2048 
2049   // These stubs get called from some dumb test routine.
2050   // I'll write them properly when they're called from
2051   // something that's actually doing something.
2052   static void fake_arraycopy_stub(address src, address dst, int count) {
2053     assert(count == 0, "huh?");
2054   }
2055 
2056 
2057   //
2058   //  Generate 'unsafe' array copy stub
2059   //  Though just as safe as the other stubs, it takes an unscaled
2060   //  size_t argument instead of an element count.
2061   //
2062   //  Input:
2063   //    c_rarg0   - source array address
2064   //    c_rarg1   - destination array address
2065   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2066   //
2067   // Examines the alignment of the operands and dispatches
2068   // to a long, int, short, or byte copy loop.
2069   //
2070   address generate_unsafe_copy(address byte_copy_entry,
2071                                address short_copy_entry,
2072                                address int_copy_entry,
2073                                address long_copy_entry) {
2074     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
2075 
2076     Label L_long_aligned, L_int_aligned, L_short_aligned;
2077     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2078 
2079     __ align(CodeEntryAlignment);
2080     StubCodeMark mark(this, stub_id);
2081     address start = __ pc();
2082     __ enter(); // required for proper stackwalking of RuntimeStub frame
2083 
2084     // bump this on entry, not on exit:
2085     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2086 
2087     __ orr(rscratch1, s, d);
2088     __ orr(rscratch1, rscratch1, count);
2089 
2090     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2091     __ cbz(rscratch1, L_long_aligned);
2092     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2093     __ cbz(rscratch1, L_int_aligned);
2094     __ tbz(rscratch1, 0, L_short_aligned);
2095     __ b(RuntimeAddress(byte_copy_entry));
2096 
2097     __ BIND(L_short_aligned);
2098     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2099     __ b(RuntimeAddress(short_copy_entry));
2100     __ BIND(L_int_aligned);
2101     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2102     __ b(RuntimeAddress(int_copy_entry));
2103     __ BIND(L_long_aligned);
2104     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2105     __ b(RuntimeAddress(long_copy_entry));
2106 
2107     return start;
2108   }
2109 
2110   //
2111   //  Generate generic array copy stubs
2112   //
2113   //  Input:
2114   //    c_rarg0    -  src oop
2115   //    c_rarg1    -  src_pos (32-bits)
2116   //    c_rarg2    -  dst oop
2117   //    c_rarg3    -  dst_pos (32-bits)
2118   //    c_rarg4    -  element count (32-bits)
2119   //
2120   //  Output:
2121   //    r0 ==  0  -  success
2122   //    r0 == -1^K - failure, where K is partial transfer count
2123   //
2124   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2125                                 address int_copy_entry, address oop_copy_entry,
2126                                 address long_copy_entry, address checkcast_copy_entry) {
2127     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
2128 
2129     Label L_failed, L_objArray;
2130     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2131 
2132     // Input registers
2133     const Register src        = c_rarg0;  // source array oop
2134     const Register src_pos    = c_rarg1;  // source position
2135     const Register dst        = c_rarg2;  // destination array oop
2136     const Register dst_pos    = c_rarg3;  // destination position
2137     const Register length     = c_rarg4;
2138 
2139 
2140     // Registers used as temps
2141     const Register dst_klass  = c_rarg5;
2142 
2143     __ align(CodeEntryAlignment);
2144 
2145     StubCodeMark mark(this, stub_id);
2146 
2147     address start = __ pc();
2148 
2149     __ enter(); // required for proper stackwalking of RuntimeStub frame
2150 
2151     // bump this on entry, not on exit:
2152     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2153 
2154     //-----------------------------------------------------------------------
2155     // Assembler stub will be used for this call to arraycopy
2156     // if the following conditions are met:
2157     //
2158     // (1) src and dst must not be null.
2159     // (2) src_pos must not be negative.
2160     // (3) dst_pos must not be negative.
2161     // (4) length  must not be negative.
2162     // (5) src klass and dst klass should be the same and not null.
2163     // (6) src and dst should be arrays.
2164     // (7) src_pos + length must not exceed length of src.
2165     // (8) dst_pos + length must not exceed length of dst.
2166     //
2167 
2168     //  if (src == nullptr) return -1;
2169     __ cbz(src, L_failed);
2170 
2171     //  if (src_pos < 0) return -1;
2172     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2173 
2174     //  if (dst == nullptr) return -1;
2175     __ cbz(dst, L_failed);
2176 
2177     //  if (dst_pos < 0) return -1;
2178     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2179 
2180     // registers used as temp
2181     const Register scratch_length    = r16; // elements count to copy
2182     const Register scratch_src_klass = r17; // array klass
2183     const Register lh                = r15; // layout helper
2184 
2185     //  if (length < 0) return -1;
2186     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2187     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2188 
2189     __ load_klass(scratch_src_klass, src);
2190 #ifdef ASSERT
2191     //  assert(src->klass() != nullptr);
2192     {
2193       BLOCK_COMMENT("assert klasses not null {");
2194       Label L1, L2;
2195       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2196       __ bind(L1);
2197       __ stop("broken null klass");
2198       __ bind(L2);
2199       __ load_klass(rscratch1, dst);
2200       __ cbz(rscratch1, L1);     // this would be broken also
2201       BLOCK_COMMENT("} assert klasses not null done");
2202     }
2203 #endif
2204 
2205     // Load layout helper (32-bits)
2206     //
2207     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2208     // 32        30    24            16              8     2                 0
2209     //
2210     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2211     //
2212 
2213     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2214 
2215     // Handle objArrays completely differently...
2216     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2217     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2218     __ movw(rscratch1, objArray_lh);
2219     __ eorw(rscratch2, lh, rscratch1);
2220     __ cbzw(rscratch2, L_objArray);
2221 
2222     //  if (src->klass() != dst->klass()) return -1;
2223     __ load_klass(rscratch2, dst);
2224     __ eor(rscratch2, rscratch2, scratch_src_klass);
2225     __ cbnz(rscratch2, L_failed);
2226 
2227     //  if (!src->is_Array()) return -1;
2228     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2229 
2230     // At this point, it is known to be a typeArray (array_tag 0x3).
2231 #ifdef ASSERT
2232     {
2233       BLOCK_COMMENT("assert primitive array {");
2234       Label L;
2235       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2236       __ cmpw(lh, rscratch2);
2237       __ br(Assembler::GE, L);
2238       __ stop("must be a primitive array");
2239       __ bind(L);
2240       BLOCK_COMMENT("} assert primitive array done");
2241     }
2242 #endif
2243 
2244     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2245                            rscratch2, L_failed);
2246 
2247     // TypeArrayKlass
2248     //
2249     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2250     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2251     //
2252 
2253     const Register rscratch1_offset = rscratch1;    // array offset
2254     const Register r15_elsize = lh; // element size
2255 
2256     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2257            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2258     __ add(src, src, rscratch1_offset);           // src array offset
2259     __ add(dst, dst, rscratch1_offset);           // dst array offset
2260     BLOCK_COMMENT("choose copy loop based on element size");
2261 
2262     // next registers should be set before the jump to corresponding stub
2263     const Register from     = c_rarg0;  // source array address
2264     const Register to       = c_rarg1;  // destination array address
2265     const Register count    = c_rarg2;  // elements count
2266 
2267     // 'from', 'to', 'count' registers should be set in such order
2268     // since they are the same as 'src', 'src_pos', 'dst'.
2269 
2270     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2271 
2272     // The possible values of elsize are 0-3, i.e. exact_log2(element
2273     // size in bytes).  We do a simple bitwise binary search.
2274   __ BIND(L_copy_bytes);
2275     __ tbnz(r15_elsize, 1, L_copy_ints);
2276     __ tbnz(r15_elsize, 0, L_copy_shorts);
2277     __ lea(from, Address(src, src_pos));// src_addr
2278     __ lea(to,   Address(dst, dst_pos));// dst_addr
2279     __ movw(count, scratch_length); // length
2280     __ b(RuntimeAddress(byte_copy_entry));
2281 
2282   __ BIND(L_copy_shorts);
2283     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2285     __ movw(count, scratch_length); // length
2286     __ b(RuntimeAddress(short_copy_entry));
2287 
2288   __ BIND(L_copy_ints);
2289     __ tbnz(r15_elsize, 0, L_copy_longs);
2290     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2291     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2292     __ movw(count, scratch_length); // length
2293     __ b(RuntimeAddress(int_copy_entry));
2294 
2295   __ BIND(L_copy_longs);
2296 #ifdef ASSERT
2297     {
2298       BLOCK_COMMENT("assert long copy {");
2299       Label L;
2300       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2301       __ cmpw(r15_elsize, LogBytesPerLong);
2302       __ br(Assembler::EQ, L);
2303       __ stop("must be long copy, but elsize is wrong");
2304       __ bind(L);
2305       BLOCK_COMMENT("} assert long copy done");
2306     }
2307 #endif
2308     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2310     __ movw(count, scratch_length); // length
2311     __ b(RuntimeAddress(long_copy_entry));
2312 
2313     // ObjArrayKlass
2314   __ BIND(L_objArray);
2315     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2316 
2317     Label L_plain_copy, L_checkcast_copy;
2318     //  test array classes for subtyping
2319     __ load_klass(r15, dst);
2320     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2321     __ br(Assembler::NE, L_checkcast_copy);
2322 
2323     // Identically typed arrays can be copied without element-wise checks.
2324     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2325                            rscratch2, L_failed);
2326 
2327     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2328     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2329     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2330     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2331     __ movw(count, scratch_length); // length
2332   __ BIND(L_plain_copy);
2333     __ b(RuntimeAddress(oop_copy_entry));
2334 
2335   __ BIND(L_checkcast_copy);
2336     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2337     {
2338       // Before looking at dst.length, make sure dst is also an objArray.
2339       __ ldrw(rscratch1, Address(r15, lh_offset));
2340       __ movw(rscratch2, objArray_lh);
2341       __ eorw(rscratch1, rscratch1, rscratch2);
2342       __ cbnzw(rscratch1, L_failed);
2343 
2344       // It is safe to examine both src.length and dst.length.
2345       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2346                              r15, L_failed);
2347 
2348       __ load_klass(dst_klass, dst); // reload
2349 
2350       // Marshal the base address arguments now, freeing registers.
2351       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2352       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2353       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2354       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2355       __ movw(count, length);           // length (reloaded)
2356       Register sco_temp = c_rarg3;      // this register is free now
2357       assert_different_registers(from, to, count, sco_temp,
2358                                  dst_klass, scratch_src_klass);
2359       // assert_clean_int(count, sco_temp);
2360 
2361       // Generate the type check.
2362       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2363       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2364 
2365       // Smashes rscratch1, rscratch2
2366       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2367                           L_plain_copy);
2368 
2369       // Fetch destination element klass from the ObjArrayKlass header.
2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2373 
2374       // the checkcast_copy loop needs two extra arguments:
2375       assert(c_rarg3 == sco_temp, "#3 already in place");
2376       // Set up arguments for checkcast_copy_entry.
2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2378       __ b(RuntimeAddress(checkcast_copy_entry));
2379     }
2380 
2381   __ BIND(L_failed);
2382     __ mov(r0, -1);
2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
2384     __ ret(lr);
2385 
2386     return start;
2387   }
2388 
2389   //
2390   // Generate stub for array fill. If "aligned" is true, the
2391   // "to" address is assumed to be heapword aligned.
2392   //
2393   // Arguments for generated stub:
2394   //   to:    c_rarg0
2395   //   value: c_rarg1
2396   //   count: c_rarg2 treated as signed
2397   //
2398   address generate_fill(StubGenStubId stub_id) {
2399     BasicType t;
2400     bool aligned;
2401 
2402     switch (stub_id) {
2403     case jbyte_fill_id:
2404       t = T_BYTE;
2405       aligned = false;
2406       break;
2407     case jshort_fill_id:
2408       t = T_SHORT;
2409       aligned = false;
2410       break;
2411     case jint_fill_id:
2412       t = T_INT;
2413       aligned = false;
2414       break;
2415     case arrayof_jbyte_fill_id:
2416       t = T_BYTE;
2417       aligned = true;
2418       break;
2419     case arrayof_jshort_fill_id:
2420       t = T_SHORT;
2421       aligned = true;
2422       break;
2423     case arrayof_jint_fill_id:
2424       t = T_INT;
2425       aligned = true;
2426       break;
2427     default:
2428       ShouldNotReachHere();
2429     };
2430 
2431     __ align(CodeEntryAlignment);
2432     StubCodeMark mark(this, stub_id);
2433     address start = __ pc();
2434 
2435     BLOCK_COMMENT("Entry:");
2436 
2437     const Register to        = c_rarg0;  // source array address
2438     const Register value     = c_rarg1;  // value
2439     const Register count     = c_rarg2;  // elements count
2440 
2441     const Register bz_base = r10;        // base for block_zero routine
2442     const Register cnt_words = r11;      // temp register
2443 
2444     __ enter();
2445 
2446     Label L_fill_elements, L_exit1;
2447 
2448     int shift = -1;
2449     switch (t) {
2450       case T_BYTE:
2451         shift = 0;
2452         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2453         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2454         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2455         __ br(Assembler::LO, L_fill_elements);
2456         break;
2457       case T_SHORT:
2458         shift = 1;
2459         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2461         __ br(Assembler::LO, L_fill_elements);
2462         break;
2463       case T_INT:
2464         shift = 2;
2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2466         __ br(Assembler::LO, L_fill_elements);
2467         break;
2468       default: ShouldNotReachHere();
2469     }
2470 
2471     // Align source address at 8 bytes address boundary.
2472     Label L_skip_align1, L_skip_align2, L_skip_align4;
2473     if (!aligned) {
2474       switch (t) {
2475         case T_BYTE:
2476           // One byte misalignment happens only for byte arrays.
2477           __ tbz(to, 0, L_skip_align1);
2478           __ strb(value, Address(__ post(to, 1)));
2479           __ subw(count, count, 1);
2480           __ bind(L_skip_align1);
2481           // Fallthrough
2482         case T_SHORT:
2483           // Two bytes misalignment happens only for byte and short (char) arrays.
2484           __ tbz(to, 1, L_skip_align2);
2485           __ strh(value, Address(__ post(to, 2)));
2486           __ subw(count, count, 2 >> shift);
2487           __ bind(L_skip_align2);
2488           // Fallthrough
2489         case T_INT:
2490           // Align to 8 bytes, we know we are 4 byte aligned to start.
2491           __ tbz(to, 2, L_skip_align4);
2492           __ strw(value, Address(__ post(to, 4)));
2493           __ subw(count, count, 4 >> shift);
2494           __ bind(L_skip_align4);
2495           break;
2496         default: ShouldNotReachHere();
2497       }
2498     }
2499 
2500     //
2501     //  Fill large chunks
2502     //
2503     __ lsrw(cnt_words, count, 3 - shift); // number of words
2504     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2505     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2506     if (UseBlockZeroing) {
2507       Label non_block_zeroing, rest;
2508       // If the fill value is zero we can use the fast zero_words().
2509       __ cbnz(value, non_block_zeroing);
2510       __ mov(bz_base, to);
2511       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2512       address tpc = __ zero_words(bz_base, cnt_words);
2513       if (tpc == nullptr) {
2514         fatal("CodeCache is full at generate_fill");
2515       }
2516       __ b(rest);
2517       __ bind(non_block_zeroing);
2518       __ fill_words(to, cnt_words, value);
2519       __ bind(rest);
2520     } else {
2521       __ fill_words(to, cnt_words, value);
2522     }
2523 
2524     // Remaining count is less than 8 bytes. Fill it by a single store.
2525     // Note that the total length is no less than 8 bytes.
2526     if (t == T_BYTE || t == T_SHORT) {
2527       Label L_exit1;
2528       __ cbzw(count, L_exit1);
2529       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2530       __ str(value, Address(to, -8));    // overwrite some elements
2531       __ bind(L_exit1);
2532       __ leave();
2533       __ ret(lr);
2534     }
2535 
2536     // Handle copies less than 8 bytes.
2537     Label L_fill_2, L_fill_4, L_exit2;
2538     __ bind(L_fill_elements);
2539     switch (t) {
2540       case T_BYTE:
2541         __ tbz(count, 0, L_fill_2);
2542         __ strb(value, Address(__ post(to, 1)));
2543         __ bind(L_fill_2);
2544         __ tbz(count, 1, L_fill_4);
2545         __ strh(value, Address(__ post(to, 2)));
2546         __ bind(L_fill_4);
2547         __ tbz(count, 2, L_exit2);
2548         __ strw(value, Address(to));
2549         break;
2550       case T_SHORT:
2551         __ tbz(count, 0, L_fill_4);
2552         __ strh(value, Address(__ post(to, 2)));
2553         __ bind(L_fill_4);
2554         __ tbz(count, 1, L_exit2);
2555         __ strw(value, Address(to));
2556         break;
2557       case T_INT:
2558         __ cbzw(count, L_exit2);
2559         __ strw(value, Address(to));
2560         break;
2561       default: ShouldNotReachHere();
2562     }
2563     __ bind(L_exit2);
2564     __ leave();
2565     __ ret(lr);
2566     return start;
2567   }
2568 
2569   address generate_data_cache_writeback() {
2570     const Register line        = c_rarg0;  // address of line to write back
2571 
2572     __ align(CodeEntryAlignment);
2573 
2574     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
2575     StubCodeMark mark(this, stub_id);
2576 
2577     address start = __ pc();
2578     __ enter();
2579     __ cache_wb(Address(line, 0));
2580     __ leave();
2581     __ ret(lr);
2582 
2583     return start;
2584   }
2585 
2586   address generate_data_cache_writeback_sync() {
2587     const Register is_pre     = c_rarg0;  // pre or post sync
2588 
2589     __ align(CodeEntryAlignment);
2590 
2591     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
2592     StubCodeMark mark(this, stub_id);
2593 
2594     // pre wbsync is a no-op
2595     // post wbsync translates to an sfence
2596 
2597     Label skip;
2598     address start = __ pc();
2599     __ enter();
2600     __ cbnz(is_pre, skip);
2601     __ cache_wbsync(false);
2602     __ bind(skip);
2603     __ leave();
2604     __ ret(lr);
2605 
2606     return start;
2607   }
2608 
2609   void generate_arraycopy_stubs() {
2610     address entry;
2611     address entry_jbyte_arraycopy;
2612     address entry_jshort_arraycopy;
2613     address entry_jint_arraycopy;
2614     address entry_oop_arraycopy;
2615     address entry_jlong_arraycopy;
2616     address entry_checkcast_arraycopy;
2617 
2618     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
2619     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
2620 
2621     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
2622     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
2623 
2624     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
2625     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
2626 
2627     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2628 
2629     //*** jbyte
2630     // Always need aligned and unaligned versions
2631     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
2632     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
2633     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
2634     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
2635 
2636     //*** jshort
2637     // Always need aligned and unaligned versions
2638     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
2639     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
2640     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
2641     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
2642 
2643     //*** jint
2644     // Aligned versions
2645     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
2646     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
2647     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2648     // entry_jint_arraycopy always points to the unaligned version
2649     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
2650     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
2651 
2652     //*** jlong
2653     // It is always aligned
2654     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
2655     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
2656     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2657     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2658 
2659     //*** oops
2660     {
2661       // With compressed oops we need unaligned versions; notice that
2662       // we overwrite entry_oop_arraycopy.
2663       bool aligned = !UseCompressedOops;
2664 
2665       StubRoutines::_arrayof_oop_disjoint_arraycopy
2666         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
2667       StubRoutines::_arrayof_oop_arraycopy
2668         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
2669       // Aligned versions without pre-barriers
2670       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2671         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
2672       StubRoutines::_arrayof_oop_arraycopy_uninit
2673         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
2674     }
2675 
2676     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2677     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2678     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2679     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2680 
2681     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
2682     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
2683 
2684     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
2685                                                               entry_jshort_arraycopy,
2686                                                               entry_jint_arraycopy,
2687                                                               entry_jlong_arraycopy);
2688 
2689     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
2690                                                                entry_jshort_arraycopy,
2691                                                                entry_jint_arraycopy,
2692                                                                entry_oop_arraycopy,
2693                                                                entry_jlong_arraycopy,
2694                                                                entry_checkcast_arraycopy);
2695 
2696     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
2697     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
2698     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
2699     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
2700     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
2701     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
2702   }
2703 
2704   void generate_math_stubs() { Unimplemented(); }
2705 
2706   // Arguments:
2707   //
2708   // Inputs:
2709   //   c_rarg0   - source byte array address
2710   //   c_rarg1   - destination byte array address
2711   //   c_rarg2   - K (key) in little endian int array
2712   //
2713   address generate_aescrypt_encryptBlock() {
2714     __ align(CodeEntryAlignment);
2715     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
2716     StubCodeMark mark(this, stub_id);
2717 
2718     const Register from        = c_rarg0;  // source array address
2719     const Register to          = c_rarg1;  // destination array address
2720     const Register key         = c_rarg2;  // key array address
2721     const Register keylen      = rscratch1;
2722 
2723     address start = __ pc();
2724     __ enter();
2725 
2726     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2727 
2728     __ aesenc_loadkeys(key, keylen);
2729     __ aesecb_encrypt(from, to, keylen);
2730 
2731     __ mov(r0, 0);
2732 
2733     __ leave();
2734     __ ret(lr);
2735 
2736     return start;
2737   }
2738 
2739   // Arguments:
2740   //
2741   // Inputs:
2742   //   c_rarg0   - source byte array address
2743   //   c_rarg1   - destination byte array address
2744   //   c_rarg2   - K (key) in little endian int array
2745   //
2746   address generate_aescrypt_decryptBlock() {
2747     assert(UseAES, "need AES cryptographic extension support");
2748     __ align(CodeEntryAlignment);
2749     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
2750     StubCodeMark mark(this, stub_id);
2751     Label L_doLast;
2752 
2753     const Register from        = c_rarg0;  // source array address
2754     const Register to          = c_rarg1;  // destination array address
2755     const Register key         = c_rarg2;  // key array address
2756     const Register keylen      = rscratch1;
2757 
2758     address start = __ pc();
2759     __ enter(); // required for proper stackwalking of RuntimeStub frame
2760 
2761     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2762 
2763     __ aesecb_decrypt(from, to, key, keylen);
2764 
2765     __ mov(r0, 0);
2766 
2767     __ leave();
2768     __ ret(lr);
2769 
2770     return start;
2771   }
2772 
2773   // Arguments:
2774   //
2775   // Inputs:
2776   //   c_rarg0   - source byte array address
2777   //   c_rarg1   - destination byte array address
2778   //   c_rarg2   - K (key) in little endian int array
2779   //   c_rarg3   - r vector byte array address
2780   //   c_rarg4   - input length
2781   //
2782   // Output:
2783   //   x0        - input length
2784   //
2785   address generate_cipherBlockChaining_encryptAESCrypt() {
2786     assert(UseAES, "need AES cryptographic extension support");
2787     __ align(CodeEntryAlignment);
2788     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
2789     StubCodeMark mark(this, stub_id);
2790 
2791     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2792 
2793     const Register from        = c_rarg0;  // source array address
2794     const Register to          = c_rarg1;  // destination array address
2795     const Register key         = c_rarg2;  // key array address
2796     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2797                                            // and left with the results of the last encryption block
2798     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2799     const Register keylen      = rscratch1;
2800 
2801     address start = __ pc();
2802 
2803       __ enter();
2804 
2805       __ movw(rscratch2, len_reg);
2806 
2807       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2808 
2809       __ ld1(v0, __ T16B, rvec);
2810 
2811       __ cmpw(keylen, 52);
2812       __ br(Assembler::CC, L_loadkeys_44);
2813       __ br(Assembler::EQ, L_loadkeys_52);
2814 
2815       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2816       __ rev32(v17, __ T16B, v17);
2817       __ rev32(v18, __ T16B, v18);
2818     __ BIND(L_loadkeys_52);
2819       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2820       __ rev32(v19, __ T16B, v19);
2821       __ rev32(v20, __ T16B, v20);
2822     __ BIND(L_loadkeys_44);
2823       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2824       __ rev32(v21, __ T16B, v21);
2825       __ rev32(v22, __ T16B, v22);
2826       __ rev32(v23, __ T16B, v23);
2827       __ rev32(v24, __ T16B, v24);
2828       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2829       __ rev32(v25, __ T16B, v25);
2830       __ rev32(v26, __ T16B, v26);
2831       __ rev32(v27, __ T16B, v27);
2832       __ rev32(v28, __ T16B, v28);
2833       __ ld1(v29, v30, v31, __ T16B, key);
2834       __ rev32(v29, __ T16B, v29);
2835       __ rev32(v30, __ T16B, v30);
2836       __ rev32(v31, __ T16B, v31);
2837 
2838     __ BIND(L_aes_loop);
2839       __ ld1(v1, __ T16B, __ post(from, 16));
2840       __ eor(v0, __ T16B, v0, v1);
2841 
2842       __ br(Assembler::CC, L_rounds_44);
2843       __ br(Assembler::EQ, L_rounds_52);
2844 
2845       __ aese(v0, v17); __ aesmc(v0, v0);
2846       __ aese(v0, v18); __ aesmc(v0, v0);
2847     __ BIND(L_rounds_52);
2848       __ aese(v0, v19); __ aesmc(v0, v0);
2849       __ aese(v0, v20); __ aesmc(v0, v0);
2850     __ BIND(L_rounds_44);
2851       __ aese(v0, v21); __ aesmc(v0, v0);
2852       __ aese(v0, v22); __ aesmc(v0, v0);
2853       __ aese(v0, v23); __ aesmc(v0, v0);
2854       __ aese(v0, v24); __ aesmc(v0, v0);
2855       __ aese(v0, v25); __ aesmc(v0, v0);
2856       __ aese(v0, v26); __ aesmc(v0, v0);
2857       __ aese(v0, v27); __ aesmc(v0, v0);
2858       __ aese(v0, v28); __ aesmc(v0, v0);
2859       __ aese(v0, v29); __ aesmc(v0, v0);
2860       __ aese(v0, v30);
2861       __ eor(v0, __ T16B, v0, v31);
2862 
2863       __ st1(v0, __ T16B, __ post(to, 16));
2864 
2865       __ subw(len_reg, len_reg, 16);
2866       __ cbnzw(len_reg, L_aes_loop);
2867 
2868       __ st1(v0, __ T16B, rvec);
2869 
2870       __ mov(r0, rscratch2);
2871 
2872       __ leave();
2873       __ ret(lr);
2874 
2875       return start;
2876   }
2877 
2878   // Arguments:
2879   //
2880   // Inputs:
2881   //   c_rarg0   - source byte array address
2882   //   c_rarg1   - destination byte array address
2883   //   c_rarg2   - K (key) in little endian int array
2884   //   c_rarg3   - r vector byte array address
2885   //   c_rarg4   - input length
2886   //
2887   // Output:
2888   //   r0        - input length
2889   //
2890   address generate_cipherBlockChaining_decryptAESCrypt() {
2891     assert(UseAES, "need AES cryptographic extension support");
2892     __ align(CodeEntryAlignment);
2893     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
2894     StubCodeMark mark(this, stub_id);
2895 
2896     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2897 
2898     const Register from        = c_rarg0;  // source array address
2899     const Register to          = c_rarg1;  // destination array address
2900     const Register key         = c_rarg2;  // key array address
2901     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2902                                            // and left with the results of the last encryption block
2903     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2904     const Register keylen      = rscratch1;
2905 
2906     address start = __ pc();
2907 
2908       __ enter();
2909 
2910       __ movw(rscratch2, len_reg);
2911 
2912       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2913 
2914       __ ld1(v2, __ T16B, rvec);
2915 
2916       __ ld1(v31, __ T16B, __ post(key, 16));
2917       __ rev32(v31, __ T16B, v31);
2918 
2919       __ cmpw(keylen, 52);
2920       __ br(Assembler::CC, L_loadkeys_44);
2921       __ br(Assembler::EQ, L_loadkeys_52);
2922 
2923       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2924       __ rev32(v17, __ T16B, v17);
2925       __ rev32(v18, __ T16B, v18);
2926     __ BIND(L_loadkeys_52);
2927       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2928       __ rev32(v19, __ T16B, v19);
2929       __ rev32(v20, __ T16B, v20);
2930     __ BIND(L_loadkeys_44);
2931       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2932       __ rev32(v21, __ T16B, v21);
2933       __ rev32(v22, __ T16B, v22);
2934       __ rev32(v23, __ T16B, v23);
2935       __ rev32(v24, __ T16B, v24);
2936       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2937       __ rev32(v25, __ T16B, v25);
2938       __ rev32(v26, __ T16B, v26);
2939       __ rev32(v27, __ T16B, v27);
2940       __ rev32(v28, __ T16B, v28);
2941       __ ld1(v29, v30, __ T16B, key);
2942       __ rev32(v29, __ T16B, v29);
2943       __ rev32(v30, __ T16B, v30);
2944 
2945     __ BIND(L_aes_loop);
2946       __ ld1(v0, __ T16B, __ post(from, 16));
2947       __ orr(v1, __ T16B, v0, v0);
2948 
2949       __ br(Assembler::CC, L_rounds_44);
2950       __ br(Assembler::EQ, L_rounds_52);
2951 
2952       __ aesd(v0, v17); __ aesimc(v0, v0);
2953       __ aesd(v0, v18); __ aesimc(v0, v0);
2954     __ BIND(L_rounds_52);
2955       __ aesd(v0, v19); __ aesimc(v0, v0);
2956       __ aesd(v0, v20); __ aesimc(v0, v0);
2957     __ BIND(L_rounds_44);
2958       __ aesd(v0, v21); __ aesimc(v0, v0);
2959       __ aesd(v0, v22); __ aesimc(v0, v0);
2960       __ aesd(v0, v23); __ aesimc(v0, v0);
2961       __ aesd(v0, v24); __ aesimc(v0, v0);
2962       __ aesd(v0, v25); __ aesimc(v0, v0);
2963       __ aesd(v0, v26); __ aesimc(v0, v0);
2964       __ aesd(v0, v27); __ aesimc(v0, v0);
2965       __ aesd(v0, v28); __ aesimc(v0, v0);
2966       __ aesd(v0, v29); __ aesimc(v0, v0);
2967       __ aesd(v0, v30);
2968       __ eor(v0, __ T16B, v0, v31);
2969       __ eor(v0, __ T16B, v0, v2);
2970 
2971       __ st1(v0, __ T16B, __ post(to, 16));
2972       __ orr(v2, __ T16B, v1, v1);
2973 
2974       __ subw(len_reg, len_reg, 16);
2975       __ cbnzw(len_reg, L_aes_loop);
2976 
2977       __ st1(v2, __ T16B, rvec);
2978 
2979       __ mov(r0, rscratch2);
2980 
2981       __ leave();
2982       __ ret(lr);
2983 
2984     return start;
2985   }
2986 
2987   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2988   // Inputs: 128-bits. in is preserved.
2989   // The least-significant 64-bit word is in the upper dword of each vector.
2990   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2991   // Output: result
2992   void be_add_128_64(FloatRegister result, FloatRegister in,
2993                      FloatRegister inc, FloatRegister tmp) {
2994     assert_different_registers(result, tmp, inc);
2995 
2996     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2997                                            // input
2998     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2999     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
3000                                            // MSD == 0 (must be!) to LSD
3001     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
3002   }
3003 
3004   // CTR AES crypt.
3005   // Arguments:
3006   //
3007   // Inputs:
3008   //   c_rarg0   - source byte array address
3009   //   c_rarg1   - destination byte array address
3010   //   c_rarg2   - K (key) in little endian int array
3011   //   c_rarg3   - counter vector byte array address
3012   //   c_rarg4   - input length
3013   //   c_rarg5   - saved encryptedCounter start
3014   //   c_rarg6   - saved used length
3015   //
3016   // Output:
3017   //   r0       - input length
3018   //
3019   address generate_counterMode_AESCrypt() {
3020     const Register in = c_rarg0;
3021     const Register out = c_rarg1;
3022     const Register key = c_rarg2;
3023     const Register counter = c_rarg3;
3024     const Register saved_len = c_rarg4, len = r10;
3025     const Register saved_encrypted_ctr = c_rarg5;
3026     const Register used_ptr = c_rarg6, used = r12;
3027 
3028     const Register offset = r7;
3029     const Register keylen = r11;
3030 
3031     const unsigned char block_size = 16;
3032     const int bulk_width = 4;
3033     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3034     // performance with larger data sizes, but it also means that the
3035     // fast path isn't used until you have at least 8 blocks, and up
3036     // to 127 bytes of data will be executed on the slow path. For
3037     // that reason, and also so as not to blow away too much icache, 4
3038     // blocks seems like a sensible compromise.
3039 
3040     // Algorithm:
3041     //
3042     //    if (len == 0) {
3043     //        goto DONE;
3044     //    }
3045     //    int result = len;
3046     //    do {
3047     //        if (used >= blockSize) {
3048     //            if (len >= bulk_width * blockSize) {
3049     //                CTR_large_block();
3050     //                if (len == 0)
3051     //                    goto DONE;
3052     //            }
3053     //            for (;;) {
3054     //                16ByteVector v0 = counter;
3055     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3056     //                used = 0;
3057     //                if (len < blockSize)
3058     //                    break;    /* goto NEXT */
3059     //                16ByteVector v1 = load16Bytes(in, offset);
3060     //                v1 = v1 ^ encryptedCounter;
3061     //                store16Bytes(out, offset);
3062     //                used = blockSize;
3063     //                offset += blockSize;
3064     //                len -= blockSize;
3065     //                if (len == 0)
3066     //                    goto DONE;
3067     //            }
3068     //        }
3069     //      NEXT:
3070     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3071     //        len--;
3072     //    } while (len != 0);
3073     //  DONE:
3074     //    return result;
3075     //
3076     // CTR_large_block()
3077     //    Wide bulk encryption of whole blocks.
3078 
3079     __ align(CodeEntryAlignment);
3080     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
3081     StubCodeMark mark(this, stub_id);
3082     const address start = __ pc();
3083     __ enter();
3084 
3085     Label DONE, CTR_large_block, large_block_return;
3086     __ ldrw(used, Address(used_ptr));
3087     __ cbzw(saved_len, DONE);
3088 
3089     __ mov(len, saved_len);
3090     __ mov(offset, 0);
3091 
3092     // Compute #rounds for AES based on the length of the key array
3093     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3094 
3095     __ aesenc_loadkeys(key, keylen);
3096 
3097     {
3098       Label L_CTR_loop, NEXT;
3099 
3100       __ bind(L_CTR_loop);
3101 
3102       __ cmp(used, block_size);
3103       __ br(__ LO, NEXT);
3104 
3105       // Maybe we have a lot of data
3106       __ subsw(rscratch1, len, bulk_width * block_size);
3107       __ br(__ HS, CTR_large_block);
3108       __ BIND(large_block_return);
3109       __ cbzw(len, DONE);
3110 
3111       // Setup the counter
3112       __ movi(v4, __ T4S, 0);
3113       __ movi(v5, __ T4S, 1);
3114       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3115 
3116       // 128-bit big-endian increment
3117       __ ld1(v0, __ T16B, counter);
3118       __ rev64(v16, __ T16B, v0);
3119       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3120       __ rev64(v16, __ T16B, v16);
3121       __ st1(v16, __ T16B, counter);
3122       // Previous counter value is in v0
3123       // v4 contains { 0, 1 }
3124 
3125       {
3126         // We have fewer than bulk_width blocks of data left. Encrypt
3127         // them one by one until there is less than a full block
3128         // remaining, being careful to save both the encrypted counter
3129         // and the counter.
3130 
3131         Label inner_loop;
3132         __ bind(inner_loop);
3133         // Counter to encrypt is in v0
3134         __ aesecb_encrypt(noreg, noreg, keylen);
3135         __ st1(v0, __ T16B, saved_encrypted_ctr);
3136 
3137         // Do we have a remaining full block?
3138 
3139         __ mov(used, 0);
3140         __ cmp(len, block_size);
3141         __ br(__ LO, NEXT);
3142 
3143         // Yes, we have a full block
3144         __ ldrq(v1, Address(in, offset));
3145         __ eor(v1, __ T16B, v1, v0);
3146         __ strq(v1, Address(out, offset));
3147         __ mov(used, block_size);
3148         __ add(offset, offset, block_size);
3149 
3150         __ subw(len, len, block_size);
3151         __ cbzw(len, DONE);
3152 
3153         // Increment the counter, store it back
3154         __ orr(v0, __ T16B, v16, v16);
3155         __ rev64(v16, __ T16B, v16);
3156         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3157         __ rev64(v16, __ T16B, v16);
3158         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3159 
3160         __ b(inner_loop);
3161       }
3162 
3163       __ BIND(NEXT);
3164 
3165       // Encrypt a single byte, and loop.
3166       // We expect this to be a rare event.
3167       __ ldrb(rscratch1, Address(in, offset));
3168       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3169       __ eor(rscratch1, rscratch1, rscratch2);
3170       __ strb(rscratch1, Address(out, offset));
3171       __ add(offset, offset, 1);
3172       __ add(used, used, 1);
3173       __ subw(len, len,1);
3174       __ cbnzw(len, L_CTR_loop);
3175     }
3176 
3177     __ bind(DONE);
3178     __ strw(used, Address(used_ptr));
3179     __ mov(r0, saved_len);
3180 
3181     __ leave(); // required for proper stackwalking of RuntimeStub frame
3182     __ ret(lr);
3183 
3184     // Bulk encryption
3185 
3186     __ BIND (CTR_large_block);
3187     assert(bulk_width == 4 || bulk_width == 8, "must be");
3188 
3189     if (bulk_width == 8) {
3190       __ sub(sp, sp, 4 * 16);
3191       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3192     }
3193     __ sub(sp, sp, 4 * 16);
3194     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3195     RegSet saved_regs = (RegSet::of(in, out, offset)
3196                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3197     __ push(saved_regs, sp);
3198     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3199     __ add(in, in, offset);
3200     __ add(out, out, offset);
3201 
3202     // Keys should already be loaded into the correct registers
3203 
3204     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3205     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3206 
3207     // AES/CTR loop
3208     {
3209       Label L_CTR_loop;
3210       __ BIND(L_CTR_loop);
3211 
3212       // Setup the counters
3213       __ movi(v8, __ T4S, 0);
3214       __ movi(v9, __ T4S, 1);
3215       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3216 
3217       for (int i = 0; i < bulk_width; i++) {
3218         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3219         __ rev64(v0_ofs, __ T16B, v16);
3220         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3221       }
3222 
3223       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3224 
3225       // Encrypt the counters
3226       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3227 
3228       if (bulk_width == 8) {
3229         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3230       }
3231 
3232       // XOR the encrypted counters with the inputs
3233       for (int i = 0; i < bulk_width; i++) {
3234         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3235         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3236         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3237       }
3238 
3239       // Write the encrypted data
3240       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3241       if (bulk_width == 8) {
3242         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3243       }
3244 
3245       __ subw(len, len, 16 * bulk_width);
3246       __ cbnzw(len, L_CTR_loop);
3247     }
3248 
3249     // Save the counter back where it goes
3250     __ rev64(v16, __ T16B, v16);
3251     __ st1(v16, __ T16B, counter);
3252 
3253     __ pop(saved_regs, sp);
3254 
3255     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3256     if (bulk_width == 8) {
3257       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3258     }
3259 
3260     __ andr(rscratch1, len, -16 * bulk_width);
3261     __ sub(len, len, rscratch1);
3262     __ add(offset, offset, rscratch1);
3263     __ mov(used, 16);
3264     __ strw(used, Address(used_ptr));
3265     __ b(large_block_return);
3266 
3267     return start;
3268   }
3269 
3270   // Vector AES Galois Counter Mode implementation. Parameters:
3271   //
3272   // in = c_rarg0
3273   // len = c_rarg1
3274   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3275   // out = c_rarg3
3276   // key = c_rarg4
3277   // state = c_rarg5 - GHASH.state
3278   // subkeyHtbl = c_rarg6 - powers of H
3279   // counter = c_rarg7 - 16 bytes of CTR
3280   // return - number of processed bytes
3281   address generate_galoisCounterMode_AESCrypt() {
3282     address ghash_polynomial = __ pc();
3283     __ emit_int64(0x87);  // The low-order bits of the field
3284                           // polynomial (i.e. p = z^7+z^2+z+1)
3285                           // repeated in the low and high parts of a
3286                           // 128-bit vector
3287     __ emit_int64(0x87);
3288 
3289     __ align(CodeEntryAlignment);
3290     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
3291     StubCodeMark mark(this, stub_id);
3292     address start = __ pc();
3293     __ enter();
3294 
3295     const Register in = c_rarg0;
3296     const Register len = c_rarg1;
3297     const Register ct = c_rarg2;
3298     const Register out = c_rarg3;
3299     // and updated with the incremented counter in the end
3300 
3301     const Register key = c_rarg4;
3302     const Register state = c_rarg5;
3303 
3304     const Register subkeyHtbl = c_rarg6;
3305 
3306     const Register counter = c_rarg7;
3307 
3308     const Register keylen = r10;
3309     // Save state before entering routine
3310     __ sub(sp, sp, 4 * 16);
3311     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3312     __ sub(sp, sp, 4 * 16);
3313     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3314 
3315     // __ andr(len, len, -512);
3316     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3317     __ str(len, __ pre(sp, -2 * wordSize));
3318 
3319     Label DONE;
3320     __ cbz(len, DONE);
3321 
3322     // Compute #rounds for AES based on the length of the key array
3323     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3324 
3325     __ aesenc_loadkeys(key, keylen);
3326     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3327     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3328 
3329     // AES/CTR loop
3330     {
3331       Label L_CTR_loop;
3332       __ BIND(L_CTR_loop);
3333 
3334       // Setup the counters
3335       __ movi(v8, __ T4S, 0);
3336       __ movi(v9, __ T4S, 1);
3337       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3338 
3339       assert(v0->encoding() < v8->encoding(), "");
3340       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3341         FloatRegister f = as_FloatRegister(i);
3342         __ rev32(f, __ T16B, v16);
3343         __ addv(v16, __ T4S, v16, v8);
3344       }
3345 
3346       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3347 
3348       // Encrypt the counters
3349       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3350 
3351       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3352 
3353       // XOR the encrypted counters with the inputs
3354       for (int i = 0; i < 8; i++) {
3355         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3356         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3357         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3358       }
3359       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3360       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3361 
3362       __ subw(len, len, 16 * 8);
3363       __ cbnzw(len, L_CTR_loop);
3364     }
3365 
3366     __ rev32(v16, __ T16B, v16);
3367     __ st1(v16, __ T16B, counter);
3368 
3369     __ ldr(len, Address(sp));
3370     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3371 
3372     // GHASH/CTR loop
3373     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3374                                 len, /*unrolls*/4);
3375 
3376 #ifdef ASSERT
3377     { Label L;
3378       __ cmp(len, (unsigned char)0);
3379       __ br(Assembler::EQ, L);
3380       __ stop("stubGenerator: abort");
3381       __ bind(L);
3382   }
3383 #endif
3384 
3385   __ bind(DONE);
3386     // Return the number of bytes processed
3387     __ ldr(r0, __ post(sp, 2 * wordSize));
3388 
3389     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3390     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3391 
3392     __ leave(); // required for proper stackwalking of RuntimeStub frame
3393     __ ret(lr);
3394      return start;
3395   }
3396 
3397   class Cached64Bytes {
3398   private:
3399     MacroAssembler *_masm;
3400     Register _regs[8];
3401 
3402   public:
3403     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3404       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3405       auto it = rs.begin();
3406       for (auto &r: _regs) {
3407         r = *it;
3408         ++it;
3409       }
3410     }
3411 
3412     void gen_loads(Register base) {
3413       for (int i = 0; i < 8; i += 2) {
3414         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3415       }
3416     }
3417 
3418     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3419     void extract_u32(Register dest, int i) {
3420       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3421     }
3422   };
3423 
3424   // Utility routines for md5.
3425   // Clobbers r10 and r11.
3426   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3427               int k, int s, int t) {
3428     Register rscratch3 = r10;
3429     Register rscratch4 = r11;
3430 
3431     __ eorw(rscratch3, r3, r4);
3432     __ movw(rscratch2, t);
3433     __ andw(rscratch3, rscratch3, r2);
3434     __ addw(rscratch4, r1, rscratch2);
3435     reg_cache.extract_u32(rscratch1, k);
3436     __ eorw(rscratch3, rscratch3, r4);
3437     __ addw(rscratch4, rscratch4, rscratch1);
3438     __ addw(rscratch3, rscratch3, rscratch4);
3439     __ rorw(rscratch2, rscratch3, 32 - s);
3440     __ addw(r1, rscratch2, r2);
3441   }
3442 
3443   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3444               int k, int s, int t) {
3445     Register rscratch3 = r10;
3446     Register rscratch4 = r11;
3447 
3448     reg_cache.extract_u32(rscratch1, k);
3449     __ movw(rscratch2, t);
3450     __ addw(rscratch4, r1, rscratch2);
3451     __ addw(rscratch4, rscratch4, rscratch1);
3452     __ bicw(rscratch2, r3, r4);
3453     __ andw(rscratch3, r2, r4);
3454     __ addw(rscratch2, rscratch2, rscratch4);
3455     __ addw(rscratch2, rscratch2, rscratch3);
3456     __ rorw(rscratch2, rscratch2, 32 - s);
3457     __ addw(r1, rscratch2, r2);
3458   }
3459 
3460   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3461               int k, int s, int t) {
3462     Register rscratch3 = r10;
3463     Register rscratch4 = r11;
3464 
3465     __ eorw(rscratch3, r3, r4);
3466     __ movw(rscratch2, t);
3467     __ addw(rscratch4, r1, rscratch2);
3468     reg_cache.extract_u32(rscratch1, k);
3469     __ eorw(rscratch3, rscratch3, r2);
3470     __ addw(rscratch4, rscratch4, rscratch1);
3471     __ addw(rscratch3, rscratch3, rscratch4);
3472     __ rorw(rscratch2, rscratch3, 32 - s);
3473     __ addw(r1, rscratch2, r2);
3474   }
3475 
3476   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3477               int k, int s, int t) {
3478     Register rscratch3 = r10;
3479     Register rscratch4 = r11;
3480 
3481     __ movw(rscratch3, t);
3482     __ ornw(rscratch2, r2, r4);
3483     __ addw(rscratch4, r1, rscratch3);
3484     reg_cache.extract_u32(rscratch1, k);
3485     __ eorw(rscratch3, rscratch2, r3);
3486     __ addw(rscratch4, rscratch4, rscratch1);
3487     __ addw(rscratch3, rscratch3, rscratch4);
3488     __ rorw(rscratch2, rscratch3, 32 - s);
3489     __ addw(r1, rscratch2, r2);
3490   }
3491 
3492   // Arguments:
3493   //
3494   // Inputs:
3495   //   c_rarg0   - byte[]  source+offset
3496   //   c_rarg1   - int[]   SHA.state
3497   //   c_rarg2   - int     offset
3498   //   c_rarg3   - int     limit
3499   //
3500   address generate_md5_implCompress(StubGenStubId stub_id) {
3501     bool multi_block;
3502     switch (stub_id) {
3503     case md5_implCompress_id:
3504       multi_block = false;
3505       break;
3506     case md5_implCompressMB_id:
3507       multi_block = true;
3508       break;
3509     default:
3510       ShouldNotReachHere();
3511     }
3512     __ align(CodeEntryAlignment);
3513 
3514     StubCodeMark mark(this, stub_id);
3515     address start = __ pc();
3516 
3517     Register buf       = c_rarg0;
3518     Register state     = c_rarg1;
3519     Register ofs       = c_rarg2;
3520     Register limit     = c_rarg3;
3521     Register a         = r4;
3522     Register b         = r5;
3523     Register c         = r6;
3524     Register d         = r7;
3525     Register rscratch3 = r10;
3526     Register rscratch4 = r11;
3527 
3528     Register state_regs[2] = { r12, r13 };
3529     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3530     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3531 
3532     __ push(saved_regs, sp);
3533 
3534     __ ldp(state_regs[0], state_regs[1], Address(state));
3535     __ ubfx(a, state_regs[0],  0, 32);
3536     __ ubfx(b, state_regs[0], 32, 32);
3537     __ ubfx(c, state_regs[1],  0, 32);
3538     __ ubfx(d, state_regs[1], 32, 32);
3539 
3540     Label md5_loop;
3541     __ BIND(md5_loop);
3542 
3543     reg_cache.gen_loads(buf);
3544 
3545     // Round 1
3546     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3547     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3548     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3549     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3550     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3551     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3552     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3553     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3554     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3555     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3556     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3557     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3558     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3559     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3560     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3561     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3562 
3563     // Round 2
3564     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3565     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3566     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3567     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3568     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3569     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3570     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3571     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3572     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3573     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3574     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3575     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3576     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3577     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3578     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3579     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3580 
3581     // Round 3
3582     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3583     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3584     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3585     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3586     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3587     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3588     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3589     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3590     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3591     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3592     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3593     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3594     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3595     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3596     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3597     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3598 
3599     // Round 4
3600     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3601     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3602     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3603     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3604     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3605     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3606     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3607     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3608     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3609     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3610     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3611     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3612     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3613     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3614     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3615     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3616 
3617     __ addw(a, state_regs[0], a);
3618     __ ubfx(rscratch2, state_regs[0], 32, 32);
3619     __ addw(b, rscratch2, b);
3620     __ addw(c, state_regs[1], c);
3621     __ ubfx(rscratch4, state_regs[1], 32, 32);
3622     __ addw(d, rscratch4, d);
3623 
3624     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3625     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3626 
3627     if (multi_block) {
3628       __ add(buf, buf, 64);
3629       __ add(ofs, ofs, 64);
3630       __ cmp(ofs, limit);
3631       __ br(Assembler::LE, md5_loop);
3632       __ mov(c_rarg0, ofs); // return ofs
3633     }
3634 
3635     // write hash values back in the correct order
3636     __ stp(state_regs[0], state_regs[1], Address(state));
3637 
3638     __ pop(saved_regs, sp);
3639 
3640     __ ret(lr);
3641 
3642     return start;
3643   }
3644 
3645   // Arguments:
3646   //
3647   // Inputs:
3648   //   c_rarg0   - byte[]  source+offset
3649   //   c_rarg1   - int[]   SHA.state
3650   //   c_rarg2   - int     offset
3651   //   c_rarg3   - int     limit
3652   //
3653   address generate_sha1_implCompress(StubGenStubId stub_id) {
3654     bool multi_block;
3655     switch (stub_id) {
3656     case sha1_implCompress_id:
3657       multi_block = false;
3658       break;
3659     case sha1_implCompressMB_id:
3660       multi_block = true;
3661       break;
3662     default:
3663       ShouldNotReachHere();
3664     }
3665 
3666     __ align(CodeEntryAlignment);
3667 
3668     StubCodeMark mark(this, stub_id);
3669     address start = __ pc();
3670 
3671     Register buf   = c_rarg0;
3672     Register state = c_rarg1;
3673     Register ofs   = c_rarg2;
3674     Register limit = c_rarg3;
3675 
3676     Label keys;
3677     Label sha1_loop;
3678 
3679     // load the keys into v0..v3
3680     __ adr(rscratch1, keys);
3681     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3682     // load 5 words state into v6, v7
3683     __ ldrq(v6, Address(state, 0));
3684     __ ldrs(v7, Address(state, 16));
3685 
3686 
3687     __ BIND(sha1_loop);
3688     // load 64 bytes of data into v16..v19
3689     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3690     __ rev32(v16, __ T16B, v16);
3691     __ rev32(v17, __ T16B, v17);
3692     __ rev32(v18, __ T16B, v18);
3693     __ rev32(v19, __ T16B, v19);
3694 
3695     // do the sha1
3696     __ addv(v4, __ T4S, v16, v0);
3697     __ orr(v20, __ T16B, v6, v6);
3698 
3699     FloatRegister d0 = v16;
3700     FloatRegister d1 = v17;
3701     FloatRegister d2 = v18;
3702     FloatRegister d3 = v19;
3703 
3704     for (int round = 0; round < 20; round++) {
3705       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3706       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3707       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3708       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3709       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3710 
3711       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3712       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3713       __ sha1h(tmp2, __ T4S, v20);
3714       if (round < 5)
3715         __ sha1c(v20, __ T4S, tmp3, tmp4);
3716       else if (round < 10 || round >= 15)
3717         __ sha1p(v20, __ T4S, tmp3, tmp4);
3718       else
3719         __ sha1m(v20, __ T4S, tmp3, tmp4);
3720       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3721 
3722       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3723     }
3724 
3725     __ addv(v7, __ T2S, v7, v21);
3726     __ addv(v6, __ T4S, v6, v20);
3727 
3728     if (multi_block) {
3729       __ add(ofs, ofs, 64);
3730       __ cmp(ofs, limit);
3731       __ br(Assembler::LE, sha1_loop);
3732       __ mov(c_rarg0, ofs); // return ofs
3733     }
3734 
3735     __ strq(v6, Address(state, 0));
3736     __ strs(v7, Address(state, 16));
3737 
3738     __ ret(lr);
3739 
3740     __ bind(keys);
3741     __ emit_int32(0x5a827999);
3742     __ emit_int32(0x6ed9eba1);
3743     __ emit_int32(0x8f1bbcdc);
3744     __ emit_int32(0xca62c1d6);
3745 
3746     return start;
3747   }
3748 
3749 
3750   // Arguments:
3751   //
3752   // Inputs:
3753   //   c_rarg0   - byte[]  source+offset
3754   //   c_rarg1   - int[]   SHA.state
3755   //   c_rarg2   - int     offset
3756   //   c_rarg3   - int     limit
3757   //
3758   address generate_sha256_implCompress(StubGenStubId stub_id) {
3759     bool multi_block;
3760     switch (stub_id) {
3761     case sha256_implCompress_id:
3762       multi_block = false;
3763       break;
3764     case sha256_implCompressMB_id:
3765       multi_block = true;
3766       break;
3767     default:
3768       ShouldNotReachHere();
3769     }
3770 
3771     static const uint32_t round_consts[64] = {
3772       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3773       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3774       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3775       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3776       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3777       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3778       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3779       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3780       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3781       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3782       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3783       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3784       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3785       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3786       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3787       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3788     };
3789 
3790     __ align(CodeEntryAlignment);
3791 
3792     StubCodeMark mark(this, stub_id);
3793     address start = __ pc();
3794 
3795     Register buf   = c_rarg0;
3796     Register state = c_rarg1;
3797     Register ofs   = c_rarg2;
3798     Register limit = c_rarg3;
3799 
3800     Label sha1_loop;
3801 
3802     __ stpd(v8, v9, __ pre(sp, -32));
3803     __ stpd(v10, v11, Address(sp, 16));
3804 
3805 // dga == v0
3806 // dgb == v1
3807 // dg0 == v2
3808 // dg1 == v3
3809 // dg2 == v4
3810 // t0 == v6
3811 // t1 == v7
3812 
3813     // load 16 keys to v16..v31
3814     __ lea(rscratch1, ExternalAddress((address)round_consts));
3815     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3816     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3817     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3818     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3819 
3820     // load 8 words (256 bits) state
3821     __ ldpq(v0, v1, state);
3822 
3823     __ BIND(sha1_loop);
3824     // load 64 bytes of data into v8..v11
3825     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3826     __ rev32(v8, __ T16B, v8);
3827     __ rev32(v9, __ T16B, v9);
3828     __ rev32(v10, __ T16B, v10);
3829     __ rev32(v11, __ T16B, v11);
3830 
3831     __ addv(v6, __ T4S, v8, v16);
3832     __ orr(v2, __ T16B, v0, v0);
3833     __ orr(v3, __ T16B, v1, v1);
3834 
3835     FloatRegister d0 = v8;
3836     FloatRegister d1 = v9;
3837     FloatRegister d2 = v10;
3838     FloatRegister d3 = v11;
3839 
3840 
3841     for (int round = 0; round < 16; round++) {
3842       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3843       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3844       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3845       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3846 
3847       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3848        __ orr(v4, __ T16B, v2, v2);
3849       if (round < 15)
3850         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3851       __ sha256h(v2, __ T4S, v3, tmp2);
3852       __ sha256h2(v3, __ T4S, v4, tmp2);
3853       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3854 
3855       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3856     }
3857 
3858     __ addv(v0, __ T4S, v0, v2);
3859     __ addv(v1, __ T4S, v1, v3);
3860 
3861     if (multi_block) {
3862       __ add(ofs, ofs, 64);
3863       __ cmp(ofs, limit);
3864       __ br(Assembler::LE, sha1_loop);
3865       __ mov(c_rarg0, ofs); // return ofs
3866     }
3867 
3868     __ ldpd(v10, v11, Address(sp, 16));
3869     __ ldpd(v8, v9, __ post(sp, 32));
3870 
3871     __ stpq(v0, v1, state);
3872 
3873     __ ret(lr);
3874 
3875     return start;
3876   }
3877 
3878   // Double rounds for sha512.
3879   void sha512_dround(int dr,
3880                      FloatRegister vi0, FloatRegister vi1,
3881                      FloatRegister vi2, FloatRegister vi3,
3882                      FloatRegister vi4, FloatRegister vrc0,
3883                      FloatRegister vrc1, FloatRegister vin0,
3884                      FloatRegister vin1, FloatRegister vin2,
3885                      FloatRegister vin3, FloatRegister vin4) {
3886       if (dr < 36) {
3887         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3888       }
3889       __ addv(v5, __ T2D, vrc0, vin0);
3890       __ ext(v6, __ T16B, vi2, vi3, 8);
3891       __ ext(v5, __ T16B, v5, v5, 8);
3892       __ ext(v7, __ T16B, vi1, vi2, 8);
3893       __ addv(vi3, __ T2D, vi3, v5);
3894       if (dr < 32) {
3895         __ ext(v5, __ T16B, vin3, vin4, 8);
3896         __ sha512su0(vin0, __ T2D, vin1);
3897       }
3898       __ sha512h(vi3, __ T2D, v6, v7);
3899       if (dr < 32) {
3900         __ sha512su1(vin0, __ T2D, vin2, v5);
3901       }
3902       __ addv(vi4, __ T2D, vi1, vi3);
3903       __ sha512h2(vi3, __ T2D, vi1, vi0);
3904   }
3905 
3906   // Arguments:
3907   //
3908   // Inputs:
3909   //   c_rarg0   - byte[]  source+offset
3910   //   c_rarg1   - int[]   SHA.state
3911   //   c_rarg2   - int     offset
3912   //   c_rarg3   - int     limit
3913   //
3914   address generate_sha512_implCompress(StubGenStubId stub_id) {
3915     bool multi_block;
3916     switch (stub_id) {
3917     case sha512_implCompress_id:
3918       multi_block = false;
3919       break;
3920     case sha512_implCompressMB_id:
3921       multi_block = true;
3922       break;
3923     default:
3924       ShouldNotReachHere();
3925     }
3926 
3927     static const uint64_t round_consts[80] = {
3928       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3929       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3930       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3931       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3932       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3933       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3934       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3935       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3936       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3937       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3938       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3939       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3940       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3941       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3942       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3943       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3944       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3945       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3946       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3947       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3948       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3949       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3950       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3951       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3952       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3953       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3954       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3955     };
3956 
3957     __ align(CodeEntryAlignment);
3958 
3959     StubCodeMark mark(this, stub_id);
3960     address start = __ pc();
3961 
3962     Register buf   = c_rarg0;
3963     Register state = c_rarg1;
3964     Register ofs   = c_rarg2;
3965     Register limit = c_rarg3;
3966 
3967     __ stpd(v8, v9, __ pre(sp, -64));
3968     __ stpd(v10, v11, Address(sp, 16));
3969     __ stpd(v12, v13, Address(sp, 32));
3970     __ stpd(v14, v15, Address(sp, 48));
3971 
3972     Label sha512_loop;
3973 
3974     // load state
3975     __ ld1(v8, v9, v10, v11, __ T2D, state);
3976 
3977     // load first 4 round constants
3978     __ lea(rscratch1, ExternalAddress((address)round_consts));
3979     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3980 
3981     __ BIND(sha512_loop);
3982     // load 128B of data into v12..v19
3983     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3984     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3985     __ rev64(v12, __ T16B, v12);
3986     __ rev64(v13, __ T16B, v13);
3987     __ rev64(v14, __ T16B, v14);
3988     __ rev64(v15, __ T16B, v15);
3989     __ rev64(v16, __ T16B, v16);
3990     __ rev64(v17, __ T16B, v17);
3991     __ rev64(v18, __ T16B, v18);
3992     __ rev64(v19, __ T16B, v19);
3993 
3994     __ mov(rscratch2, rscratch1);
3995 
3996     __ mov(v0, __ T16B, v8);
3997     __ mov(v1, __ T16B, v9);
3998     __ mov(v2, __ T16B, v10);
3999     __ mov(v3, __ T16B, v11);
4000 
4001     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4002     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4003     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4004     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4005     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4006     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4007     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4008     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4009     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4010     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4011     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4012     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4013     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4014     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4015     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4016     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4017     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4018     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4019     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4020     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4021     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4022     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4023     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4024     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4025     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4026     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4027     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4028     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4029     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4030     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4031     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4032     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4033     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
4034     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
4035     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
4036     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
4037     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
4038     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
4039     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
4040     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
4041 
4042     __ addv(v8, __ T2D, v8, v0);
4043     __ addv(v9, __ T2D, v9, v1);
4044     __ addv(v10, __ T2D, v10, v2);
4045     __ addv(v11, __ T2D, v11, v3);
4046 
4047     if (multi_block) {
4048       __ add(ofs, ofs, 128);
4049       __ cmp(ofs, limit);
4050       __ br(Assembler::LE, sha512_loop);
4051       __ mov(c_rarg0, ofs); // return ofs
4052     }
4053 
4054     __ st1(v8, v9, v10, v11, __ T2D, state);
4055 
4056     __ ldpd(v14, v15, Address(sp, 48));
4057     __ ldpd(v12, v13, Address(sp, 32));
4058     __ ldpd(v10, v11, Address(sp, 16));
4059     __ ldpd(v8, v9, __ post(sp, 64));
4060 
4061     __ ret(lr);
4062 
4063     return start;
4064   }
4065 
4066   // Arguments:
4067   //
4068   // Inputs:
4069   //   c_rarg0   - byte[]  source+offset
4070   //   c_rarg1   - byte[]  SHA.state
4071   //   c_rarg2   - int     block_size
4072   //   c_rarg3   - int     offset
4073   //   c_rarg4   - int     limit
4074   //
4075   address generate_sha3_implCompress(StubGenStubId stub_id) {
4076     bool multi_block;
4077     switch (stub_id) {
4078     case sha3_implCompress_id:
4079       multi_block = false;
4080       break;
4081     case sha3_implCompressMB_id:
4082       multi_block = true;
4083       break;
4084     default:
4085       ShouldNotReachHere();
4086     }
4087 
4088     static const uint64_t round_consts[24] = {
4089       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4090       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4091       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4092       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4093       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4094       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4095       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4096       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4097     };
4098 
4099     __ align(CodeEntryAlignment);
4100 
4101     StubCodeMark mark(this, stub_id);
4102     address start = __ pc();
4103 
4104     Register buf           = c_rarg0;
4105     Register state         = c_rarg1;
4106     Register block_size    = c_rarg2;
4107     Register ofs           = c_rarg3;
4108     Register limit         = c_rarg4;
4109 
4110     Label sha3_loop, rounds24_loop;
4111     Label sha3_512_or_sha3_384, shake128;
4112 
4113     __ stpd(v8, v9, __ pre(sp, -64));
4114     __ stpd(v10, v11, Address(sp, 16));
4115     __ stpd(v12, v13, Address(sp, 32));
4116     __ stpd(v14, v15, Address(sp, 48));
4117 
4118     // load state
4119     __ add(rscratch1, state, 32);
4120     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4121     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4122     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4123     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4124     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4125     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4126     __ ld1(v24, __ T1D, rscratch1);
4127 
4128     __ BIND(sha3_loop);
4129 
4130     // 24 keccak rounds
4131     __ movw(rscratch2, 24);
4132 
4133     // load round_constants base
4134     __ lea(rscratch1, ExternalAddress((address) round_consts));
4135 
4136     // load input
4137     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4138     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4139     __ eor(v0, __ T8B, v0, v25);
4140     __ eor(v1, __ T8B, v1, v26);
4141     __ eor(v2, __ T8B, v2, v27);
4142     __ eor(v3, __ T8B, v3, v28);
4143     __ eor(v4, __ T8B, v4, v29);
4144     __ eor(v5, __ T8B, v5, v30);
4145     __ eor(v6, __ T8B, v6, v31);
4146 
4147     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4148     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4149 
4150     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4151     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4152     __ eor(v7, __ T8B, v7, v25);
4153     __ eor(v8, __ T8B, v8, v26);
4154     __ eor(v9, __ T8B, v9, v27);
4155     __ eor(v10, __ T8B, v10, v28);
4156     __ eor(v11, __ T8B, v11, v29);
4157     __ eor(v12, __ T8B, v12, v30);
4158     __ eor(v13, __ T8B, v13, v31);
4159 
4160     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4161     __ eor(v14, __ T8B, v14, v25);
4162     __ eor(v15, __ T8B, v15, v26);
4163     __ eor(v16, __ T8B, v16, v27);
4164 
4165     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4166     __ andw(c_rarg5, block_size, 48);
4167     __ cbzw(c_rarg5, rounds24_loop);
4168 
4169     __ tbnz(block_size, 5, shake128);
4170     // block_size == 144, bit5 == 0, SHA3-244
4171     __ ldrd(v28, __ post(buf, 8));
4172     __ eor(v17, __ T8B, v17, v28);
4173     __ b(rounds24_loop);
4174 
4175     __ BIND(shake128);
4176     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4177     __ eor(v17, __ T8B, v17, v28);
4178     __ eor(v18, __ T8B, v18, v29);
4179     __ eor(v19, __ T8B, v19, v30);
4180     __ eor(v20, __ T8B, v20, v31);
4181     __ b(rounds24_loop); // block_size == 168, SHAKE128
4182 
4183     __ BIND(sha3_512_or_sha3_384);
4184     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4185     __ eor(v7, __ T8B, v7, v25);
4186     __ eor(v8, __ T8B, v8, v26);
4187     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4188 
4189     // SHA3-384
4190     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4191     __ eor(v9,  __ T8B, v9,  v27);
4192     __ eor(v10, __ T8B, v10, v28);
4193     __ eor(v11, __ T8B, v11, v29);
4194     __ eor(v12, __ T8B, v12, v30);
4195 
4196     __ BIND(rounds24_loop);
4197     __ subw(rscratch2, rscratch2, 1);
4198 
4199     __ eor3(v29, __ T16B, v4, v9, v14);
4200     __ eor3(v26, __ T16B, v1, v6, v11);
4201     __ eor3(v28, __ T16B, v3, v8, v13);
4202     __ eor3(v25, __ T16B, v0, v5, v10);
4203     __ eor3(v27, __ T16B, v2, v7, v12);
4204     __ eor3(v29, __ T16B, v29, v19, v24);
4205     __ eor3(v26, __ T16B, v26, v16, v21);
4206     __ eor3(v28, __ T16B, v28, v18, v23);
4207     __ eor3(v25, __ T16B, v25, v15, v20);
4208     __ eor3(v27, __ T16B, v27, v17, v22);
4209 
4210     __ rax1(v30, __ T2D, v29, v26);
4211     __ rax1(v26, __ T2D, v26, v28);
4212     __ rax1(v28, __ T2D, v28, v25);
4213     __ rax1(v25, __ T2D, v25, v27);
4214     __ rax1(v27, __ T2D, v27, v29);
4215 
4216     __ eor(v0, __ T16B, v0, v30);
4217     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4218     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4219     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4220     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4221     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4222     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4223     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4224     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4225     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4226     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4227     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4228     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4229     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4230     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4231     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4232     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4233     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4234     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4235     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4236     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4237     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4238     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4239     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4240     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4241 
4242     __ bcax(v20, __ T16B, v31, v22, v8);
4243     __ bcax(v21, __ T16B, v8,  v23, v22);
4244     __ bcax(v22, __ T16B, v22, v24, v23);
4245     __ bcax(v23, __ T16B, v23, v31, v24);
4246     __ bcax(v24, __ T16B, v24, v8,  v31);
4247 
4248     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4249 
4250     __ bcax(v17, __ T16B, v25, v19, v3);
4251     __ bcax(v18, __ T16B, v3,  v15, v19);
4252     __ bcax(v19, __ T16B, v19, v16, v15);
4253     __ bcax(v15, __ T16B, v15, v25, v16);
4254     __ bcax(v16, __ T16B, v16, v3,  v25);
4255 
4256     __ bcax(v10, __ T16B, v29, v12, v26);
4257     __ bcax(v11, __ T16B, v26, v13, v12);
4258     __ bcax(v12, __ T16B, v12, v14, v13);
4259     __ bcax(v13, __ T16B, v13, v29, v14);
4260     __ bcax(v14, __ T16B, v14, v26, v29);
4261 
4262     __ bcax(v7, __ T16B, v30, v9,  v4);
4263     __ bcax(v8, __ T16B, v4,  v5,  v9);
4264     __ bcax(v9, __ T16B, v9,  v6,  v5);
4265     __ bcax(v5, __ T16B, v5,  v30, v6);
4266     __ bcax(v6, __ T16B, v6,  v4,  v30);
4267 
4268     __ bcax(v3, __ T16B, v27, v0,  v28);
4269     __ bcax(v4, __ T16B, v28, v1,  v0);
4270     __ bcax(v0, __ T16B, v0,  v2,  v1);
4271     __ bcax(v1, __ T16B, v1,  v27, v2);
4272     __ bcax(v2, __ T16B, v2,  v28, v27);
4273 
4274     __ eor(v0, __ T16B, v0, v31);
4275 
4276     __ cbnzw(rscratch2, rounds24_loop);
4277 
4278     if (multi_block) {
4279       __ add(ofs, ofs, block_size);
4280       __ cmp(ofs, limit);
4281       __ br(Assembler::LE, sha3_loop);
4282       __ mov(c_rarg0, ofs); // return ofs
4283     }
4284 
4285     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4286     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4287     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4288     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4289     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4290     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4291     __ st1(v24, __ T1D, state);
4292 
4293     __ ldpd(v14, v15, Address(sp, 48));
4294     __ ldpd(v12, v13, Address(sp, 32));
4295     __ ldpd(v10, v11, Address(sp, 16));
4296     __ ldpd(v8, v9, __ post(sp, 64));
4297 
4298     __ ret(lr);
4299 
4300     return start;
4301   }
4302 
4303   /**
4304    *  Arguments:
4305    *
4306    * Inputs:
4307    *   c_rarg0   - int crc
4308    *   c_rarg1   - byte* buf
4309    *   c_rarg2   - int length
4310    *
4311    * Output:
4312    *       rax   - int crc result
4313    */
4314   address generate_updateBytesCRC32() {
4315     assert(UseCRC32Intrinsics, "what are we doing here?");
4316 
4317     __ align(CodeEntryAlignment);
4318     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
4319     StubCodeMark mark(this, stub_id);
4320 
4321     address start = __ pc();
4322 
4323     const Register crc   = c_rarg0;  // crc
4324     const Register buf   = c_rarg1;  // source java byte array address
4325     const Register len   = c_rarg2;  // length
4326     const Register table0 = c_rarg3; // crc_table address
4327     const Register table1 = c_rarg4;
4328     const Register table2 = c_rarg5;
4329     const Register table3 = c_rarg6;
4330     const Register tmp3 = c_rarg7;
4331 
4332     BLOCK_COMMENT("Entry:");
4333     __ enter(); // required for proper stackwalking of RuntimeStub frame
4334 
4335     __ kernel_crc32(crc, buf, len,
4336               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4337 
4338     __ leave(); // required for proper stackwalking of RuntimeStub frame
4339     __ ret(lr);
4340 
4341     return start;
4342   }
4343 
4344   // ChaCha20 block function.  This version parallelizes 4 quarter
4345   // round operations at a time.  It uses 16 SIMD registers to
4346   // produce 4 blocks of key stream.
4347   //
4348   // state (int[16]) = c_rarg0
4349   // keystream (byte[256]) = c_rarg1
4350   // return - number of bytes of keystream (always 256)
4351   //
4352   // In this approach, we load the 512-bit start state sequentially into
4353   // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
4354   // state, with each successive set of 4 vectors having a +1 added into
4355   // the first 32-bit lane of the 4th vector in that group (the counter).
4356   // By doing this, we can perform the block function on 4 512-bit blocks
4357   // within one run of this intrinsic.
4358   // The alignment of the data across the 4-vector group is such that at
4359   // the start it is already aligned for the first round of each two-round
4360   // loop iteration.  In other words, the corresponding lanes of each vector
4361   // will contain the values needed for that quarter round operation (e.g.
4362   // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
4363   // In between each full round, a lane shift must occur.  Within a loop
4364   // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
4365   // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
4366   // is effectively a diagonal orientation in columnar form.  After the
4367   // second full round, those registers are left-rotated again, this time
4368   // 96, 64, and 32 bits - returning the vectors to their columnar organization.
4369   // After all 10 iterations, the original state is added to each 4-vector
4370   // working state along with the add mask, and the 4 vector groups are
4371   // sequentially written to the memory dedicated for the output key stream.
4372   //
4373   // For a more detailed explanation, see Goll and Gueron, "Vectorization of
4374   // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
4375   // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
4376   address generate_chacha20Block_qrpar() {
4377     Label L_Q_twoRounds, L_Q_cc20_const;
4378     // The constant data is broken into two 128-bit segments to be loaded
4379     // onto SIMD registers.  The first 128 bits are a counter add overlay
4380     // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
4381     // The second 128-bits is a table constant used for 8-bit left rotations.
4382     // on 32-bit lanes within a SIMD register.
4383     __ BIND(L_Q_cc20_const);
4384     __ emit_int64(0x0000000000000001UL);
4385     __ emit_int64(0x0000000000000000UL);
4386     __ emit_int64(0x0605040702010003UL);
4387     __ emit_int64(0x0E0D0C0F0A09080BUL);
4388 
4389     __ align(CodeEntryAlignment);
4390     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
4391     StubCodeMark mark(this, stub_id);
4392     address start = __ pc();
4393     __ enter();
4394 
4395     const Register state = c_rarg0;
4396     const Register keystream = c_rarg1;
4397     const Register loopCtr = r10;
4398     const Register tmpAddr = r11;
4399 
4400     const FloatRegister aState = v0;
4401     const FloatRegister bState = v1;
4402     const FloatRegister cState = v2;
4403     const FloatRegister dState = v3;
4404     const FloatRegister a1Vec = v4;
4405     const FloatRegister b1Vec = v5;
4406     const FloatRegister c1Vec = v6;
4407     const FloatRegister d1Vec = v7;
4408     // Skip the callee-saved registers v8 - v15
4409     const FloatRegister a2Vec = v16;
4410     const FloatRegister b2Vec = v17;
4411     const FloatRegister c2Vec = v18;
4412     const FloatRegister d2Vec = v19;
4413     const FloatRegister a3Vec = v20;
4414     const FloatRegister b3Vec = v21;
4415     const FloatRegister c3Vec = v22;
4416     const FloatRegister d3Vec = v23;
4417     const FloatRegister a4Vec = v24;
4418     const FloatRegister b4Vec = v25;
4419     const FloatRegister c4Vec = v26;
4420     const FloatRegister d4Vec = v27;
4421     const FloatRegister scratch = v28;
4422     const FloatRegister addMask = v29;
4423     const FloatRegister lrot8Tbl = v30;
4424 
4425     // Load the initial state in the first 4 quadword registers,
4426     // then copy the initial state into the next 4 quadword registers
4427     // that will be used for the working state.
4428     __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
4429 
4430     // Load the index register for 2 constant 128-bit data fields.
4431     // The first represents the +1/+0/+0/+0 add mask.  The second is
4432     // the 8-bit left rotation.
4433     __ adr(tmpAddr, L_Q_cc20_const);
4434     __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
4435 
4436     __ mov(a1Vec, __ T16B, aState);
4437     __ mov(b1Vec, __ T16B, bState);
4438     __ mov(c1Vec, __ T16B, cState);
4439     __ mov(d1Vec, __ T16B, dState);
4440 
4441     __ mov(a2Vec, __ T16B, aState);
4442     __ mov(b2Vec, __ T16B, bState);
4443     __ mov(c2Vec, __ T16B, cState);
4444     __ addv(d2Vec, __ T4S, d1Vec, addMask);
4445 
4446     __ mov(a3Vec, __ T16B, aState);
4447     __ mov(b3Vec, __ T16B, bState);
4448     __ mov(c3Vec, __ T16B, cState);
4449     __ addv(d3Vec, __ T4S, d2Vec, addMask);
4450 
4451     __ mov(a4Vec, __ T16B, aState);
4452     __ mov(b4Vec, __ T16B, bState);
4453     __ mov(c4Vec, __ T16B, cState);
4454     __ addv(d4Vec, __ T4S, d3Vec, addMask);
4455 
4456     // Set up the 10 iteration loop
4457     __ mov(loopCtr, 10);
4458     __ BIND(L_Q_twoRounds);
4459 
4460     // The first set of operations on the vectors covers the first 4 quarter
4461     // round operations:
4462     //  Qround(state, 0, 4, 8,12)
4463     //  Qround(state, 1, 5, 9,13)
4464     //  Qround(state, 2, 6,10,14)
4465     //  Qround(state, 3, 7,11,15)
4466     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
4467     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
4468     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
4469     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
4470 
4471     // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
4472     // diagonals. The a1Vec does not need to change orientation.
4473     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
4474     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
4475     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
4476     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
4477 
4478     // The second set of operations on the vectors covers the second 4 quarter
4479     // round operations, now acting on the diagonals:
4480     //  Qround(state, 0, 5,10,15)
4481     //  Qround(state, 1, 6,11,12)
4482     //  Qround(state, 2, 7, 8,13)
4483     //  Qround(state, 3, 4, 9,14)
4484     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
4485     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
4486     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
4487     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
4488 
4489     // Before we start the next iteration, we need to perform shuffles
4490     // on the b/c/d vectors to move them back to columnar organizations
4491     // from their current diagonal orientation.
4492     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
4493     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
4494     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
4495     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
4496 
4497     // Decrement and iterate
4498     __ sub(loopCtr, loopCtr, 1);
4499     __ cbnz(loopCtr, L_Q_twoRounds);
4500 
4501     // Once the counter reaches zero, we fall out of the loop
4502     // and need to add the initial state back into the working state
4503     // represented by the a/b/c/d1Vec registers.  This is destructive
4504     // on the dState register but we no longer will need it.
4505     __ addv(a1Vec, __ T4S, a1Vec, aState);
4506     __ addv(b1Vec, __ T4S, b1Vec, bState);
4507     __ addv(c1Vec, __ T4S, c1Vec, cState);
4508     __ addv(d1Vec, __ T4S, d1Vec, dState);
4509 
4510     __ addv(a2Vec, __ T4S, a2Vec, aState);
4511     __ addv(b2Vec, __ T4S, b2Vec, bState);
4512     __ addv(c2Vec, __ T4S, c2Vec, cState);
4513     __ addv(dState, __ T4S, dState, addMask);
4514     __ addv(d2Vec, __ T4S, d2Vec, dState);
4515 
4516     __ addv(a3Vec, __ T4S, a3Vec, aState);
4517     __ addv(b3Vec, __ T4S, b3Vec, bState);
4518     __ addv(c3Vec, __ T4S, c3Vec, cState);
4519     __ addv(dState, __ T4S, dState, addMask);
4520     __ addv(d3Vec, __ T4S, d3Vec, dState);
4521 
4522     __ addv(a4Vec, __ T4S, a4Vec, aState);
4523     __ addv(b4Vec, __ T4S, b4Vec, bState);
4524     __ addv(c4Vec, __ T4S, c4Vec, cState);
4525     __ addv(dState, __ T4S, dState, addMask);
4526     __ addv(d4Vec, __ T4S, d4Vec, dState);
4527 
4528     // Write the final state back to the result buffer
4529     __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
4530     __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
4531     __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
4532     __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
4533 
4534     __ mov(r0, 256);             // Return length of output keystream
4535     __ leave();
4536     __ ret(lr);
4537 
4538     return start;
4539   }
4540 
4541   /**
4542    *  Arguments:
4543    *
4544    * Inputs:
4545    *   c_rarg0   - int crc
4546    *   c_rarg1   - byte* buf
4547    *   c_rarg2   - int length
4548    *   c_rarg3   - int* table
4549    *
4550    * Output:
4551    *       r0   - int crc result
4552    */
4553   address generate_updateBytesCRC32C() {
4554     assert(UseCRC32CIntrinsics, "what are we doing here?");
4555 
4556     __ align(CodeEntryAlignment);
4557     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
4558     StubCodeMark mark(this, stub_id);
4559 
4560     address start = __ pc();
4561 
4562     const Register crc   = c_rarg0;  // crc
4563     const Register buf   = c_rarg1;  // source java byte array address
4564     const Register len   = c_rarg2;  // length
4565     const Register table0 = c_rarg3; // crc_table address
4566     const Register table1 = c_rarg4;
4567     const Register table2 = c_rarg5;
4568     const Register table3 = c_rarg6;
4569     const Register tmp3 = c_rarg7;
4570 
4571     BLOCK_COMMENT("Entry:");
4572     __ enter(); // required for proper stackwalking of RuntimeStub frame
4573 
4574     __ kernel_crc32c(crc, buf, len,
4575               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4576 
4577     __ leave(); // required for proper stackwalking of RuntimeStub frame
4578     __ ret(lr);
4579 
4580     return start;
4581   }
4582 
4583   /***
4584    *  Arguments:
4585    *
4586    *  Inputs:
4587    *   c_rarg0   - int   adler
4588    *   c_rarg1   - byte* buff
4589    *   c_rarg2   - int   len
4590    *
4591    * Output:
4592    *   c_rarg0   - int adler result
4593    */
4594   address generate_updateBytesAdler32() {
4595     __ align(CodeEntryAlignment);
4596     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
4597     StubCodeMark mark(this, stub_id);
4598     address start = __ pc();
4599 
4600     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4601 
4602     // Aliases
4603     Register adler  = c_rarg0;
4604     Register s1     = c_rarg0;
4605     Register s2     = c_rarg3;
4606     Register buff   = c_rarg1;
4607     Register len    = c_rarg2;
4608     Register nmax  = r4;
4609     Register base  = r5;
4610     Register count = r6;
4611     Register temp0 = rscratch1;
4612     Register temp1 = rscratch2;
4613     FloatRegister vbytes = v0;
4614     FloatRegister vs1acc = v1;
4615     FloatRegister vs2acc = v2;
4616     FloatRegister vtable = v3;
4617 
4618     // Max number of bytes we can process before having to take the mod
4619     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4620     uint64_t BASE = 0xfff1;
4621     uint64_t NMAX = 0x15B0;
4622 
4623     __ mov(base, BASE);
4624     __ mov(nmax, NMAX);
4625 
4626     // Load accumulation coefficients for the upper 16 bits
4627     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4628     __ ld1(vtable, __ T16B, Address(temp0));
4629 
4630     // s1 is initialized to the lower 16 bits of adler
4631     // s2 is initialized to the upper 16 bits of adler
4632     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4633     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4634 
4635     // The pipelined loop needs at least 16 elements for 1 iteration
4636     // It does check this, but it is more effective to skip to the cleanup loop
4637     __ cmp(len, (u1)16);
4638     __ br(Assembler::HS, L_nmax);
4639     __ cbz(len, L_combine);
4640 
4641     __ bind(L_simple_by1_loop);
4642     __ ldrb(temp0, Address(__ post(buff, 1)));
4643     __ add(s1, s1, temp0);
4644     __ add(s2, s2, s1);
4645     __ subs(len, len, 1);
4646     __ br(Assembler::HI, L_simple_by1_loop);
4647 
4648     // s1 = s1 % BASE
4649     __ subs(temp0, s1, base);
4650     __ csel(s1, temp0, s1, Assembler::HS);
4651 
4652     // s2 = s2 % BASE
4653     __ lsr(temp0, s2, 16);
4654     __ lsl(temp1, temp0, 4);
4655     __ sub(temp1, temp1, temp0);
4656     __ add(s2, temp1, s2, ext::uxth);
4657 
4658     __ subs(temp0, s2, base);
4659     __ csel(s2, temp0, s2, Assembler::HS);
4660 
4661     __ b(L_combine);
4662 
4663     __ bind(L_nmax);
4664     __ subs(len, len, nmax);
4665     __ sub(count, nmax, 16);
4666     __ br(Assembler::LO, L_by16);
4667 
4668     __ bind(L_nmax_loop);
4669 
4670     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4671                                       vbytes, vs1acc, vs2acc, vtable);
4672 
4673     __ subs(count, count, 16);
4674     __ br(Assembler::HS, L_nmax_loop);
4675 
4676     // s1 = s1 % BASE
4677     __ lsr(temp0, s1, 16);
4678     __ lsl(temp1, temp0, 4);
4679     __ sub(temp1, temp1, temp0);
4680     __ add(temp1, temp1, s1, ext::uxth);
4681 
4682     __ lsr(temp0, temp1, 16);
4683     __ lsl(s1, temp0, 4);
4684     __ sub(s1, s1, temp0);
4685     __ add(s1, s1, temp1, ext:: uxth);
4686 
4687     __ subs(temp0, s1, base);
4688     __ csel(s1, temp0, s1, Assembler::HS);
4689 
4690     // s2 = s2 % BASE
4691     __ lsr(temp0, s2, 16);
4692     __ lsl(temp1, temp0, 4);
4693     __ sub(temp1, temp1, temp0);
4694     __ add(temp1, temp1, s2, ext::uxth);
4695 
4696     __ lsr(temp0, temp1, 16);
4697     __ lsl(s2, temp0, 4);
4698     __ sub(s2, s2, temp0);
4699     __ add(s2, s2, temp1, ext:: uxth);
4700 
4701     __ subs(temp0, s2, base);
4702     __ csel(s2, temp0, s2, Assembler::HS);
4703 
4704     __ subs(len, len, nmax);
4705     __ sub(count, nmax, 16);
4706     __ br(Assembler::HS, L_nmax_loop);
4707 
4708     __ bind(L_by16);
4709     __ adds(len, len, count);
4710     __ br(Assembler::LO, L_by1);
4711 
4712     __ bind(L_by16_loop);
4713 
4714     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4715                                       vbytes, vs1acc, vs2acc, vtable);
4716 
4717     __ subs(len, len, 16);
4718     __ br(Assembler::HS, L_by16_loop);
4719 
4720     __ bind(L_by1);
4721     __ adds(len, len, 15);
4722     __ br(Assembler::LO, L_do_mod);
4723 
4724     __ bind(L_by1_loop);
4725     __ ldrb(temp0, Address(__ post(buff, 1)));
4726     __ add(s1, temp0, s1);
4727     __ add(s2, s2, s1);
4728     __ subs(len, len, 1);
4729     __ br(Assembler::HS, L_by1_loop);
4730 
4731     __ bind(L_do_mod);
4732     // s1 = s1 % BASE
4733     __ lsr(temp0, s1, 16);
4734     __ lsl(temp1, temp0, 4);
4735     __ sub(temp1, temp1, temp0);
4736     __ add(temp1, temp1, s1, ext::uxth);
4737 
4738     __ lsr(temp0, temp1, 16);
4739     __ lsl(s1, temp0, 4);
4740     __ sub(s1, s1, temp0);
4741     __ add(s1, s1, temp1, ext:: uxth);
4742 
4743     __ subs(temp0, s1, base);
4744     __ csel(s1, temp0, s1, Assembler::HS);
4745 
4746     // s2 = s2 % BASE
4747     __ lsr(temp0, s2, 16);
4748     __ lsl(temp1, temp0, 4);
4749     __ sub(temp1, temp1, temp0);
4750     __ add(temp1, temp1, s2, ext::uxth);
4751 
4752     __ lsr(temp0, temp1, 16);
4753     __ lsl(s2, temp0, 4);
4754     __ sub(s2, s2, temp0);
4755     __ add(s2, s2, temp1, ext:: uxth);
4756 
4757     __ subs(temp0, s2, base);
4758     __ csel(s2, temp0, s2, Assembler::HS);
4759 
4760     // Combine lower bits and higher bits
4761     __ bind(L_combine);
4762     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4763 
4764     __ ret(lr);
4765 
4766     return start;
4767   }
4768 
4769   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4770           Register temp0, Register temp1, FloatRegister vbytes,
4771           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4772     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4773     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4774     // In non-vectorized code, we update s1 and s2 as:
4775     //   s1 <- s1 + b1
4776     //   s2 <- s2 + s1
4777     //   s1 <- s1 + b2
4778     //   s2 <- s2 + b1
4779     //   ...
4780     //   s1 <- s1 + b16
4781     //   s2 <- s2 + s1
4782     // Putting above assignments together, we have:
4783     //   s1_new = s1 + b1 + b2 + ... + b16
4784     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4785     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4786     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4787     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4788 
4789     // s2 = s2 + s1 * 16
4790     __ add(s2, s2, s1, Assembler::LSL, 4);
4791 
4792     // vs1acc = b1 + b2 + b3 + ... + b16
4793     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4794     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4795     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4796     __ uaddlv(vs1acc, __ T16B, vbytes);
4797     __ uaddlv(vs2acc, __ T8H, vs2acc);
4798 
4799     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4800     __ fmovd(temp0, vs1acc);
4801     __ fmovd(temp1, vs2acc);
4802     __ add(s1, s1, temp0);
4803     __ add(s2, s2, temp1);
4804   }
4805 
4806   /**
4807    *  Arguments:
4808    *
4809    *  Input:
4810    *    c_rarg0   - x address
4811    *    c_rarg1   - x length
4812    *    c_rarg2   - y address
4813    *    c_rarg3   - y length
4814    *    c_rarg4   - z address
4815    */
4816   address generate_multiplyToLen() {
4817     __ align(CodeEntryAlignment);
4818     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
4819     StubCodeMark mark(this, stub_id);
4820 
4821     address start = __ pc();
4822     const Register x     = r0;
4823     const Register xlen  = r1;
4824     const Register y     = r2;
4825     const Register ylen  = r3;
4826     const Register z     = r4;
4827 
4828     const Register tmp0  = r5;
4829     const Register tmp1  = r10;
4830     const Register tmp2  = r11;
4831     const Register tmp3  = r12;
4832     const Register tmp4  = r13;
4833     const Register tmp5  = r14;
4834     const Register tmp6  = r15;
4835     const Register tmp7  = r16;
4836 
4837     BLOCK_COMMENT("Entry:");
4838     __ enter(); // required for proper stackwalking of RuntimeStub frame
4839     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4840     __ leave(); // required for proper stackwalking of RuntimeStub frame
4841     __ ret(lr);
4842 
4843     return start;
4844   }
4845 
4846   address generate_squareToLen() {
4847     // squareToLen algorithm for sizes 1..127 described in java code works
4848     // faster than multiply_to_len on some CPUs and slower on others, but
4849     // multiply_to_len shows a bit better overall results
4850     __ align(CodeEntryAlignment);
4851     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
4852     StubCodeMark mark(this, stub_id);
4853     address start = __ pc();
4854 
4855     const Register x     = r0;
4856     const Register xlen  = r1;
4857     const Register z     = r2;
4858     const Register y     = r4; // == x
4859     const Register ylen  = r5; // == xlen
4860 
4861     const Register tmp0  = r3;
4862     const Register tmp1  = r10;
4863     const Register tmp2  = r11;
4864     const Register tmp3  = r12;
4865     const Register tmp4  = r13;
4866     const Register tmp5  = r14;
4867     const Register tmp6  = r15;
4868     const Register tmp7  = r16;
4869 
4870     RegSet spilled_regs = RegSet::of(y, ylen);
4871     BLOCK_COMMENT("Entry:");
4872     __ enter();
4873     __ push(spilled_regs, sp);
4874     __ mov(y, x);
4875     __ mov(ylen, xlen);
4876     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4877     __ pop(spilled_regs, sp);
4878     __ leave();
4879     __ ret(lr);
4880     return start;
4881   }
4882 
4883   address generate_mulAdd() {
4884     __ align(CodeEntryAlignment);
4885     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
4886     StubCodeMark mark(this, stub_id);
4887 
4888     address start = __ pc();
4889 
4890     const Register out     = r0;
4891     const Register in      = r1;
4892     const Register offset  = r2;
4893     const Register len     = r3;
4894     const Register k       = r4;
4895 
4896     BLOCK_COMMENT("Entry:");
4897     __ enter();
4898     __ mul_add(out, in, offset, len, k);
4899     __ leave();
4900     __ ret(lr);
4901 
4902     return start;
4903   }
4904 
4905   // Arguments:
4906   //
4907   // Input:
4908   //   c_rarg0   - newArr address
4909   //   c_rarg1   - oldArr address
4910   //   c_rarg2   - newIdx
4911   //   c_rarg3   - shiftCount
4912   //   c_rarg4   - numIter
4913   //
4914   address generate_bigIntegerRightShift() {
4915     __ align(CodeEntryAlignment);
4916     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
4917     StubCodeMark mark(this, stub_id);
4918     address start = __ pc();
4919 
4920     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4921 
4922     Register newArr        = c_rarg0;
4923     Register oldArr        = c_rarg1;
4924     Register newIdx        = c_rarg2;
4925     Register shiftCount    = c_rarg3;
4926     Register numIter       = c_rarg4;
4927     Register idx           = numIter;
4928 
4929     Register newArrCur     = rscratch1;
4930     Register shiftRevCount = rscratch2;
4931     Register oldArrCur     = r13;
4932     Register oldArrNext    = r14;
4933 
4934     FloatRegister oldElem0        = v0;
4935     FloatRegister oldElem1        = v1;
4936     FloatRegister newElem         = v2;
4937     FloatRegister shiftVCount     = v3;
4938     FloatRegister shiftVRevCount  = v4;
4939 
4940     __ cbz(idx, Exit);
4941 
4942     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4943 
4944     // left shift count
4945     __ movw(shiftRevCount, 32);
4946     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4947 
4948     // numIter too small to allow a 4-words SIMD loop, rolling back
4949     __ cmp(numIter, (u1)4);
4950     __ br(Assembler::LT, ShiftThree);
4951 
4952     __ dup(shiftVCount,    __ T4S, shiftCount);
4953     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4954     __ negr(shiftVCount,   __ T4S, shiftVCount);
4955 
4956     __ BIND(ShiftSIMDLoop);
4957 
4958     // Calculate the load addresses
4959     __ sub(idx, idx, 4);
4960     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4961     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4962     __ add(oldArrCur,  oldArrNext, 4);
4963 
4964     // Load 4 words and process
4965     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4966     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4967     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4968     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4969     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4970     __ st1(newElem,   __ T4S,  Address(newArrCur));
4971 
4972     __ cmp(idx, (u1)4);
4973     __ br(Assembler::LT, ShiftTwoLoop);
4974     __ b(ShiftSIMDLoop);
4975 
4976     __ BIND(ShiftTwoLoop);
4977     __ cbz(idx, Exit);
4978     __ cmp(idx, (u1)1);
4979     __ br(Assembler::EQ, ShiftOne);
4980 
4981     // Calculate the load addresses
4982     __ sub(idx, idx, 2);
4983     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4984     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4985     __ add(oldArrCur,  oldArrNext, 4);
4986 
4987     // Load 2 words and process
4988     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4989     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4990     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4991     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4992     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4993     __ st1(newElem,   __ T2S, Address(newArrCur));
4994     __ b(ShiftTwoLoop);
4995 
4996     __ BIND(ShiftThree);
4997     __ tbz(idx, 1, ShiftOne);
4998     __ tbz(idx, 0, ShiftTwo);
4999     __ ldrw(r10,  Address(oldArr, 12));
5000     __ ldrw(r11,  Address(oldArr, 8));
5001     __ lsrvw(r10, r10, shiftCount);
5002     __ lslvw(r11, r11, shiftRevCount);
5003     __ orrw(r12,  r10, r11);
5004     __ strw(r12,  Address(newArr, 8));
5005 
5006     __ BIND(ShiftTwo);
5007     __ ldrw(r10,  Address(oldArr, 8));
5008     __ ldrw(r11,  Address(oldArr, 4));
5009     __ lsrvw(r10, r10, shiftCount);
5010     __ lslvw(r11, r11, shiftRevCount);
5011     __ orrw(r12,  r10, r11);
5012     __ strw(r12,  Address(newArr, 4));
5013 
5014     __ BIND(ShiftOne);
5015     __ ldrw(r10,  Address(oldArr, 4));
5016     __ ldrw(r11,  Address(oldArr));
5017     __ lsrvw(r10, r10, shiftCount);
5018     __ lslvw(r11, r11, shiftRevCount);
5019     __ orrw(r12,  r10, r11);
5020     __ strw(r12,  Address(newArr));
5021 
5022     __ BIND(Exit);
5023     __ ret(lr);
5024 
5025     return start;
5026   }
5027 
5028   // Arguments:
5029   //
5030   // Input:
5031   //   c_rarg0   - newArr address
5032   //   c_rarg1   - oldArr address
5033   //   c_rarg2   - newIdx
5034   //   c_rarg3   - shiftCount
5035   //   c_rarg4   - numIter
5036   //
5037   address generate_bigIntegerLeftShift() {
5038     __ align(CodeEntryAlignment);
5039     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
5040     StubCodeMark mark(this, stub_id);
5041     address start = __ pc();
5042 
5043     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
5044 
5045     Register newArr        = c_rarg0;
5046     Register oldArr        = c_rarg1;
5047     Register newIdx        = c_rarg2;
5048     Register shiftCount    = c_rarg3;
5049     Register numIter       = c_rarg4;
5050 
5051     Register shiftRevCount = rscratch1;
5052     Register oldArrNext    = rscratch2;
5053 
5054     FloatRegister oldElem0        = v0;
5055     FloatRegister oldElem1        = v1;
5056     FloatRegister newElem         = v2;
5057     FloatRegister shiftVCount     = v3;
5058     FloatRegister shiftVRevCount  = v4;
5059 
5060     __ cbz(numIter, Exit);
5061 
5062     __ add(oldArrNext, oldArr, 4);
5063     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
5064 
5065     // right shift count
5066     __ movw(shiftRevCount, 32);
5067     __ subw(shiftRevCount, shiftRevCount, shiftCount);
5068 
5069     // numIter too small to allow a 4-words SIMD loop, rolling back
5070     __ cmp(numIter, (u1)4);
5071     __ br(Assembler::LT, ShiftThree);
5072 
5073     __ dup(shiftVCount,     __ T4S, shiftCount);
5074     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
5075     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
5076 
5077     __ BIND(ShiftSIMDLoop);
5078 
5079     // load 4 words and process
5080     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
5081     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
5082     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
5083     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
5084     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
5085     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
5086     __ sub(numIter,   numIter, 4);
5087 
5088     __ cmp(numIter, (u1)4);
5089     __ br(Assembler::LT, ShiftTwoLoop);
5090     __ b(ShiftSIMDLoop);
5091 
5092     __ BIND(ShiftTwoLoop);
5093     __ cbz(numIter, Exit);
5094     __ cmp(numIter, (u1)1);
5095     __ br(Assembler::EQ, ShiftOne);
5096 
5097     // load 2 words and process
5098     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
5099     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
5100     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
5101     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
5102     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
5103     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
5104     __ sub(numIter,   numIter, 2);
5105     __ b(ShiftTwoLoop);
5106 
5107     __ BIND(ShiftThree);
5108     __ ldrw(r10,  __ post(oldArr, 4));
5109     __ ldrw(r11,  __ post(oldArrNext, 4));
5110     __ lslvw(r10, r10, shiftCount);
5111     __ lsrvw(r11, r11, shiftRevCount);
5112     __ orrw(r12,  r10, r11);
5113     __ strw(r12,  __ post(newArr, 4));
5114     __ tbz(numIter, 1, Exit);
5115     __ tbz(numIter, 0, ShiftOne);
5116 
5117     __ BIND(ShiftTwo);
5118     __ ldrw(r10,  __ post(oldArr, 4));
5119     __ ldrw(r11,  __ post(oldArrNext, 4));
5120     __ lslvw(r10, r10, shiftCount);
5121     __ lsrvw(r11, r11, shiftRevCount);
5122     __ orrw(r12,  r10, r11);
5123     __ strw(r12,  __ post(newArr, 4));
5124 
5125     __ BIND(ShiftOne);
5126     __ ldrw(r10,  Address(oldArr));
5127     __ ldrw(r11,  Address(oldArrNext));
5128     __ lslvw(r10, r10, shiftCount);
5129     __ lsrvw(r11, r11, shiftRevCount);
5130     __ orrw(r12,  r10, r11);
5131     __ strw(r12,  Address(newArr));
5132 
5133     __ BIND(Exit);
5134     __ ret(lr);
5135 
5136     return start;
5137   }
5138 
5139   address generate_count_positives(address &count_positives_long) {
5140     const u1 large_loop_size = 64;
5141     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5142     int dcache_line = VM_Version::dcache_line_size();
5143 
5144     Register ary1 = r1, len = r2, result = r0;
5145 
5146     __ align(CodeEntryAlignment);
5147 
5148     StubGenStubId stub_id = StubGenStubId::count_positives_id;
5149     StubCodeMark mark(this, stub_id);
5150 
5151     address entry = __ pc();
5152 
5153     __ enter();
5154     // precondition: a copy of len is already in result
5155     // __ mov(result, len);
5156 
5157   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5158         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5159 
5160   __ cmp(len, (u1)15);
5161   __ br(Assembler::GT, LEN_OVER_15);
5162   // The only case when execution falls into this code is when pointer is near
5163   // the end of memory page and we have to avoid reading next page
5164   __ add(ary1, ary1, len);
5165   __ subs(len, len, 8);
5166   __ br(Assembler::GT, LEN_OVER_8);
5167   __ ldr(rscratch2, Address(ary1, -8));
5168   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5169   __ lsrv(rscratch2, rscratch2, rscratch1);
5170   __ tst(rscratch2, UPPER_BIT_MASK);
5171   __ csel(result, zr, result, Assembler::NE);
5172   __ leave();
5173   __ ret(lr);
5174   __ bind(LEN_OVER_8);
5175   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5176   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5177   __ tst(rscratch2, UPPER_BIT_MASK);
5178   __ br(Assembler::NE, RET_NO_POP);
5179   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5180   __ lsrv(rscratch1, rscratch1, rscratch2);
5181   __ tst(rscratch1, UPPER_BIT_MASK);
5182   __ bind(RET_NO_POP);
5183   __ csel(result, zr, result, Assembler::NE);
5184   __ leave();
5185   __ ret(lr);
5186 
5187   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5188   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5189 
5190   count_positives_long = __ pc(); // 2nd entry point
5191 
5192   __ enter();
5193 
5194   __ bind(LEN_OVER_15);
5195     __ push(spilled_regs, sp);
5196     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5197     __ cbz(rscratch2, ALIGNED);
5198     __ ldp(tmp6, tmp1, Address(ary1));
5199     __ mov(tmp5, 16);
5200     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5201     __ add(ary1, ary1, rscratch1);
5202     __ orr(tmp6, tmp6, tmp1);
5203     __ tst(tmp6, UPPER_BIT_MASK);
5204     __ br(Assembler::NE, RET_ADJUST);
5205     __ sub(len, len, rscratch1);
5206 
5207   __ bind(ALIGNED);
5208     __ cmp(len, large_loop_size);
5209     __ br(Assembler::LT, CHECK_16);
5210     // Perform 16-byte load as early return in pre-loop to handle situation
5211     // when initially aligned large array has negative values at starting bytes,
5212     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5213     // slower. Cases with negative bytes further ahead won't be affected that
5214     // much. In fact, it'll be faster due to early loads, less instructions and
5215     // less branches in LARGE_LOOP.
5216     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5217     __ sub(len, len, 16);
5218     __ orr(tmp6, tmp6, tmp1);
5219     __ tst(tmp6, UPPER_BIT_MASK);
5220     __ br(Assembler::NE, RET_ADJUST_16);
5221     __ cmp(len, large_loop_size);
5222     __ br(Assembler::LT, CHECK_16);
5223 
5224     if (SoftwarePrefetchHintDistance >= 0
5225         && SoftwarePrefetchHintDistance >= dcache_line) {
5226       // initial prefetch
5227       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5228     }
5229   __ bind(LARGE_LOOP);
5230     if (SoftwarePrefetchHintDistance >= 0) {
5231       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5232     }
5233     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5234     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5235     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5236     // instructions per cycle and have less branches, but this approach disables
5237     // early return, thus, all 64 bytes are loaded and checked every time.
5238     __ ldp(tmp2, tmp3, Address(ary1));
5239     __ ldp(tmp4, tmp5, Address(ary1, 16));
5240     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5241     __ ldp(tmp6, tmp1, Address(ary1, 48));
5242     __ add(ary1, ary1, large_loop_size);
5243     __ sub(len, len, large_loop_size);
5244     __ orr(tmp2, tmp2, tmp3);
5245     __ orr(tmp4, tmp4, tmp5);
5246     __ orr(rscratch1, rscratch1, rscratch2);
5247     __ orr(tmp6, tmp6, tmp1);
5248     __ orr(tmp2, tmp2, tmp4);
5249     __ orr(rscratch1, rscratch1, tmp6);
5250     __ orr(tmp2, tmp2, rscratch1);
5251     __ tst(tmp2, UPPER_BIT_MASK);
5252     __ br(Assembler::NE, RET_ADJUST_LONG);
5253     __ cmp(len, large_loop_size);
5254     __ br(Assembler::GE, LARGE_LOOP);
5255 
5256   __ bind(CHECK_16); // small 16-byte load pre-loop
5257     __ cmp(len, (u1)16);
5258     __ br(Assembler::LT, POST_LOOP16);
5259 
5260   __ bind(LOOP16); // small 16-byte load loop
5261     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5262     __ sub(len, len, 16);
5263     __ orr(tmp2, tmp2, tmp3);
5264     __ tst(tmp2, UPPER_BIT_MASK);
5265     __ br(Assembler::NE, RET_ADJUST_16);
5266     __ cmp(len, (u1)16);
5267     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5268 
5269   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5270     __ cmp(len, (u1)8);
5271     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5272     __ ldr(tmp3, Address(__ post(ary1, 8)));
5273     __ tst(tmp3, UPPER_BIT_MASK);
5274     __ br(Assembler::NE, RET_ADJUST);
5275     __ sub(len, len, 8);
5276 
5277   __ bind(POST_LOOP16_LOAD_TAIL);
5278     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5279     __ ldr(tmp1, Address(ary1));
5280     __ mov(tmp2, 64);
5281     __ sub(tmp4, tmp2, len, __ LSL, 3);
5282     __ lslv(tmp1, tmp1, tmp4);
5283     __ tst(tmp1, UPPER_BIT_MASK);
5284     __ br(Assembler::NE, RET_ADJUST);
5285     // Fallthrough
5286 
5287   __ bind(RET_LEN);
5288     __ pop(spilled_regs, sp);
5289     __ leave();
5290     __ ret(lr);
5291 
5292     // difference result - len is the count of guaranteed to be
5293     // positive bytes
5294 
5295   __ bind(RET_ADJUST_LONG);
5296     __ add(len, len, (u1)(large_loop_size - 16));
5297   __ bind(RET_ADJUST_16);
5298     __ add(len, len, 16);
5299   __ bind(RET_ADJUST);
5300     __ pop(spilled_regs, sp);
5301     __ leave();
5302     __ sub(result, result, len);
5303     __ ret(lr);
5304 
5305     return entry;
5306   }
5307 
5308   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5309         bool usePrefetch, Label &NOT_EQUAL) {
5310     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5311         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5312         tmp7 = r12, tmp8 = r13;
5313     Label LOOP;
5314 
5315     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5316     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5317     __ bind(LOOP);
5318     if (usePrefetch) {
5319       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5320       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5321     }
5322     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5323     __ eor(tmp1, tmp1, tmp2);
5324     __ eor(tmp3, tmp3, tmp4);
5325     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5326     __ orr(tmp1, tmp1, tmp3);
5327     __ cbnz(tmp1, NOT_EQUAL);
5328     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5329     __ eor(tmp5, tmp5, tmp6);
5330     __ eor(tmp7, tmp7, tmp8);
5331     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5332     __ orr(tmp5, tmp5, tmp7);
5333     __ cbnz(tmp5, NOT_EQUAL);
5334     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5335     __ eor(tmp1, tmp1, tmp2);
5336     __ eor(tmp3, tmp3, tmp4);
5337     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5338     __ orr(tmp1, tmp1, tmp3);
5339     __ cbnz(tmp1, NOT_EQUAL);
5340     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5341     __ eor(tmp5, tmp5, tmp6);
5342     __ sub(cnt1, cnt1, 8 * wordSize);
5343     __ eor(tmp7, tmp7, tmp8);
5344     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5345     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5346     // cmp) because subs allows an unlimited range of immediate operand.
5347     __ subs(tmp6, cnt1, loopThreshold);
5348     __ orr(tmp5, tmp5, tmp7);
5349     __ cbnz(tmp5, NOT_EQUAL);
5350     __ br(__ GE, LOOP);
5351     // post-loop
5352     __ eor(tmp1, tmp1, tmp2);
5353     __ eor(tmp3, tmp3, tmp4);
5354     __ orr(tmp1, tmp1, tmp3);
5355     __ sub(cnt1, cnt1, 2 * wordSize);
5356     __ cbnz(tmp1, NOT_EQUAL);
5357   }
5358 
5359   void generate_large_array_equals_loop_simd(int loopThreshold,
5360         bool usePrefetch, Label &NOT_EQUAL) {
5361     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5362         tmp2 = rscratch2;
5363     Label LOOP;
5364 
5365     __ bind(LOOP);
5366     if (usePrefetch) {
5367       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5368       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5369     }
5370     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5371     __ sub(cnt1, cnt1, 8 * wordSize);
5372     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5373     __ subs(tmp1, cnt1, loopThreshold);
5374     __ eor(v0, __ T16B, v0, v4);
5375     __ eor(v1, __ T16B, v1, v5);
5376     __ eor(v2, __ T16B, v2, v6);
5377     __ eor(v3, __ T16B, v3, v7);
5378     __ orr(v0, __ T16B, v0, v1);
5379     __ orr(v1, __ T16B, v2, v3);
5380     __ orr(v0, __ T16B, v0, v1);
5381     __ umov(tmp1, v0, __ D, 0);
5382     __ umov(tmp2, v0, __ D, 1);
5383     __ orr(tmp1, tmp1, tmp2);
5384     __ cbnz(tmp1, NOT_EQUAL);
5385     __ br(__ GE, LOOP);
5386   }
5387 
5388   // a1 = r1 - array1 address
5389   // a2 = r2 - array2 address
5390   // result = r0 - return value. Already contains "false"
5391   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5392   // r3-r5 are reserved temporary registers
5393   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5394   address generate_large_array_equals() {
5395     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5396         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5397         tmp7 = r12, tmp8 = r13;
5398     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5399         SMALL_LOOP, POST_LOOP;
5400     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5401     // calculate if at least 32 prefetched bytes are used
5402     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5403     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5404     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5405     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5406         tmp5, tmp6, tmp7, tmp8);
5407 
5408     __ align(CodeEntryAlignment);
5409 
5410     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
5411     StubCodeMark mark(this, stub_id);
5412 
5413     address entry = __ pc();
5414     __ enter();
5415     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5416     // also advance pointers to use post-increment instead of pre-increment
5417     __ add(a1, a1, wordSize);
5418     __ add(a2, a2, wordSize);
5419     if (AvoidUnalignedAccesses) {
5420       // both implementations (SIMD/nonSIMD) are using relatively large load
5421       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5422       // on some CPUs in case of address is not at least 16-byte aligned.
5423       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5424       // load if needed at least for 1st address and make if 16-byte aligned.
5425       Label ALIGNED16;
5426       __ tbz(a1, 3, ALIGNED16);
5427       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5428       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5429       __ sub(cnt1, cnt1, wordSize);
5430       __ eor(tmp1, tmp1, tmp2);
5431       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5432       __ bind(ALIGNED16);
5433     }
5434     if (UseSIMDForArrayEquals) {
5435       if (SoftwarePrefetchHintDistance >= 0) {
5436         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5437         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5438         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5439             /* prfm = */ true, NOT_EQUAL);
5440         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5441         __ br(__ LT, TAIL);
5442       }
5443       __ bind(NO_PREFETCH_LARGE_LOOP);
5444       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5445           /* prfm = */ false, NOT_EQUAL);
5446     } else {
5447       __ push(spilled_regs, sp);
5448       if (SoftwarePrefetchHintDistance >= 0) {
5449         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5450         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5451         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5452             /* prfm = */ true, NOT_EQUAL);
5453         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5454         __ br(__ LT, TAIL);
5455       }
5456       __ bind(NO_PREFETCH_LARGE_LOOP);
5457       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5458           /* prfm = */ false, NOT_EQUAL);
5459     }
5460     __ bind(TAIL);
5461       __ cbz(cnt1, EQUAL);
5462       __ subs(cnt1, cnt1, wordSize);
5463       __ br(__ LE, POST_LOOP);
5464     __ bind(SMALL_LOOP);
5465       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5466       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5467       __ subs(cnt1, cnt1, wordSize);
5468       __ eor(tmp1, tmp1, tmp2);
5469       __ cbnz(tmp1, NOT_EQUAL);
5470       __ br(__ GT, SMALL_LOOP);
5471     __ bind(POST_LOOP);
5472       __ ldr(tmp1, Address(a1, cnt1));
5473       __ ldr(tmp2, Address(a2, cnt1));
5474       __ eor(tmp1, tmp1, tmp2);
5475       __ cbnz(tmp1, NOT_EQUAL);
5476     __ bind(EQUAL);
5477       __ mov(result, true);
5478     __ bind(NOT_EQUAL);
5479       if (!UseSIMDForArrayEquals) {
5480         __ pop(spilled_regs, sp);
5481       }
5482     __ bind(NOT_EQUAL_NO_POP);
5483     __ leave();
5484     __ ret(lr);
5485     return entry;
5486   }
5487 
5488   // result = r0 - return value. Contains initial hashcode value on entry.
5489   // ary = r1 - array address
5490   // cnt = r2 - elements count
5491   // Clobbers: v0-v13, rscratch1, rscratch2
5492   address generate_large_arrays_hashcode(BasicType eltype) {
5493     const Register result = r0, ary = r1, cnt = r2;
5494     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
5495     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
5496     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
5497     const FloatRegister vpowm = v13;
5498 
5499     ARRAYS_HASHCODE_REGISTERS;
5500 
5501     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
5502 
5503     unsigned int vf; // vectorization factor
5504     bool multiply_by_halves;
5505     Assembler::SIMD_Arrangement load_arrangement;
5506     switch (eltype) {
5507     case T_BOOLEAN:
5508     case T_BYTE:
5509       load_arrangement = Assembler::T8B;
5510       multiply_by_halves = true;
5511       vf = 8;
5512       break;
5513     case T_CHAR:
5514     case T_SHORT:
5515       load_arrangement = Assembler::T8H;
5516       multiply_by_halves = true;
5517       vf = 8;
5518       break;
5519     case T_INT:
5520       load_arrangement = Assembler::T4S;
5521       multiply_by_halves = false;
5522       vf = 4;
5523       break;
5524     default:
5525       ShouldNotReachHere();
5526     }
5527 
5528     // Unroll factor
5529     const unsigned uf = 4;
5530 
5531     // Effective vectorization factor
5532     const unsigned evf = vf * uf;
5533 
5534     __ align(CodeEntryAlignment);
5535 
5536     StubGenStubId stub_id;
5537     switch (eltype) {
5538     case T_BOOLEAN:
5539       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
5540       break;
5541     case T_BYTE:
5542       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
5543       break;
5544     case T_CHAR:
5545       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
5546       break;
5547     case T_SHORT:
5548       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
5549       break;
5550     case T_INT:
5551       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
5552       break;
5553     default:
5554       stub_id = StubGenStubId::NO_STUBID;
5555       ShouldNotReachHere();
5556     };
5557 
5558     StubCodeMark mark(this, stub_id);
5559 
5560     address entry = __ pc();
5561     __ enter();
5562 
5563     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
5564     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
5565     // value shouldn't change throughout both loops.
5566     __ movw(rscratch1, intpow(31U, 3));
5567     __ mov(vpow, Assembler::S, 0, rscratch1);
5568     __ movw(rscratch1, intpow(31U, 2));
5569     __ mov(vpow, Assembler::S, 1, rscratch1);
5570     __ movw(rscratch1, intpow(31U, 1));
5571     __ mov(vpow, Assembler::S, 2, rscratch1);
5572     __ movw(rscratch1, intpow(31U, 0));
5573     __ mov(vpow, Assembler::S, 3, rscratch1);
5574 
5575     __ mov(vmul0, Assembler::T16B, 0);
5576     __ mov(vmul0, Assembler::S, 3, result);
5577 
5578     __ andr(rscratch2, cnt, (uf - 1) * vf);
5579     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
5580 
5581     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
5582     __ mov(vpowm, Assembler::S, 0, rscratch1);
5583 
5584     // SMALL LOOP
5585     __ bind(SMALL_LOOP);
5586 
5587     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
5588     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5589     __ subsw(rscratch2, rscratch2, vf);
5590 
5591     if (load_arrangement == Assembler::T8B) {
5592       // Extend 8B to 8H to be able to use vector multiply
5593       // instructions
5594       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5595       if (is_signed_subword_type(eltype)) {
5596         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5597       } else {
5598         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5599       }
5600     }
5601 
5602     switch (load_arrangement) {
5603     case Assembler::T4S:
5604       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5605       break;
5606     case Assembler::T8B:
5607     case Assembler::T8H:
5608       assert(is_subword_type(eltype), "subword type expected");
5609       if (is_signed_subword_type(eltype)) {
5610         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5611       } else {
5612         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5613       }
5614       break;
5615     default:
5616       __ should_not_reach_here();
5617     }
5618 
5619     // Process the upper half of a vector
5620     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5621       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5622       if (is_signed_subword_type(eltype)) {
5623         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5624       } else {
5625         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5626       }
5627     }
5628 
5629     __ br(Assembler::HI, SMALL_LOOP);
5630 
5631     // SMALL LOOP'S EPILOQUE
5632     __ lsr(rscratch2, cnt, exact_log2(evf));
5633     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
5634 
5635     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5636     __ addv(vmul0, Assembler::T4S, vmul0);
5637     __ umov(result, vmul0, Assembler::S, 0);
5638 
5639     // TAIL
5640     __ bind(TAIL);
5641 
5642     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
5643     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
5644     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
5645     __ andr(rscratch2, cnt, vf - 1);
5646     __ bind(TAIL_SHORTCUT);
5647     __ adr(rscratch1, BR_BASE);
5648     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
5649     __ movw(rscratch2, 0x1f);
5650     __ br(rscratch1);
5651 
5652     for (size_t i = 0; i < vf - 1; ++i) {
5653       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
5654                                    eltype);
5655       __ maddw(result, result, rscratch2, rscratch1);
5656     }
5657     __ bind(BR_BASE);
5658 
5659     __ leave();
5660     __ ret(lr);
5661 
5662     // LARGE LOOP
5663     __ bind(LARGE_LOOP_PREHEADER);
5664 
5665     __ lsr(rscratch2, cnt, exact_log2(evf));
5666 
5667     if (multiply_by_halves) {
5668       // 31^4 - multiplier between lower and upper parts of a register
5669       __ movw(rscratch1, intpow(31U, vf / 2));
5670       __ mov(vpowm, Assembler::S, 1, rscratch1);
5671       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
5672       __ movw(rscratch1, intpow(31U, evf - vf / 2));
5673       __ mov(vpowm, Assembler::S, 0, rscratch1);
5674     } else {
5675       // 31^16
5676       __ movw(rscratch1, intpow(31U, evf));
5677       __ mov(vpowm, Assembler::S, 0, rscratch1);
5678     }
5679 
5680     __ mov(vmul3, Assembler::T16B, 0);
5681     __ mov(vmul2, Assembler::T16B, 0);
5682     __ mov(vmul1, Assembler::T16B, 0);
5683 
5684     __ bind(LARGE_LOOP);
5685 
5686     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
5687     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
5688     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
5689     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5690 
5691     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
5692            Address(__ post(ary, evf * type2aelembytes(eltype))));
5693 
5694     if (load_arrangement == Assembler::T8B) {
5695       // Extend 8B to 8H to be able to use vector multiply
5696       // instructions
5697       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5698       if (is_signed_subword_type(eltype)) {
5699         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5700         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5701         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5702         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5703       } else {
5704         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5705         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5706         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5707         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5708       }
5709     }
5710 
5711     switch (load_arrangement) {
5712     case Assembler::T4S:
5713       __ addv(vmul3, load_arrangement, vmul3, vdata3);
5714       __ addv(vmul2, load_arrangement, vmul2, vdata2);
5715       __ addv(vmul1, load_arrangement, vmul1, vdata1);
5716       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5717       break;
5718     case Assembler::T8B:
5719     case Assembler::T8H:
5720       assert(is_subword_type(eltype), "subword type expected");
5721       if (is_signed_subword_type(eltype)) {
5722         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5723         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5724         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5725         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5726       } else {
5727         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5728         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5729         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5730         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5731       }
5732       break;
5733     default:
5734       __ should_not_reach_here();
5735     }
5736 
5737     // Process the upper half of a vector
5738     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5739       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
5740       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
5741       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
5742       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
5743       if (is_signed_subword_type(eltype)) {
5744         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5745         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5746         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5747         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5748       } else {
5749         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5750         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5751         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5752         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5753       }
5754     }
5755 
5756     __ subsw(rscratch2, rscratch2, 1);
5757     __ br(Assembler::HI, LARGE_LOOP);
5758 
5759     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
5760     __ addv(vmul3, Assembler::T4S, vmul3);
5761     __ umov(result, vmul3, Assembler::S, 0);
5762 
5763     __ mov(rscratch2, intpow(31U, vf));
5764 
5765     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
5766     __ addv(vmul2, Assembler::T4S, vmul2);
5767     __ umov(rscratch1, vmul2, Assembler::S, 0);
5768     __ maddw(result, result, rscratch2, rscratch1);
5769 
5770     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
5771     __ addv(vmul1, Assembler::T4S, vmul1);
5772     __ umov(rscratch1, vmul1, Assembler::S, 0);
5773     __ maddw(result, result, rscratch2, rscratch1);
5774 
5775     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5776     __ addv(vmul0, Assembler::T4S, vmul0);
5777     __ umov(rscratch1, vmul0, Assembler::S, 0);
5778     __ maddw(result, result, rscratch2, rscratch1);
5779 
5780     __ andr(rscratch2, cnt, vf - 1);
5781     __ cbnz(rscratch2, TAIL_SHORTCUT);
5782 
5783     __ leave();
5784     __ ret(lr);
5785 
5786     return entry;
5787   }
5788 
5789   address generate_dsin_dcos(bool isCos) {
5790     __ align(CodeEntryAlignment);
5791     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
5792     StubCodeMark mark(this, stub_id);
5793     address start = __ pc();
5794     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5795         (address)StubRoutines::aarch64::_two_over_pi,
5796         (address)StubRoutines::aarch64::_pio2,
5797         (address)StubRoutines::aarch64::_dsin_coef,
5798         (address)StubRoutines::aarch64::_dcos_coef);
5799     return start;
5800   }
5801 
5802   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5803   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5804       Label &DIFF2) {
5805     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5806     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5807 
5808     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5809     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5810     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5811     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5812 
5813     __ fmovd(tmpL, vtmp3);
5814     __ eor(rscratch2, tmp3, tmpL);
5815     __ cbnz(rscratch2, DIFF2);
5816 
5817     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5818     __ umov(tmpL, vtmp3, __ D, 1);
5819     __ eor(rscratch2, tmpU, tmpL);
5820     __ cbnz(rscratch2, DIFF1);
5821 
5822     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5823     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5824     __ fmovd(tmpL, vtmp);
5825     __ eor(rscratch2, tmp3, tmpL);
5826     __ cbnz(rscratch2, DIFF2);
5827 
5828     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5829     __ umov(tmpL, vtmp, __ D, 1);
5830     __ eor(rscratch2, tmpU, tmpL);
5831     __ cbnz(rscratch2, DIFF1);
5832   }
5833 
5834   // r0  = result
5835   // r1  = str1
5836   // r2  = cnt1
5837   // r3  = str2
5838   // r4  = cnt2
5839   // r10 = tmp1
5840   // r11 = tmp2
5841   address generate_compare_long_string_different_encoding(bool isLU) {
5842     __ align(CodeEntryAlignment);
5843     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
5844     StubCodeMark mark(this, stub_id);
5845     address entry = __ pc();
5846     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5847         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5848         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5849     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5850         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5851     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5852     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5853 
5854     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5855 
5856     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5857     // cnt2 == amount of characters left to compare
5858     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5859     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5860     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5861     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5862     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5863     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5864     __ eor(rscratch2, tmp1, tmp2);
5865     __ mov(rscratch1, tmp2);
5866     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5867     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5868              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5869     __ push(spilled_regs, sp);
5870     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5871     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5872 
5873     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5874 
5875     if (SoftwarePrefetchHintDistance >= 0) {
5876       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5877       __ br(__ LT, NO_PREFETCH);
5878       __ bind(LARGE_LOOP_PREFETCH);
5879         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5880         __ mov(tmp4, 2);
5881         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5882         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5883           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5884           __ subs(tmp4, tmp4, 1);
5885           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5886           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5887           __ mov(tmp4, 2);
5888         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5889           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5890           __ subs(tmp4, tmp4, 1);
5891           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5892           __ sub(cnt2, cnt2, 64);
5893           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5894           __ br(__ GE, LARGE_LOOP_PREFETCH);
5895     }
5896     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5897     __ bind(NO_PREFETCH);
5898     __ subs(cnt2, cnt2, 16);
5899     __ br(__ LT, TAIL);
5900     __ align(OptoLoopAlignment);
5901     __ bind(SMALL_LOOP); // smaller loop
5902       __ subs(cnt2, cnt2, 16);
5903       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5904       __ br(__ GE, SMALL_LOOP);
5905       __ cmn(cnt2, (u1)16);
5906       __ br(__ EQ, LOAD_LAST);
5907     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5908       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5909       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5910       __ ldr(tmp3, Address(cnt1, -8));
5911       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5912       __ b(LOAD_LAST);
5913     __ bind(DIFF2);
5914       __ mov(tmpU, tmp3);
5915     __ bind(DIFF1);
5916       __ pop(spilled_regs, sp);
5917       __ b(CALCULATE_DIFFERENCE);
5918     __ bind(LOAD_LAST);
5919       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5920       // No need to load it again
5921       __ mov(tmpU, tmp3);
5922       __ pop(spilled_regs, sp);
5923 
5924       // tmp2 points to the address of the last 4 Latin1 characters right now
5925       __ ldrs(vtmp, Address(tmp2));
5926       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5927       __ fmovd(tmpL, vtmp);
5928 
5929       __ eor(rscratch2, tmpU, tmpL);
5930       __ cbz(rscratch2, DONE);
5931 
5932     // Find the first different characters in the longwords and
5933     // compute their difference.
5934     __ bind(CALCULATE_DIFFERENCE);
5935       __ rev(rscratch2, rscratch2);
5936       __ clz(rscratch2, rscratch2);
5937       __ andr(rscratch2, rscratch2, -16);
5938       __ lsrv(tmp1, tmp1, rscratch2);
5939       __ uxthw(tmp1, tmp1);
5940       __ lsrv(rscratch1, rscratch1, rscratch2);
5941       __ uxthw(rscratch1, rscratch1);
5942       __ subw(result, tmp1, rscratch1);
5943     __ bind(DONE);
5944       __ ret(lr);
5945     return entry;
5946   }
5947 
5948   // r0 = input (float16)
5949   // v0 = result (float)
5950   // v1 = temporary float register
5951   address generate_float16ToFloat() {
5952     __ align(CodeEntryAlignment);
5953     StubGenStubId stub_id = StubGenStubId::hf2f_id;
5954     StubCodeMark mark(this, stub_id);
5955     address entry = __ pc();
5956     BLOCK_COMMENT("Entry:");
5957     __ flt16_to_flt(v0, r0, v1);
5958     __ ret(lr);
5959     return entry;
5960   }
5961 
5962   // v0 = input (float)
5963   // r0 = result (float16)
5964   // v1 = temporary float register
5965   address generate_floatToFloat16() {
5966     __ align(CodeEntryAlignment);
5967     StubGenStubId stub_id = StubGenStubId::f2hf_id;
5968     StubCodeMark mark(this, stub_id);
5969     address entry = __ pc();
5970     BLOCK_COMMENT("Entry:");
5971     __ flt_to_flt16(r0, v0, v1);
5972     __ ret(lr);
5973     return entry;
5974   }
5975 
5976   address generate_method_entry_barrier() {
5977     __ align(CodeEntryAlignment);
5978     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
5979     StubCodeMark mark(this, stub_id);
5980 
5981     Label deoptimize_label;
5982 
5983     address start = __ pc();
5984 
5985     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5986 
5987     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5988       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5989       // We can get here despite the nmethod being good, if we have not
5990       // yet applied our cross modification fence (or data fence).
5991       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5992       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5993       __ ldrw(rscratch2, rscratch2);
5994       __ strw(rscratch2, thread_epoch_addr);
5995       __ isb();
5996       __ membar(__ LoadLoad);
5997     }
5998 
5999     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
6000 
6001     __ enter();
6002     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
6003 
6004     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
6005 
6006     __ push_call_clobbered_registers();
6007 
6008     __ mov(c_rarg0, rscratch2);
6009     __ call_VM_leaf
6010          (CAST_FROM_FN_PTR
6011           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
6012 
6013     __ reset_last_Java_frame(true);
6014 
6015     __ mov(rscratch1, r0);
6016 
6017     __ pop_call_clobbered_registers();
6018 
6019     __ cbnz(rscratch1, deoptimize_label);
6020 
6021     __ leave();
6022     __ ret(lr);
6023 
6024     __ BIND(deoptimize_label);
6025 
6026     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
6027     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
6028 
6029     __ mov(sp, rscratch1);
6030     __ br(rscratch2);
6031 
6032     return start;
6033   }
6034 
6035   // r0  = result
6036   // r1  = str1
6037   // r2  = cnt1
6038   // r3  = str2
6039   // r4  = cnt2
6040   // r10 = tmp1
6041   // r11 = tmp2
6042   address generate_compare_long_string_same_encoding(bool isLL) {
6043     __ align(CodeEntryAlignment);
6044     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
6045     StubCodeMark mark(this, stub_id);
6046     address entry = __ pc();
6047     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
6048         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
6049 
6050     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
6051 
6052     // exit from large loop when less than 64 bytes left to read or we're about
6053     // to prefetch memory behind array border
6054     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
6055 
6056     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
6057     __ eor(rscratch2, tmp1, tmp2);
6058     __ cbnz(rscratch2, CAL_DIFFERENCE);
6059 
6060     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
6061     // update pointers, because of previous read
6062     __ add(str1, str1, wordSize);
6063     __ add(str2, str2, wordSize);
6064     if (SoftwarePrefetchHintDistance >= 0) {
6065       __ align(OptoLoopAlignment);
6066       __ bind(LARGE_LOOP_PREFETCH);
6067         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
6068         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
6069 
6070         for (int i = 0; i < 4; i++) {
6071           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
6072           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
6073           __ cmp(tmp1, tmp2);
6074           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
6075           __ br(Assembler::NE, DIFF);
6076         }
6077         __ sub(cnt2, cnt2, isLL ? 64 : 32);
6078         __ add(str1, str1, 64);
6079         __ add(str2, str2, 64);
6080         __ subs(rscratch2, cnt2, largeLoopExitCondition);
6081         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
6082         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
6083     }
6084 
6085     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
6086     __ br(Assembler::LE, LESS16);
6087     __ align(OptoLoopAlignment);
6088     __ bind(LOOP_COMPARE16);
6089       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
6090       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
6091       __ cmp(tmp1, tmp2);
6092       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
6093       __ br(Assembler::NE, DIFF);
6094       __ sub(cnt2, cnt2, isLL ? 16 : 8);
6095       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
6096       __ br(Assembler::LT, LESS16);
6097 
6098       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
6099       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
6100       __ cmp(tmp1, tmp2);
6101       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
6102       __ br(Assembler::NE, DIFF);
6103       __ sub(cnt2, cnt2, isLL ? 16 : 8);
6104       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
6105       __ br(Assembler::GE, LOOP_COMPARE16);
6106       __ cbz(cnt2, LENGTH_DIFF);
6107 
6108     __ bind(LESS16);
6109       // each 8 compare
6110       __ subs(cnt2, cnt2, isLL ? 8 : 4);
6111       __ br(Assembler::LE, LESS8);
6112       __ ldr(tmp1, Address(__ post(str1, 8)));
6113       __ ldr(tmp2, Address(__ post(str2, 8)));
6114       __ eor(rscratch2, tmp1, tmp2);
6115       __ cbnz(rscratch2, CAL_DIFFERENCE);
6116       __ sub(cnt2, cnt2, isLL ? 8 : 4);
6117 
6118     __ bind(LESS8); // directly load last 8 bytes
6119       if (!isLL) {
6120         __ add(cnt2, cnt2, cnt2);
6121       }
6122       __ ldr(tmp1, Address(str1, cnt2));
6123       __ ldr(tmp2, Address(str2, cnt2));
6124       __ eor(rscratch2, tmp1, tmp2);
6125       __ cbz(rscratch2, LENGTH_DIFF);
6126       __ b(CAL_DIFFERENCE);
6127 
6128     __ bind(DIFF);
6129       __ cmp(tmp1, tmp2);
6130       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
6131       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
6132       // reuse rscratch2 register for the result of eor instruction
6133       __ eor(rscratch2, tmp1, tmp2);
6134 
6135     __ bind(CAL_DIFFERENCE);
6136       __ rev(rscratch2, rscratch2);
6137       __ clz(rscratch2, rscratch2);
6138       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
6139       __ lsrv(tmp1, tmp1, rscratch2);
6140       __ lsrv(tmp2, tmp2, rscratch2);
6141       if (isLL) {
6142         __ uxtbw(tmp1, tmp1);
6143         __ uxtbw(tmp2, tmp2);
6144       } else {
6145         __ uxthw(tmp1, tmp1);
6146         __ uxthw(tmp2, tmp2);
6147       }
6148       __ subw(result, tmp1, tmp2);
6149 
6150     __ bind(LENGTH_DIFF);
6151       __ ret(lr);
6152     return entry;
6153   }
6154 
6155   enum string_compare_mode {
6156     LL,
6157     LU,
6158     UL,
6159     UU,
6160   };
6161 
6162   // The following registers are declared in aarch64.ad
6163   // r0  = result
6164   // r1  = str1
6165   // r2  = cnt1
6166   // r3  = str2
6167   // r4  = cnt2
6168   // r10 = tmp1
6169   // r11 = tmp2
6170   // z0  = ztmp1
6171   // z1  = ztmp2
6172   // p0  = pgtmp1
6173   // p1  = pgtmp2
6174   address generate_compare_long_string_sve(string_compare_mode mode) {
6175     StubGenStubId stub_id;
6176     switch (mode) {
6177       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
6178       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
6179       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
6180       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
6181       default: ShouldNotReachHere();
6182     }
6183 
6184     __ align(CodeEntryAlignment);
6185     address entry = __ pc();
6186     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
6187              tmp1 = r10, tmp2 = r11;
6188 
6189     Label LOOP, DONE, MISMATCH;
6190     Register vec_len = tmp1;
6191     Register idx = tmp2;
6192     // The minimum of the string lengths has been stored in cnt2.
6193     Register cnt = cnt2;
6194     FloatRegister ztmp1 = z0, ztmp2 = z1;
6195     PRegister pgtmp1 = p0, pgtmp2 = p1;
6196 
6197 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
6198     switch (mode) {                                                            \
6199       case LL:                                                                 \
6200         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
6201         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
6202         break;                                                                 \
6203       case LU:                                                                 \
6204         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
6205         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6206         break;                                                                 \
6207       case UL:                                                                 \
6208         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6209         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
6210         break;                                                                 \
6211       case UU:                                                                 \
6212         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6213         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6214         break;                                                                 \
6215       default:                                                                 \
6216         ShouldNotReachHere();                                                  \
6217     }
6218 
6219     StubCodeMark mark(this, stub_id);
6220 
6221     __ mov(idx, 0);
6222     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6223 
6224     if (mode == LL) {
6225       __ sve_cntb(vec_len);
6226     } else {
6227       __ sve_cnth(vec_len);
6228     }
6229 
6230     __ sub(rscratch1, cnt, vec_len);
6231 
6232     __ bind(LOOP);
6233 
6234       // main loop
6235       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6236       __ add(idx, idx, vec_len);
6237       // Compare strings.
6238       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6239       __ br(__ NE, MISMATCH);
6240       __ cmp(idx, rscratch1);
6241       __ br(__ LT, LOOP);
6242 
6243     // post loop, last iteration
6244     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6245 
6246     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6247     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6248     __ br(__ EQ, DONE);
6249 
6250     __ bind(MISMATCH);
6251 
6252     // Crop the vector to find its location.
6253     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
6254     // Extract the first different characters of each string.
6255     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
6256     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
6257 
6258     // Compute the difference of the first different characters.
6259     __ sub(result, rscratch1, rscratch2);
6260 
6261     __ bind(DONE);
6262     __ ret(lr);
6263 #undef LOAD_PAIR
6264     return entry;
6265   }
6266 
6267   void generate_compare_long_strings() {
6268     if (UseSVE == 0) {
6269       StubRoutines::aarch64::_compare_long_string_LL
6270           = generate_compare_long_string_same_encoding(true);
6271       StubRoutines::aarch64::_compare_long_string_UU
6272           = generate_compare_long_string_same_encoding(false);
6273       StubRoutines::aarch64::_compare_long_string_LU
6274           = generate_compare_long_string_different_encoding(true);
6275       StubRoutines::aarch64::_compare_long_string_UL
6276           = generate_compare_long_string_different_encoding(false);
6277     } else {
6278       StubRoutines::aarch64::_compare_long_string_LL
6279           = generate_compare_long_string_sve(LL);
6280       StubRoutines::aarch64::_compare_long_string_UU
6281           = generate_compare_long_string_sve(UU);
6282       StubRoutines::aarch64::_compare_long_string_LU
6283           = generate_compare_long_string_sve(LU);
6284       StubRoutines::aarch64::_compare_long_string_UL
6285           = generate_compare_long_string_sve(UL);
6286     }
6287   }
6288 
6289   // R0 = result
6290   // R1 = str2
6291   // R2 = cnt1
6292   // R3 = str1
6293   // R4 = cnt2
6294   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
6295   //
6296   // This generic linear code use few additional ideas, which makes it faster:
6297   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
6298   // in order to skip initial loading(help in systems with 1 ld pipeline)
6299   // 2) we can use "fast" algorithm of finding single character to search for
6300   // first symbol with less branches(1 branch per each loaded register instead
6301   // of branch for each symbol), so, this is where constants like
6302   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
6303   // 3) after loading and analyzing 1st register of source string, it can be
6304   // used to search for every 1st character entry, saving few loads in
6305   // comparison with "simplier-but-slower" implementation
6306   // 4) in order to avoid lots of push/pop operations, code below is heavily
6307   // re-using/re-initializing/compressing register values, which makes code
6308   // larger and a bit less readable, however, most of extra operations are
6309   // issued during loads or branches, so, penalty is minimal
6310   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
6311     StubGenStubId stub_id;
6312     if (str1_isL) {
6313       if (str2_isL) {
6314         stub_id = StubGenStubId::string_indexof_linear_ll_id;
6315       } else {
6316         stub_id = StubGenStubId::string_indexof_linear_ul_id;
6317       }
6318     } else {
6319       if (str2_isL) {
6320         ShouldNotReachHere();
6321       } else {
6322         stub_id = StubGenStubId::string_indexof_linear_uu_id;
6323       }
6324     }
6325     __ align(CodeEntryAlignment);
6326     StubCodeMark mark(this, stub_id);
6327     address entry = __ pc();
6328 
6329     int str1_chr_size = str1_isL ? 1 : 2;
6330     int str2_chr_size = str2_isL ? 1 : 2;
6331     int str1_chr_shift = str1_isL ? 0 : 1;
6332     int str2_chr_shift = str2_isL ? 0 : 1;
6333     bool isL = str1_isL && str2_isL;
6334    // parameters
6335     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
6336     // temporary registers
6337     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
6338     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
6339     // redefinitions
6340     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
6341 
6342     __ push(spilled_regs, sp);
6343     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
6344         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
6345         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
6346         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
6347         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
6348         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
6349     // Read whole register from str1. It is safe, because length >=8 here
6350     __ ldr(ch1, Address(str1));
6351     // Read whole register from str2. It is safe, because length >=8 here
6352     __ ldr(ch2, Address(str2));
6353     __ sub(cnt2, cnt2, cnt1);
6354     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
6355     if (str1_isL != str2_isL) {
6356       __ eor(v0, __ T16B, v0, v0);
6357     }
6358     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
6359     __ mul(first, first, tmp1);
6360     // check if we have less than 1 register to check
6361     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
6362     if (str1_isL != str2_isL) {
6363       __ fmovd(v1, ch1);
6364     }
6365     __ br(__ LE, L_SMALL);
6366     __ eor(ch2, first, ch2);
6367     if (str1_isL != str2_isL) {
6368       __ zip1(v1, __ T16B, v1, v0);
6369     }
6370     __ sub(tmp2, ch2, tmp1);
6371     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6372     __ bics(tmp2, tmp2, ch2);
6373     if (str1_isL != str2_isL) {
6374       __ fmovd(ch1, v1);
6375     }
6376     __ br(__ NE, L_HAS_ZERO);
6377     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6378     __ add(result, result, wordSize/str2_chr_size);
6379     __ add(str2, str2, wordSize);
6380     __ br(__ LT, L_POST_LOOP);
6381     __ BIND(L_LOOP);
6382       __ ldr(ch2, Address(str2));
6383       __ eor(ch2, first, ch2);
6384       __ sub(tmp2, ch2, tmp1);
6385       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6386       __ bics(tmp2, tmp2, ch2);
6387       __ br(__ NE, L_HAS_ZERO);
6388     __ BIND(L_LOOP_PROCEED);
6389       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6390       __ add(str2, str2, wordSize);
6391       __ add(result, result, wordSize/str2_chr_size);
6392       __ br(__ GE, L_LOOP);
6393     __ BIND(L_POST_LOOP);
6394       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
6395       __ br(__ LE, NOMATCH);
6396       __ ldr(ch2, Address(str2));
6397       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6398       __ eor(ch2, first, ch2);
6399       __ sub(tmp2, ch2, tmp1);
6400       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6401       __ mov(tmp4, -1); // all bits set
6402       __ b(L_SMALL_PROCEED);
6403     __ align(OptoLoopAlignment);
6404     __ BIND(L_SMALL);
6405       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6406       __ eor(ch2, first, ch2);
6407       if (str1_isL != str2_isL) {
6408         __ zip1(v1, __ T16B, v1, v0);
6409       }
6410       __ sub(tmp2, ch2, tmp1);
6411       __ mov(tmp4, -1); // all bits set
6412       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6413       if (str1_isL != str2_isL) {
6414         __ fmovd(ch1, v1); // move converted 4 symbols
6415       }
6416     __ BIND(L_SMALL_PROCEED);
6417       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
6418       __ bic(tmp2, tmp2, ch2);
6419       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
6420       __ rbit(tmp2, tmp2);
6421       __ br(__ EQ, NOMATCH);
6422     __ BIND(L_SMALL_HAS_ZERO_LOOP);
6423       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
6424       __ cmp(cnt1, u1(wordSize/str2_chr_size));
6425       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
6426       if (str2_isL) { // LL
6427         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6428         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6429         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6430         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6431         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6432       } else {
6433         __ mov(ch2, 0xE); // all bits in byte set except last one
6434         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6435         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6436         __ lslv(tmp2, tmp2, tmp4);
6437         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6438         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6439         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6440         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6441       }
6442       __ cmp(ch1, ch2);
6443       __ mov(tmp4, wordSize/str2_chr_size);
6444       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6445     __ BIND(L_SMALL_CMP_LOOP);
6446       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6447                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6448       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6449                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6450       __ add(tmp4, tmp4, 1);
6451       __ cmp(tmp4, cnt1);
6452       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
6453       __ cmp(first, ch2);
6454       __ br(__ EQ, L_SMALL_CMP_LOOP);
6455     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
6456       __ cbz(tmp2, NOMATCH); // no more matches. exit
6457       __ clz(tmp4, tmp2);
6458       __ add(result, result, 1); // advance index
6459       __ add(str2, str2, str2_chr_size); // advance pointer
6460       __ b(L_SMALL_HAS_ZERO_LOOP);
6461     __ align(OptoLoopAlignment);
6462     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
6463       __ cmp(first, ch2);
6464       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6465       __ b(DONE);
6466     __ align(OptoLoopAlignment);
6467     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
6468       if (str2_isL) { // LL
6469         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6470         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6471         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6472         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6473         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6474       } else {
6475         __ mov(ch2, 0xE); // all bits in byte set except last one
6476         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6477         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6478         __ lslv(tmp2, tmp2, tmp4);
6479         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6480         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6481         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6482         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6483       }
6484       __ cmp(ch1, ch2);
6485       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6486       __ b(DONE);
6487     __ align(OptoLoopAlignment);
6488     __ BIND(L_HAS_ZERO);
6489       __ rbit(tmp2, tmp2);
6490       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6491       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6492       // It's fine because both counters are 32bit and are not changed in this
6493       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6494       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6495       __ sub(result, result, 1);
6496     __ BIND(L_HAS_ZERO_LOOP);
6497       __ mov(cnt1, wordSize/str2_chr_size);
6498       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6499       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6500       if (str2_isL) {
6501         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6502         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6503         __ lslv(tmp2, tmp2, tmp4);
6504         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6505         __ add(tmp4, tmp4, 1);
6506         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6507         __ lsl(tmp2, tmp2, 1);
6508         __ mov(tmp4, wordSize/str2_chr_size);
6509       } else {
6510         __ mov(ch2, 0xE);
6511         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6512         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6513         __ lslv(tmp2, tmp2, tmp4);
6514         __ add(tmp4, tmp4, 1);
6515         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6516         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6517         __ lsl(tmp2, tmp2, 1);
6518         __ mov(tmp4, wordSize/str2_chr_size);
6519         __ sub(str2, str2, str2_chr_size);
6520       }
6521       __ cmp(ch1, ch2);
6522       __ mov(tmp4, wordSize/str2_chr_size);
6523       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6524     __ BIND(L_CMP_LOOP);
6525       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6526                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6527       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6528                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6529       __ add(tmp4, tmp4, 1);
6530       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6531       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6532       __ cmp(cnt1, ch2);
6533       __ br(__ EQ, L_CMP_LOOP);
6534     __ BIND(L_CMP_LOOP_NOMATCH);
6535       // here we're not matched
6536       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6537       __ clz(tmp4, tmp2);
6538       __ add(str2, str2, str2_chr_size); // advance pointer
6539       __ b(L_HAS_ZERO_LOOP);
6540     __ align(OptoLoopAlignment);
6541     __ BIND(L_CMP_LOOP_LAST_CMP);
6542       __ cmp(cnt1, ch2);
6543       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6544       __ b(DONE);
6545     __ align(OptoLoopAlignment);
6546     __ BIND(L_CMP_LOOP_LAST_CMP2);
6547       if (str2_isL) {
6548         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6549         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6550         __ lslv(tmp2, tmp2, tmp4);
6551         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6552         __ add(tmp4, tmp4, 1);
6553         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6554         __ lsl(tmp2, tmp2, 1);
6555       } else {
6556         __ mov(ch2, 0xE);
6557         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6558         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6559         __ lslv(tmp2, tmp2, tmp4);
6560         __ add(tmp4, tmp4, 1);
6561         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6562         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6563         __ lsl(tmp2, tmp2, 1);
6564         __ sub(str2, str2, str2_chr_size);
6565       }
6566       __ cmp(ch1, ch2);
6567       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6568       __ b(DONE);
6569     __ align(OptoLoopAlignment);
6570     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6571       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6572       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6573       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6574       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6575       // result by analyzed characters value, so, we can just reset lower bits
6576       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6577       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6578       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6579       // index of last analyzed substring inside current octet. So, str2 in at
6580       // respective start address. We need to advance it to next octet
6581       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6582       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6583       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6584       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6585       __ movw(cnt2, cnt2);
6586       __ b(L_LOOP_PROCEED);
6587     __ align(OptoLoopAlignment);
6588     __ BIND(NOMATCH);
6589       __ mov(result, -1);
6590     __ BIND(DONE);
6591       __ pop(spilled_regs, sp);
6592       __ ret(lr);
6593     return entry;
6594   }
6595 
6596   void generate_string_indexof_stubs() {
6597     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6598     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6599     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6600   }
6601 
6602   void inflate_and_store_2_fp_registers(bool generatePrfm,
6603       FloatRegister src1, FloatRegister src2) {
6604     Register dst = r1;
6605     __ zip1(v1, __ T16B, src1, v0);
6606     __ zip2(v2, __ T16B, src1, v0);
6607     if (generatePrfm) {
6608       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6609     }
6610     __ zip1(v3, __ T16B, src2, v0);
6611     __ zip2(v4, __ T16B, src2, v0);
6612     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6613   }
6614 
6615   // R0 = src
6616   // R1 = dst
6617   // R2 = len
6618   // R3 = len >> 3
6619   // V0 = 0
6620   // v1 = loaded 8 bytes
6621   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6622   address generate_large_byte_array_inflate() {
6623     __ align(CodeEntryAlignment);
6624     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
6625     StubCodeMark mark(this, stub_id);
6626     address entry = __ pc();
6627     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6628     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6629     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6630 
6631     // do one more 8-byte read to have address 16-byte aligned in most cases
6632     // also use single store instruction
6633     __ ldrd(v2, __ post(src, 8));
6634     __ sub(octetCounter, octetCounter, 2);
6635     __ zip1(v1, __ T16B, v1, v0);
6636     __ zip1(v2, __ T16B, v2, v0);
6637     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6638     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6639     __ subs(rscratch1, octetCounter, large_loop_threshold);
6640     __ br(__ LE, LOOP_START);
6641     __ b(LOOP_PRFM_START);
6642     __ bind(LOOP_PRFM);
6643       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6644     __ bind(LOOP_PRFM_START);
6645       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6646       __ sub(octetCounter, octetCounter, 8);
6647       __ subs(rscratch1, octetCounter, large_loop_threshold);
6648       inflate_and_store_2_fp_registers(true, v3, v4);
6649       inflate_and_store_2_fp_registers(true, v5, v6);
6650       __ br(__ GT, LOOP_PRFM);
6651       __ cmp(octetCounter, (u1)8);
6652       __ br(__ LT, DONE);
6653     __ bind(LOOP);
6654       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6655       __ bind(LOOP_START);
6656       __ sub(octetCounter, octetCounter, 8);
6657       __ cmp(octetCounter, (u1)8);
6658       inflate_and_store_2_fp_registers(false, v3, v4);
6659       inflate_and_store_2_fp_registers(false, v5, v6);
6660       __ br(__ GE, LOOP);
6661     __ bind(DONE);
6662       __ ret(lr);
6663     return entry;
6664   }
6665 
6666   /**
6667    *  Arguments:
6668    *
6669    *  Input:
6670    *  c_rarg0   - current state address
6671    *  c_rarg1   - H key address
6672    *  c_rarg2   - data address
6673    *  c_rarg3   - number of blocks
6674    *
6675    *  Output:
6676    *  Updated state at c_rarg0
6677    */
6678   address generate_ghash_processBlocks() {
6679     // Bafflingly, GCM uses little-endian for the byte order, but
6680     // big-endian for the bit order.  For example, the polynomial 1 is
6681     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6682     //
6683     // So, we must either reverse the bytes in each word and do
6684     // everything big-endian or reverse the bits in each byte and do
6685     // it little-endian.  On AArch64 it's more idiomatic to reverse
6686     // the bits in each byte (we have an instruction, RBIT, to do
6687     // that) and keep the data in little-endian bit order through the
6688     // calculation, bit-reversing the inputs and outputs.
6689 
6690     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
6691     StubCodeMark mark(this, stub_id);
6692     __ align(wordSize * 2);
6693     address p = __ pc();
6694     __ emit_int64(0x87);  // The low-order bits of the field
6695                           // polynomial (i.e. p = z^7+z^2+z+1)
6696                           // repeated in the low and high parts of a
6697                           // 128-bit vector
6698     __ emit_int64(0x87);
6699 
6700     __ align(CodeEntryAlignment);
6701     address start = __ pc();
6702 
6703     Register state   = c_rarg0;
6704     Register subkeyH = c_rarg1;
6705     Register data    = c_rarg2;
6706     Register blocks  = c_rarg3;
6707 
6708     FloatRegister vzr = v30;
6709     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6710 
6711     __ ldrq(v24, p);    // The field polynomial
6712 
6713     __ ldrq(v0, Address(state));
6714     __ ldrq(v1, Address(subkeyH));
6715 
6716     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6717     __ rbit(v0, __ T16B, v0);
6718     __ rev64(v1, __ T16B, v1);
6719     __ rbit(v1, __ T16B, v1);
6720 
6721     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6722     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6723 
6724     {
6725       Label L_ghash_loop;
6726       __ bind(L_ghash_loop);
6727 
6728       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6729                                                  // reversing each byte
6730       __ rbit(v2, __ T16B, v2);
6731       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6732 
6733       // Multiply state in v2 by subkey in v1
6734       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6735                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6736                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6737       // Reduce v7:v5 by the field polynomial
6738       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6739 
6740       __ sub(blocks, blocks, 1);
6741       __ cbnz(blocks, L_ghash_loop);
6742     }
6743 
6744     // The bit-reversed result is at this point in v0
6745     __ rev64(v0, __ T16B, v0);
6746     __ rbit(v0, __ T16B, v0);
6747 
6748     __ st1(v0, __ T16B, state);
6749     __ ret(lr);
6750 
6751     return start;
6752   }
6753 
6754   address generate_ghash_processBlocks_wide() {
6755     address small = generate_ghash_processBlocks();
6756 
6757     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
6758     StubCodeMark mark(this, stub_id);
6759     __ align(wordSize * 2);
6760     address p = __ pc();
6761     __ emit_int64(0x87);  // The low-order bits of the field
6762                           // polynomial (i.e. p = z^7+z^2+z+1)
6763                           // repeated in the low and high parts of a
6764                           // 128-bit vector
6765     __ emit_int64(0x87);
6766 
6767     __ align(CodeEntryAlignment);
6768     address start = __ pc();
6769 
6770     Register state   = c_rarg0;
6771     Register subkeyH = c_rarg1;
6772     Register data    = c_rarg2;
6773     Register blocks  = c_rarg3;
6774 
6775     const int unroll = 4;
6776 
6777     __ cmp(blocks, (unsigned char)(unroll * 2));
6778     __ br(__ LT, small);
6779 
6780     if (unroll > 1) {
6781     // Save state before entering routine
6782       __ sub(sp, sp, 4 * 16);
6783       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6784       __ sub(sp, sp, 4 * 16);
6785       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6786     }
6787 
6788     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6789 
6790     if (unroll > 1) {
6791       // And restore state
6792       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6793       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6794     }
6795 
6796     __ cmp(blocks, (unsigned char)0);
6797     __ br(__ GT, small);
6798 
6799     __ ret(lr);
6800 
6801     return start;
6802   }
6803 
6804   void generate_base64_encode_simdround(Register src, Register dst,
6805         FloatRegister codec, u8 size) {
6806 
6807     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6808     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6809     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6810 
6811     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6812 
6813     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6814 
6815     __ ushr(ind0, arrangement, in0,  2);
6816 
6817     __ ushr(ind1, arrangement, in1,  2);
6818     __ shl(in0,   arrangement, in0,  6);
6819     __ orr(ind1,  arrangement, ind1, in0);
6820     __ ushr(ind1, arrangement, ind1, 2);
6821 
6822     __ ushr(ind2, arrangement, in2,  4);
6823     __ shl(in1,   arrangement, in1,  4);
6824     __ orr(ind2,  arrangement, in1,  ind2);
6825     __ ushr(ind2, arrangement, ind2, 2);
6826 
6827     __ shl(ind3,  arrangement, in2,  2);
6828     __ ushr(ind3, arrangement, ind3, 2);
6829 
6830     __ tbl(out0,  arrangement, codec,  4, ind0);
6831     __ tbl(out1,  arrangement, codec,  4, ind1);
6832     __ tbl(out2,  arrangement, codec,  4, ind2);
6833     __ tbl(out3,  arrangement, codec,  4, ind3);
6834 
6835     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6836   }
6837 
6838    /**
6839    *  Arguments:
6840    *
6841    *  Input:
6842    *  c_rarg0   - src_start
6843    *  c_rarg1   - src_offset
6844    *  c_rarg2   - src_length
6845    *  c_rarg3   - dest_start
6846    *  c_rarg4   - dest_offset
6847    *  c_rarg5   - isURL
6848    *
6849    */
6850   address generate_base64_encodeBlock() {
6851 
6852     static const char toBase64[64] = {
6853       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6854       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6855       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6856       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6857       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6858     };
6859 
6860     static const char toBase64URL[64] = {
6861       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6862       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6863       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6864       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6865       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6866     };
6867 
6868     __ align(CodeEntryAlignment);
6869     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
6870     StubCodeMark mark(this, stub_id);
6871     address start = __ pc();
6872 
6873     Register src   = c_rarg0;  // source array
6874     Register soff  = c_rarg1;  // source start offset
6875     Register send  = c_rarg2;  // source end offset
6876     Register dst   = c_rarg3;  // dest array
6877     Register doff  = c_rarg4;  // position for writing to dest array
6878     Register isURL = c_rarg5;  // Base64 or URL character set
6879 
6880     // c_rarg6 and c_rarg7 are free to use as temps
6881     Register codec  = c_rarg6;
6882     Register length = c_rarg7;
6883 
6884     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6885 
6886     __ add(src, src, soff);
6887     __ add(dst, dst, doff);
6888     __ sub(length, send, soff);
6889 
6890     // load the codec base address
6891     __ lea(codec, ExternalAddress((address) toBase64));
6892     __ cbz(isURL, ProcessData);
6893     __ lea(codec, ExternalAddress((address) toBase64URL));
6894 
6895     __ BIND(ProcessData);
6896 
6897     // too short to formup a SIMD loop, roll back
6898     __ cmp(length, (u1)24);
6899     __ br(Assembler::LT, Process3B);
6900 
6901     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6902 
6903     __ BIND(Process48B);
6904     __ cmp(length, (u1)48);
6905     __ br(Assembler::LT, Process24B);
6906     generate_base64_encode_simdround(src, dst, v0, 16);
6907     __ sub(length, length, 48);
6908     __ b(Process48B);
6909 
6910     __ BIND(Process24B);
6911     __ cmp(length, (u1)24);
6912     __ br(Assembler::LT, SIMDExit);
6913     generate_base64_encode_simdround(src, dst, v0, 8);
6914     __ sub(length, length, 24);
6915 
6916     __ BIND(SIMDExit);
6917     __ cbz(length, Exit);
6918 
6919     __ BIND(Process3B);
6920     //  3 src bytes, 24 bits
6921     __ ldrb(r10, __ post(src, 1));
6922     __ ldrb(r11, __ post(src, 1));
6923     __ ldrb(r12, __ post(src, 1));
6924     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6925     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6926     // codec index
6927     __ ubfmw(r15, r12, 18, 23);
6928     __ ubfmw(r14, r12, 12, 17);
6929     __ ubfmw(r13, r12, 6,  11);
6930     __ andw(r12,  r12, 63);
6931     // get the code based on the codec
6932     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6933     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6934     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6935     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6936     __ strb(r15, __ post(dst, 1));
6937     __ strb(r14, __ post(dst, 1));
6938     __ strb(r13, __ post(dst, 1));
6939     __ strb(r12, __ post(dst, 1));
6940     __ sub(length, length, 3);
6941     __ cbnz(length, Process3B);
6942 
6943     __ BIND(Exit);
6944     __ ret(lr);
6945 
6946     return start;
6947   }
6948 
6949   void generate_base64_decode_simdround(Register src, Register dst,
6950         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6951 
6952     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6953     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6954 
6955     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6956     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6957 
6958     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6959 
6960     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6961 
6962     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6963 
6964     // we need unsigned saturating subtract, to make sure all input values
6965     // in range [0, 63] will have 0U value in the higher half lookup
6966     __ uqsubv(decH0, __ T16B, in0, v27);
6967     __ uqsubv(decH1, __ T16B, in1, v27);
6968     __ uqsubv(decH2, __ T16B, in2, v27);
6969     __ uqsubv(decH3, __ T16B, in3, v27);
6970 
6971     // lower half lookup
6972     __ tbl(decL0, arrangement, codecL, 4, in0);
6973     __ tbl(decL1, arrangement, codecL, 4, in1);
6974     __ tbl(decL2, arrangement, codecL, 4, in2);
6975     __ tbl(decL3, arrangement, codecL, 4, in3);
6976 
6977     // higher half lookup
6978     __ tbx(decH0, arrangement, codecH, 4, decH0);
6979     __ tbx(decH1, arrangement, codecH, 4, decH1);
6980     __ tbx(decH2, arrangement, codecH, 4, decH2);
6981     __ tbx(decH3, arrangement, codecH, 4, decH3);
6982 
6983     // combine lower and higher
6984     __ orr(decL0, arrangement, decL0, decH0);
6985     __ orr(decL1, arrangement, decL1, decH1);
6986     __ orr(decL2, arrangement, decL2, decH2);
6987     __ orr(decL3, arrangement, decL3, decH3);
6988 
6989     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6990     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6991     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6992     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6993     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6994     __ orr(in0, arrangement, decH0, decH1);
6995     __ orr(in1, arrangement, decH2, decH3);
6996     __ orr(in2, arrangement, in0,   in1);
6997     __ umaxv(in3, arrangement, in2);
6998     __ umov(rscratch2, in3, __ B, 0);
6999 
7000     // get the data to output
7001     __ shl(out0,  arrangement, decL0, 2);
7002     __ ushr(out1, arrangement, decL1, 4);
7003     __ orr(out0,  arrangement, out0,  out1);
7004     __ shl(out1,  arrangement, decL1, 4);
7005     __ ushr(out2, arrangement, decL2, 2);
7006     __ orr(out1,  arrangement, out1,  out2);
7007     __ shl(out2,  arrangement, decL2, 6);
7008     __ orr(out2,  arrangement, out2,  decL3);
7009 
7010     __ cbz(rscratch2, NoIllegalData);
7011 
7012     // handle illegal input
7013     __ umov(r10, in2, __ D, 0);
7014     if (size == 16) {
7015       __ cbnz(r10, ErrorInLowerHalf);
7016 
7017       // illegal input is in higher half, store the lower half now.
7018       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
7019 
7020       __ umov(r10, in2,  __ D, 1);
7021       __ umov(r11, out0, __ D, 1);
7022       __ umov(r12, out1, __ D, 1);
7023       __ umov(r13, out2, __ D, 1);
7024       __ b(StoreLegalData);
7025 
7026       __ BIND(ErrorInLowerHalf);
7027     }
7028     __ umov(r11, out0, __ D, 0);
7029     __ umov(r12, out1, __ D, 0);
7030     __ umov(r13, out2, __ D, 0);
7031 
7032     __ BIND(StoreLegalData);
7033     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
7034     __ strb(r11, __ post(dst, 1));
7035     __ strb(r12, __ post(dst, 1));
7036     __ strb(r13, __ post(dst, 1));
7037     __ lsr(r10, r10, 8);
7038     __ lsr(r11, r11, 8);
7039     __ lsr(r12, r12, 8);
7040     __ lsr(r13, r13, 8);
7041     __ b(StoreLegalData);
7042 
7043     __ BIND(NoIllegalData);
7044     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
7045   }
7046 
7047 
7048    /**
7049    *  Arguments:
7050    *
7051    *  Input:
7052    *  c_rarg0   - src_start
7053    *  c_rarg1   - src_offset
7054    *  c_rarg2   - src_length
7055    *  c_rarg3   - dest_start
7056    *  c_rarg4   - dest_offset
7057    *  c_rarg5   - isURL
7058    *  c_rarg6   - isMIME
7059    *
7060    */
7061   address generate_base64_decodeBlock() {
7062 
7063     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
7064     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
7065     // titled "Base64 decoding".
7066 
7067     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
7068     // except the trailing character '=' is also treated illegal value in this intrinsic. That
7069     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
7070     static const uint8_t fromBase64ForNoSIMD[256] = {
7071       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7072       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7073       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
7074        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
7075       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
7076        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
7077       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
7078        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
7079       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7080       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7081       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7082       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7083       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7084       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7085       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7086       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7087     };
7088 
7089     static const uint8_t fromBase64URLForNoSIMD[256] = {
7090       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7091       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7092       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
7093        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
7094       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
7095        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
7096       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
7097        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
7098       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7099       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7100       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7101       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7102       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7103       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7104       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7105       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7106     };
7107 
7108     // A legal value of base64 code is in range [0, 127].  We need two lookups
7109     // with tbl/tbx and combine them to get the decode data. The 1st table vector
7110     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
7111     // table vector lookup use tbx, out of range indices are unchanged in
7112     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
7113     // The value of index 64 is set to 0, so that we know that we already get the
7114     // decoded data with the 1st lookup.
7115     static const uint8_t fromBase64ForSIMD[128] = {
7116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7117       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7118       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
7119        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
7120         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
7121        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
7122       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
7123        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
7124     };
7125 
7126     static const uint8_t fromBase64URLForSIMD[128] = {
7127       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7128       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7129       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
7130        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
7131         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
7132        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
7133        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
7134        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
7135     };
7136 
7137     __ align(CodeEntryAlignment);
7138     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
7139     StubCodeMark mark(this, stub_id);
7140     address start = __ pc();
7141 
7142     Register src    = c_rarg0;  // source array
7143     Register soff   = c_rarg1;  // source start offset
7144     Register send   = c_rarg2;  // source end offset
7145     Register dst    = c_rarg3;  // dest array
7146     Register doff   = c_rarg4;  // position for writing to dest array
7147     Register isURL  = c_rarg5;  // Base64 or URL character set
7148     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
7149 
7150     Register length = send;    // reuse send as length of source data to process
7151 
7152     Register simd_codec   = c_rarg6;
7153     Register nosimd_codec = c_rarg7;
7154 
7155     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
7156 
7157     __ enter();
7158 
7159     __ add(src, src, soff);
7160     __ add(dst, dst, doff);
7161 
7162     __ mov(doff, dst);
7163 
7164     __ sub(length, send, soff);
7165     __ bfm(length, zr, 0, 1);
7166 
7167     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
7168     __ cbz(isURL, ProcessData);
7169     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
7170 
7171     __ BIND(ProcessData);
7172     __ mov(rscratch1, length);
7173     __ cmp(length, (u1)144); // 144 = 80 + 64
7174     __ br(Assembler::LT, Process4B);
7175 
7176     // In the MIME case, the line length cannot be more than 76
7177     // bytes (see RFC 2045). This is too short a block for SIMD
7178     // to be worthwhile, so we use non-SIMD here.
7179     __ movw(rscratch1, 79);
7180 
7181     __ BIND(Process4B);
7182     __ ldrw(r14, __ post(src, 4));
7183     __ ubfxw(r10, r14, 0,  8);
7184     __ ubfxw(r11, r14, 8,  8);
7185     __ ubfxw(r12, r14, 16, 8);
7186     __ ubfxw(r13, r14, 24, 8);
7187     // get the de-code
7188     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
7189     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
7190     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
7191     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
7192     // error detection, 255u indicates an illegal input
7193     __ orrw(r14, r10, r11);
7194     __ orrw(r15, r12, r13);
7195     __ orrw(r14, r14, r15);
7196     __ tbnz(r14, 7, Exit);
7197     // recover the data
7198     __ lslw(r14, r10, 10);
7199     __ bfiw(r14, r11, 4, 6);
7200     __ bfmw(r14, r12, 2, 5);
7201     __ rev16w(r14, r14);
7202     __ bfiw(r13, r12, 6, 2);
7203     __ strh(r14, __ post(dst, 2));
7204     __ strb(r13, __ post(dst, 1));
7205     // non-simd loop
7206     __ subsw(rscratch1, rscratch1, 4);
7207     __ br(Assembler::GT, Process4B);
7208 
7209     // if exiting from PreProcess80B, rscratch1 == -1;
7210     // otherwise, rscratch1 == 0.
7211     __ cbzw(rscratch1, Exit);
7212     __ sub(length, length, 80);
7213 
7214     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
7215     __ cbz(isURL, SIMDEnter);
7216     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
7217 
7218     __ BIND(SIMDEnter);
7219     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
7220     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
7221     __ mov(rscratch1, 63);
7222     __ dup(v27, __ T16B, rscratch1);
7223 
7224     __ BIND(Process64B);
7225     __ cmp(length, (u1)64);
7226     __ br(Assembler::LT, Process32B);
7227     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
7228     __ sub(length, length, 64);
7229     __ b(Process64B);
7230 
7231     __ BIND(Process32B);
7232     __ cmp(length, (u1)32);
7233     __ br(Assembler::LT, SIMDExit);
7234     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
7235     __ sub(length, length, 32);
7236     __ b(Process32B);
7237 
7238     __ BIND(SIMDExit);
7239     __ cbz(length, Exit);
7240     __ movw(rscratch1, length);
7241     __ b(Process4B);
7242 
7243     __ BIND(Exit);
7244     __ sub(c_rarg0, dst, doff);
7245 
7246     __ leave();
7247     __ ret(lr);
7248 
7249     return start;
7250   }
7251 
7252   // Support for spin waits.
7253   address generate_spin_wait() {
7254     __ align(CodeEntryAlignment);
7255     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
7256     StubCodeMark mark(this, stub_id);
7257     address start = __ pc();
7258 
7259     __ spin_wait();
7260     __ ret(lr);
7261 
7262     return start;
7263   }
7264 
7265   void generate_lookup_secondary_supers_table_stub() {
7266     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
7267     StubCodeMark mark(this, stub_id);
7268 
7269     const Register
7270       r_super_klass  = r0,
7271       r_array_base   = r1,
7272       r_array_length = r2,
7273       r_array_index  = r3,
7274       r_sub_klass    = r4,
7275       r_bitmap       = rscratch2,
7276       result         = r5;
7277     const FloatRegister
7278       vtemp          = v0;
7279 
7280     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
7281       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
7282       Label L_success;
7283       __ enter();
7284       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
7285                                              r_array_base, r_array_length, r_array_index,
7286                                              vtemp, result, slot,
7287                                              /*stub_is_near*/true);
7288       __ leave();
7289       __ ret(lr);
7290     }
7291   }
7292 
7293   // Slow path implementation for UseSecondarySupersTable.
7294   address generate_lookup_secondary_supers_table_slow_path_stub() {
7295     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
7296     StubCodeMark mark(this, stub_id);
7297 
7298     address start = __ pc();
7299     const Register
7300       r_super_klass  = r0,        // argument
7301       r_array_base   = r1,        // argument
7302       temp1          = r2,        // temp
7303       r_array_index  = r3,        // argument
7304       r_bitmap       = rscratch2, // argument
7305       result         = r5;        // argument
7306 
7307     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
7308     __ ret(lr);
7309 
7310     return start;
7311   }
7312 
7313 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
7314 
7315   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
7316   //
7317   // If LSE is in use, generate LSE versions of all the stubs. The
7318   // non-LSE versions are in atomic_aarch64.S.
7319 
7320   // class AtomicStubMark records the entry point of a stub and the
7321   // stub pointer which will point to it. The stub pointer is set to
7322   // the entry point when ~AtomicStubMark() is called, which must be
7323   // after ICache::invalidate_range. This ensures safe publication of
7324   // the generated code.
7325   class AtomicStubMark {
7326     address _entry_point;
7327     aarch64_atomic_stub_t *_stub;
7328     MacroAssembler *_masm;
7329   public:
7330     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
7331       _masm = masm;
7332       __ align(32);
7333       _entry_point = __ pc();
7334       _stub = stub;
7335     }
7336     ~AtomicStubMark() {
7337       *_stub = (aarch64_atomic_stub_t)_entry_point;
7338     }
7339   };
7340 
7341   // NB: For memory_order_conservative we need a trailing membar after
7342   // LSE atomic operations but not a leading membar.
7343   //
7344   // We don't need a leading membar because a clause in the Arm ARM
7345   // says:
7346   //
7347   //   Barrier-ordered-before
7348   //
7349   //   Barrier instructions order prior Memory effects before subsequent
7350   //   Memory effects generated by the same Observer. A read or a write
7351   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
7352   //   Observer if and only if RW1 appears in program order before RW 2
7353   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
7354   //   instruction with both Acquire and Release semantics.
7355   //
7356   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
7357   // and Release semantics, therefore we don't need a leading
7358   // barrier. However, there is no corresponding Barrier-ordered-after
7359   // relationship, therefore we need a trailing membar to prevent a
7360   // later store or load from being reordered with the store in an
7361   // atomic instruction.
7362   //
7363   // This was checked by using the herd7 consistency model simulator
7364   // (http://diy.inria.fr/) with this test case:
7365   //
7366   // AArch64 LseCas
7367   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
7368   // P0 | P1;
7369   // LDR W4, [X2] | MOV W3, #0;
7370   // DMB LD       | MOV W4, #1;
7371   // LDR W3, [X1] | CASAL W3, W4, [X1];
7372   //              | DMB ISH;
7373   //              | STR W4, [X2];
7374   // exists
7375   // (0:X3=0 /\ 0:X4=1)
7376   //
7377   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
7378   // with the store to x in P1. Without the DMB in P1 this may happen.
7379   //
7380   // At the time of writing we don't know of any AArch64 hardware that
7381   // reorders stores in this way, but the Reference Manual permits it.
7382 
7383   void gen_cas_entry(Assembler::operand_size size,
7384                      atomic_memory_order order) {
7385     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
7386       exchange_val = c_rarg2;
7387     bool acquire, release;
7388     switch (order) {
7389       case memory_order_relaxed:
7390         acquire = false;
7391         release = false;
7392         break;
7393       case memory_order_release:
7394         acquire = false;
7395         release = true;
7396         break;
7397       default:
7398         acquire = true;
7399         release = true;
7400         break;
7401     }
7402     __ mov(prev, compare_val);
7403     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
7404     if (order == memory_order_conservative) {
7405       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7406     }
7407     if (size == Assembler::xword) {
7408       __ mov(r0, prev);
7409     } else {
7410       __ movw(r0, prev);
7411     }
7412     __ ret(lr);
7413   }
7414 
7415   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
7416     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7417     // If not relaxed, then default to conservative.  Relaxed is the only
7418     // case we use enough to be worth specializing.
7419     if (order == memory_order_relaxed) {
7420       __ ldadd(size, incr, prev, addr);
7421     } else {
7422       __ ldaddal(size, incr, prev, addr);
7423       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7424     }
7425     if (size == Assembler::xword) {
7426       __ mov(r0, prev);
7427     } else {
7428       __ movw(r0, prev);
7429     }
7430     __ ret(lr);
7431   }
7432 
7433   void gen_swpal_entry(Assembler::operand_size size) {
7434     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7435     __ swpal(size, incr, prev, addr);
7436     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7437     if (size == Assembler::xword) {
7438       __ mov(r0, prev);
7439     } else {
7440       __ movw(r0, prev);
7441     }
7442     __ ret(lr);
7443   }
7444 
7445   void generate_atomic_entry_points() {
7446     if (! UseLSE) {
7447       return;
7448     }
7449     __ align(CodeEntryAlignment);
7450     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
7451     StubCodeMark mark(this, stub_id);
7452     address first_entry = __ pc();
7453 
7454     // ADD, memory_order_conservative
7455     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
7456     gen_ldadd_entry(Assembler::word, memory_order_conservative);
7457     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
7458     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
7459 
7460     // ADD, memory_order_relaxed
7461     AtomicStubMark mark_fetch_add_4_relaxed
7462       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
7463     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
7464     AtomicStubMark mark_fetch_add_8_relaxed
7465       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
7466     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
7467 
7468     // XCHG, memory_order_conservative
7469     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
7470     gen_swpal_entry(Assembler::word);
7471     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
7472     gen_swpal_entry(Assembler::xword);
7473 
7474     // CAS, memory_order_conservative
7475     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
7476     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
7477     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
7478     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
7479     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
7480     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
7481 
7482     // CAS, memory_order_relaxed
7483     AtomicStubMark mark_cmpxchg_1_relaxed
7484       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
7485     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
7486     AtomicStubMark mark_cmpxchg_4_relaxed
7487       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
7488     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
7489     AtomicStubMark mark_cmpxchg_8_relaxed
7490       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
7491     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
7492 
7493     AtomicStubMark mark_cmpxchg_4_release
7494       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
7495     gen_cas_entry(MacroAssembler::word, memory_order_release);
7496     AtomicStubMark mark_cmpxchg_8_release
7497       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
7498     gen_cas_entry(MacroAssembler::xword, memory_order_release);
7499 
7500     AtomicStubMark mark_cmpxchg_4_seq_cst
7501       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
7502     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
7503     AtomicStubMark mark_cmpxchg_8_seq_cst
7504       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
7505     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
7506 
7507     ICache::invalidate_range(first_entry, __ pc() - first_entry);
7508   }
7509 #endif // LINUX
7510 
7511   address generate_cont_thaw(Continuation::thaw_kind kind) {
7512     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
7513     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
7514 
7515     address start = __ pc();
7516 
7517     if (return_barrier) {
7518       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7519       __ mov(sp, rscratch1);
7520     }
7521     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7522 
7523     if (return_barrier) {
7524       // preserve possible return value from a method returning to the return barrier
7525       __ fmovd(rscratch1, v0);
7526       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7527     }
7528 
7529     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7530     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7531     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7532 
7533     if (return_barrier) {
7534       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7535       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7536       __ fmovd(v0, rscratch1);
7537     }
7538     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7539 
7540 
7541     Label thaw_success;
7542     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7543     __ cbnz(rscratch2, thaw_success);
7544     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
7545     __ br(rscratch1);
7546     __ bind(thaw_success);
7547 
7548     // make room for the thawed frames
7549     __ sub(rscratch1, sp, rscratch2);
7550     __ andr(rscratch1, rscratch1, -16); // align
7551     __ mov(sp, rscratch1);
7552 
7553     if (return_barrier) {
7554       // save original return value -- again
7555       __ fmovd(rscratch1, v0);
7556       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7557     }
7558 
7559     // If we want, we can templatize thaw by kind, and have three different entries
7560     __ movw(c_rarg1, (uint32_t)kind);
7561 
7562     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7563     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7564 
7565     if (return_barrier) {
7566       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7567       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7568       __ fmovd(v0, rscratch1);
7569     } else {
7570       __ mov(r0, zr); // return 0 (success) from doYield
7571     }
7572 
7573     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7574     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7575     __ mov(rfp, sp);
7576 
7577     if (return_barrier_exception) {
7578       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7579       __ authenticate_return_address(c_rarg1);
7580       __ verify_oop(r0);
7581       // save return value containing the exception oop in callee-saved R19
7582       __ mov(r19, r0);
7583 
7584       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7585 
7586       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7587       // __ reinitialize_ptrue();
7588 
7589       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7590 
7591       __ mov(r1, r0); // the exception handler
7592       __ mov(r0, r19); // restore return value containing the exception oop
7593       __ verify_oop(r0);
7594 
7595       __ leave();
7596       __ mov(r3, lr);
7597       __ br(r1); // the exception handler
7598     } else {
7599       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7600       __ leave();
7601       __ ret(lr);
7602     }
7603 
7604     return start;
7605   }
7606 
7607   address generate_cont_thaw() {
7608     if (!Continuations::enabled()) return nullptr;
7609 
7610     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
7611     StubCodeMark mark(this, stub_id);
7612     address start = __ pc();
7613     generate_cont_thaw(Continuation::thaw_top);
7614     return start;
7615   }
7616 
7617   address generate_cont_returnBarrier() {
7618     if (!Continuations::enabled()) return nullptr;
7619 
7620     // TODO: will probably need multiple return barriers depending on return type
7621     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
7622     StubCodeMark mark(this, stub_id);
7623     address start = __ pc();
7624 
7625     generate_cont_thaw(Continuation::thaw_return_barrier);
7626 
7627     return start;
7628   }
7629 
7630   address generate_cont_returnBarrier_exception() {
7631     if (!Continuations::enabled()) return nullptr;
7632 
7633     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
7634     StubCodeMark mark(this, stub_id);
7635     address start = __ pc();
7636 
7637     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7638 
7639     return start;
7640   }
7641 
7642   address generate_cont_preempt_stub() {
7643     if (!Continuations::enabled()) return nullptr;
7644     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
7645     StubCodeMark mark(this, stub_id);
7646     address start = __ pc();
7647 
7648     __ reset_last_Java_frame(true);
7649 
7650     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
7651     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
7652     __ mov(sp, rscratch2);
7653 
7654     Label preemption_cancelled;
7655     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
7656     __ cbnz(rscratch1, preemption_cancelled);
7657 
7658     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
7659     SharedRuntime::continuation_enter_cleanup(_masm);
7660     __ leave();
7661     __ ret(lr);
7662 
7663     // We acquired the monitor after freezing the frames so call thaw to continue execution.
7664     __ bind(preemption_cancelled);
7665     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
7666     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
7667     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
7668     __ ldr(rscratch1, Address(rscratch1));
7669     __ br(rscratch1);
7670 
7671     return start;
7672   }
7673 
7674   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7675   // are represented as long[5], with BITS_PER_LIMB = 26.
7676   // Pack five 26-bit limbs into three 64-bit registers.
7677   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7678     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7679     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7680     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7681     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7682 
7683     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7684     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7685     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7686     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7687 
7688     if (dest2->is_valid()) {
7689       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7690     } else {
7691 #ifdef ASSERT
7692       Label OK;
7693       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7694       __ br(__ EQ, OK);
7695       __ stop("high bits of Poly1305 integer should be zero");
7696       __ should_not_reach_here();
7697       __ bind(OK);
7698 #endif
7699     }
7700   }
7701 
7702   // As above, but return only a 128-bit integer, packed into two
7703   // 64-bit registers.
7704   void pack_26(Register dest0, Register dest1, Register src) {
7705     pack_26(dest0, dest1, noreg, src);
7706   }
7707 
7708   // Multiply and multiply-accumulate unsigned 64-bit registers.
7709   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7710     __ mul(prod_lo, n, m);
7711     __ umulh(prod_hi, n, m);
7712   }
7713   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7714     wide_mul(rscratch1, rscratch2, n, m);
7715     __ adds(sum_lo, sum_lo, rscratch1);
7716     __ adc(sum_hi, sum_hi, rscratch2);
7717   }
7718 
7719   // Poly1305, RFC 7539
7720 
7721   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7722   // description of the tricks used to simplify and accelerate this
7723   // computation.
7724 
7725   address generate_poly1305_processBlocks() {
7726     __ align(CodeEntryAlignment);
7727     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
7728     StubCodeMark mark(this, stub_id);
7729     address start = __ pc();
7730     Label here;
7731     __ enter();
7732     RegSet callee_saved = RegSet::range(r19, r28);
7733     __ push(callee_saved, sp);
7734 
7735     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7736 
7737     // Arguments
7738     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7739 
7740     // R_n is the 128-bit randomly-generated key, packed into two
7741     // registers.  The caller passes this key to us as long[5], with
7742     // BITS_PER_LIMB = 26.
7743     const Register R_0 = *++regs, R_1 = *++regs;
7744     pack_26(R_0, R_1, r_start);
7745 
7746     // RR_n is (R_n >> 2) * 5
7747     const Register RR_0 = *++regs, RR_1 = *++regs;
7748     __ lsr(RR_0, R_0, 2);
7749     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7750     __ lsr(RR_1, R_1, 2);
7751     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7752 
7753     // U_n is the current checksum
7754     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7755     pack_26(U_0, U_1, U_2, acc_start);
7756 
7757     static constexpr int BLOCK_LENGTH = 16;
7758     Label DONE, LOOP;
7759 
7760     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7761     __ br(Assembler::LT, DONE); {
7762       __ bind(LOOP);
7763 
7764       // S_n is to be the sum of U_n and the next block of data
7765       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7766       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7767       __ adds(S_0, U_0, S_0);
7768       __ adcs(S_1, U_1, S_1);
7769       __ adc(S_2, U_2, zr);
7770       __ add(S_2, S_2, 1);
7771 
7772       const Register U_0HI = *++regs, U_1HI = *++regs;
7773 
7774       // NB: this logic depends on some of the special properties of
7775       // Poly1305 keys. In particular, because we know that the top
7776       // four bits of R_0 and R_1 are zero, we can add together
7777       // partial products without any risk of needing to propagate a
7778       // carry out.
7779       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7780       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7781       __ andr(U_2, R_0, 3);
7782       __ mul(U_2, S_2, U_2);
7783 
7784       // Recycle registers S_0, S_1, S_2
7785       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7786 
7787       // Partial reduction mod 2**130 - 5
7788       __ adds(U_1, U_0HI, U_1);
7789       __ adc(U_2, U_1HI, U_2);
7790       // Sum now in U_2:U_1:U_0.
7791       // Dead: U_0HI, U_1HI.
7792       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7793 
7794       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7795 
7796       // First, U_2:U_1:U_0 += (U_2 >> 2)
7797       __ lsr(rscratch1, U_2, 2);
7798       __ andr(U_2, U_2, (u8)3);
7799       __ adds(U_0, U_0, rscratch1);
7800       __ adcs(U_1, U_1, zr);
7801       __ adc(U_2, U_2, zr);
7802       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7803       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7804       __ adcs(U_1, U_1, zr);
7805       __ adc(U_2, U_2, zr);
7806 
7807       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7808       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7809       __ br(~ Assembler::LT, LOOP);
7810     }
7811 
7812     // Further reduce modulo 2^130 - 5
7813     __ lsr(rscratch1, U_2, 2);
7814     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7815     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7816     __ adcs(U_1, U_1, zr);
7817     __ andr(U_2, U_2, (u1)3);
7818     __ adc(U_2, U_2, zr);
7819 
7820     // Unpack the sum into five 26-bit limbs and write to memory.
7821     __ ubfiz(rscratch1, U_0, 0, 26);
7822     __ ubfx(rscratch2, U_0, 26, 26);
7823     __ stp(rscratch1, rscratch2, Address(acc_start));
7824     __ ubfx(rscratch1, U_0, 52, 12);
7825     __ bfi(rscratch1, U_1, 12, 14);
7826     __ ubfx(rscratch2, U_1, 14, 26);
7827     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7828     __ ubfx(rscratch1, U_1, 40, 24);
7829     __ bfi(rscratch1, U_2, 24, 3);
7830     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7831 
7832     __ bind(DONE);
7833     __ pop(callee_saved, sp);
7834     __ leave();
7835     __ ret(lr);
7836 
7837     return start;
7838   }
7839 
7840   // exception handler for upcall stubs
7841   address generate_upcall_stub_exception_handler() {
7842     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
7843     StubCodeMark mark(this, stub_id);
7844     address start = __ pc();
7845 
7846     // Native caller has no idea how to handle exceptions,
7847     // so we just crash here. Up to callee to catch exceptions.
7848     __ verify_oop(r0);
7849     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7850     __ blr(rscratch1);
7851     __ should_not_reach_here();
7852 
7853     return start;
7854   }
7855 
7856   // load Method* target of MethodHandle
7857   // j_rarg0 = jobject receiver
7858   // rmethod = result
7859   address generate_upcall_stub_load_target() {
7860     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
7861     StubCodeMark mark(this, stub_id);
7862     address start = __ pc();
7863 
7864     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
7865       // Load target method from receiver
7866     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
7867     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
7868     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
7869     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
7870                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7871                       noreg, noreg);
7872     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7873 
7874     __ ret(lr);
7875 
7876     return start;
7877   }
7878 
7879 #undef __
7880 #define __ masm->
7881 
7882   class MontgomeryMultiplyGenerator : public MacroAssembler {
7883 
7884     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7885       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7886 
7887     RegSet _toSave;
7888     bool _squaring;
7889 
7890   public:
7891     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7892       : MacroAssembler(as->code()), _squaring(squaring) {
7893 
7894       // Register allocation
7895 
7896       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7897       Pa_base = *regs;       // Argument registers
7898       if (squaring)
7899         Pb_base = Pa_base;
7900       else
7901         Pb_base = *++regs;
7902       Pn_base = *++regs;
7903       Rlen= *++regs;
7904       inv = *++regs;
7905       Pm_base = *++regs;
7906 
7907                           // Working registers:
7908       Ra =  *++regs;        // The current digit of a, b, n, and m.
7909       Rb =  *++regs;
7910       Rm =  *++regs;
7911       Rn =  *++regs;
7912 
7913       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7914       Pb =  *++regs;
7915       Pm =  *++regs;
7916       Pn =  *++regs;
7917 
7918       t0 =  *++regs;        // Three registers which form a
7919       t1 =  *++regs;        // triple-precision accumuator.
7920       t2 =  *++regs;
7921 
7922       Ri =  *++regs;        // Inner and outer loop indexes.
7923       Rj =  *++regs;
7924 
7925       Rhi_ab = *++regs;     // Product registers: low and high parts
7926       Rlo_ab = *++regs;     // of a*b and m*n.
7927       Rhi_mn = *++regs;
7928       Rlo_mn = *++regs;
7929 
7930       // r19 and up are callee-saved.
7931       _toSave = RegSet::range(r19, *regs) + Pm_base;
7932     }
7933 
7934   private:
7935     void save_regs() {
7936       push(_toSave, sp);
7937     }
7938 
7939     void restore_regs() {
7940       pop(_toSave, sp);
7941     }
7942 
7943     template <typename T>
7944     void unroll_2(Register count, T block) {
7945       Label loop, end, odd;
7946       tbnz(count, 0, odd);
7947       cbz(count, end);
7948       align(16);
7949       bind(loop);
7950       (this->*block)();
7951       bind(odd);
7952       (this->*block)();
7953       subs(count, count, 2);
7954       br(Assembler::GT, loop);
7955       bind(end);
7956     }
7957 
7958     template <typename T>
7959     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7960       Label loop, end, odd;
7961       tbnz(count, 0, odd);
7962       cbz(count, end);
7963       align(16);
7964       bind(loop);
7965       (this->*block)(d, s, tmp);
7966       bind(odd);
7967       (this->*block)(d, s, tmp);
7968       subs(count, count, 2);
7969       br(Assembler::GT, loop);
7970       bind(end);
7971     }
7972 
7973     void pre1(RegisterOrConstant i) {
7974       block_comment("pre1");
7975       // Pa = Pa_base;
7976       // Pb = Pb_base + i;
7977       // Pm = Pm_base;
7978       // Pn = Pn_base + i;
7979       // Ra = *Pa;
7980       // Rb = *Pb;
7981       // Rm = *Pm;
7982       // Rn = *Pn;
7983       ldr(Ra, Address(Pa_base));
7984       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7985       ldr(Rm, Address(Pm_base));
7986       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7987       lea(Pa, Address(Pa_base));
7988       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7989       lea(Pm, Address(Pm_base));
7990       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7991 
7992       // Zero the m*n result.
7993       mov(Rhi_mn, zr);
7994       mov(Rlo_mn, zr);
7995     }
7996 
7997     // The core multiply-accumulate step of a Montgomery
7998     // multiplication.  The idea is to schedule operations as a
7999     // pipeline so that instructions with long latencies (loads and
8000     // multiplies) have time to complete before their results are
8001     // used.  This most benefits in-order implementations of the
8002     // architecture but out-of-order ones also benefit.
8003     void step() {
8004       block_comment("step");
8005       // MACC(Ra, Rb, t0, t1, t2);
8006       // Ra = *++Pa;
8007       // Rb = *--Pb;
8008       umulh(Rhi_ab, Ra, Rb);
8009       mul(Rlo_ab, Ra, Rb);
8010       ldr(Ra, pre(Pa, wordSize));
8011       ldr(Rb, pre(Pb, -wordSize));
8012       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
8013                                        // previous iteration.
8014       // MACC(Rm, Rn, t0, t1, t2);
8015       // Rm = *++Pm;
8016       // Rn = *--Pn;
8017       umulh(Rhi_mn, Rm, Rn);
8018       mul(Rlo_mn, Rm, Rn);
8019       ldr(Rm, pre(Pm, wordSize));
8020       ldr(Rn, pre(Pn, -wordSize));
8021       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8022     }
8023 
8024     void post1() {
8025       block_comment("post1");
8026 
8027       // MACC(Ra, Rb, t0, t1, t2);
8028       // Ra = *++Pa;
8029       // Rb = *--Pb;
8030       umulh(Rhi_ab, Ra, Rb);
8031       mul(Rlo_ab, Ra, Rb);
8032       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8033       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8034 
8035       // *Pm = Rm = t0 * inv;
8036       mul(Rm, t0, inv);
8037       str(Rm, Address(Pm));
8038 
8039       // MACC(Rm, Rn, t0, t1, t2);
8040       // t0 = t1; t1 = t2; t2 = 0;
8041       umulh(Rhi_mn, Rm, Rn);
8042 
8043 #ifndef PRODUCT
8044       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
8045       {
8046         mul(Rlo_mn, Rm, Rn);
8047         add(Rlo_mn, t0, Rlo_mn);
8048         Label ok;
8049         cbz(Rlo_mn, ok); {
8050           stop("broken Montgomery multiply");
8051         } bind(ok);
8052       }
8053 #endif
8054       // We have very carefully set things up so that
8055       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
8056       // the lower half of Rm * Rn because we know the result already:
8057       // it must be -t0.  t0 + (-t0) must generate a carry iff
8058       // t0 != 0.  So, rather than do a mul and an adds we just set
8059       // the carry flag iff t0 is nonzero.
8060       //
8061       // mul(Rlo_mn, Rm, Rn);
8062       // adds(zr, t0, Rlo_mn);
8063       subs(zr, t0, 1); // Set carry iff t0 is nonzero
8064       adcs(t0, t1, Rhi_mn);
8065       adc(t1, t2, zr);
8066       mov(t2, zr);
8067     }
8068 
8069     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
8070       block_comment("pre2");
8071       // Pa = Pa_base + i-len;
8072       // Pb = Pb_base + len;
8073       // Pm = Pm_base + i-len;
8074       // Pn = Pn_base + len;
8075 
8076       if (i.is_register()) {
8077         sub(Rj, i.as_register(), len);
8078       } else {
8079         mov(Rj, i.as_constant());
8080         sub(Rj, Rj, len);
8081       }
8082       // Rj == i-len
8083 
8084       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
8085       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
8086       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
8087       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
8088 
8089       // Ra = *++Pa;
8090       // Rb = *--Pb;
8091       // Rm = *++Pm;
8092       // Rn = *--Pn;
8093       ldr(Ra, pre(Pa, wordSize));
8094       ldr(Rb, pre(Pb, -wordSize));
8095       ldr(Rm, pre(Pm, wordSize));
8096       ldr(Rn, pre(Pn, -wordSize));
8097 
8098       mov(Rhi_mn, zr);
8099       mov(Rlo_mn, zr);
8100     }
8101 
8102     void post2(RegisterOrConstant i, RegisterOrConstant len) {
8103       block_comment("post2");
8104       if (i.is_constant()) {
8105         mov(Rj, i.as_constant()-len.as_constant());
8106       } else {
8107         sub(Rj, i.as_register(), len);
8108       }
8109 
8110       adds(t0, t0, Rlo_mn); // The pending m*n, low part
8111 
8112       // As soon as we know the least significant digit of our result,
8113       // store it.
8114       // Pm_base[i-len] = t0;
8115       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
8116 
8117       // t0 = t1; t1 = t2; t2 = 0;
8118       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
8119       adc(t1, t2, zr);
8120       mov(t2, zr);
8121     }
8122 
8123     // A carry in t0 after Montgomery multiplication means that we
8124     // should subtract multiples of n from our result in m.  We'll
8125     // keep doing that until there is no carry.
8126     void normalize(RegisterOrConstant len) {
8127       block_comment("normalize");
8128       // while (t0)
8129       //   t0 = sub(Pm_base, Pn_base, t0, len);
8130       Label loop, post, again;
8131       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
8132       cbz(t0, post); {
8133         bind(again); {
8134           mov(i, zr);
8135           mov(cnt, len);
8136           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
8137           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
8138           subs(zr, zr, zr); // set carry flag, i.e. no borrow
8139           align(16);
8140           bind(loop); {
8141             sbcs(Rm, Rm, Rn);
8142             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
8143             add(i, i, 1);
8144             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
8145             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
8146             sub(cnt, cnt, 1);
8147           } cbnz(cnt, loop);
8148           sbc(t0, t0, zr);
8149         } cbnz(t0, again);
8150       } bind(post);
8151     }
8152 
8153     // Move memory at s to d, reversing words.
8154     //    Increments d to end of copied memory
8155     //    Destroys tmp1, tmp2
8156     //    Preserves len
8157     //    Leaves s pointing to the address which was in d at start
8158     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
8159       assert(tmp1->encoding() < r19->encoding(), "register corruption");
8160       assert(tmp2->encoding() < r19->encoding(), "register corruption");
8161 
8162       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
8163       mov(tmp1, len);
8164       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
8165       sub(s, d, len, ext::uxtw, LogBytesPerWord);
8166     }
8167     // where
8168     void reverse1(Register d, Register s, Register tmp) {
8169       ldr(tmp, pre(s, -wordSize));
8170       ror(tmp, tmp, 32);
8171       str(tmp, post(d, wordSize));
8172     }
8173 
8174     void step_squaring() {
8175       // An extra ACC
8176       step();
8177       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8178     }
8179 
8180     void last_squaring(RegisterOrConstant i) {
8181       Label dont;
8182       // if ((i & 1) == 0) {
8183       tbnz(i.as_register(), 0, dont); {
8184         // MACC(Ra, Rb, t0, t1, t2);
8185         // Ra = *++Pa;
8186         // Rb = *--Pb;
8187         umulh(Rhi_ab, Ra, Rb);
8188         mul(Rlo_ab, Ra, Rb);
8189         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8190       } bind(dont);
8191     }
8192 
8193     void extra_step_squaring() {
8194       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8195 
8196       // MACC(Rm, Rn, t0, t1, t2);
8197       // Rm = *++Pm;
8198       // Rn = *--Pn;
8199       umulh(Rhi_mn, Rm, Rn);
8200       mul(Rlo_mn, Rm, Rn);
8201       ldr(Rm, pre(Pm, wordSize));
8202       ldr(Rn, pre(Pn, -wordSize));
8203     }
8204 
8205     void post1_squaring() {
8206       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8207 
8208       // *Pm = Rm = t0 * inv;
8209       mul(Rm, t0, inv);
8210       str(Rm, Address(Pm));
8211 
8212       // MACC(Rm, Rn, t0, t1, t2);
8213       // t0 = t1; t1 = t2; t2 = 0;
8214       umulh(Rhi_mn, Rm, Rn);
8215 
8216 #ifndef PRODUCT
8217       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
8218       {
8219         mul(Rlo_mn, Rm, Rn);
8220         add(Rlo_mn, t0, Rlo_mn);
8221         Label ok;
8222         cbz(Rlo_mn, ok); {
8223           stop("broken Montgomery multiply");
8224         } bind(ok);
8225       }
8226 #endif
8227       // We have very carefully set things up so that
8228       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
8229       // the lower half of Rm * Rn because we know the result already:
8230       // it must be -t0.  t0 + (-t0) must generate a carry iff
8231       // t0 != 0.  So, rather than do a mul and an adds we just set
8232       // the carry flag iff t0 is nonzero.
8233       //
8234       // mul(Rlo_mn, Rm, Rn);
8235       // adds(zr, t0, Rlo_mn);
8236       subs(zr, t0, 1); // Set carry iff t0 is nonzero
8237       adcs(t0, t1, Rhi_mn);
8238       adc(t1, t2, zr);
8239       mov(t2, zr);
8240     }
8241 
8242     void acc(Register Rhi, Register Rlo,
8243              Register t0, Register t1, Register t2) {
8244       adds(t0, t0, Rlo);
8245       adcs(t1, t1, Rhi);
8246       adc(t2, t2, zr);
8247     }
8248 
8249   public:
8250     /**
8251      * Fast Montgomery multiplication.  The derivation of the
8252      * algorithm is in A Cryptographic Library for the Motorola
8253      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
8254      *
8255      * Arguments:
8256      *
8257      * Inputs for multiplication:
8258      *   c_rarg0   - int array elements a
8259      *   c_rarg1   - int array elements b
8260      *   c_rarg2   - int array elements n (the modulus)
8261      *   c_rarg3   - int length
8262      *   c_rarg4   - int inv
8263      *   c_rarg5   - int array elements m (the result)
8264      *
8265      * Inputs for squaring:
8266      *   c_rarg0   - int array elements a
8267      *   c_rarg1   - int array elements n (the modulus)
8268      *   c_rarg2   - int length
8269      *   c_rarg3   - int inv
8270      *   c_rarg4   - int array elements m (the result)
8271      *
8272      */
8273     address generate_multiply() {
8274       Label argh, nothing;
8275       bind(argh);
8276       stop("MontgomeryMultiply total_allocation must be <= 8192");
8277 
8278       align(CodeEntryAlignment);
8279       address entry = pc();
8280 
8281       cbzw(Rlen, nothing);
8282 
8283       enter();
8284 
8285       // Make room.
8286       cmpw(Rlen, 512);
8287       br(Assembler::HI, argh);
8288       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8289       andr(sp, Ra, -2 * wordSize);
8290 
8291       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8292 
8293       {
8294         // Copy input args, reversing as we go.  We use Ra as a
8295         // temporary variable.
8296         reverse(Ra, Pa_base, Rlen, t0, t1);
8297         if (!_squaring)
8298           reverse(Ra, Pb_base, Rlen, t0, t1);
8299         reverse(Ra, Pn_base, Rlen, t0, t1);
8300       }
8301 
8302       // Push all call-saved registers and also Pm_base which we'll need
8303       // at the end.
8304       save_regs();
8305 
8306 #ifndef PRODUCT
8307       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
8308       {
8309         ldr(Rn, Address(Pn_base, 0));
8310         mul(Rlo_mn, Rn, inv);
8311         subs(zr, Rlo_mn, -1);
8312         Label ok;
8313         br(EQ, ok); {
8314           stop("broken inverse in Montgomery multiply");
8315         } bind(ok);
8316       }
8317 #endif
8318 
8319       mov(Pm_base, Ra);
8320 
8321       mov(t0, zr);
8322       mov(t1, zr);
8323       mov(t2, zr);
8324 
8325       block_comment("for (int i = 0; i < len; i++) {");
8326       mov(Ri, zr); {
8327         Label loop, end;
8328         cmpw(Ri, Rlen);
8329         br(Assembler::GE, end);
8330 
8331         bind(loop);
8332         pre1(Ri);
8333 
8334         block_comment("  for (j = i; j; j--) {"); {
8335           movw(Rj, Ri);
8336           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8337         } block_comment("  } // j");
8338 
8339         post1();
8340         addw(Ri, Ri, 1);
8341         cmpw(Ri, Rlen);
8342         br(Assembler::LT, loop);
8343         bind(end);
8344         block_comment("} // i");
8345       }
8346 
8347       block_comment("for (int i = len; i < 2*len; i++) {");
8348       mov(Ri, Rlen); {
8349         Label loop, end;
8350         cmpw(Ri, Rlen, Assembler::LSL, 1);
8351         br(Assembler::GE, end);
8352 
8353         bind(loop);
8354         pre2(Ri, Rlen);
8355 
8356         block_comment("  for (j = len*2-i-1; j; j--) {"); {
8357           lslw(Rj, Rlen, 1);
8358           subw(Rj, Rj, Ri);
8359           subw(Rj, Rj, 1);
8360           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8361         } block_comment("  } // j");
8362 
8363         post2(Ri, Rlen);
8364         addw(Ri, Ri, 1);
8365         cmpw(Ri, Rlen, Assembler::LSL, 1);
8366         br(Assembler::LT, loop);
8367         bind(end);
8368       }
8369       block_comment("} // i");
8370 
8371       normalize(Rlen);
8372 
8373       mov(Ra, Pm_base);  // Save Pm_base in Ra
8374       restore_regs();  // Restore caller's Pm_base
8375 
8376       // Copy our result into caller's Pm_base
8377       reverse(Pm_base, Ra, Rlen, t0, t1);
8378 
8379       leave();
8380       bind(nothing);
8381       ret(lr);
8382 
8383       return entry;
8384     }
8385     // In C, approximately:
8386 
8387     // void
8388     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8389     //                     julong Pn_base[], julong Pm_base[],
8390     //                     julong inv, int len) {
8391     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8392     //   julong *Pa, *Pb, *Pn, *Pm;
8393     //   julong Ra, Rb, Rn, Rm;
8394 
8395     //   int i;
8396 
8397     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8398 
8399     //   for (i = 0; i < len; i++) {
8400     //     int j;
8401 
8402     //     Pa = Pa_base;
8403     //     Pb = Pb_base + i;
8404     //     Pm = Pm_base;
8405     //     Pn = Pn_base + i;
8406 
8407     //     Ra = *Pa;
8408     //     Rb = *Pb;
8409     //     Rm = *Pm;
8410     //     Rn = *Pn;
8411 
8412     //     int iters = i;
8413     //     for (j = 0; iters--; j++) {
8414     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8415     //       MACC(Ra, Rb, t0, t1, t2);
8416     //       Ra = *++Pa;
8417     //       Rb = *--Pb;
8418     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8419     //       MACC(Rm, Rn, t0, t1, t2);
8420     //       Rm = *++Pm;
8421     //       Rn = *--Pn;
8422     //     }
8423 
8424     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8425     //     MACC(Ra, Rb, t0, t1, t2);
8426     //     *Pm = Rm = t0 * inv;
8427     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8428     //     MACC(Rm, Rn, t0, t1, t2);
8429 
8430     //     assert(t0 == 0, "broken Montgomery multiply");
8431 
8432     //     t0 = t1; t1 = t2; t2 = 0;
8433     //   }
8434 
8435     //   for (i = len; i < 2*len; i++) {
8436     //     int j;
8437 
8438     //     Pa = Pa_base + i-len;
8439     //     Pb = Pb_base + len;
8440     //     Pm = Pm_base + i-len;
8441     //     Pn = Pn_base + len;
8442 
8443     //     Ra = *++Pa;
8444     //     Rb = *--Pb;
8445     //     Rm = *++Pm;
8446     //     Rn = *--Pn;
8447 
8448     //     int iters = len*2-i-1;
8449     //     for (j = i-len+1; iters--; j++) {
8450     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8451     //       MACC(Ra, Rb, t0, t1, t2);
8452     //       Ra = *++Pa;
8453     //       Rb = *--Pb;
8454     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8455     //       MACC(Rm, Rn, t0, t1, t2);
8456     //       Rm = *++Pm;
8457     //       Rn = *--Pn;
8458     //     }
8459 
8460     //     Pm_base[i-len] = t0;
8461     //     t0 = t1; t1 = t2; t2 = 0;
8462     //   }
8463 
8464     //   while (t0)
8465     //     t0 = sub(Pm_base, Pn_base, t0, len);
8466     // }
8467 
8468     /**
8469      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8470      * multiplies than Montgomery multiplication so it should be up to
8471      * 25% faster.  However, its loop control is more complex and it
8472      * may actually run slower on some machines.
8473      *
8474      * Arguments:
8475      *
8476      * Inputs:
8477      *   c_rarg0   - int array elements a
8478      *   c_rarg1   - int array elements n (the modulus)
8479      *   c_rarg2   - int length
8480      *   c_rarg3   - int inv
8481      *   c_rarg4   - int array elements m (the result)
8482      *
8483      */
8484     address generate_square() {
8485       Label argh;
8486       bind(argh);
8487       stop("MontgomeryMultiply total_allocation must be <= 8192");
8488 
8489       align(CodeEntryAlignment);
8490       address entry = pc();
8491 
8492       enter();
8493 
8494       // Make room.
8495       cmpw(Rlen, 512);
8496       br(Assembler::HI, argh);
8497       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8498       andr(sp, Ra, -2 * wordSize);
8499 
8500       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8501 
8502       {
8503         // Copy input args, reversing as we go.  We use Ra as a
8504         // temporary variable.
8505         reverse(Ra, Pa_base, Rlen, t0, t1);
8506         reverse(Ra, Pn_base, Rlen, t0, t1);
8507       }
8508 
8509       // Push all call-saved registers and also Pm_base which we'll need
8510       // at the end.
8511       save_regs();
8512 
8513       mov(Pm_base, Ra);
8514 
8515       mov(t0, zr);
8516       mov(t1, zr);
8517       mov(t2, zr);
8518 
8519       block_comment("for (int i = 0; i < len; i++) {");
8520       mov(Ri, zr); {
8521         Label loop, end;
8522         bind(loop);
8523         cmp(Ri, Rlen);
8524         br(Assembler::GE, end);
8525 
8526         pre1(Ri);
8527 
8528         block_comment("for (j = (i+1)/2; j; j--) {"); {
8529           add(Rj, Ri, 1);
8530           lsr(Rj, Rj, 1);
8531           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8532         } block_comment("  } // j");
8533 
8534         last_squaring(Ri);
8535 
8536         block_comment("  for (j = i/2; j; j--) {"); {
8537           lsr(Rj, Ri, 1);
8538           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8539         } block_comment("  } // j");
8540 
8541         post1_squaring();
8542         add(Ri, Ri, 1);
8543         cmp(Ri, Rlen);
8544         br(Assembler::LT, loop);
8545 
8546         bind(end);
8547         block_comment("} // i");
8548       }
8549 
8550       block_comment("for (int i = len; i < 2*len; i++) {");
8551       mov(Ri, Rlen); {
8552         Label loop, end;
8553         bind(loop);
8554         cmp(Ri, Rlen, Assembler::LSL, 1);
8555         br(Assembler::GE, end);
8556 
8557         pre2(Ri, Rlen);
8558 
8559         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8560           lsl(Rj, Rlen, 1);
8561           sub(Rj, Rj, Ri);
8562           sub(Rj, Rj, 1);
8563           lsr(Rj, Rj, 1);
8564           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8565         } block_comment("  } // j");
8566 
8567         last_squaring(Ri);
8568 
8569         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8570           lsl(Rj, Rlen, 1);
8571           sub(Rj, Rj, Ri);
8572           lsr(Rj, Rj, 1);
8573           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8574         } block_comment("  } // j");
8575 
8576         post2(Ri, Rlen);
8577         add(Ri, Ri, 1);
8578         cmp(Ri, Rlen, Assembler::LSL, 1);
8579 
8580         br(Assembler::LT, loop);
8581         bind(end);
8582         block_comment("} // i");
8583       }
8584 
8585       normalize(Rlen);
8586 
8587       mov(Ra, Pm_base);  // Save Pm_base in Ra
8588       restore_regs();  // Restore caller's Pm_base
8589 
8590       // Copy our result into caller's Pm_base
8591       reverse(Pm_base, Ra, Rlen, t0, t1);
8592 
8593       leave();
8594       ret(lr);
8595 
8596       return entry;
8597     }
8598     // In C, approximately:
8599 
8600     // void
8601     // montgomery_square(julong Pa_base[], julong Pn_base[],
8602     //                   julong Pm_base[], julong inv, int len) {
8603     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8604     //   julong *Pa, *Pb, *Pn, *Pm;
8605     //   julong Ra, Rb, Rn, Rm;
8606 
8607     //   int i;
8608 
8609     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8610 
8611     //   for (i = 0; i < len; i++) {
8612     //     int j;
8613 
8614     //     Pa = Pa_base;
8615     //     Pb = Pa_base + i;
8616     //     Pm = Pm_base;
8617     //     Pn = Pn_base + i;
8618 
8619     //     Ra = *Pa;
8620     //     Rb = *Pb;
8621     //     Rm = *Pm;
8622     //     Rn = *Pn;
8623 
8624     //     int iters = (i+1)/2;
8625     //     for (j = 0; iters--; j++) {
8626     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8627     //       MACC2(Ra, Rb, t0, t1, t2);
8628     //       Ra = *++Pa;
8629     //       Rb = *--Pb;
8630     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8631     //       MACC(Rm, Rn, t0, t1, t2);
8632     //       Rm = *++Pm;
8633     //       Rn = *--Pn;
8634     //     }
8635     //     if ((i & 1) == 0) {
8636     //       assert(Ra == Pa_base[j], "must be");
8637     //       MACC(Ra, Ra, t0, t1, t2);
8638     //     }
8639     //     iters = i/2;
8640     //     assert(iters == i-j, "must be");
8641     //     for (; iters--; j++) {
8642     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8643     //       MACC(Rm, Rn, t0, t1, t2);
8644     //       Rm = *++Pm;
8645     //       Rn = *--Pn;
8646     //     }
8647 
8648     //     *Pm = Rm = t0 * inv;
8649     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8650     //     MACC(Rm, Rn, t0, t1, t2);
8651 
8652     //     assert(t0 == 0, "broken Montgomery multiply");
8653 
8654     //     t0 = t1; t1 = t2; t2 = 0;
8655     //   }
8656 
8657     //   for (i = len; i < 2*len; i++) {
8658     //     int start = i-len+1;
8659     //     int end = start + (len - start)/2;
8660     //     int j;
8661 
8662     //     Pa = Pa_base + i-len;
8663     //     Pb = Pa_base + len;
8664     //     Pm = Pm_base + i-len;
8665     //     Pn = Pn_base + len;
8666 
8667     //     Ra = *++Pa;
8668     //     Rb = *--Pb;
8669     //     Rm = *++Pm;
8670     //     Rn = *--Pn;
8671 
8672     //     int iters = (2*len-i-1)/2;
8673     //     assert(iters == end-start, "must be");
8674     //     for (j = start; iters--; j++) {
8675     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8676     //       MACC2(Ra, Rb, t0, t1, t2);
8677     //       Ra = *++Pa;
8678     //       Rb = *--Pb;
8679     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8680     //       MACC(Rm, Rn, t0, t1, t2);
8681     //       Rm = *++Pm;
8682     //       Rn = *--Pn;
8683     //     }
8684     //     if ((i & 1) == 0) {
8685     //       assert(Ra == Pa_base[j], "must be");
8686     //       MACC(Ra, Ra, t0, t1, t2);
8687     //     }
8688     //     iters =  (2*len-i)/2;
8689     //     assert(iters == len-j, "must be");
8690     //     for (; iters--; j++) {
8691     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8692     //       MACC(Rm, Rn, t0, t1, t2);
8693     //       Rm = *++Pm;
8694     //       Rn = *--Pn;
8695     //     }
8696     //     Pm_base[i-len] = t0;
8697     //     t0 = t1; t1 = t2; t2 = 0;
8698     //   }
8699 
8700     //   while (t0)
8701     //     t0 = sub(Pm_base, Pn_base, t0, len);
8702     // }
8703   };
8704 
8705   void generate_vector_math_stubs() {
8706     // Get native vector math stub routine addresses
8707     void* libsleef = nullptr;
8708     char ebuf[1024];
8709     char dll_name[JVM_MAXPATHLEN];
8710     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
8711       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
8712     }
8713     if (libsleef == nullptr) {
8714       log_info(library)("Failed to load native vector math library, %s!", ebuf);
8715       return;
8716     }
8717     // Method naming convention
8718     //   All the methods are named as <OP><T><N>_<U><suffix>
8719     //   Where:
8720     //     <OP>     is the operation name, e.g. sin
8721     //     <T>      is optional to indicate float/double
8722     //              "f/d" for vector float/double operation
8723     //     <N>      is the number of elements in the vector
8724     //              "2/4" for neon, and "x" for sve
8725     //     <U>      is the precision level
8726     //              "u10/u05" represents 1.0/0.5 ULP error bounds
8727     //               We use "u10" for all operations by default
8728     //               But for those functions do not have u10 support, we use "u05" instead
8729     //     <suffix> indicates neon/sve
8730     //              "sve/advsimd" for sve/neon implementations
8731     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
8732     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
8733     //
8734     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
8735 
8736     // Math vector stubs implemented with SVE for scalable vector size.
8737     if (UseSVE > 0) {
8738       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8739         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8740         // Skip "tanh" because there is performance regression
8741         if (vop == VectorSupport::VECTOR_OP_TANH) {
8742           continue;
8743         }
8744 
8745         // The native library does not support u10 level of "hypot".
8746         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8747 
8748         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
8749         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8750 
8751         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
8752         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8753       }
8754     }
8755 
8756     // Math vector stubs implemented with NEON for 64/128 bits vector size.
8757     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8758       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8759       // Skip "tanh" because there is performance regression
8760       if (vop == VectorSupport::VECTOR_OP_TANH) {
8761         continue;
8762       }
8763 
8764       // The native library does not support u10 level of "hypot".
8765       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8766 
8767       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8768       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
8769 
8770       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8771       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8772 
8773       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
8774       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8775     }
8776   }
8777 
8778   // Initialization
8779   void generate_initial_stubs() {
8780     // Generate initial stubs and initializes the entry points
8781 
8782     // entry points that exist in all platforms Note: This is code
8783     // that could be shared among different platforms - however the
8784     // benefit seems to be smaller than the disadvantage of having a
8785     // much more complicated generator structure. See also comment in
8786     // stubRoutines.hpp.
8787 
8788     StubRoutines::_forward_exception_entry = generate_forward_exception();
8789 
8790     StubRoutines::_call_stub_entry =
8791       generate_call_stub(StubRoutines::_call_stub_return_address);
8792 
8793     // is referenced by megamorphic call
8794     StubRoutines::_catch_exception_entry = generate_catch_exception();
8795 
8796     // Initialize table for copy memory (arraycopy) check.
8797     if (UnsafeMemoryAccess::_table == nullptr) {
8798       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
8799     }
8800 
8801     if (UseCRC32Intrinsics) {
8802       // set table address before stub generation which use it
8803       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8804       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8805     }
8806 
8807     if (UseCRC32CIntrinsics) {
8808       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8809     }
8810 
8811     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8812       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8813     }
8814 
8815     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8816       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8817     }
8818 
8819     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8820         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8821       StubRoutines::_hf2f = generate_float16ToFloat();
8822       StubRoutines::_f2hf = generate_floatToFloat16();
8823     }
8824   }
8825 
8826   void generate_continuation_stubs() {
8827     // Continuation stubs:
8828     StubRoutines::_cont_thaw          = generate_cont_thaw();
8829     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8830     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8831     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
8832   }
8833 
8834   void generate_final_stubs() {
8835     // support for verify_oop (must happen after universe_init)
8836     if (VerifyOops) {
8837       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8838     }
8839 
8840     // arraycopy stubs used by compilers
8841     generate_arraycopy_stubs();
8842 
8843     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8844     if (bs_nm != nullptr) {
8845       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8846     }
8847 
8848     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8849 
8850     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8851     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
8852 
8853 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8854 
8855     generate_atomic_entry_points();
8856 
8857 #endif // LINUX
8858 
8859 #ifdef COMPILER2
8860     if (UseSecondarySupersTable) {
8861       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
8862       if (! InlineSecondarySupersTest) {
8863         generate_lookup_secondary_supers_table_stub();
8864       }
8865     }
8866 #endif
8867 
8868     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8869   }
8870 
8871   void generate_compiler_stubs() {
8872 #if COMPILER2_OR_JVMCI
8873 
8874     if (UseSVE == 0) {
8875       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
8876     }
8877 
8878     // array equals stub for large arrays.
8879     if (!UseSimpleArrayEquals) {
8880       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8881     }
8882 
8883     // arrays_hascode stub for large arrays.
8884     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
8885     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
8886     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
8887     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
8888     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
8889 
8890     // byte_array_inflate stub for large arrays.
8891     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8892 
8893     // countPositives stub for large arrays.
8894     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8895 
8896     generate_compare_long_strings();
8897 
8898     generate_string_indexof_stubs();
8899 
8900 #ifdef COMPILER2
8901     if (UseMultiplyToLenIntrinsic) {
8902       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8903     }
8904 
8905     if (UseSquareToLenIntrinsic) {
8906       StubRoutines::_squareToLen = generate_squareToLen();
8907     }
8908 
8909     if (UseMulAddIntrinsic) {
8910       StubRoutines::_mulAdd = generate_mulAdd();
8911     }
8912 
8913     if (UseSIMDForBigIntegerShiftIntrinsics) {
8914       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8915       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8916     }
8917 
8918     if (UseMontgomeryMultiplyIntrinsic) {
8919       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
8920       StubCodeMark mark(this, stub_id);
8921       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8922       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8923     }
8924 
8925     if (UseMontgomerySquareIntrinsic) {
8926       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
8927       StubCodeMark mark(this, stub_id);
8928       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8929       // We use generate_multiply() rather than generate_square()
8930       // because it's faster for the sizes of modulus we care about.
8931       StubRoutines::_montgomerySquare = g.generate_multiply();
8932     }
8933 
8934     generate_vector_math_stubs();
8935 
8936 #endif // COMPILER2
8937 
8938     if (UseChaCha20Intrinsics) {
8939       StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
8940     }
8941 
8942     if (UseBASE64Intrinsics) {
8943         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8944         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8945     }
8946 
8947     // data cache line writeback
8948     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8949     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8950 
8951     if (UseAESIntrinsics) {
8952       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8953       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8954       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8955       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8956       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8957     }
8958     if (UseGHASHIntrinsics) {
8959       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8960       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8961     }
8962     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8963       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8964     }
8965 
8966     if (UseMD5Intrinsics) {
8967       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
8968       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
8969     }
8970     if (UseSHA1Intrinsics) {
8971       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
8972       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
8973     }
8974     if (UseSHA256Intrinsics) {
8975       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
8976       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
8977     }
8978     if (UseSHA512Intrinsics) {
8979       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
8980       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
8981     }
8982     if (UseSHA3Intrinsics) {
8983       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
8984       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
8985     }
8986 
8987     if (UsePoly1305Intrinsics) {
8988       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8989     }
8990 
8991     // generate Adler32 intrinsics code
8992     if (UseAdler32Intrinsics) {
8993       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8994     }
8995 
8996 #endif // COMPILER2_OR_JVMCI
8997   }
8998 
8999  public:
9000   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
9001     switch(blob_id) {
9002     case initial_id:
9003       generate_initial_stubs();
9004       break;
9005      case continuation_id:
9006       generate_continuation_stubs();
9007       break;
9008     case compiler_id:
9009       generate_compiler_stubs();
9010       break;
9011     case final_id:
9012       generate_final_stubs();
9013       break;
9014     default:
9015       fatal("unexpected blob id: %d", blob_id);
9016       break;
9017     };
9018   }
9019 }; // end class declaration
9020 
9021 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
9022   StubGenerator g(code, blob_id);
9023 }
9024 
9025 
9026 #if defined (LINUX)
9027 
9028 // Define pointers to atomic stubs and initialize them to point to the
9029 // code in atomic_aarch64.S.
9030 
9031 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
9032   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
9033     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
9034   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
9035     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
9036 
9037 DEFAULT_ATOMIC_OP(fetch_add, 4, )
9038 DEFAULT_ATOMIC_OP(fetch_add, 8, )
9039 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
9040 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
9041 DEFAULT_ATOMIC_OP(xchg, 4, )
9042 DEFAULT_ATOMIC_OP(xchg, 8, )
9043 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
9044 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
9045 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
9046 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
9047 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
9048 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
9049 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
9050 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
9051 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
9052 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
9053 
9054 #undef DEFAULT_ATOMIC_OP
9055 
9056 #endif // LINUX