1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/frame.inline.hpp"
  49 #include "runtime/handles.inline.hpp"
  50 #include "runtime/javaThread.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubCodeGenerator.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/checkedCast.hpp"
  56 #include "utilities/globalDefinitions.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER2
  59 #include "opto/runtime.hpp"
  60 #endif
  61 #if INCLUDE_ZGC
  62 #include "gc/z/zThreadLocalData.hpp"
  63 #endif
  64 
  65 // Declaration and definition of StubGenerator (no .hpp file).
  66 // For a more detailed description of the stub routine structure
  67 // see the comment in stubRoutines.hpp
  68 
  69 #undef __
  70 #define __ _masm->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 
  80 // Stub Code definitions
  81 
  82 class StubGenerator: public StubCodeGenerator {
  83  private:
  84 
  85 #ifdef PRODUCT
  86 #define inc_counter_np(counter) ((void)0)
  87 #else
  88   void inc_counter_np_(uint& counter) {
  89     __ lea(rscratch2, ExternalAddress((address)&counter));
  90     __ ldrw(rscratch1, Address(rscratch2));
  91     __ addw(rscratch1, rscratch1, 1);
  92     __ strw(rscratch1, Address(rscratch2));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -29 [ argument word 1      ]
 145   // -28 [ saved Floating-point Control Register ]
 146   // -26 [ saved v15            ] <--- sp_after_call
 147   // -25 [ saved v14            ]
 148   // -24 [ saved v13            ]
 149   // -23 [ saved v12            ]
 150   // -22 [ saved v11            ]
 151   // -21 [ saved v10            ]
 152   // -20 [ saved v9             ]
 153   // -19 [ saved v8             ]
 154   // -18 [ saved r28            ]
 155   // -17 [ saved r27            ]
 156   // -16 [ saved r26            ]
 157   // -15 [ saved r25            ]
 158   // -14 [ saved r24            ]
 159   // -13 [ saved r23            ]
 160   // -12 [ saved r22            ]
 161   // -11 [ saved r21            ]
 162   // -10 [ saved r20            ]
 163   //  -9 [ saved r19            ]
 164   //  -8 [ call wrapper    (r0) ]
 165   //  -7 [ result          (r1) ]
 166   //  -6 [ result type     (r2) ]
 167   //  -5 [ method          (r3) ]
 168   //  -4 [ entry point     (r4) ]
 169   //  -3 [ parameters      (r5) ]
 170   //  -2 [ parameter size  (r6) ]
 171   //  -1 [ thread (r7)          ]
 172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 173   //   1 [ saved lr       (r30) ]
 174 
 175   // Call stub stack layout word offsets from fp
 176   enum call_stub_layout {
 177     sp_after_call_off  = -28,
 178 
 179     fpcr_off           = sp_after_call_off,
 180     d15_off            = -26,
 181     d13_off            = -24,
 182     d11_off            = -22,
 183     d9_off             = -20,
 184 
 185     r28_off            = -18,
 186     r26_off            = -16,
 187     r24_off            = -14,
 188     r22_off            = -12,
 189     r20_off            = -10,
 190     call_wrapper_off   =  -8,
 191     result_off         =  -7,
 192     result_type_off    =  -6,
 193     method_off         =  -5,
 194     entry_point_off    =  -4,
 195     parameter_size_off =  -2,
 196     thread_off         =  -1,
 197     fp_f               =   0,
 198     retaddr_off        =   1,
 199   };
 200 
 201   address generate_call_stub(address& return_address) {
 202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 204            "adjust this code");
 205 
 206     StubCodeMark mark(this, "StubRoutines", "call_stub");
 207     address start = __ pc();
 208 
 209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 210 
 211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     __ get_fpcr(rscratch1);
 261     __ str(rscratch1, fpcr_save);
 262     // Set FPCR to the state we need. We do want Round to Nearest. We
 263     // don't want non-IEEE rounding modes or floating-point traps.
 264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 266     __ set_fpcr(rscratch1);
 267 
 268     // install Java thread in global register now we have saved
 269     // whatever value it held
 270     __ mov(rthread, c_rarg7);
 271     // And method
 272     __ mov(rmethod, c_rarg3);
 273 
 274     // set up the heapbase register
 275     __ reinit_heapbase();
 276 
 277 #ifdef ASSERT
 278     // make sure we have no pending exceptions
 279     {
 280       Label L;
 281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 282       __ cmp(rscratch1, (u1)NULL_WORD);
 283       __ br(Assembler::EQ, L);
 284       __ stop("StubRoutines::call_stub: entered with pending exception");
 285       __ BIND(L);
 286     }
 287 #endif
 288     // pass parameters if any
 289     __ mov(esp, sp);
 290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 291     __ andr(sp, rscratch1, -2 * wordSize);
 292 
 293     BLOCK_COMMENT("pass parameters if any");
 294     Label parameters_done;
 295     // parameter count is still in c_rarg6
 296     // and parameter pointer identifying param 1 is in c_rarg5
 297     __ cbzw(c_rarg6, parameters_done);
 298 
 299     address loop = __ pc();
 300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 301     __ subsw(c_rarg6, c_rarg6, 1);
 302     __ push(rscratch1);
 303     __ br(Assembler::GT, loop);
 304 
 305     __ BIND(parameters_done);
 306 
 307     // call Java entry -- passing methdoOop, and current sp
 308     //      rmethod: Method*
 309     //      r19_sender_sp: sender sp
 310     BLOCK_COMMENT("call Java function");
 311     __ mov(r19_sender_sp, sp);
 312     __ blr(c_rarg4);
 313 
 314     // we do this here because the notify will already have been done
 315     // if we get to the next instruction via an exception
 316     //
 317     // n.b. adding this instruction here affects the calculation of
 318     // whether or not a routine returns to the call stub (used when
 319     // doing stack walks) since the normal test is to check the return
 320     // pc against the address saved below. so we may need to allow for
 321     // this extra instruction in the check.
 322 
 323     // save current address for use by exception handling code
 324 
 325     return_address = __ pc();
 326 
 327     // store result depending on type (everything that is not
 328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 329     // n.b. this assumes Java returns an integral result in r0
 330     // and a floating result in j_farg0
 331     // All of j_rargN may be used to return inline type fields so be careful
 332     // not to clobber those.
 333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 334     // assignment of Rresult below.
 335     Register Rresult = r14, Rresult_type = r15;
 336     __ ldr(Rresult, result);
 337     Label is_long, is_float, is_double, check_prim, exit;
 338     __ ldr(Rresult_type, result_type);
 339     __ cmp(Rresult_type, (u1)T_OBJECT);
 340     __ br(Assembler::EQ, check_prim);
 341     __ cmp(Rresult_type, (u1)T_LONG);
 342     __ br(Assembler::EQ, is_long);
 343     __ cmp(Rresult_type, (u1)T_FLOAT);
 344     __ br(Assembler::EQ, is_float);
 345     __ cmp(Rresult_type, (u1)T_DOUBLE);
 346     __ br(Assembler::EQ, is_double);
 347 
 348     // handle T_INT case
 349     __ strw(r0, Address(Rresult));
 350 
 351     __ BIND(exit);
 352 
 353     // pop parameters
 354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 355 
 356 #ifdef ASSERT
 357     // verify that threads correspond
 358     {
 359       Label L, S;
 360       __ ldr(rscratch1, thread);
 361       __ cmp(rthread, rscratch1);
 362       __ br(Assembler::NE, S);
 363       __ get_thread(rscratch1);
 364       __ cmp(rthread, rscratch1);
 365       __ br(Assembler::EQ, L);
 366       __ BIND(S);
 367       __ stop("StubRoutines::call_stub: threads must correspond");
 368       __ BIND(L);
 369     }
 370 #endif
 371 
 372     __ pop_cont_fastpath(rthread);
 373 
 374     // restore callee-save registers
 375     __ ldpd(v15, v14,  d15_save);
 376     __ ldpd(v13, v12,  d13_save);
 377     __ ldpd(v11, v10,  d11_save);
 378     __ ldpd(v9,  v8,   d9_save);
 379 
 380     __ ldp(r28, r27,   r28_save);
 381     __ ldp(r26, r25,   r26_save);
 382     __ ldp(r24, r23,   r24_save);
 383     __ ldp(r22, r21,   r22_save);
 384     __ ldp(r20, r19,   r20_save);
 385 
 386     // restore fpcr
 387     __ ldr(rscratch1,  fpcr_save);
 388     __ set_fpcr(rscratch1);
 389 
 390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 391     __ ldrw(c_rarg2, result_type);
 392     __ ldr(c_rarg3,  method);
 393     __ ldp(c_rarg4, c_rarg5,  entry_point);
 394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 395 
 396     // leave frame and return to caller
 397     __ leave();
 398     __ ret(lr);
 399 
 400     // handle return types different from T_INT
 401     __ BIND(check_prim);
 402     if (InlineTypeReturnedAsFields) {
 403       // Check for scalarized return value
 404       __ tbz(r0, 0, is_long);
 405       // Load pack handler address
 406       __ andr(rscratch1, r0, -2);
 407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 409       __ blr(rscratch1);
 410       __ b(exit);
 411     }
 412 
 413     __ BIND(is_long);
 414     __ str(r0, Address(Rresult, 0));
 415     __ br(Assembler::AL, exit);
 416 
 417     __ BIND(is_float);
 418     __ strs(j_farg0, Address(Rresult, 0));
 419     __ br(Assembler::AL, exit);
 420 
 421     __ BIND(is_double);
 422     __ strd(j_farg0, Address(Rresult, 0));
 423     __ br(Assembler::AL, exit);
 424 
 425     return start;
 426   }
 427 
 428   // Return point for a Java call if there's an exception thrown in
 429   // Java code.  The exception is caught and transformed into a
 430   // pending exception stored in JavaThread that can be tested from
 431   // within the VM.
 432   //
 433   // Note: Usually the parameters are removed by the callee. In case
 434   // of an exception crossing an activation frame boundary, that is
 435   // not the case if the callee is compiled code => need to setup the
 436   // rsp.
 437   //
 438   // r0: exception oop
 439 
 440   address generate_catch_exception() {
 441     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 442     address start = __ pc();
 443 
 444     // same as in generate_call_stub():
 445     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 446     const Address thread        (rfp, thread_off         * wordSize);
 447 
 448 #ifdef ASSERT
 449     // verify that threads correspond
 450     {
 451       Label L, S;
 452       __ ldr(rscratch1, thread);
 453       __ cmp(rthread, rscratch1);
 454       __ br(Assembler::NE, S);
 455       __ get_thread(rscratch1);
 456       __ cmp(rthread, rscratch1);
 457       __ br(Assembler::EQ, L);
 458       __ bind(S);
 459       __ stop("StubRoutines::catch_exception: threads must correspond");
 460       __ bind(L);
 461     }
 462 #endif
 463 
 464     // set pending exception
 465     __ verify_oop(r0);
 466 
 467     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 468     __ mov(rscratch1, (address)__FILE__);
 469     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 470     __ movw(rscratch1, (int)__LINE__);
 471     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 472 
 473     // complete return to VM
 474     assert(StubRoutines::_call_stub_return_address != nullptr,
 475            "_call_stub_return_address must have been generated before");
 476     __ b(StubRoutines::_call_stub_return_address);
 477 
 478     return start;
 479   }
 480 
 481   // Continuation point for runtime calls returning with a pending
 482   // exception.  The pending exception check happened in the runtime
 483   // or native call stub.  The pending exception in Thread is
 484   // converted into a Java-level exception.
 485   //
 486   // Contract with Java-level exception handlers:
 487   // r0: exception
 488   // r3: throwing pc
 489   //
 490   // NOTE: At entry of this stub, exception-pc must be in LR !!
 491 
 492   // NOTE: this is always used as a jump target within generated code
 493   // so it just needs to be generated code with no x86 prolog
 494 
 495   address generate_forward_exception() {
 496     StubCodeMark mark(this, "StubRoutines", "forward exception");
 497     address start = __ pc();
 498 
 499     // Upon entry, LR points to the return address returning into
 500     // Java (interpreted or compiled) code; i.e., the return address
 501     // becomes the throwing pc.
 502     //
 503     // Arguments pushed before the runtime call are still on the stack
 504     // but the exception handler will reset the stack pointer ->
 505     // ignore them.  A potential result in registers can be ignored as
 506     // well.
 507 
 508 #ifdef ASSERT
 509     // make sure this code is only executed if there is a pending exception
 510     {
 511       Label L;
 512       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 513       __ cbnz(rscratch1, L);
 514       __ stop("StubRoutines::forward exception: no pending exception (1)");
 515       __ bind(L);
 516     }
 517 #endif
 518 
 519     // compute exception handler into r19
 520 
 521     // call the VM to find the handler address associated with the
 522     // caller address. pass thread in r0 and caller pc (ret address)
 523     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 524     // the stack.
 525     __ mov(c_rarg1, lr);
 526     // lr will be trashed by the VM call so we move it to R19
 527     // (callee-saved) because we also need to pass it to the handler
 528     // returned by this call.
 529     __ mov(r19, lr);
 530     BLOCK_COMMENT("call exception_handler_for_return_address");
 531     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 532                          SharedRuntime::exception_handler_for_return_address),
 533                     rthread, c_rarg1);
 534     // Reinitialize the ptrue predicate register, in case the external runtime
 535     // call clobbers ptrue reg, as we may return to SVE compiled code.
 536     __ reinitialize_ptrue();
 537 
 538     // we should not really care that lr is no longer the callee
 539     // address. we saved the value the handler needs in r19 so we can
 540     // just copy it to r3. however, the C2 handler will push its own
 541     // frame and then calls into the VM and the VM code asserts that
 542     // the PC for the frame above the handler belongs to a compiled
 543     // Java method. So, we restore lr here to satisfy that assert.
 544     __ mov(lr, r19);
 545     // setup r0 & r3 & clear pending exception
 546     __ mov(r3, r19);
 547     __ mov(r19, r0);
 548     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 549     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 550 
 551 #ifdef ASSERT
 552     // make sure exception is set
 553     {
 554       Label L;
 555       __ cbnz(r0, L);
 556       __ stop("StubRoutines::forward exception: no pending exception (2)");
 557       __ bind(L);
 558     }
 559 #endif
 560 
 561     // continue at exception handler
 562     // r0: exception
 563     // r3: throwing pc
 564     // r19: exception handler
 565     __ verify_oop(r0);
 566     __ br(r19);
 567 
 568     return start;
 569   }
 570 
 571   // Non-destructive plausibility checks for oops
 572   //
 573   // Arguments:
 574   //    r0: oop to verify
 575   //    rscratch1: error message
 576   //
 577   // Stack after saving c_rarg3:
 578   //    [tos + 0]: saved c_rarg3
 579   //    [tos + 1]: saved c_rarg2
 580   //    [tos + 2]: saved lr
 581   //    [tos + 3]: saved rscratch2
 582   //    [tos + 4]: saved r0
 583   //    [tos + 5]: saved rscratch1
 584   address generate_verify_oop() {
 585 
 586     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 587     address start = __ pc();
 588 
 589     Label exit, error;
 590 
 591     // save c_rarg2 and c_rarg3
 592     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 593 
 594     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 595     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 596     __ ldr(c_rarg3, Address(c_rarg2));
 597     __ add(c_rarg3, c_rarg3, 1);
 598     __ str(c_rarg3, Address(c_rarg2));
 599 
 600     // object is in r0
 601     // make sure object is 'reasonable'
 602     __ cbz(r0, exit); // if obj is null it is OK
 603 
 604     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 605     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 606 
 607     // return if everything seems ok
 608     __ bind(exit);
 609 
 610     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 611     __ ret(lr);
 612 
 613     // handle errors
 614     __ bind(error);
 615     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 616 
 617     __ push(RegSet::range(r0, r29), sp);
 618     // debug(char* msg, int64_t pc, int64_t regs[])
 619     __ mov(c_rarg0, rscratch1);      // pass address of error message
 620     __ mov(c_rarg1, lr);             // pass return address
 621     __ mov(c_rarg2, sp);             // pass address of regs on stack
 622 #ifndef PRODUCT
 623     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 624 #endif
 625     BLOCK_COMMENT("call MacroAssembler::debug");
 626     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 627     __ blr(rscratch1);
 628     __ hlt(0);
 629 
 630     return start;
 631   }
 632 
 633   // Generate indices for iota vector.
 634   address generate_iota_indices(const char *stub_name) {
 635     __ align(CodeEntryAlignment);
 636     StubCodeMark mark(this, "StubRoutines", stub_name);
 637     address start = __ pc();
 638     // B
 639     __ emit_data64(0x0706050403020100, relocInfo::none);
 640     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 641     // H
 642     __ emit_data64(0x0003000200010000, relocInfo::none);
 643     __ emit_data64(0x0007000600050004, relocInfo::none);
 644     // S
 645     __ emit_data64(0x0000000100000000, relocInfo::none);
 646     __ emit_data64(0x0000000300000002, relocInfo::none);
 647     // D
 648     __ emit_data64(0x0000000000000000, relocInfo::none);
 649     __ emit_data64(0x0000000000000001, relocInfo::none);
 650     // S - FP
 651     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 652     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 653     // D - FP
 654     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 655     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 656     return start;
 657   }
 658 
 659   // The inner part of zero_words().  This is the bulk operation,
 660   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 661   // caller is responsible for zeroing the last few words.
 662   //
 663   // Inputs:
 664   // r10: the HeapWord-aligned base address of an array to zero.
 665   // r11: the count in HeapWords, r11 > 0.
 666   //
 667   // Returns r10 and r11, adjusted for the caller to clear.
 668   // r10: the base address of the tail of words left to clear.
 669   // r11: the number of words in the tail.
 670   //      r11 < MacroAssembler::zero_words_block_size.
 671 
 672   address generate_zero_blocks() {
 673     Label done;
 674     Label base_aligned;
 675 
 676     Register base = r10, cnt = r11;
 677 
 678     __ align(CodeEntryAlignment);
 679     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 680     address start = __ pc();
 681 
 682     if (UseBlockZeroing) {
 683       int zva_length = VM_Version::zva_length();
 684 
 685       // Ensure ZVA length can be divided by 16. This is required by
 686       // the subsequent operations.
 687       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 688 
 689       __ tbz(base, 3, base_aligned);
 690       __ str(zr, Address(__ post(base, 8)));
 691       __ sub(cnt, cnt, 1);
 692       __ bind(base_aligned);
 693 
 694       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 695       // alignment.
 696       Label small;
 697       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 698       __ subs(rscratch1, cnt, low_limit >> 3);
 699       __ br(Assembler::LT, small);
 700       __ zero_dcache_blocks(base, cnt);
 701       __ bind(small);
 702     }
 703 
 704     {
 705       // Number of stp instructions we'll unroll
 706       const int unroll =
 707         MacroAssembler::zero_words_block_size / 2;
 708       // Clear the remaining blocks.
 709       Label loop;
 710       __ subs(cnt, cnt, unroll * 2);
 711       __ br(Assembler::LT, done);
 712       __ bind(loop);
 713       for (int i = 0; i < unroll; i++)
 714         __ stp(zr, zr, __ post(base, 16));
 715       __ subs(cnt, cnt, unroll * 2);
 716       __ br(Assembler::GE, loop);
 717       __ bind(done);
 718       __ add(cnt, cnt, unroll * 2);
 719     }
 720 
 721     __ ret(lr);
 722 
 723     return start;
 724   }
 725 
 726 
 727   typedef enum {
 728     copy_forwards = 1,
 729     copy_backwards = -1
 730   } copy_direction;
 731 
 732   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 733   // for arraycopy stubs.
 734   class ArrayCopyBarrierSetHelper : StackObj {
 735     BarrierSetAssembler* _bs_asm;
 736     MacroAssembler* _masm;
 737     DecoratorSet _decorators;
 738     BasicType _type;
 739     Register _gct1;
 740     Register _gct2;
 741     Register _gct3;
 742     FloatRegister _gcvt1;
 743     FloatRegister _gcvt2;
 744     FloatRegister _gcvt3;
 745 
 746   public:
 747     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 748                               DecoratorSet decorators,
 749                               BasicType type,
 750                               Register gct1,
 751                               Register gct2,
 752                               Register gct3,
 753                               FloatRegister gcvt1,
 754                               FloatRegister gcvt2,
 755                               FloatRegister gcvt3)
 756       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 757         _masm(masm),
 758         _decorators(decorators),
 759         _type(type),
 760         _gct1(gct1),
 761         _gct2(gct2),
 762         _gct3(gct3),
 763         _gcvt1(gcvt1),
 764         _gcvt2(gcvt2),
 765         _gcvt3(gcvt3) {
 766     }
 767 
 768     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 769       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 770                             dst1, dst2, src,
 771                             _gct1, _gct2, _gcvt1);
 772     }
 773 
 774     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 775       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 776                              dst, src1, src2,
 777                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 778     }
 779 
 780     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 781       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 782                             dst1, dst2, src,
 783                             _gct1);
 784     }
 785 
 786     void copy_store_at_16(Address dst, Register src1, Register src2) {
 787       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 788                              dst, src1, src2,
 789                              _gct1, _gct2, _gct3);
 790     }
 791 
 792     void copy_load_at_8(Register dst, Address src) {
 793       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 794                             dst, noreg, src,
 795                             _gct1);
 796     }
 797 
 798     void copy_store_at_8(Address dst, Register src) {
 799       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 800                              dst, src, noreg,
 801                              _gct1, _gct2, _gct3);
 802     }
 803   };
 804 
 805   // Bulk copy of blocks of 8 words.
 806   //
 807   // count is a count of words.
 808   //
 809   // Precondition: count >= 8
 810   //
 811   // Postconditions:
 812   //
 813   // The least significant bit of count contains the remaining count
 814   // of words to copy.  The rest of count is trash.
 815   //
 816   // s and d are adjusted to point to the remaining words to copy
 817   //
 818   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 819                            copy_direction direction) {
 820     int unit = wordSize * direction;
 821     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 822 
 823     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 824       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 825     const Register stride = r14;
 826     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 827     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 828     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 829 
 830     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 831     assert_different_registers(s, d, count, rscratch1, rscratch2);
 832 
 833     Label again, drain;
 834     const char *stub_name;
 835     if (direction == copy_forwards)
 836       stub_name = "forward_copy_longs";
 837     else
 838       stub_name = "backward_copy_longs";
 839 
 840     __ align(CodeEntryAlignment);
 841 
 842     StubCodeMark mark(this, "StubRoutines", stub_name);
 843 
 844     __ bind(start);
 845 
 846     Label unaligned_copy_long;
 847     if (AvoidUnalignedAccesses) {
 848       __ tbnz(d, 3, unaligned_copy_long);
 849     }
 850 
 851     if (direction == copy_forwards) {
 852       __ sub(s, s, bias);
 853       __ sub(d, d, bias);
 854     }
 855 
 856 #ifdef ASSERT
 857     // Make sure we are never given < 8 words
 858     {
 859       Label L;
 860       __ cmp(count, (u1)8);
 861       __ br(Assembler::GE, L);
 862       __ stop("genrate_copy_longs called with < 8 words");
 863       __ bind(L);
 864     }
 865 #endif
 866 
 867     // Fill 8 registers
 868     if (UseSIMDForMemoryOps) {
 869       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 870       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 871     } else {
 872       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 873       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 874       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 875       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 876     }
 877 
 878     __ subs(count, count, 16);
 879     __ br(Assembler::LO, drain);
 880 
 881     int prefetch = PrefetchCopyIntervalInBytes;
 882     bool use_stride = false;
 883     if (direction == copy_backwards) {
 884        use_stride = prefetch > 256;
 885        prefetch = -prefetch;
 886        if (use_stride) __ mov(stride, prefetch);
 887     }
 888 
 889     __ bind(again);
 890 
 891     if (PrefetchCopyIntervalInBytes > 0)
 892       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 893 
 894     if (UseSIMDForMemoryOps) {
 895       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 896       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 897       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 898       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 899     } else {
 900       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 902       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 903       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 904       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 905       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 906       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 907       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 908     }
 909 
 910     __ subs(count, count, 8);
 911     __ br(Assembler::HS, again);
 912 
 913     // Drain
 914     __ bind(drain);
 915     if (UseSIMDForMemoryOps) {
 916       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 917       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 918     } else {
 919       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 920       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 921       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 922       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 923     }
 924 
 925     {
 926       Label L1, L2;
 927       __ tbz(count, exact_log2(4), L1);
 928       if (UseSIMDForMemoryOps) {
 929         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 930         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 931       } else {
 932         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 933         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 934         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 935         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 936       }
 937       __ bind(L1);
 938 
 939       if (direction == copy_forwards) {
 940         __ add(s, s, bias);
 941         __ add(d, d, bias);
 942       }
 943 
 944       __ tbz(count, 1, L2);
 945       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 946       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 947       __ bind(L2);
 948     }
 949 
 950     __ ret(lr);
 951 
 952     if (AvoidUnalignedAccesses) {
 953       Label drain, again;
 954       // Register order for storing. Order is different for backward copy.
 955 
 956       __ bind(unaligned_copy_long);
 957 
 958       // source address is even aligned, target odd aligned
 959       //
 960       // when forward copying word pairs we read long pairs at offsets
 961       // {0, 2, 4, 6} (in long words). when backwards copying we read
 962       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 963       // address by -2 in the forwards case so we can compute the
 964       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 965       // or -1.
 966       //
 967       // when forward copying we need to store 1 word, 3 pairs and
 968       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 969       // zero offset We adjust the destination by -1 which means we
 970       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 971       //
 972       // When backwards copyng we need to store 1 word, 3 pairs and
 973       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 974       // offsets {1, 3, 5, 7, 8} * unit.
 975 
 976       if (direction == copy_forwards) {
 977         __ sub(s, s, 16);
 978         __ sub(d, d, 8);
 979       }
 980 
 981       // Fill 8 registers
 982       //
 983       // for forwards copy s was offset by -16 from the original input
 984       // value of s so the register contents are at these offsets
 985       // relative to the 64 bit block addressed by that original input
 986       // and so on for each successive 64 byte block when s is updated
 987       //
 988       // t0 at offset 0,  t1 at offset 8
 989       // t2 at offset 16, t3 at offset 24
 990       // t4 at offset 32, t5 at offset 40
 991       // t6 at offset 48, t7 at offset 56
 992 
 993       // for backwards copy s was not offset so the register contents
 994       // are at these offsets into the preceding 64 byte block
 995       // relative to that original input and so on for each successive
 996       // preceding 64 byte block when s is updated. this explains the
 997       // slightly counter-intuitive looking pattern of register usage
 998       // in the stp instructions for backwards copy.
 999       //
1000       // t0 at offset -16, t1 at offset -8
1001       // t2 at offset -32, t3 at offset -24
1002       // t4 at offset -48, t5 at offset -40
1003       // t6 at offset -64, t7 at offset -56
1004 
1005       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1006       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1007       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1008       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1009 
1010       __ subs(count, count, 16);
1011       __ br(Assembler::LO, drain);
1012 
1013       int prefetch = PrefetchCopyIntervalInBytes;
1014       bool use_stride = false;
1015       if (direction == copy_backwards) {
1016          use_stride = prefetch > 256;
1017          prefetch = -prefetch;
1018          if (use_stride) __ mov(stride, prefetch);
1019       }
1020 
1021       __ bind(again);
1022 
1023       if (PrefetchCopyIntervalInBytes > 0)
1024         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1025 
1026       if (direction == copy_forwards) {
1027        // allowing for the offset of -8 the store instructions place
1028        // registers into the target 64 bit block at the following
1029        // offsets
1030        //
1031        // t0 at offset 0
1032        // t1 at offset 8,  t2 at offset 16
1033        // t3 at offset 24, t4 at offset 32
1034        // t5 at offset 40, t6 at offset 48
1035        // t7 at offset 56
1036 
1037         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1038         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1039         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1040         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1041         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1042         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1043         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1044         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1045         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1046       } else {
1047        // d was not offset when we started so the registers are
1048        // written into the 64 bit block preceding d with the following
1049        // offsets
1050        //
1051        // t1 at offset -8
1052        // t3 at offset -24, t0 at offset -16
1053        // t5 at offset -48, t2 at offset -32
1054        // t7 at offset -56, t4 at offset -48
1055        //                   t6 at offset -64
1056        //
1057        // note that this matches the offsets previously noted for the
1058        // loads
1059 
1060         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1061         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1062         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1063         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1064         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1065         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1066         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1067         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1068         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1069       }
1070 
1071       __ subs(count, count, 8);
1072       __ br(Assembler::HS, again);
1073 
1074       // Drain
1075       //
1076       // this uses the same pattern of offsets and register arguments
1077       // as above
1078       __ bind(drain);
1079       if (direction == copy_forwards) {
1080         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1081         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1082         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1083         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1084         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1085       } else {
1086         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1087         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1088         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1089         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1090         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1091       }
1092       // now we need to copy any remaining part block which may
1093       // include a 4 word block subblock and/or a 2 word subblock.
1094       // bits 2 and 1 in the count are the tell-tale for whether we
1095       // have each such subblock
1096       {
1097         Label L1, L2;
1098         __ tbz(count, exact_log2(4), L1);
1099        // this is the same as above but copying only 4 longs hence
1100        // with only one intervening stp between the str instructions
1101        // but note that the offsets and registers still follow the
1102        // same pattern
1103         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1104         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1105         if (direction == copy_forwards) {
1106           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1107           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1108           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1109         } else {
1110           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1111           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1112           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1113         }
1114         __ bind(L1);
1115 
1116         __ tbz(count, 1, L2);
1117        // this is the same as above but copying only 2 longs hence
1118        // there is no intervening stp between the str instructions
1119        // but note that the offset and register patterns are still
1120        // the same
1121         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1122         if (direction == copy_forwards) {
1123           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1124           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1125         } else {
1126           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1127           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1128         }
1129         __ bind(L2);
1130 
1131        // for forwards copy we need to re-adjust the offsets we
1132        // applied so that s and d are follow the last words written
1133 
1134        if (direction == copy_forwards) {
1135          __ add(s, s, 16);
1136          __ add(d, d, 8);
1137        }
1138 
1139       }
1140 
1141       __ ret(lr);
1142       }
1143   }
1144 
1145   // Small copy: less than 16 bytes.
1146   //
1147   // NB: Ignores all of the bits of count which represent more than 15
1148   // bytes, so a caller doesn't have to mask them.
1149 
1150   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1151     bool is_backwards = step < 0;
1152     size_t granularity = uabs(step);
1153     int direction = is_backwards ? -1 : 1;
1154 
1155     Label Lword, Lint, Lshort, Lbyte;
1156 
1157     assert(granularity
1158            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1159 
1160     const Register t0 = r3;
1161     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1162     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1163 
1164     // ??? I don't know if this bit-test-and-branch is the right thing
1165     // to do.  It does a lot of jumping, resulting in several
1166     // mispredicted branches.  It might make more sense to do this
1167     // with something like Duff's device with a single computed branch.
1168 
1169     __ tbz(count, 3 - exact_log2(granularity), Lword);
1170     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1171     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1172     __ bind(Lword);
1173 
1174     if (granularity <= sizeof (jint)) {
1175       __ tbz(count, 2 - exact_log2(granularity), Lint);
1176       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1177       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1178       __ bind(Lint);
1179     }
1180 
1181     if (granularity <= sizeof (jshort)) {
1182       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1183       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1184       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1185       __ bind(Lshort);
1186     }
1187 
1188     if (granularity <= sizeof (jbyte)) {
1189       __ tbz(count, 0, Lbyte);
1190       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1191       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1192       __ bind(Lbyte);
1193     }
1194   }
1195 
1196   Label copy_f, copy_b;
1197   Label copy_obj_f, copy_obj_b;
1198   Label copy_obj_uninit_f, copy_obj_uninit_b;
1199 
1200   // All-singing all-dancing memory copy.
1201   //
1202   // Copy count units of memory from s to d.  The size of a unit is
1203   // step, which can be positive or negative depending on the direction
1204   // of copy.  If is_aligned is false, we align the source address.
1205   //
1206 
1207   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1208                    Register s, Register d, Register count, int step) {
1209     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1210     bool is_backwards = step < 0;
1211     unsigned int granularity = uabs(step);
1212     const Register t0 = r3, t1 = r4;
1213 
1214     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1215     // load all the data before writing anything
1216     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1217     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1218     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1219     const Register send = r17, dend = r16;
1220     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1221     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1222     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1223 
1224     if (PrefetchCopyIntervalInBytes > 0)
1225       __ prfm(Address(s, 0), PLDL1KEEP);
1226     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1227     __ br(Assembler::HI, copy_big);
1228 
1229     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1230     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1231 
1232     __ cmp(count, u1(16/granularity));
1233     __ br(Assembler::LS, copy16);
1234 
1235     __ cmp(count, u1(64/granularity));
1236     __ br(Assembler::HI, copy80);
1237 
1238     __ cmp(count, u1(32/granularity));
1239     __ br(Assembler::LS, copy32);
1240 
1241     // 33..64 bytes
1242     if (UseSIMDForMemoryOps) {
1243       bs.copy_load_at_32(v0, v1, Address(s, 0));
1244       bs.copy_load_at_32(v2, v3, Address(send, -32));
1245       bs.copy_store_at_32(Address(d, 0), v0, v1);
1246       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1247     } else {
1248       bs.copy_load_at_16(t0, t1, Address(s, 0));
1249       bs.copy_load_at_16(t2, t3, Address(s, 16));
1250       bs.copy_load_at_16(t4, t5, Address(send, -32));
1251       bs.copy_load_at_16(t6, t7, Address(send, -16));
1252 
1253       bs.copy_store_at_16(Address(d, 0), t0, t1);
1254       bs.copy_store_at_16(Address(d, 16), t2, t3);
1255       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1256       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1257     }
1258     __ b(finish);
1259 
1260     // 17..32 bytes
1261     __ bind(copy32);
1262     bs.copy_load_at_16(t0, t1, Address(s, 0));
1263     bs.copy_load_at_16(t6, t7, Address(send, -16));
1264 
1265     bs.copy_store_at_16(Address(d, 0), t0, t1);
1266     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1267     __ b(finish);
1268 
1269     // 65..80/96 bytes
1270     // (96 bytes if SIMD because we do 32 byes per instruction)
1271     __ bind(copy80);
1272     if (UseSIMDForMemoryOps) {
1273       bs.copy_load_at_32(v0, v1, Address(s, 0));
1274       bs.copy_load_at_32(v2, v3, Address(s, 32));
1275       // Unaligned pointers can be an issue for copying.
1276       // The issue has more chances to happen when granularity of data is
1277       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1278       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1279       // The most performance drop has been seen for the range 65-80 bytes.
1280       // For such cases using the pair of ldp/stp instead of the third pair of
1281       // ldpq/stpq fixes the performance issue.
1282       if (granularity < sizeof (jint)) {
1283         Label copy96;
1284         __ cmp(count, u1(80/granularity));
1285         __ br(Assembler::HI, copy96);
1286         bs.copy_load_at_16(t0, t1, Address(send, -16));
1287 
1288         bs.copy_store_at_32(Address(d, 0), v0, v1);
1289         bs.copy_store_at_32(Address(d, 32), v2, v3);
1290 
1291         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1292         __ b(finish);
1293 
1294         __ bind(copy96);
1295       }
1296       bs.copy_load_at_32(v4, v5, Address(send, -32));
1297 
1298       bs.copy_store_at_32(Address(d, 0), v0, v1);
1299       bs.copy_store_at_32(Address(d, 32), v2, v3);
1300 
1301       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1302     } else {
1303       bs.copy_load_at_16(t0, t1, Address(s, 0));
1304       bs.copy_load_at_16(t2, t3, Address(s, 16));
1305       bs.copy_load_at_16(t4, t5, Address(s, 32));
1306       bs.copy_load_at_16(t6, t7, Address(s, 48));
1307       bs.copy_load_at_16(t8, t9, Address(send, -16));
1308 
1309       bs.copy_store_at_16(Address(d, 0), t0, t1);
1310       bs.copy_store_at_16(Address(d, 16), t2, t3);
1311       bs.copy_store_at_16(Address(d, 32), t4, t5);
1312       bs.copy_store_at_16(Address(d, 48), t6, t7);
1313       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1314     }
1315     __ b(finish);
1316 
1317     // 0..16 bytes
1318     __ bind(copy16);
1319     __ cmp(count, u1(8/granularity));
1320     __ br(Assembler::LO, copy8);
1321 
1322     // 8..16 bytes
1323     bs.copy_load_at_8(t0, Address(s, 0));
1324     bs.copy_load_at_8(t1, Address(send, -8));
1325     bs.copy_store_at_8(Address(d, 0), t0);
1326     bs.copy_store_at_8(Address(dend, -8), t1);
1327     __ b(finish);
1328 
1329     if (granularity < 8) {
1330       // 4..7 bytes
1331       __ bind(copy8);
1332       __ tbz(count, 2 - exact_log2(granularity), copy4);
1333       __ ldrw(t0, Address(s, 0));
1334       __ ldrw(t1, Address(send, -4));
1335       __ strw(t0, Address(d, 0));
1336       __ strw(t1, Address(dend, -4));
1337       __ b(finish);
1338       if (granularity < 4) {
1339         // 0..3 bytes
1340         __ bind(copy4);
1341         __ cbz(count, finish); // get rid of 0 case
1342         if (granularity == 2) {
1343           __ ldrh(t0, Address(s, 0));
1344           __ strh(t0, Address(d, 0));
1345         } else { // granularity == 1
1346           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1347           // the first and last byte.
1348           // Handle the 3 byte case by loading and storing base + count/2
1349           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1350           // This does means in the 1 byte case we load/store the same
1351           // byte 3 times.
1352           __ lsr(count, count, 1);
1353           __ ldrb(t0, Address(s, 0));
1354           __ ldrb(t1, Address(send, -1));
1355           __ ldrb(t2, Address(s, count));
1356           __ strb(t0, Address(d, 0));
1357           __ strb(t1, Address(dend, -1));
1358           __ strb(t2, Address(d, count));
1359         }
1360         __ b(finish);
1361       }
1362     }
1363 
1364     __ bind(copy_big);
1365     if (is_backwards) {
1366       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1367       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1368     }
1369 
1370     // Now we've got the small case out of the way we can align the
1371     // source address on a 2-word boundary.
1372 
1373     // Here we will materialize a count in r15, which is used by copy_memory_small
1374     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1375     // Up until here, we have used t9, which aliases r15, but from here on, that register
1376     // can not be used as a temp register, as it contains the count.
1377 
1378     Label aligned;
1379 
1380     if (is_aligned) {
1381       // We may have to adjust by 1 word to get s 2-word-aligned.
1382       __ tbz(s, exact_log2(wordSize), aligned);
1383       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1384       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1385       __ sub(count, count, wordSize/granularity);
1386     } else {
1387       if (is_backwards) {
1388         __ andr(r15, s, 2 * wordSize - 1);
1389       } else {
1390         __ neg(r15, s);
1391         __ andr(r15, r15, 2 * wordSize - 1);
1392       }
1393       // r15 is the byte adjustment needed to align s.
1394       __ cbz(r15, aligned);
1395       int shift = exact_log2(granularity);
1396       if (shift)  __ lsr(r15, r15, shift);
1397       __ sub(count, count, r15);
1398 
1399 #if 0
1400       // ?? This code is only correct for a disjoint copy.  It may or
1401       // may not make sense to use it in that case.
1402 
1403       // Copy the first pair; s and d may not be aligned.
1404       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1405       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1406 
1407       // Align s and d, adjust count
1408       if (is_backwards) {
1409         __ sub(s, s, r15);
1410         __ sub(d, d, r15);
1411       } else {
1412         __ add(s, s, r15);
1413         __ add(d, d, r15);
1414       }
1415 #else
1416       copy_memory_small(decorators, type, s, d, r15, step);
1417 #endif
1418     }
1419 
1420     __ bind(aligned);
1421 
1422     // s is now 2-word-aligned.
1423 
1424     // We have a count of units and some trailing bytes.  Adjust the
1425     // count and do a bulk copy of words.
1426     __ lsr(r15, count, exact_log2(wordSize/granularity));
1427     if (direction == copy_forwards) {
1428       if (type != T_OBJECT) {
1429         __ bl(copy_f);
1430       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1431         __ bl(copy_obj_uninit_f);
1432       } else {
1433         __ bl(copy_obj_f);
1434       }
1435     } else {
1436       if (type != T_OBJECT) {
1437         __ bl(copy_b);
1438       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1439         __ bl(copy_obj_uninit_b);
1440       } else {
1441         __ bl(copy_obj_b);
1442       }
1443     }
1444 
1445     // And the tail.
1446     copy_memory_small(decorators, type, s, d, count, step);
1447 
1448     if (granularity >= 8) __ bind(copy8);
1449     if (granularity >= 4) __ bind(copy4);
1450     __ bind(finish);
1451   }
1452 
1453 
1454   void clobber_registers() {
1455 #ifdef ASSERT
1456     RegSet clobbered
1457       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1458     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1459     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1460     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1461       __ mov(*it, rscratch1);
1462     }
1463 #endif
1464 
1465   }
1466 
1467   // Scan over array at a for count oops, verifying each one.
1468   // Preserves a and count, clobbers rscratch1 and rscratch2.
1469   void verify_oop_array (int size, Register a, Register count, Register temp) {
1470     Label loop, end;
1471     __ mov(rscratch1, a);
1472     __ mov(rscratch2, zr);
1473     __ bind(loop);
1474     __ cmp(rscratch2, count);
1475     __ br(Assembler::HS, end);
1476     if (size == wordSize) {
1477       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1478       __ verify_oop(temp);
1479     } else {
1480       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1481       __ decode_heap_oop(temp); // calls verify_oop
1482     }
1483     __ add(rscratch2, rscratch2, 1);
1484     __ b(loop);
1485     __ bind(end);
1486   }
1487 
1488   // Arguments:
1489   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1490   //             ignored
1491   //   is_oop  - true => oop array, so generate store check code
1492   //   name    - stub name string
1493   //
1494   // Inputs:
1495   //   c_rarg0   - source array address
1496   //   c_rarg1   - destination array address
1497   //   c_rarg2   - element count, treated as ssize_t, can be zero
1498   //
1499   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1500   // the hardware handle it.  The two dwords within qwords that span
1501   // cache line boundaries will still be loaded and stored atomically.
1502   //
1503   // Side Effects:
1504   //   disjoint_int_copy_entry is set to the no-overlap entry point
1505   //   used by generate_conjoint_int_oop_copy().
1506   //
1507   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1508                                   const char *name, bool dest_uninitialized = false) {
1509     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1510     RegSet saved_reg = RegSet::of(s, d, count);
1511     __ align(CodeEntryAlignment);
1512     StubCodeMark mark(this, "StubRoutines", name);
1513     address start = __ pc();
1514     __ enter();
1515 
1516     if (entry != nullptr) {
1517       *entry = __ pc();
1518       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1519       BLOCK_COMMENT("Entry:");
1520     }
1521 
1522     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1523     if (dest_uninitialized) {
1524       decorators |= IS_DEST_UNINITIALIZED;
1525     }
1526     if (aligned) {
1527       decorators |= ARRAYCOPY_ALIGNED;
1528     }
1529 
1530     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1531     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1532 
1533     if (is_oop) {
1534       // save regs before copy_memory
1535       __ push(RegSet::of(d, count), sp);
1536     }
1537     {
1538       // UnsafeCopyMemory page error: continue after ucm
1539       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1540       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1541       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1542     }
1543 
1544     if (is_oop) {
1545       __ pop(RegSet::of(d, count), sp);
1546       if (VerifyOops)
1547         verify_oop_array(size, d, count, r16);
1548     }
1549 
1550     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1551 
1552     __ leave();
1553     __ mov(r0, zr); // return 0
1554     __ ret(lr);
1555     return start;
1556   }
1557 
1558   // Arguments:
1559   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1560   //             ignored
1561   //   is_oop  - true => oop array, so generate store check code
1562   //   name    - stub name string
1563   //
1564   // Inputs:
1565   //   c_rarg0   - source array address
1566   //   c_rarg1   - destination array address
1567   //   c_rarg2   - element count, treated as ssize_t, can be zero
1568   //
1569   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1570   // the hardware handle it.  The two dwords within qwords that span
1571   // cache line boundaries will still be loaded and stored atomically.
1572   //
1573   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1574                                  address *entry, const char *name,
1575                                  bool dest_uninitialized = false) {
1576     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1577     RegSet saved_regs = RegSet::of(s, d, count);
1578     StubCodeMark mark(this, "StubRoutines", name);
1579     address start = __ pc();
1580     __ enter();
1581 
1582     if (entry != nullptr) {
1583       *entry = __ pc();
1584       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1585       BLOCK_COMMENT("Entry:");
1586     }
1587 
1588     // use fwd copy when (d-s) above_equal (count*size)
1589     __ sub(rscratch1, d, s);
1590     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1591     __ br(Assembler::HS, nooverlap_target);
1592 
1593     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1594     if (dest_uninitialized) {
1595       decorators |= IS_DEST_UNINITIALIZED;
1596     }
1597     if (aligned) {
1598       decorators |= ARRAYCOPY_ALIGNED;
1599     }
1600 
1601     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1602     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1603 
1604     if (is_oop) {
1605       // save regs before copy_memory
1606       __ push(RegSet::of(d, count), sp);
1607     }
1608     {
1609       // UnsafeCopyMemory page error: continue after ucm
1610       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1611       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1612       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1613     }
1614     if (is_oop) {
1615       __ pop(RegSet::of(d, count), sp);
1616       if (VerifyOops)
1617         verify_oop_array(size, d, count, r16);
1618     }
1619     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1620     __ leave();
1621     __ mov(r0, zr); // return 0
1622     __ ret(lr);
1623     return start;
1624 }
1625 
1626   // Arguments:
1627   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1628   //             ignored
1629   //   name    - stub name string
1630   //
1631   // Inputs:
1632   //   c_rarg0   - source array address
1633   //   c_rarg1   - destination array address
1634   //   c_rarg2   - element count, treated as ssize_t, can be zero
1635   //
1636   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1637   // we let the hardware handle it.  The one to eight bytes within words,
1638   // dwords or qwords that span cache line boundaries will still be loaded
1639   // and stored atomically.
1640   //
1641   // Side Effects:
1642   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1643   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1644   // we let the hardware handle it.  The one to eight bytes within words,
1645   // dwords or qwords that span cache line boundaries will still be loaded
1646   // and stored atomically.
1647   //
1648   // Side Effects:
1649   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1650   //   used by generate_conjoint_byte_copy().
1651   //
1652   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1653     const bool not_oop = false;
1654     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1655   }
1656 
1657   // Arguments:
1658   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1659   //             ignored
1660   //   name    - stub name string
1661   //
1662   // Inputs:
1663   //   c_rarg0   - source array address
1664   //   c_rarg1   - destination array address
1665   //   c_rarg2   - element count, treated as ssize_t, can be zero
1666   //
1667   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1668   // we let the hardware handle it.  The one to eight bytes within words,
1669   // dwords or qwords that span cache line boundaries will still be loaded
1670   // and stored atomically.
1671   //
1672   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1673                                       address* entry, const char *name) {
1674     const bool not_oop = false;
1675     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1676   }
1677 
1678   // Arguments:
1679   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1680   //             ignored
1681   //   name    - stub name string
1682   //
1683   // Inputs:
1684   //   c_rarg0   - source array address
1685   //   c_rarg1   - destination array address
1686   //   c_rarg2   - element count, treated as ssize_t, can be zero
1687   //
1688   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1689   // let the hardware handle it.  The two or four words within dwords
1690   // or qwords that span cache line boundaries will still be loaded
1691   // and stored atomically.
1692   //
1693   // Side Effects:
1694   //   disjoint_short_copy_entry is set to the no-overlap entry point
1695   //   used by generate_conjoint_short_copy().
1696   //
1697   address generate_disjoint_short_copy(bool aligned,
1698                                        address* entry, const char *name) {
1699     const bool not_oop = false;
1700     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1701   }
1702 
1703   // Arguments:
1704   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1705   //             ignored
1706   //   name    - stub name string
1707   //
1708   // Inputs:
1709   //   c_rarg0   - source array address
1710   //   c_rarg1   - destination array address
1711   //   c_rarg2   - element count, treated as ssize_t, can be zero
1712   //
1713   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1714   // let the hardware handle it.  The two or four words within dwords
1715   // or qwords that span cache line boundaries will still be loaded
1716   // and stored atomically.
1717   //
1718   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1719                                        address *entry, const char *name) {
1720     const bool not_oop = false;
1721     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1722 
1723   }
1724   // Arguments:
1725   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1726   //             ignored
1727   //   name    - stub name string
1728   //
1729   // Inputs:
1730   //   c_rarg0   - source array address
1731   //   c_rarg1   - destination array address
1732   //   c_rarg2   - element count, treated as ssize_t, can be zero
1733   //
1734   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1735   // the hardware handle it.  The two dwords within qwords that span
1736   // cache line boundaries will still be loaded and stored atomically.
1737   //
1738   // Side Effects:
1739   //   disjoint_int_copy_entry is set to the no-overlap entry point
1740   //   used by generate_conjoint_int_oop_copy().
1741   //
1742   address generate_disjoint_int_copy(bool aligned, address *entry,
1743                                          const char *name, bool dest_uninitialized = false) {
1744     const bool not_oop = false;
1745     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1746   }
1747 
1748   // Arguments:
1749   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1750   //             ignored
1751   //   name    - stub name string
1752   //
1753   // Inputs:
1754   //   c_rarg0   - source array address
1755   //   c_rarg1   - destination array address
1756   //   c_rarg2   - element count, treated as ssize_t, can be zero
1757   //
1758   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1759   // the hardware handle it.  The two dwords within qwords that span
1760   // cache line boundaries will still be loaded and stored atomically.
1761   //
1762   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1763                                      address *entry, const char *name,
1764                                      bool dest_uninitialized = false) {
1765     const bool not_oop = false;
1766     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1767   }
1768 
1769 
1770   // Arguments:
1771   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1772   //             ignored
1773   //   name    - stub name string
1774   //
1775   // Inputs:
1776   //   c_rarg0   - source array address
1777   //   c_rarg1   - destination array address
1778   //   c_rarg2   - element count, treated as size_t, can be zero
1779   //
1780   // Side Effects:
1781   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1782   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1783   //
1784   address generate_disjoint_long_copy(bool aligned, address *entry,
1785                                           const char *name, bool dest_uninitialized = false) {
1786     const bool not_oop = false;
1787     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1788   }
1789 
1790   // Arguments:
1791   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1792   //             ignored
1793   //   name    - stub name string
1794   //
1795   // Inputs:
1796   //   c_rarg0   - source array address
1797   //   c_rarg1   - destination array address
1798   //   c_rarg2   - element count, treated as size_t, can be zero
1799   //
1800   address generate_conjoint_long_copy(bool aligned,
1801                                       address nooverlap_target, address *entry,
1802                                       const char *name, bool dest_uninitialized = false) {
1803     const bool not_oop = false;
1804     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1805   }
1806 
1807   // Arguments:
1808   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1809   //             ignored
1810   //   name    - stub name string
1811   //
1812   // Inputs:
1813   //   c_rarg0   - source array address
1814   //   c_rarg1   - destination array address
1815   //   c_rarg2   - element count, treated as size_t, can be zero
1816   //
1817   // Side Effects:
1818   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1819   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1820   //
1821   address generate_disjoint_oop_copy(bool aligned, address *entry,
1822                                      const char *name, bool dest_uninitialized) {
1823     const bool is_oop = true;
1824     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1825     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1826   }
1827 
1828   // Arguments:
1829   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1830   //             ignored
1831   //   name    - stub name string
1832   //
1833   // Inputs:
1834   //   c_rarg0   - source array address
1835   //   c_rarg1   - destination array address
1836   //   c_rarg2   - element count, treated as size_t, can be zero
1837   //
1838   address generate_conjoint_oop_copy(bool aligned,
1839                                      address nooverlap_target, address *entry,
1840                                      const char *name, bool dest_uninitialized) {
1841     const bool is_oop = true;
1842     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1843     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1844                                   name, dest_uninitialized);
1845   }
1846 
1847 
1848   // Helper for generating a dynamic type check.
1849   // Smashes rscratch1, rscratch2.
1850   void generate_type_check(Register sub_klass,
1851                            Register super_check_offset,
1852                            Register super_klass,
1853                            Label& L_success) {
1854     assert_different_registers(sub_klass, super_check_offset, super_klass);
1855 
1856     BLOCK_COMMENT("type_check:");
1857 
1858     Label L_miss;
1859 
1860     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1861                                      super_check_offset);
1862     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1863 
1864     // Fall through on failure!
1865     __ BIND(L_miss);
1866   }
1867 
1868   //
1869   //  Generate checkcasting array copy stub
1870   //
1871   //  Input:
1872   //    c_rarg0   - source array address
1873   //    c_rarg1   - destination array address
1874   //    c_rarg2   - element count, treated as ssize_t, can be zero
1875   //    c_rarg3   - size_t ckoff (super_check_offset)
1876   //    c_rarg4   - oop ckval (super_klass)
1877   //
1878   //  Output:
1879   //    r0 ==  0  -  success
1880   //    r0 == -1^K - failure, where K is partial transfer count
1881   //
1882   address generate_checkcast_copy(const char *name, address *entry,
1883                                   bool dest_uninitialized = false) {
1884 
1885     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1886 
1887     // Input registers (after setup_arg_regs)
1888     const Register from        = c_rarg0;   // source array address
1889     const Register to          = c_rarg1;   // destination array address
1890     const Register count       = c_rarg2;   // elementscount
1891     const Register ckoff       = c_rarg3;   // super_check_offset
1892     const Register ckval       = c_rarg4;   // super_klass
1893 
1894     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1895     RegSet wb_post_saved_regs = RegSet::of(count);
1896 
1897     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1898     const Register copied_oop  = r22;       // actual oop copied
1899     const Register count_save  = r21;       // orig elementscount
1900     const Register start_to    = r20;       // destination array start address
1901     const Register r19_klass   = r19;       // oop._klass
1902 
1903     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1904     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1905 
1906     //---------------------------------------------------------------
1907     // Assembler stub will be used for this call to arraycopy
1908     // if the two arrays are subtypes of Object[] but the
1909     // destination array type is not equal to or a supertype
1910     // of the source type.  Each element must be separately
1911     // checked.
1912 
1913     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1914                                copied_oop, r19_klass, count_save);
1915 
1916     __ align(CodeEntryAlignment);
1917     StubCodeMark mark(this, "StubRoutines", name);
1918     address start = __ pc();
1919 
1920     __ enter(); // required for proper stackwalking of RuntimeStub frame
1921 
1922 #ifdef ASSERT
1923     // caller guarantees that the arrays really are different
1924     // otherwise, we would have to make conjoint checks
1925     { Label L;
1926       __ b(L);                  // conjoint check not yet implemented
1927       __ stop("checkcast_copy within a single array");
1928       __ bind(L);
1929     }
1930 #endif //ASSERT
1931 
1932     // Caller of this entry point must set up the argument registers.
1933     if (entry != nullptr) {
1934       *entry = __ pc();
1935       BLOCK_COMMENT("Entry:");
1936     }
1937 
1938      // Empty array:  Nothing to do.
1939     __ cbz(count, L_done);
1940     __ push(RegSet::of(r19, r20, r21, r22), sp);
1941 
1942 #ifdef ASSERT
1943     BLOCK_COMMENT("assert consistent ckoff/ckval");
1944     // The ckoff and ckval must be mutually consistent,
1945     // even though caller generates both.
1946     { Label L;
1947       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1948       __ ldrw(start_to, Address(ckval, sco_offset));
1949       __ cmpw(ckoff, start_to);
1950       __ br(Assembler::EQ, L);
1951       __ stop("super_check_offset inconsistent");
1952       __ bind(L);
1953     }
1954 #endif //ASSERT
1955 
1956     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1957     bool is_oop = true;
1958     int element_size = UseCompressedOops ? 4 : 8;
1959     if (dest_uninitialized) {
1960       decorators |= IS_DEST_UNINITIALIZED;
1961     }
1962 
1963     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1964     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1965 
1966     // save the original count
1967     __ mov(count_save, count);
1968 
1969     // Copy from low to high addresses
1970     __ mov(start_to, to);              // Save destination array start address
1971     __ b(L_load_element);
1972 
1973     // ======== begin loop ========
1974     // (Loop is rotated; its entry is L_load_element.)
1975     // Loop control:
1976     //   for (; count != 0; count--) {
1977     //     copied_oop = load_heap_oop(from++);
1978     //     ... generate_type_check ...;
1979     //     store_heap_oop(to++, copied_oop);
1980     //   }
1981     __ align(OptoLoopAlignment);
1982 
1983     __ BIND(L_store_element);
1984     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1985                       __ post(to, element_size), copied_oop, noreg,
1986                       gct1, gct2, gct3);
1987     __ sub(count, count, 1);
1988     __ cbz(count, L_do_card_marks);
1989 
1990     // ======== loop entry is here ========
1991     __ BIND(L_load_element);
1992     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1993                      copied_oop, noreg, __ post(from, element_size),
1994                      gct1);
1995     __ cbz(copied_oop, L_store_element);
1996 
1997     __ load_klass(r19_klass, copied_oop);// query the object klass
1998     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1999     // ======== end loop ========
2000 
2001     // It was a real error; we must depend on the caller to finish the job.
2002     // Register count = remaining oops, count_orig = total oops.
2003     // Emit GC store barriers for the oops we have copied and report
2004     // their number to the caller.
2005 
2006     __ subs(count, count_save, count);     // K = partially copied oop count
2007     __ eon(count, count, zr);                   // report (-1^K) to caller
2008     __ br(Assembler::EQ, L_done_pop);
2009 
2010     __ BIND(L_do_card_marks);
2011     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2012 
2013     __ bind(L_done_pop);
2014     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2015     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2016 
2017     __ bind(L_done);
2018     __ mov(r0, count);
2019     __ leave();
2020     __ ret(lr);
2021 
2022     return start;
2023   }
2024 
2025   // Perform range checks on the proposed arraycopy.
2026   // Kills temp, but nothing else.
2027   // Also, clean the sign bits of src_pos and dst_pos.
2028   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2029                               Register src_pos, // source position (c_rarg1)
2030                               Register dst,     // destination array oo (c_rarg2)
2031                               Register dst_pos, // destination position (c_rarg3)
2032                               Register length,
2033                               Register temp,
2034                               Label& L_failed) {
2035     BLOCK_COMMENT("arraycopy_range_checks:");
2036 
2037     assert_different_registers(rscratch1, temp);
2038 
2039     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2040     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2041     __ addw(temp, length, src_pos);
2042     __ cmpw(temp, rscratch1);
2043     __ br(Assembler::HI, L_failed);
2044 
2045     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2046     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2047     __ addw(temp, length, dst_pos);
2048     __ cmpw(temp, rscratch1);
2049     __ br(Assembler::HI, L_failed);
2050 
2051     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2052     __ movw(src_pos, src_pos);
2053     __ movw(dst_pos, dst_pos);
2054 
2055     BLOCK_COMMENT("arraycopy_range_checks done");
2056   }
2057 
2058   // These stubs get called from some dumb test routine.
2059   // I'll write them properly when they're called from
2060   // something that's actually doing something.
2061   static void fake_arraycopy_stub(address src, address dst, int count) {
2062     assert(count == 0, "huh?");
2063   }
2064 
2065 
2066   //
2067   //  Generate 'unsafe' array copy stub
2068   //  Though just as safe as the other stubs, it takes an unscaled
2069   //  size_t argument instead of an element count.
2070   //
2071   //  Input:
2072   //    c_rarg0   - source array address
2073   //    c_rarg1   - destination array address
2074   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2075   //
2076   // Examines the alignment of the operands and dispatches
2077   // to a long, int, short, or byte copy loop.
2078   //
2079   address generate_unsafe_copy(const char *name,
2080                                address byte_copy_entry,
2081                                address short_copy_entry,
2082                                address int_copy_entry,
2083                                address long_copy_entry) {
2084     Label L_long_aligned, L_int_aligned, L_short_aligned;
2085     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2086 
2087     __ align(CodeEntryAlignment);
2088     StubCodeMark mark(this, "StubRoutines", name);
2089     address start = __ pc();
2090     __ enter(); // required for proper stackwalking of RuntimeStub frame
2091 
2092     // bump this on entry, not on exit:
2093     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2094 
2095     __ orr(rscratch1, s, d);
2096     __ orr(rscratch1, rscratch1, count);
2097 
2098     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2099     __ cbz(rscratch1, L_long_aligned);
2100     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2101     __ cbz(rscratch1, L_int_aligned);
2102     __ tbz(rscratch1, 0, L_short_aligned);
2103     __ b(RuntimeAddress(byte_copy_entry));
2104 
2105     __ BIND(L_short_aligned);
2106     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2107     __ b(RuntimeAddress(short_copy_entry));
2108     __ BIND(L_int_aligned);
2109     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2110     __ b(RuntimeAddress(int_copy_entry));
2111     __ BIND(L_long_aligned);
2112     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2113     __ b(RuntimeAddress(long_copy_entry));
2114 
2115     return start;
2116   }
2117 
2118   //
2119   //  Generate generic array copy stubs
2120   //
2121   //  Input:
2122   //    c_rarg0    -  src oop
2123   //    c_rarg1    -  src_pos (32-bits)
2124   //    c_rarg2    -  dst oop
2125   //    c_rarg3    -  dst_pos (32-bits)
2126   //    c_rarg4    -  element count (32-bits)
2127   //
2128   //  Output:
2129   //    r0 ==  0  -  success
2130   //    r0 == -1^K - failure, where K is partial transfer count
2131   //
2132   address generate_generic_copy(const char *name,
2133                                 address byte_copy_entry, address short_copy_entry,
2134                                 address int_copy_entry, address oop_copy_entry,
2135                                 address long_copy_entry, address checkcast_copy_entry) {
2136 
2137     Label L_failed, L_objArray;
2138     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2139 
2140     // Input registers
2141     const Register src        = c_rarg0;  // source array oop
2142     const Register src_pos    = c_rarg1;  // source position
2143     const Register dst        = c_rarg2;  // destination array oop
2144     const Register dst_pos    = c_rarg3;  // destination position
2145     const Register length     = c_rarg4;
2146 
2147 
2148     // Registers used as temps
2149     const Register dst_klass  = c_rarg5;
2150 
2151     __ align(CodeEntryAlignment);
2152 
2153     StubCodeMark mark(this, "StubRoutines", name);
2154 
2155     address start = __ pc();
2156 
2157     __ enter(); // required for proper stackwalking of RuntimeStub frame
2158 
2159     // bump this on entry, not on exit:
2160     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2161 
2162     //-----------------------------------------------------------------------
2163     // Assembler stub will be used for this call to arraycopy
2164     // if the following conditions are met:
2165     //
2166     // (1) src and dst must not be null.
2167     // (2) src_pos must not be negative.
2168     // (3) dst_pos must not be negative.
2169     // (4) length  must not be negative.
2170     // (5) src klass and dst klass should be the same and not null.
2171     // (6) src and dst should be arrays.
2172     // (7) src_pos + length must not exceed length of src.
2173     // (8) dst_pos + length must not exceed length of dst.
2174     //
2175 
2176     //  if (src == nullptr) return -1;
2177     __ cbz(src, L_failed);
2178 
2179     //  if (src_pos < 0) return -1;
2180     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2181 
2182     //  if (dst == nullptr) return -1;
2183     __ cbz(dst, L_failed);
2184 
2185     //  if (dst_pos < 0) return -1;
2186     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2187 
2188     // registers used as temp
2189     const Register scratch_length    = r16; // elements count to copy
2190     const Register scratch_src_klass = r17; // array klass
2191     const Register lh                = r15; // layout helper
2192 
2193     //  if (length < 0) return -1;
2194     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2195     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2196 
2197     __ load_klass(scratch_src_klass, src);
2198 #ifdef ASSERT
2199     //  assert(src->klass() != nullptr);
2200     {
2201       BLOCK_COMMENT("assert klasses not null {");
2202       Label L1, L2;
2203       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2204       __ bind(L1);
2205       __ stop("broken null klass");
2206       __ bind(L2);
2207       __ load_klass(rscratch1, dst);
2208       __ cbz(rscratch1, L1);     // this would be broken also
2209       BLOCK_COMMENT("} assert klasses not null done");
2210     }
2211 #endif
2212 
2213     // Load layout helper (32-bits)
2214     //
2215     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2216     // 32        30    24            16              8     2                 0
2217     //
2218     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2219     //
2220 
2221     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2222 
2223     // Handle objArrays completely differently...
2224     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2225     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2226     __ movw(rscratch1, objArray_lh);
2227     __ eorw(rscratch2, lh, rscratch1);
2228     __ cbzw(rscratch2, L_objArray);
2229 
2230     //  if (src->klass() != dst->klass()) return -1;
2231     __ load_klass(rscratch2, dst);
2232     __ eor(rscratch2, rscratch2, scratch_src_klass);
2233     __ cbnz(rscratch2, L_failed);
2234 
2235     // Check for flat inline type array -> return -1
2236     __ test_flat_array_oop(src, rscratch2, L_failed);
2237 
2238     // Check for null-free (non-flat) inline type array -> handle as object array
2239     __ test_null_free_array_oop(src, rscratch2, L_objArray);
2240 
2241     //  if (!src->is_Array()) return -1;
2242     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2243 
2244     // At this point, it is known to be a typeArray (array_tag 0x3).
2245 #ifdef ASSERT
2246     {
2247       BLOCK_COMMENT("assert primitive array {");
2248       Label L;
2249       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2250       __ cmpw(lh, rscratch2);
2251       __ br(Assembler::GE, L);
2252       __ stop("must be a primitive array");
2253       __ bind(L);
2254       BLOCK_COMMENT("} assert primitive array done");
2255     }
2256 #endif
2257 
2258     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2259                            rscratch2, L_failed);
2260 
2261     // TypeArrayKlass
2262     //
2263     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2264     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2265     //
2266 
2267     const Register rscratch1_offset = rscratch1;    // array offset
2268     const Register r15_elsize = lh; // element size
2269 
2270     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2271            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2272     __ add(src, src, rscratch1_offset);           // src array offset
2273     __ add(dst, dst, rscratch1_offset);           // dst array offset
2274     BLOCK_COMMENT("choose copy loop based on element size");
2275 
2276     // next registers should be set before the jump to corresponding stub
2277     const Register from     = c_rarg0;  // source array address
2278     const Register to       = c_rarg1;  // destination array address
2279     const Register count    = c_rarg2;  // elements count
2280 
2281     // 'from', 'to', 'count' registers should be set in such order
2282     // since they are the same as 'src', 'src_pos', 'dst'.
2283 
2284     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2285 
2286     // The possible values of elsize are 0-3, i.e. exact_log2(element
2287     // size in bytes).  We do a simple bitwise binary search.
2288   __ BIND(L_copy_bytes);
2289     __ tbnz(r15_elsize, 1, L_copy_ints);
2290     __ tbnz(r15_elsize, 0, L_copy_shorts);
2291     __ lea(from, Address(src, src_pos));// src_addr
2292     __ lea(to,   Address(dst, dst_pos));// dst_addr
2293     __ movw(count, scratch_length); // length
2294     __ b(RuntimeAddress(byte_copy_entry));
2295 
2296   __ BIND(L_copy_shorts);
2297     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2298     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2299     __ movw(count, scratch_length); // length
2300     __ b(RuntimeAddress(short_copy_entry));
2301 
2302   __ BIND(L_copy_ints);
2303     __ tbnz(r15_elsize, 0, L_copy_longs);
2304     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2305     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2306     __ movw(count, scratch_length); // length
2307     __ b(RuntimeAddress(int_copy_entry));
2308 
2309   __ BIND(L_copy_longs);
2310 #ifdef ASSERT
2311     {
2312       BLOCK_COMMENT("assert long copy {");
2313       Label L;
2314       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2315       __ cmpw(r15_elsize, LogBytesPerLong);
2316       __ br(Assembler::EQ, L);
2317       __ stop("must be long copy, but elsize is wrong");
2318       __ bind(L);
2319       BLOCK_COMMENT("} assert long copy done");
2320     }
2321 #endif
2322     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2323     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2324     __ movw(count, scratch_length); // length
2325     __ b(RuntimeAddress(long_copy_entry));
2326 
2327     // ObjArrayKlass
2328   __ BIND(L_objArray);
2329     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2330 
2331     Label L_plain_copy, L_checkcast_copy;
2332     //  test array classes for subtyping
2333     __ load_klass(r15, dst);
2334     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2335     __ br(Assembler::NE, L_checkcast_copy);
2336 
2337     // Identically typed arrays can be copied without element-wise checks.
2338     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2339                            rscratch2, L_failed);
2340 
2341     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2342     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2343     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2344     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2345     __ movw(count, scratch_length); // length
2346   __ BIND(L_plain_copy);
2347     __ b(RuntimeAddress(oop_copy_entry));
2348 
2349   __ BIND(L_checkcast_copy);
2350     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2351     {
2352       // Before looking at dst.length, make sure dst is also an objArray.
2353       __ ldrw(rscratch1, Address(r15, lh_offset));
2354       __ movw(rscratch2, objArray_lh);
2355       __ eorw(rscratch1, rscratch1, rscratch2);
2356       __ cbnzw(rscratch1, L_failed);
2357 
2358       // It is safe to examine both src.length and dst.length.
2359       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2360                              r15, L_failed);
2361 
2362       __ load_klass(dst_klass, dst); // reload
2363 
2364       // Marshal the base address arguments now, freeing registers.
2365       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2366       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2367       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2368       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2369       __ movw(count, length);           // length (reloaded)
2370       Register sco_temp = c_rarg3;      // this register is free now
2371       assert_different_registers(from, to, count, sco_temp,
2372                                  dst_klass, scratch_src_klass);
2373       // assert_clean_int(count, sco_temp);
2374 
2375       // Generate the type check.
2376       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2377       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2378 
2379       // Smashes rscratch1, rscratch2
2380       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2381 
2382       // Fetch destination element klass from the ObjArrayKlass header.
2383       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2384       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2386 
2387       // the checkcast_copy loop needs two extra arguments:
2388       assert(c_rarg3 == sco_temp, "#3 already in place");
2389       // Set up arguments for checkcast_copy_entry.
2390       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2391       __ b(RuntimeAddress(checkcast_copy_entry));
2392     }
2393 
2394   __ BIND(L_failed);
2395     __ mov(r0, -1);
2396     __ leave();   // required for proper stackwalking of RuntimeStub frame
2397     __ ret(lr);
2398 
2399     return start;
2400   }
2401 
2402   //
2403   // Generate stub for array fill. If "aligned" is true, the
2404   // "to" address is assumed to be heapword aligned.
2405   //
2406   // Arguments for generated stub:
2407   //   to:    c_rarg0
2408   //   value: c_rarg1
2409   //   count: c_rarg2 treated as signed
2410   //
2411   address generate_fill(BasicType t, bool aligned, const char *name) {
2412     __ align(CodeEntryAlignment);
2413     StubCodeMark mark(this, "StubRoutines", name);
2414     address start = __ pc();
2415 
2416     BLOCK_COMMENT("Entry:");
2417 
2418     const Register to        = c_rarg0;  // source array address
2419     const Register value     = c_rarg1;  // value
2420     const Register count     = c_rarg2;  // elements count
2421 
2422     const Register bz_base = r10;        // base for block_zero routine
2423     const Register cnt_words = r11;      // temp register
2424 
2425     __ enter();
2426 
2427     Label L_fill_elements, L_exit1;
2428 
2429     int shift = -1;
2430     switch (t) {
2431       case T_BYTE:
2432         shift = 0;
2433         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2434         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2435         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2436         __ br(Assembler::LO, L_fill_elements);
2437         break;
2438       case T_SHORT:
2439         shift = 1;
2440         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2441         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2442         __ br(Assembler::LO, L_fill_elements);
2443         break;
2444       case T_INT:
2445         shift = 2;
2446         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2447         __ br(Assembler::LO, L_fill_elements);
2448         break;
2449       default: ShouldNotReachHere();
2450     }
2451 
2452     // Align source address at 8 bytes address boundary.
2453     Label L_skip_align1, L_skip_align2, L_skip_align4;
2454     if (!aligned) {
2455       switch (t) {
2456         case T_BYTE:
2457           // One byte misalignment happens only for byte arrays.
2458           __ tbz(to, 0, L_skip_align1);
2459           __ strb(value, Address(__ post(to, 1)));
2460           __ subw(count, count, 1);
2461           __ bind(L_skip_align1);
2462           // Fallthrough
2463         case T_SHORT:
2464           // Two bytes misalignment happens only for byte and short (char) arrays.
2465           __ tbz(to, 1, L_skip_align2);
2466           __ strh(value, Address(__ post(to, 2)));
2467           __ subw(count, count, 2 >> shift);
2468           __ bind(L_skip_align2);
2469           // Fallthrough
2470         case T_INT:
2471           // Align to 8 bytes, we know we are 4 byte aligned to start.
2472           __ tbz(to, 2, L_skip_align4);
2473           __ strw(value, Address(__ post(to, 4)));
2474           __ subw(count, count, 4 >> shift);
2475           __ bind(L_skip_align4);
2476           break;
2477         default: ShouldNotReachHere();
2478       }
2479     }
2480 
2481     //
2482     //  Fill large chunks
2483     //
2484     __ lsrw(cnt_words, count, 3 - shift); // number of words
2485     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2486     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2487     if (UseBlockZeroing) {
2488       Label non_block_zeroing, rest;
2489       // If the fill value is zero we can use the fast zero_words().
2490       __ cbnz(value, non_block_zeroing);
2491       __ mov(bz_base, to);
2492       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2493       address tpc = __ zero_words(bz_base, cnt_words);
2494       if (tpc == nullptr) {
2495         fatal("CodeCache is full at generate_fill");
2496       }
2497       __ b(rest);
2498       __ bind(non_block_zeroing);
2499       __ fill_words(to, cnt_words, value);
2500       __ bind(rest);
2501     } else {
2502       __ fill_words(to, cnt_words, value);
2503     }
2504 
2505     // Remaining count is less than 8 bytes. Fill it by a single store.
2506     // Note that the total length is no less than 8 bytes.
2507     if (t == T_BYTE || t == T_SHORT) {
2508       Label L_exit1;
2509       __ cbzw(count, L_exit1);
2510       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2511       __ str(value, Address(to, -8));    // overwrite some elements
2512       __ bind(L_exit1);
2513       __ leave();
2514       __ ret(lr);
2515     }
2516 
2517     // Handle copies less than 8 bytes.
2518     Label L_fill_2, L_fill_4, L_exit2;
2519     __ bind(L_fill_elements);
2520     switch (t) {
2521       case T_BYTE:
2522         __ tbz(count, 0, L_fill_2);
2523         __ strb(value, Address(__ post(to, 1)));
2524         __ bind(L_fill_2);
2525         __ tbz(count, 1, L_fill_4);
2526         __ strh(value, Address(__ post(to, 2)));
2527         __ bind(L_fill_4);
2528         __ tbz(count, 2, L_exit2);
2529         __ strw(value, Address(to));
2530         break;
2531       case T_SHORT:
2532         __ tbz(count, 0, L_fill_4);
2533         __ strh(value, Address(__ post(to, 2)));
2534         __ bind(L_fill_4);
2535         __ tbz(count, 1, L_exit2);
2536         __ strw(value, Address(to));
2537         break;
2538       case T_INT:
2539         __ cbzw(count, L_exit2);
2540         __ strw(value, Address(to));
2541         break;
2542       default: ShouldNotReachHere();
2543     }
2544     __ bind(L_exit2);
2545     __ leave();
2546     __ ret(lr);
2547     return start;
2548   }
2549 
2550   address generate_data_cache_writeback() {
2551     const Register line        = c_rarg0;  // address of line to write back
2552 
2553     __ align(CodeEntryAlignment);
2554 
2555     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2556 
2557     address start = __ pc();
2558     __ enter();
2559     __ cache_wb(Address(line, 0));
2560     __ leave();
2561     __ ret(lr);
2562 
2563     return start;
2564   }
2565 
2566   address generate_data_cache_writeback_sync() {
2567     const Register is_pre     = c_rarg0;  // pre or post sync
2568 
2569     __ align(CodeEntryAlignment);
2570 
2571     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2572 
2573     // pre wbsync is a no-op
2574     // post wbsync translates to an sfence
2575 
2576     Label skip;
2577     address start = __ pc();
2578     __ enter();
2579     __ cbnz(is_pre, skip);
2580     __ cache_wbsync(false);
2581     __ bind(skip);
2582     __ leave();
2583     __ ret(lr);
2584 
2585     return start;
2586   }
2587 
2588   void generate_arraycopy_stubs() {
2589     address entry;
2590     address entry_jbyte_arraycopy;
2591     address entry_jshort_arraycopy;
2592     address entry_jint_arraycopy;
2593     address entry_oop_arraycopy;
2594     address entry_jlong_arraycopy;
2595     address entry_checkcast_arraycopy;
2596 
2597     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2598     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2599 
2600     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2601     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2602 
2603     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2604     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2605 
2606     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2607 
2608     //*** jbyte
2609     // Always need aligned and unaligned versions
2610     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2611                                                                                   "jbyte_disjoint_arraycopy");
2612     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2613                                                                                   &entry_jbyte_arraycopy,
2614                                                                                   "jbyte_arraycopy");
2615     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2616                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2617     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2618                                                                                   "arrayof_jbyte_arraycopy");
2619 
2620     //*** jshort
2621     // Always need aligned and unaligned versions
2622     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2623                                                                                     "jshort_disjoint_arraycopy");
2624     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2625                                                                                     &entry_jshort_arraycopy,
2626                                                                                     "jshort_arraycopy");
2627     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2628                                                                                     "arrayof_jshort_disjoint_arraycopy");
2629     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2630                                                                                     "arrayof_jshort_arraycopy");
2631 
2632     //*** jint
2633     // Aligned versions
2634     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2635                                                                                 "arrayof_jint_disjoint_arraycopy");
2636     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2637                                                                                 "arrayof_jint_arraycopy");
2638     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2639     // entry_jint_arraycopy always points to the unaligned version
2640     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2641                                                                                 "jint_disjoint_arraycopy");
2642     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2643                                                                                 &entry_jint_arraycopy,
2644                                                                                 "jint_arraycopy");
2645 
2646     //*** jlong
2647     // It is always aligned
2648     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2649                                                                                   "arrayof_jlong_disjoint_arraycopy");
2650     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2651                                                                                   "arrayof_jlong_arraycopy");
2652     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2653     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2654 
2655     //*** oops
2656     {
2657       // With compressed oops we need unaligned versions; notice that
2658       // we overwrite entry_oop_arraycopy.
2659       bool aligned = !UseCompressedOops;
2660 
2661       StubRoutines::_arrayof_oop_disjoint_arraycopy
2662         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2663                                      /*dest_uninitialized*/false);
2664       StubRoutines::_arrayof_oop_arraycopy
2665         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2666                                      /*dest_uninitialized*/false);
2667       // Aligned versions without pre-barriers
2668       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2669         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2670                                      /*dest_uninitialized*/true);
2671       StubRoutines::_arrayof_oop_arraycopy_uninit
2672         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2673                                      /*dest_uninitialized*/true);
2674     }
2675 
2676     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2677     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2678     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2679     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2680 
2681     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2682     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2683                                                                         /*dest_uninitialized*/true);
2684 
2685     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2686                                                               entry_jbyte_arraycopy,
2687                                                               entry_jshort_arraycopy,
2688                                                               entry_jint_arraycopy,
2689                                                               entry_jlong_arraycopy);
2690 
2691     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2692                                                                entry_jbyte_arraycopy,
2693                                                                entry_jshort_arraycopy,
2694                                                                entry_jint_arraycopy,
2695                                                                entry_oop_arraycopy,
2696                                                                entry_jlong_arraycopy,
2697                                                                entry_checkcast_arraycopy);
2698 
2699     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2700     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2701     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2702     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2703     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2704     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2705   }
2706 
2707   void generate_math_stubs() { Unimplemented(); }
2708 
2709   // Arguments:
2710   //
2711   // Inputs:
2712   //   c_rarg0   - source byte array address
2713   //   c_rarg1   - destination byte array address
2714   //   c_rarg2   - K (key) in little endian int array
2715   //
2716   address generate_aescrypt_encryptBlock() {
2717     __ align(CodeEntryAlignment);
2718     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2719 
2720     const Register from        = c_rarg0;  // source array address
2721     const Register to          = c_rarg1;  // destination array address
2722     const Register key         = c_rarg2;  // key array address
2723     const Register keylen      = rscratch1;
2724 
2725     address start = __ pc();
2726     __ enter();
2727 
2728     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2729 
2730     __ aesenc_loadkeys(key, keylen);
2731     __ aesecb_encrypt(from, to, keylen);
2732 
2733     __ mov(r0, 0);
2734 
2735     __ leave();
2736     __ ret(lr);
2737 
2738     return start;
2739   }
2740 
2741   // Arguments:
2742   //
2743   // Inputs:
2744   //   c_rarg0   - source byte array address
2745   //   c_rarg1   - destination byte array address
2746   //   c_rarg2   - K (key) in little endian int array
2747   //
2748   address generate_aescrypt_decryptBlock() {
2749     assert(UseAES, "need AES cryptographic extension support");
2750     __ align(CodeEntryAlignment);
2751     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2752     Label L_doLast;
2753 
2754     const Register from        = c_rarg0;  // source array address
2755     const Register to          = c_rarg1;  // destination array address
2756     const Register key         = c_rarg2;  // key array address
2757     const Register keylen      = rscratch1;
2758 
2759     address start = __ pc();
2760     __ enter(); // required for proper stackwalking of RuntimeStub frame
2761 
2762     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2763 
2764     __ aesecb_decrypt(from, to, key, keylen);
2765 
2766     __ mov(r0, 0);
2767 
2768     __ leave();
2769     __ ret(lr);
2770 
2771     return start;
2772   }
2773 
2774   // Arguments:
2775   //
2776   // Inputs:
2777   //   c_rarg0   - source byte array address
2778   //   c_rarg1   - destination byte array address
2779   //   c_rarg2   - K (key) in little endian int array
2780   //   c_rarg3   - r vector byte array address
2781   //   c_rarg4   - input length
2782   //
2783   // Output:
2784   //   x0        - input length
2785   //
2786   address generate_cipherBlockChaining_encryptAESCrypt() {
2787     assert(UseAES, "need AES cryptographic extension support");
2788     __ align(CodeEntryAlignment);
2789     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2790 
2791     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2792 
2793     const Register from        = c_rarg0;  // source array address
2794     const Register to          = c_rarg1;  // destination array address
2795     const Register key         = c_rarg2;  // key array address
2796     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2797                                            // and left with the results of the last encryption block
2798     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2799     const Register keylen      = rscratch1;
2800 
2801     address start = __ pc();
2802 
2803       __ enter();
2804 
2805       __ movw(rscratch2, len_reg);
2806 
2807       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2808 
2809       __ ld1(v0, __ T16B, rvec);
2810 
2811       __ cmpw(keylen, 52);
2812       __ br(Assembler::CC, L_loadkeys_44);
2813       __ br(Assembler::EQ, L_loadkeys_52);
2814 
2815       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2816       __ rev32(v17, __ T16B, v17);
2817       __ rev32(v18, __ T16B, v18);
2818     __ BIND(L_loadkeys_52);
2819       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2820       __ rev32(v19, __ T16B, v19);
2821       __ rev32(v20, __ T16B, v20);
2822     __ BIND(L_loadkeys_44);
2823       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2824       __ rev32(v21, __ T16B, v21);
2825       __ rev32(v22, __ T16B, v22);
2826       __ rev32(v23, __ T16B, v23);
2827       __ rev32(v24, __ T16B, v24);
2828       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2829       __ rev32(v25, __ T16B, v25);
2830       __ rev32(v26, __ T16B, v26);
2831       __ rev32(v27, __ T16B, v27);
2832       __ rev32(v28, __ T16B, v28);
2833       __ ld1(v29, v30, v31, __ T16B, key);
2834       __ rev32(v29, __ T16B, v29);
2835       __ rev32(v30, __ T16B, v30);
2836       __ rev32(v31, __ T16B, v31);
2837 
2838     __ BIND(L_aes_loop);
2839       __ ld1(v1, __ T16B, __ post(from, 16));
2840       __ eor(v0, __ T16B, v0, v1);
2841 
2842       __ br(Assembler::CC, L_rounds_44);
2843       __ br(Assembler::EQ, L_rounds_52);
2844 
2845       __ aese(v0, v17); __ aesmc(v0, v0);
2846       __ aese(v0, v18); __ aesmc(v0, v0);
2847     __ BIND(L_rounds_52);
2848       __ aese(v0, v19); __ aesmc(v0, v0);
2849       __ aese(v0, v20); __ aesmc(v0, v0);
2850     __ BIND(L_rounds_44);
2851       __ aese(v0, v21); __ aesmc(v0, v0);
2852       __ aese(v0, v22); __ aesmc(v0, v0);
2853       __ aese(v0, v23); __ aesmc(v0, v0);
2854       __ aese(v0, v24); __ aesmc(v0, v0);
2855       __ aese(v0, v25); __ aesmc(v0, v0);
2856       __ aese(v0, v26); __ aesmc(v0, v0);
2857       __ aese(v0, v27); __ aesmc(v0, v0);
2858       __ aese(v0, v28); __ aesmc(v0, v0);
2859       __ aese(v0, v29); __ aesmc(v0, v0);
2860       __ aese(v0, v30);
2861       __ eor(v0, __ T16B, v0, v31);
2862 
2863       __ st1(v0, __ T16B, __ post(to, 16));
2864 
2865       __ subw(len_reg, len_reg, 16);
2866       __ cbnzw(len_reg, L_aes_loop);
2867 
2868       __ st1(v0, __ T16B, rvec);
2869 
2870       __ mov(r0, rscratch2);
2871 
2872       __ leave();
2873       __ ret(lr);
2874 
2875       return start;
2876   }
2877 
2878   // Arguments:
2879   //
2880   // Inputs:
2881   //   c_rarg0   - source byte array address
2882   //   c_rarg1   - destination byte array address
2883   //   c_rarg2   - K (key) in little endian int array
2884   //   c_rarg3   - r vector byte array address
2885   //   c_rarg4   - input length
2886   //
2887   // Output:
2888   //   r0        - input length
2889   //
2890   address generate_cipherBlockChaining_decryptAESCrypt() {
2891     assert(UseAES, "need AES cryptographic extension support");
2892     __ align(CodeEntryAlignment);
2893     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2894 
2895     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2896 
2897     const Register from        = c_rarg0;  // source array address
2898     const Register to          = c_rarg1;  // destination array address
2899     const Register key         = c_rarg2;  // key array address
2900     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2901                                            // and left with the results of the last encryption block
2902     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2903     const Register keylen      = rscratch1;
2904 
2905     address start = __ pc();
2906 
2907       __ enter();
2908 
2909       __ movw(rscratch2, len_reg);
2910 
2911       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2912 
2913       __ ld1(v2, __ T16B, rvec);
2914 
2915       __ ld1(v31, __ T16B, __ post(key, 16));
2916       __ rev32(v31, __ T16B, v31);
2917 
2918       __ cmpw(keylen, 52);
2919       __ br(Assembler::CC, L_loadkeys_44);
2920       __ br(Assembler::EQ, L_loadkeys_52);
2921 
2922       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2923       __ rev32(v17, __ T16B, v17);
2924       __ rev32(v18, __ T16B, v18);
2925     __ BIND(L_loadkeys_52);
2926       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2927       __ rev32(v19, __ T16B, v19);
2928       __ rev32(v20, __ T16B, v20);
2929     __ BIND(L_loadkeys_44);
2930       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2931       __ rev32(v21, __ T16B, v21);
2932       __ rev32(v22, __ T16B, v22);
2933       __ rev32(v23, __ T16B, v23);
2934       __ rev32(v24, __ T16B, v24);
2935       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2936       __ rev32(v25, __ T16B, v25);
2937       __ rev32(v26, __ T16B, v26);
2938       __ rev32(v27, __ T16B, v27);
2939       __ rev32(v28, __ T16B, v28);
2940       __ ld1(v29, v30, __ T16B, key);
2941       __ rev32(v29, __ T16B, v29);
2942       __ rev32(v30, __ T16B, v30);
2943 
2944     __ BIND(L_aes_loop);
2945       __ ld1(v0, __ T16B, __ post(from, 16));
2946       __ orr(v1, __ T16B, v0, v0);
2947 
2948       __ br(Assembler::CC, L_rounds_44);
2949       __ br(Assembler::EQ, L_rounds_52);
2950 
2951       __ aesd(v0, v17); __ aesimc(v0, v0);
2952       __ aesd(v0, v18); __ aesimc(v0, v0);
2953     __ BIND(L_rounds_52);
2954       __ aesd(v0, v19); __ aesimc(v0, v0);
2955       __ aesd(v0, v20); __ aesimc(v0, v0);
2956     __ BIND(L_rounds_44);
2957       __ aesd(v0, v21); __ aesimc(v0, v0);
2958       __ aesd(v0, v22); __ aesimc(v0, v0);
2959       __ aesd(v0, v23); __ aesimc(v0, v0);
2960       __ aesd(v0, v24); __ aesimc(v0, v0);
2961       __ aesd(v0, v25); __ aesimc(v0, v0);
2962       __ aesd(v0, v26); __ aesimc(v0, v0);
2963       __ aesd(v0, v27); __ aesimc(v0, v0);
2964       __ aesd(v0, v28); __ aesimc(v0, v0);
2965       __ aesd(v0, v29); __ aesimc(v0, v0);
2966       __ aesd(v0, v30);
2967       __ eor(v0, __ T16B, v0, v31);
2968       __ eor(v0, __ T16B, v0, v2);
2969 
2970       __ st1(v0, __ T16B, __ post(to, 16));
2971       __ orr(v2, __ T16B, v1, v1);
2972 
2973       __ subw(len_reg, len_reg, 16);
2974       __ cbnzw(len_reg, L_aes_loop);
2975 
2976       __ st1(v2, __ T16B, rvec);
2977 
2978       __ mov(r0, rscratch2);
2979 
2980       __ leave();
2981       __ ret(lr);
2982 
2983     return start;
2984   }
2985 
2986   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2987   // Inputs: 128-bits. in is preserved.
2988   // The least-significant 64-bit word is in the upper dword of each vector.
2989   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2990   // Output: result
2991   void be_add_128_64(FloatRegister result, FloatRegister in,
2992                      FloatRegister inc, FloatRegister tmp) {
2993     assert_different_registers(result, tmp, inc);
2994 
2995     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2996                                            // input
2997     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2998     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
2999                                            // MSD == 0 (must be!) to LSD
3000     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
3001   }
3002 
3003   // CTR AES crypt.
3004   // Arguments:
3005   //
3006   // Inputs:
3007   //   c_rarg0   - source byte array address
3008   //   c_rarg1   - destination byte array address
3009   //   c_rarg2   - K (key) in little endian int array
3010   //   c_rarg3   - counter vector byte array address
3011   //   c_rarg4   - input length
3012   //   c_rarg5   - saved encryptedCounter start
3013   //   c_rarg6   - saved used length
3014   //
3015   // Output:
3016   //   r0       - input length
3017   //
3018   address generate_counterMode_AESCrypt() {
3019     const Register in = c_rarg0;
3020     const Register out = c_rarg1;
3021     const Register key = c_rarg2;
3022     const Register counter = c_rarg3;
3023     const Register saved_len = c_rarg4, len = r10;
3024     const Register saved_encrypted_ctr = c_rarg5;
3025     const Register used_ptr = c_rarg6, used = r12;
3026 
3027     const Register offset = r7;
3028     const Register keylen = r11;
3029 
3030     const unsigned char block_size = 16;
3031     const int bulk_width = 4;
3032     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3033     // performance with larger data sizes, but it also means that the
3034     // fast path isn't used until you have at least 8 blocks, and up
3035     // to 127 bytes of data will be executed on the slow path. For
3036     // that reason, and also so as not to blow away too much icache, 4
3037     // blocks seems like a sensible compromise.
3038 
3039     // Algorithm:
3040     //
3041     //    if (len == 0) {
3042     //        goto DONE;
3043     //    }
3044     //    int result = len;
3045     //    do {
3046     //        if (used >= blockSize) {
3047     //            if (len >= bulk_width * blockSize) {
3048     //                CTR_large_block();
3049     //                if (len == 0)
3050     //                    goto DONE;
3051     //            }
3052     //            for (;;) {
3053     //                16ByteVector v0 = counter;
3054     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3055     //                used = 0;
3056     //                if (len < blockSize)
3057     //                    break;    /* goto NEXT */
3058     //                16ByteVector v1 = load16Bytes(in, offset);
3059     //                v1 = v1 ^ encryptedCounter;
3060     //                store16Bytes(out, offset);
3061     //                used = blockSize;
3062     //                offset += blockSize;
3063     //                len -= blockSize;
3064     //                if (len == 0)
3065     //                    goto DONE;
3066     //            }
3067     //        }
3068     //      NEXT:
3069     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3070     //        len--;
3071     //    } while (len != 0);
3072     //  DONE:
3073     //    return result;
3074     //
3075     // CTR_large_block()
3076     //    Wide bulk encryption of whole blocks.
3077 
3078     __ align(CodeEntryAlignment);
3079     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3080     const address start = __ pc();
3081     __ enter();
3082 
3083     Label DONE, CTR_large_block, large_block_return;
3084     __ ldrw(used, Address(used_ptr));
3085     __ cbzw(saved_len, DONE);
3086 
3087     __ mov(len, saved_len);
3088     __ mov(offset, 0);
3089 
3090     // Compute #rounds for AES based on the length of the key array
3091     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3092 
3093     __ aesenc_loadkeys(key, keylen);
3094 
3095     {
3096       Label L_CTR_loop, NEXT;
3097 
3098       __ bind(L_CTR_loop);
3099 
3100       __ cmp(used, block_size);
3101       __ br(__ LO, NEXT);
3102 
3103       // Maybe we have a lot of data
3104       __ subsw(rscratch1, len, bulk_width * block_size);
3105       __ br(__ HS, CTR_large_block);
3106       __ BIND(large_block_return);
3107       __ cbzw(len, DONE);
3108 
3109       // Setup the counter
3110       __ movi(v4, __ T4S, 0);
3111       __ movi(v5, __ T4S, 1);
3112       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3113 
3114       // 128-bit big-endian increment
3115       __ ld1(v0, __ T16B, counter);
3116       __ rev64(v16, __ T16B, v0);
3117       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3118       __ rev64(v16, __ T16B, v16);
3119       __ st1(v16, __ T16B, counter);
3120       // Previous counter value is in v0
3121       // v4 contains { 0, 1 }
3122 
3123       {
3124         // We have fewer than bulk_width blocks of data left. Encrypt
3125         // them one by one until there is less than a full block
3126         // remaining, being careful to save both the encrypted counter
3127         // and the counter.
3128 
3129         Label inner_loop;
3130         __ bind(inner_loop);
3131         // Counter to encrypt is in v0
3132         __ aesecb_encrypt(noreg, noreg, keylen);
3133         __ st1(v0, __ T16B, saved_encrypted_ctr);
3134 
3135         // Do we have a remaining full block?
3136 
3137         __ mov(used, 0);
3138         __ cmp(len, block_size);
3139         __ br(__ LO, NEXT);
3140 
3141         // Yes, we have a full block
3142         __ ldrq(v1, Address(in, offset));
3143         __ eor(v1, __ T16B, v1, v0);
3144         __ strq(v1, Address(out, offset));
3145         __ mov(used, block_size);
3146         __ add(offset, offset, block_size);
3147 
3148         __ subw(len, len, block_size);
3149         __ cbzw(len, DONE);
3150 
3151         // Increment the counter, store it back
3152         __ orr(v0, __ T16B, v16, v16);
3153         __ rev64(v16, __ T16B, v16);
3154         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3155         __ rev64(v16, __ T16B, v16);
3156         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3157 
3158         __ b(inner_loop);
3159       }
3160 
3161       __ BIND(NEXT);
3162 
3163       // Encrypt a single byte, and loop.
3164       // We expect this to be a rare event.
3165       __ ldrb(rscratch1, Address(in, offset));
3166       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3167       __ eor(rscratch1, rscratch1, rscratch2);
3168       __ strb(rscratch1, Address(out, offset));
3169       __ add(offset, offset, 1);
3170       __ add(used, used, 1);
3171       __ subw(len, len,1);
3172       __ cbnzw(len, L_CTR_loop);
3173     }
3174 
3175     __ bind(DONE);
3176     __ strw(used, Address(used_ptr));
3177     __ mov(r0, saved_len);
3178 
3179     __ leave(); // required for proper stackwalking of RuntimeStub frame
3180     __ ret(lr);
3181 
3182     // Bulk encryption
3183 
3184     __ BIND (CTR_large_block);
3185     assert(bulk_width == 4 || bulk_width == 8, "must be");
3186 
3187     if (bulk_width == 8) {
3188       __ sub(sp, sp, 4 * 16);
3189       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3190     }
3191     __ sub(sp, sp, 4 * 16);
3192     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3193     RegSet saved_regs = (RegSet::of(in, out, offset)
3194                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3195     __ push(saved_regs, sp);
3196     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3197     __ add(in, in, offset);
3198     __ add(out, out, offset);
3199 
3200     // Keys should already be loaded into the correct registers
3201 
3202     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3203     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3204 
3205     // AES/CTR loop
3206     {
3207       Label L_CTR_loop;
3208       __ BIND(L_CTR_loop);
3209 
3210       // Setup the counters
3211       __ movi(v8, __ T4S, 0);
3212       __ movi(v9, __ T4S, 1);
3213       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3214 
3215       for (int i = 0; i < bulk_width; i++) {
3216         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3217         __ rev64(v0_ofs, __ T16B, v16);
3218         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3219       }
3220 
3221       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3222 
3223       // Encrypt the counters
3224       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3225 
3226       if (bulk_width == 8) {
3227         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3228       }
3229 
3230       // XOR the encrypted counters with the inputs
3231       for (int i = 0; i < bulk_width; i++) {
3232         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3233         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3234         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3235       }
3236 
3237       // Write the encrypted data
3238       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3239       if (bulk_width == 8) {
3240         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3241       }
3242 
3243       __ subw(len, len, 16 * bulk_width);
3244       __ cbnzw(len, L_CTR_loop);
3245     }
3246 
3247     // Save the counter back where it goes
3248     __ rev64(v16, __ T16B, v16);
3249     __ st1(v16, __ T16B, counter);
3250 
3251     __ pop(saved_regs, sp);
3252 
3253     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3254     if (bulk_width == 8) {
3255       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3256     }
3257 
3258     __ andr(rscratch1, len, -16 * bulk_width);
3259     __ sub(len, len, rscratch1);
3260     __ add(offset, offset, rscratch1);
3261     __ mov(used, 16);
3262     __ strw(used, Address(used_ptr));
3263     __ b(large_block_return);
3264 
3265     return start;
3266   }
3267 
3268   // Vector AES Galois Counter Mode implementation. Parameters:
3269   //
3270   // in = c_rarg0
3271   // len = c_rarg1
3272   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3273   // out = c_rarg3
3274   // key = c_rarg4
3275   // state = c_rarg5 - GHASH.state
3276   // subkeyHtbl = c_rarg6 - powers of H
3277   // counter = c_rarg7 - 16 bytes of CTR
3278   // return - number of processed bytes
3279   address generate_galoisCounterMode_AESCrypt() {
3280     address ghash_polynomial = __ pc();
3281     __ emit_int64(0x87);  // The low-order bits of the field
3282                           // polynomial (i.e. p = z^7+z^2+z+1)
3283                           // repeated in the low and high parts of a
3284                           // 128-bit vector
3285     __ emit_int64(0x87);
3286 
3287     __ align(CodeEntryAlignment);
3288      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3289     address start = __ pc();
3290     __ enter();
3291 
3292     const Register in = c_rarg0;
3293     const Register len = c_rarg1;
3294     const Register ct = c_rarg2;
3295     const Register out = c_rarg3;
3296     // and updated with the incremented counter in the end
3297 
3298     const Register key = c_rarg4;
3299     const Register state = c_rarg5;
3300 
3301     const Register subkeyHtbl = c_rarg6;
3302 
3303     const Register counter = c_rarg7;
3304 
3305     const Register keylen = r10;
3306     // Save state before entering routine
3307     __ sub(sp, sp, 4 * 16);
3308     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3309     __ sub(sp, sp, 4 * 16);
3310     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3311 
3312     // __ andr(len, len, -512);
3313     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3314     __ str(len, __ pre(sp, -2 * wordSize));
3315 
3316     Label DONE;
3317     __ cbz(len, DONE);
3318 
3319     // Compute #rounds for AES based on the length of the key array
3320     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3321 
3322     __ aesenc_loadkeys(key, keylen);
3323     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3324     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3325 
3326     // AES/CTR loop
3327     {
3328       Label L_CTR_loop;
3329       __ BIND(L_CTR_loop);
3330 
3331       // Setup the counters
3332       __ movi(v8, __ T4S, 0);
3333       __ movi(v9, __ T4S, 1);
3334       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3335 
3336       assert(v0->encoding() < v8->encoding(), "");
3337       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3338         FloatRegister f = as_FloatRegister(i);
3339         __ rev32(f, __ T16B, v16);
3340         __ addv(v16, __ T4S, v16, v8);
3341       }
3342 
3343       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3344 
3345       // Encrypt the counters
3346       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3347 
3348       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3349 
3350       // XOR the encrypted counters with the inputs
3351       for (int i = 0; i < 8; i++) {
3352         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3353         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3354         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3355       }
3356       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3357       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3358 
3359       __ subw(len, len, 16 * 8);
3360       __ cbnzw(len, L_CTR_loop);
3361     }
3362 
3363     __ rev32(v16, __ T16B, v16);
3364     __ st1(v16, __ T16B, counter);
3365 
3366     __ ldr(len, Address(sp));
3367     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3368 
3369     // GHASH/CTR loop
3370     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3371                                 len, /*unrolls*/4);
3372 
3373 #ifdef ASSERT
3374     { Label L;
3375       __ cmp(len, (unsigned char)0);
3376       __ br(Assembler::EQ, L);
3377       __ stop("stubGenerator: abort");
3378       __ bind(L);
3379   }
3380 #endif
3381 
3382   __ bind(DONE);
3383     // Return the number of bytes processed
3384     __ ldr(r0, __ post(sp, 2 * wordSize));
3385 
3386     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3387     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3388 
3389     __ leave(); // required for proper stackwalking of RuntimeStub frame
3390     __ ret(lr);
3391      return start;
3392   }
3393 
3394   class Cached64Bytes {
3395   private:
3396     MacroAssembler *_masm;
3397     Register _regs[8];
3398 
3399   public:
3400     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3401       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3402       auto it = rs.begin();
3403       for (auto &r: _regs) {
3404         r = *it;
3405         ++it;
3406       }
3407     }
3408 
3409     void gen_loads(Register base) {
3410       for (int i = 0; i < 8; i += 2) {
3411         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3412       }
3413     }
3414 
3415     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3416     void extract_u32(Register dest, int i) {
3417       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3418     }
3419   };
3420 
3421   // Utility routines for md5.
3422   // Clobbers r10 and r11.
3423   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3424               int k, int s, int t) {
3425     Register rscratch3 = r10;
3426     Register rscratch4 = r11;
3427 
3428     __ eorw(rscratch3, r3, r4);
3429     __ movw(rscratch2, t);
3430     __ andw(rscratch3, rscratch3, r2);
3431     __ addw(rscratch4, r1, rscratch2);
3432     reg_cache.extract_u32(rscratch1, k);
3433     __ eorw(rscratch3, rscratch3, r4);
3434     __ addw(rscratch4, rscratch4, rscratch1);
3435     __ addw(rscratch3, rscratch3, rscratch4);
3436     __ rorw(rscratch2, rscratch3, 32 - s);
3437     __ addw(r1, rscratch2, r2);
3438   }
3439 
3440   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3441               int k, int s, int t) {
3442     Register rscratch3 = r10;
3443     Register rscratch4 = r11;
3444 
3445     __ andw(rscratch3, r2, r4);
3446     __ bicw(rscratch4, r3, r4);
3447     reg_cache.extract_u32(rscratch1, k);
3448     __ movw(rscratch2, t);
3449     __ orrw(rscratch3, rscratch3, rscratch4);
3450     __ addw(rscratch4, r1, rscratch2);
3451     __ addw(rscratch4, rscratch4, rscratch1);
3452     __ addw(rscratch3, rscratch3, rscratch4);
3453     __ rorw(rscratch2, rscratch3, 32 - s);
3454     __ addw(r1, rscratch2, r2);
3455   }
3456 
3457   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3458               int k, int s, int t) {
3459     Register rscratch3 = r10;
3460     Register rscratch4 = r11;
3461 
3462     __ eorw(rscratch3, r3, r4);
3463     __ movw(rscratch2, t);
3464     __ addw(rscratch4, r1, rscratch2);
3465     reg_cache.extract_u32(rscratch1, k);
3466     __ eorw(rscratch3, rscratch3, r2);
3467     __ addw(rscratch4, rscratch4, rscratch1);
3468     __ addw(rscratch3, rscratch3, rscratch4);
3469     __ rorw(rscratch2, rscratch3, 32 - s);
3470     __ addw(r1, rscratch2, r2);
3471   }
3472 
3473   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3474               int k, int s, int t) {
3475     Register rscratch3 = r10;
3476     Register rscratch4 = r11;
3477 
3478     __ movw(rscratch3, t);
3479     __ ornw(rscratch2, r2, r4);
3480     __ addw(rscratch4, r1, rscratch3);
3481     reg_cache.extract_u32(rscratch1, k);
3482     __ eorw(rscratch3, rscratch2, r3);
3483     __ addw(rscratch4, rscratch4, rscratch1);
3484     __ addw(rscratch3, rscratch3, rscratch4);
3485     __ rorw(rscratch2, rscratch3, 32 - s);
3486     __ addw(r1, rscratch2, r2);
3487   }
3488 
3489   // Arguments:
3490   //
3491   // Inputs:
3492   //   c_rarg0   - byte[]  source+offset
3493   //   c_rarg1   - int[]   SHA.state
3494   //   c_rarg2   - int     offset
3495   //   c_rarg3   - int     limit
3496   //
3497   address generate_md5_implCompress(bool multi_block, const char *name) {
3498     __ align(CodeEntryAlignment);
3499     StubCodeMark mark(this, "StubRoutines", name);
3500     address start = __ pc();
3501 
3502     Register buf       = c_rarg0;
3503     Register state     = c_rarg1;
3504     Register ofs       = c_rarg2;
3505     Register limit     = c_rarg3;
3506     Register a         = r4;
3507     Register b         = r5;
3508     Register c         = r6;
3509     Register d         = r7;
3510     Register rscratch3 = r10;
3511     Register rscratch4 = r11;
3512 
3513     Register state_regs[2] = { r12, r13 };
3514     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3515     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3516 
3517     __ push(saved_regs, sp);
3518 
3519     __ ldp(state_regs[0], state_regs[1], Address(state));
3520     __ ubfx(a, state_regs[0],  0, 32);
3521     __ ubfx(b, state_regs[0], 32, 32);
3522     __ ubfx(c, state_regs[1],  0, 32);
3523     __ ubfx(d, state_regs[1], 32, 32);
3524 
3525     Label md5_loop;
3526     __ BIND(md5_loop);
3527 
3528     reg_cache.gen_loads(buf);
3529 
3530     // Round 1
3531     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3532     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3533     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3534     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3535     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3536     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3537     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3538     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3539     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3540     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3541     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3542     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3543     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3544     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3545     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3546     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3547 
3548     // Round 2
3549     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3550     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3551     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3552     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3553     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3554     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3555     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3556     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3557     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3558     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3559     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3560     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3561     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3562     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3563     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3564     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3565 
3566     // Round 3
3567     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3568     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3569     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3570     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3571     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3572     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3573     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3574     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3575     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3576     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3577     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3578     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3579     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3580     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3581     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3582     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3583 
3584     // Round 4
3585     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3586     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3587     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3588     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3589     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3590     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3591     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3592     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3593     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3594     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3595     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3596     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3597     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3598     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3599     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3600     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3601 
3602     __ addw(a, state_regs[0], a);
3603     __ ubfx(rscratch2, state_regs[0], 32, 32);
3604     __ addw(b, rscratch2, b);
3605     __ addw(c, state_regs[1], c);
3606     __ ubfx(rscratch4, state_regs[1], 32, 32);
3607     __ addw(d, rscratch4, d);
3608 
3609     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3610     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3611 
3612     if (multi_block) {
3613       __ add(buf, buf, 64);
3614       __ add(ofs, ofs, 64);
3615       __ cmp(ofs, limit);
3616       __ br(Assembler::LE, md5_loop);
3617       __ mov(c_rarg0, ofs); // return ofs
3618     }
3619 
3620     // write hash values back in the correct order
3621     __ stp(state_regs[0], state_regs[1], Address(state));
3622 
3623     __ pop(saved_regs, sp);
3624 
3625     __ ret(lr);
3626 
3627     return start;
3628   }
3629 
3630   // Arguments:
3631   //
3632   // Inputs:
3633   //   c_rarg0   - byte[]  source+offset
3634   //   c_rarg1   - int[]   SHA.state
3635   //   c_rarg2   - int     offset
3636   //   c_rarg3   - int     limit
3637   //
3638   address generate_sha1_implCompress(bool multi_block, const char *name) {
3639     __ align(CodeEntryAlignment);
3640     StubCodeMark mark(this, "StubRoutines", name);
3641     address start = __ pc();
3642 
3643     Register buf   = c_rarg0;
3644     Register state = c_rarg1;
3645     Register ofs   = c_rarg2;
3646     Register limit = c_rarg3;
3647 
3648     Label keys;
3649     Label sha1_loop;
3650 
3651     // load the keys into v0..v3
3652     __ adr(rscratch1, keys);
3653     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3654     // load 5 words state into v6, v7
3655     __ ldrq(v6, Address(state, 0));
3656     __ ldrs(v7, Address(state, 16));
3657 
3658 
3659     __ BIND(sha1_loop);
3660     // load 64 bytes of data into v16..v19
3661     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3662     __ rev32(v16, __ T16B, v16);
3663     __ rev32(v17, __ T16B, v17);
3664     __ rev32(v18, __ T16B, v18);
3665     __ rev32(v19, __ T16B, v19);
3666 
3667     // do the sha1
3668     __ addv(v4, __ T4S, v16, v0);
3669     __ orr(v20, __ T16B, v6, v6);
3670 
3671     FloatRegister d0 = v16;
3672     FloatRegister d1 = v17;
3673     FloatRegister d2 = v18;
3674     FloatRegister d3 = v19;
3675 
3676     for (int round = 0; round < 20; round++) {
3677       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3678       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3679       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3680       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3681       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3682 
3683       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3684       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3685       __ sha1h(tmp2, __ T4S, v20);
3686       if (round < 5)
3687         __ sha1c(v20, __ T4S, tmp3, tmp4);
3688       else if (round < 10 || round >= 15)
3689         __ sha1p(v20, __ T4S, tmp3, tmp4);
3690       else
3691         __ sha1m(v20, __ T4S, tmp3, tmp4);
3692       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3693 
3694       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3695     }
3696 
3697     __ addv(v7, __ T2S, v7, v21);
3698     __ addv(v6, __ T4S, v6, v20);
3699 
3700     if (multi_block) {
3701       __ add(ofs, ofs, 64);
3702       __ cmp(ofs, limit);
3703       __ br(Assembler::LE, sha1_loop);
3704       __ mov(c_rarg0, ofs); // return ofs
3705     }
3706 
3707     __ strq(v6, Address(state, 0));
3708     __ strs(v7, Address(state, 16));
3709 
3710     __ ret(lr);
3711 
3712     __ bind(keys);
3713     __ emit_int32(0x5a827999);
3714     __ emit_int32(0x6ed9eba1);
3715     __ emit_int32(0x8f1bbcdc);
3716     __ emit_int32(0xca62c1d6);
3717 
3718     return start;
3719   }
3720 
3721 
3722   // Arguments:
3723   //
3724   // Inputs:
3725   //   c_rarg0   - byte[]  source+offset
3726   //   c_rarg1   - int[]   SHA.state
3727   //   c_rarg2   - int     offset
3728   //   c_rarg3   - int     limit
3729   //
3730   address generate_sha256_implCompress(bool multi_block, const char *name) {
3731     static const uint32_t round_consts[64] = {
3732       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3733       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3734       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3735       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3736       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3737       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3738       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3739       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3740       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3741       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3742       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3743       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3744       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3745       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3746       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3747       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3748     };
3749     __ align(CodeEntryAlignment);
3750     StubCodeMark mark(this, "StubRoutines", name);
3751     address start = __ pc();
3752 
3753     Register buf   = c_rarg0;
3754     Register state = c_rarg1;
3755     Register ofs   = c_rarg2;
3756     Register limit = c_rarg3;
3757 
3758     Label sha1_loop;
3759 
3760     __ stpd(v8, v9, __ pre(sp, -32));
3761     __ stpd(v10, v11, Address(sp, 16));
3762 
3763 // dga == v0
3764 // dgb == v1
3765 // dg0 == v2
3766 // dg1 == v3
3767 // dg2 == v4
3768 // t0 == v6
3769 // t1 == v7
3770 
3771     // load 16 keys to v16..v31
3772     __ lea(rscratch1, ExternalAddress((address)round_consts));
3773     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3774     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3775     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3776     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3777 
3778     // load 8 words (256 bits) state
3779     __ ldpq(v0, v1, state);
3780 
3781     __ BIND(sha1_loop);
3782     // load 64 bytes of data into v8..v11
3783     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3784     __ rev32(v8, __ T16B, v8);
3785     __ rev32(v9, __ T16B, v9);
3786     __ rev32(v10, __ T16B, v10);
3787     __ rev32(v11, __ T16B, v11);
3788 
3789     __ addv(v6, __ T4S, v8, v16);
3790     __ orr(v2, __ T16B, v0, v0);
3791     __ orr(v3, __ T16B, v1, v1);
3792 
3793     FloatRegister d0 = v8;
3794     FloatRegister d1 = v9;
3795     FloatRegister d2 = v10;
3796     FloatRegister d3 = v11;
3797 
3798 
3799     for (int round = 0; round < 16; round++) {
3800       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3801       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3802       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3803       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3804 
3805       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3806        __ orr(v4, __ T16B, v2, v2);
3807       if (round < 15)
3808         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3809       __ sha256h(v2, __ T4S, v3, tmp2);
3810       __ sha256h2(v3, __ T4S, v4, tmp2);
3811       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3812 
3813       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3814     }
3815 
3816     __ addv(v0, __ T4S, v0, v2);
3817     __ addv(v1, __ T4S, v1, v3);
3818 
3819     if (multi_block) {
3820       __ add(ofs, ofs, 64);
3821       __ cmp(ofs, limit);
3822       __ br(Assembler::LE, sha1_loop);
3823       __ mov(c_rarg0, ofs); // return ofs
3824     }
3825 
3826     __ ldpd(v10, v11, Address(sp, 16));
3827     __ ldpd(v8, v9, __ post(sp, 32));
3828 
3829     __ stpq(v0, v1, state);
3830 
3831     __ ret(lr);
3832 
3833     return start;
3834   }
3835 
3836   // Double rounds for sha512.
3837   void sha512_dround(int dr,
3838                      FloatRegister vi0, FloatRegister vi1,
3839                      FloatRegister vi2, FloatRegister vi3,
3840                      FloatRegister vi4, FloatRegister vrc0,
3841                      FloatRegister vrc1, FloatRegister vin0,
3842                      FloatRegister vin1, FloatRegister vin2,
3843                      FloatRegister vin3, FloatRegister vin4) {
3844       if (dr < 36) {
3845         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3846       }
3847       __ addv(v5, __ T2D, vrc0, vin0);
3848       __ ext(v6, __ T16B, vi2, vi3, 8);
3849       __ ext(v5, __ T16B, v5, v5, 8);
3850       __ ext(v7, __ T16B, vi1, vi2, 8);
3851       __ addv(vi3, __ T2D, vi3, v5);
3852       if (dr < 32) {
3853         __ ext(v5, __ T16B, vin3, vin4, 8);
3854         __ sha512su0(vin0, __ T2D, vin1);
3855       }
3856       __ sha512h(vi3, __ T2D, v6, v7);
3857       if (dr < 32) {
3858         __ sha512su1(vin0, __ T2D, vin2, v5);
3859       }
3860       __ addv(vi4, __ T2D, vi1, vi3);
3861       __ sha512h2(vi3, __ T2D, vi1, vi0);
3862   }
3863 
3864   // Arguments:
3865   //
3866   // Inputs:
3867   //   c_rarg0   - byte[]  source+offset
3868   //   c_rarg1   - int[]   SHA.state
3869   //   c_rarg2   - int     offset
3870   //   c_rarg3   - int     limit
3871   //
3872   address generate_sha512_implCompress(bool multi_block, const char *name) {
3873     static const uint64_t round_consts[80] = {
3874       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3875       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3876       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3877       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3878       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3879       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3880       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3881       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3882       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3883       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3884       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3885       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3886       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3887       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3888       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3889       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3890       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3891       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3892       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3893       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3894       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3895       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3896       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3897       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3898       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3899       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3900       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3901     };
3902 
3903     __ align(CodeEntryAlignment);
3904     StubCodeMark mark(this, "StubRoutines", name);
3905     address start = __ pc();
3906 
3907     Register buf   = c_rarg0;
3908     Register state = c_rarg1;
3909     Register ofs   = c_rarg2;
3910     Register limit = c_rarg3;
3911 
3912     __ stpd(v8, v9, __ pre(sp, -64));
3913     __ stpd(v10, v11, Address(sp, 16));
3914     __ stpd(v12, v13, Address(sp, 32));
3915     __ stpd(v14, v15, Address(sp, 48));
3916 
3917     Label sha512_loop;
3918 
3919     // load state
3920     __ ld1(v8, v9, v10, v11, __ T2D, state);
3921 
3922     // load first 4 round constants
3923     __ lea(rscratch1, ExternalAddress((address)round_consts));
3924     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3925 
3926     __ BIND(sha512_loop);
3927     // load 128B of data into v12..v19
3928     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3929     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3930     __ rev64(v12, __ T16B, v12);
3931     __ rev64(v13, __ T16B, v13);
3932     __ rev64(v14, __ T16B, v14);
3933     __ rev64(v15, __ T16B, v15);
3934     __ rev64(v16, __ T16B, v16);
3935     __ rev64(v17, __ T16B, v17);
3936     __ rev64(v18, __ T16B, v18);
3937     __ rev64(v19, __ T16B, v19);
3938 
3939     __ mov(rscratch2, rscratch1);
3940 
3941     __ mov(v0, __ T16B, v8);
3942     __ mov(v1, __ T16B, v9);
3943     __ mov(v2, __ T16B, v10);
3944     __ mov(v3, __ T16B, v11);
3945 
3946     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3947     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3948     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3949     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3950     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3951     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3952     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3953     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3954     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3955     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3956     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3957     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3958     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3959     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3960     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3961     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3962     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3963     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3964     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3965     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3966     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3967     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3968     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3969     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3970     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3971     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3972     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3973     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3974     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3975     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3976     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3977     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3978     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3979     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3980     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3981     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3982     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3983     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3984     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3985     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3986 
3987     __ addv(v8, __ T2D, v8, v0);
3988     __ addv(v9, __ T2D, v9, v1);
3989     __ addv(v10, __ T2D, v10, v2);
3990     __ addv(v11, __ T2D, v11, v3);
3991 
3992     if (multi_block) {
3993       __ add(ofs, ofs, 128);
3994       __ cmp(ofs, limit);
3995       __ br(Assembler::LE, sha512_loop);
3996       __ mov(c_rarg0, ofs); // return ofs
3997     }
3998 
3999     __ st1(v8, v9, v10, v11, __ T2D, state);
4000 
4001     __ ldpd(v14, v15, Address(sp, 48));
4002     __ ldpd(v12, v13, Address(sp, 32));
4003     __ ldpd(v10, v11, Address(sp, 16));
4004     __ ldpd(v8, v9, __ post(sp, 64));
4005 
4006     __ ret(lr);
4007 
4008     return start;
4009   }
4010 
4011   // Arguments:
4012   //
4013   // Inputs:
4014   //   c_rarg0   - byte[]  source+offset
4015   //   c_rarg1   - byte[]  SHA.state
4016   //   c_rarg2   - int     block_size
4017   //   c_rarg3   - int     offset
4018   //   c_rarg4   - int     limit
4019   //
4020   address generate_sha3_implCompress(bool multi_block, const char *name) {
4021     static const uint64_t round_consts[24] = {
4022       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4023       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4024       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4025       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4026       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4027       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4028       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4029       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4030     };
4031 
4032     __ align(CodeEntryAlignment);
4033     StubCodeMark mark(this, "StubRoutines", name);
4034     address start = __ pc();
4035 
4036     Register buf           = c_rarg0;
4037     Register state         = c_rarg1;
4038     Register block_size    = c_rarg2;
4039     Register ofs           = c_rarg3;
4040     Register limit         = c_rarg4;
4041 
4042     Label sha3_loop, rounds24_loop;
4043     Label sha3_512_or_sha3_384, shake128;
4044 
4045     __ stpd(v8, v9, __ pre(sp, -64));
4046     __ stpd(v10, v11, Address(sp, 16));
4047     __ stpd(v12, v13, Address(sp, 32));
4048     __ stpd(v14, v15, Address(sp, 48));
4049 
4050     // load state
4051     __ add(rscratch1, state, 32);
4052     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4053     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4054     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4055     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4056     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4057     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4058     __ ld1(v24, __ T1D, rscratch1);
4059 
4060     __ BIND(sha3_loop);
4061 
4062     // 24 keccak rounds
4063     __ movw(rscratch2, 24);
4064 
4065     // load round_constants base
4066     __ lea(rscratch1, ExternalAddress((address) round_consts));
4067 
4068     // load input
4069     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4070     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4071     __ eor(v0, __ T8B, v0, v25);
4072     __ eor(v1, __ T8B, v1, v26);
4073     __ eor(v2, __ T8B, v2, v27);
4074     __ eor(v3, __ T8B, v3, v28);
4075     __ eor(v4, __ T8B, v4, v29);
4076     __ eor(v5, __ T8B, v5, v30);
4077     __ eor(v6, __ T8B, v6, v31);
4078 
4079     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4080     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4081 
4082     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4083     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4084     __ eor(v7, __ T8B, v7, v25);
4085     __ eor(v8, __ T8B, v8, v26);
4086     __ eor(v9, __ T8B, v9, v27);
4087     __ eor(v10, __ T8B, v10, v28);
4088     __ eor(v11, __ T8B, v11, v29);
4089     __ eor(v12, __ T8B, v12, v30);
4090     __ eor(v13, __ T8B, v13, v31);
4091 
4092     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4093     __ eor(v14, __ T8B, v14, v25);
4094     __ eor(v15, __ T8B, v15, v26);
4095     __ eor(v16, __ T8B, v16, v27);
4096 
4097     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4098     __ andw(c_rarg5, block_size, 48);
4099     __ cbzw(c_rarg5, rounds24_loop);
4100 
4101     __ tbnz(block_size, 5, shake128);
4102     // block_size == 144, bit5 == 0, SHA3-244
4103     __ ldrd(v28, __ post(buf, 8));
4104     __ eor(v17, __ T8B, v17, v28);
4105     __ b(rounds24_loop);
4106 
4107     __ BIND(shake128);
4108     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4109     __ eor(v17, __ T8B, v17, v28);
4110     __ eor(v18, __ T8B, v18, v29);
4111     __ eor(v19, __ T8B, v19, v30);
4112     __ eor(v20, __ T8B, v20, v31);
4113     __ b(rounds24_loop); // block_size == 168, SHAKE128
4114 
4115     __ BIND(sha3_512_or_sha3_384);
4116     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4117     __ eor(v7, __ T8B, v7, v25);
4118     __ eor(v8, __ T8B, v8, v26);
4119     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4120 
4121     // SHA3-384
4122     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4123     __ eor(v9,  __ T8B, v9,  v27);
4124     __ eor(v10, __ T8B, v10, v28);
4125     __ eor(v11, __ T8B, v11, v29);
4126     __ eor(v12, __ T8B, v12, v30);
4127 
4128     __ BIND(rounds24_loop);
4129     __ subw(rscratch2, rscratch2, 1);
4130 
4131     __ eor3(v29, __ T16B, v4, v9, v14);
4132     __ eor3(v26, __ T16B, v1, v6, v11);
4133     __ eor3(v28, __ T16B, v3, v8, v13);
4134     __ eor3(v25, __ T16B, v0, v5, v10);
4135     __ eor3(v27, __ T16B, v2, v7, v12);
4136     __ eor3(v29, __ T16B, v29, v19, v24);
4137     __ eor3(v26, __ T16B, v26, v16, v21);
4138     __ eor3(v28, __ T16B, v28, v18, v23);
4139     __ eor3(v25, __ T16B, v25, v15, v20);
4140     __ eor3(v27, __ T16B, v27, v17, v22);
4141 
4142     __ rax1(v30, __ T2D, v29, v26);
4143     __ rax1(v26, __ T2D, v26, v28);
4144     __ rax1(v28, __ T2D, v28, v25);
4145     __ rax1(v25, __ T2D, v25, v27);
4146     __ rax1(v27, __ T2D, v27, v29);
4147 
4148     __ eor(v0, __ T16B, v0, v30);
4149     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4150     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4151     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4152     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4153     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4154     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4155     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4156     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4157     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4158     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4159     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4160     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4161     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4162     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4163     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4164     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4165     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4166     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4167     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4168     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4169     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4170     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4171     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4172     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4173 
4174     __ bcax(v20, __ T16B, v31, v22, v8);
4175     __ bcax(v21, __ T16B, v8,  v23, v22);
4176     __ bcax(v22, __ T16B, v22, v24, v23);
4177     __ bcax(v23, __ T16B, v23, v31, v24);
4178     __ bcax(v24, __ T16B, v24, v8,  v31);
4179 
4180     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4181 
4182     __ bcax(v17, __ T16B, v25, v19, v3);
4183     __ bcax(v18, __ T16B, v3,  v15, v19);
4184     __ bcax(v19, __ T16B, v19, v16, v15);
4185     __ bcax(v15, __ T16B, v15, v25, v16);
4186     __ bcax(v16, __ T16B, v16, v3,  v25);
4187 
4188     __ bcax(v10, __ T16B, v29, v12, v26);
4189     __ bcax(v11, __ T16B, v26, v13, v12);
4190     __ bcax(v12, __ T16B, v12, v14, v13);
4191     __ bcax(v13, __ T16B, v13, v29, v14);
4192     __ bcax(v14, __ T16B, v14, v26, v29);
4193 
4194     __ bcax(v7, __ T16B, v30, v9,  v4);
4195     __ bcax(v8, __ T16B, v4,  v5,  v9);
4196     __ bcax(v9, __ T16B, v9,  v6,  v5);
4197     __ bcax(v5, __ T16B, v5,  v30, v6);
4198     __ bcax(v6, __ T16B, v6,  v4,  v30);
4199 
4200     __ bcax(v3, __ T16B, v27, v0,  v28);
4201     __ bcax(v4, __ T16B, v28, v1,  v0);
4202     __ bcax(v0, __ T16B, v0,  v2,  v1);
4203     __ bcax(v1, __ T16B, v1,  v27, v2);
4204     __ bcax(v2, __ T16B, v2,  v28, v27);
4205 
4206     __ eor(v0, __ T16B, v0, v31);
4207 
4208     __ cbnzw(rscratch2, rounds24_loop);
4209 
4210     if (multi_block) {
4211       __ add(ofs, ofs, block_size);
4212       __ cmp(ofs, limit);
4213       __ br(Assembler::LE, sha3_loop);
4214       __ mov(c_rarg0, ofs); // return ofs
4215     }
4216 
4217     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4218     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4219     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4220     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4221     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4222     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4223     __ st1(v24, __ T1D, state);
4224 
4225     __ ldpd(v14, v15, Address(sp, 48));
4226     __ ldpd(v12, v13, Address(sp, 32));
4227     __ ldpd(v10, v11, Address(sp, 16));
4228     __ ldpd(v8, v9, __ post(sp, 64));
4229 
4230     __ ret(lr);
4231 
4232     return start;
4233   }
4234 
4235   /**
4236    *  Arguments:
4237    *
4238    * Inputs:
4239    *   c_rarg0   - int crc
4240    *   c_rarg1   - byte* buf
4241    *   c_rarg2   - int length
4242    *
4243    * Output:
4244    *       rax   - int crc result
4245    */
4246   address generate_updateBytesCRC32() {
4247     assert(UseCRC32Intrinsics, "what are we doing here?");
4248 
4249     __ align(CodeEntryAlignment);
4250     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4251 
4252     address start = __ pc();
4253 
4254     const Register crc   = c_rarg0;  // crc
4255     const Register buf   = c_rarg1;  // source java byte array address
4256     const Register len   = c_rarg2;  // length
4257     const Register table0 = c_rarg3; // crc_table address
4258     const Register table1 = c_rarg4;
4259     const Register table2 = c_rarg5;
4260     const Register table3 = c_rarg6;
4261     const Register tmp3 = c_rarg7;
4262 
4263     BLOCK_COMMENT("Entry:");
4264     __ enter(); // required for proper stackwalking of RuntimeStub frame
4265 
4266     __ kernel_crc32(crc, buf, len,
4267               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4268 
4269     __ leave(); // required for proper stackwalking of RuntimeStub frame
4270     __ ret(lr);
4271 
4272     return start;
4273   }
4274 
4275   // ChaCha20 block function.  This version parallelizes by loading
4276   // individual 32-bit state elements into vectors for four blocks
4277   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4278   //
4279   // state (int[16]) = c_rarg0
4280   // keystream (byte[1024]) = c_rarg1
4281   // return - number of bytes of keystream (always 256)
4282   address generate_chacha20Block_blockpar() {
4283     Label L_twoRounds, L_cc20_const;
4284     // The constant data is broken into two 128-bit segments to be loaded
4285     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4286     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4287     // The second 128-bits is a table constant used for 8-bit left rotations.
4288     __ BIND(L_cc20_const);
4289     __ emit_int64(0x0000000100000000UL);
4290     __ emit_int64(0x0000000300000002UL);
4291     __ emit_int64(0x0605040702010003UL);
4292     __ emit_int64(0x0E0D0C0F0A09080BUL);
4293 
4294     __ align(CodeEntryAlignment);
4295     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4296     address start = __ pc();
4297     __ enter();
4298 
4299     int i, j;
4300     const Register state = c_rarg0;
4301     const Register keystream = c_rarg1;
4302     const Register loopCtr = r10;
4303     const Register tmpAddr = r11;
4304 
4305     const FloatRegister stateFirst = v0;
4306     const FloatRegister stateSecond = v1;
4307     const FloatRegister stateThird = v2;
4308     const FloatRegister stateFourth = v3;
4309     const FloatRegister origCtrState = v28;
4310     const FloatRegister scratch = v29;
4311     const FloatRegister lrot8Tbl = v30;
4312 
4313     // Organize SIMD registers in an array that facilitates
4314     // putting repetitive opcodes into loop structures.  It is
4315     // important that each grouping of 4 registers is monotonically
4316     // increasing to support the requirements of multi-register
4317     // instructions (e.g. ld4r, st4, etc.)
4318     const FloatRegister workSt[16] = {
4319          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4320         v20, v21, v22, v23, v24, v25, v26, v27
4321     };
4322 
4323     // Load from memory and interlace across 16 SIMD registers,
4324     // With each word from memory being broadcast to all lanes of
4325     // each successive SIMD register.
4326     //      Addr(0) -> All lanes in workSt[i]
4327     //      Addr(4) -> All lanes workSt[i + 1], etc.
4328     __ mov(tmpAddr, state);
4329     for (i = 0; i < 16; i += 4) {
4330       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4331           __ post(tmpAddr, 16));
4332     }
4333 
4334     // Pull in constant data.  The first 16 bytes are the add overlay
4335     // which is applied to the vector holding the counter (state[12]).
4336     // The second 16 bytes is the index register for the 8-bit left
4337     // rotation tbl instruction.
4338     __ adr(tmpAddr, L_cc20_const);
4339     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4340     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4341 
4342     // Set up the 10 iteration loop and perform all 8 quarter round ops
4343     __ mov(loopCtr, 10);
4344     __ BIND(L_twoRounds);
4345 
4346     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4347         scratch, lrot8Tbl);
4348     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4349         scratch, lrot8Tbl);
4350     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4351         scratch, lrot8Tbl);
4352     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4353         scratch, lrot8Tbl);
4354 
4355     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4356         scratch, lrot8Tbl);
4357     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4358         scratch, lrot8Tbl);
4359     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4360         scratch, lrot8Tbl);
4361     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4362         scratch, lrot8Tbl);
4363 
4364     // Decrement and iterate
4365     __ sub(loopCtr, loopCtr, 1);
4366     __ cbnz(loopCtr, L_twoRounds);
4367 
4368     __ mov(tmpAddr, state);
4369 
4370     // Add the starting state back to the post-loop keystream
4371     // state.  We read/interlace the state array from memory into
4372     // 4 registers similar to what we did in the beginning.  Then
4373     // add the counter overlay onto workSt[12] at the end.
4374     for (i = 0; i < 16; i += 4) {
4375       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4376           __ post(tmpAddr, 16));
4377       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4378       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4379       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4380       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4381     }
4382     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4383 
4384     // Write to key stream, storing the same element out of workSt[0..15]
4385     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4386     // for the next element position.
4387     for (i = 0; i < 4; i++) {
4388       for (j = 0; j < 16; j += 4) {
4389         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4390             __ post(keystream, 16));
4391       }
4392     }
4393 
4394     __ mov(r0, 256);             // Return length of output keystream
4395     __ leave();
4396     __ ret(lr);
4397 
4398     return start;
4399   }
4400 
4401   /**
4402    *  Arguments:
4403    *
4404    * Inputs:
4405    *   c_rarg0   - int crc
4406    *   c_rarg1   - byte* buf
4407    *   c_rarg2   - int length
4408    *   c_rarg3   - int* table
4409    *
4410    * Output:
4411    *       r0   - int crc result
4412    */
4413   address generate_updateBytesCRC32C() {
4414     assert(UseCRC32CIntrinsics, "what are we doing here?");
4415 
4416     __ align(CodeEntryAlignment);
4417     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4418 
4419     address start = __ pc();
4420 
4421     const Register crc   = c_rarg0;  // crc
4422     const Register buf   = c_rarg1;  // source java byte array address
4423     const Register len   = c_rarg2;  // length
4424     const Register table0 = c_rarg3; // crc_table address
4425     const Register table1 = c_rarg4;
4426     const Register table2 = c_rarg5;
4427     const Register table3 = c_rarg6;
4428     const Register tmp3 = c_rarg7;
4429 
4430     BLOCK_COMMENT("Entry:");
4431     __ enter(); // required for proper stackwalking of RuntimeStub frame
4432 
4433     __ kernel_crc32c(crc, buf, len,
4434               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4435 
4436     __ leave(); // required for proper stackwalking of RuntimeStub frame
4437     __ ret(lr);
4438 
4439     return start;
4440   }
4441 
4442   /***
4443    *  Arguments:
4444    *
4445    *  Inputs:
4446    *   c_rarg0   - int   adler
4447    *   c_rarg1   - byte* buff
4448    *   c_rarg2   - int   len
4449    *
4450    * Output:
4451    *   c_rarg0   - int adler result
4452    */
4453   address generate_updateBytesAdler32() {
4454     __ align(CodeEntryAlignment);
4455     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4456     address start = __ pc();
4457 
4458     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4459 
4460     // Aliases
4461     Register adler  = c_rarg0;
4462     Register s1     = c_rarg0;
4463     Register s2     = c_rarg3;
4464     Register buff   = c_rarg1;
4465     Register len    = c_rarg2;
4466     Register nmax  = r4;
4467     Register base  = r5;
4468     Register count = r6;
4469     Register temp0 = rscratch1;
4470     Register temp1 = rscratch2;
4471     FloatRegister vbytes = v0;
4472     FloatRegister vs1acc = v1;
4473     FloatRegister vs2acc = v2;
4474     FloatRegister vtable = v3;
4475 
4476     // Max number of bytes we can process before having to take the mod
4477     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4478     uint64_t BASE = 0xfff1;
4479     uint64_t NMAX = 0x15B0;
4480 
4481     __ mov(base, BASE);
4482     __ mov(nmax, NMAX);
4483 
4484     // Load accumulation coefficients for the upper 16 bits
4485     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4486     __ ld1(vtable, __ T16B, Address(temp0));
4487 
4488     // s1 is initialized to the lower 16 bits of adler
4489     // s2 is initialized to the upper 16 bits of adler
4490     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4491     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4492 
4493     // The pipelined loop needs at least 16 elements for 1 iteration
4494     // It does check this, but it is more effective to skip to the cleanup loop
4495     __ cmp(len, (u1)16);
4496     __ br(Assembler::HS, L_nmax);
4497     __ cbz(len, L_combine);
4498 
4499     __ bind(L_simple_by1_loop);
4500     __ ldrb(temp0, Address(__ post(buff, 1)));
4501     __ add(s1, s1, temp0);
4502     __ add(s2, s2, s1);
4503     __ subs(len, len, 1);
4504     __ br(Assembler::HI, L_simple_by1_loop);
4505 
4506     // s1 = s1 % BASE
4507     __ subs(temp0, s1, base);
4508     __ csel(s1, temp0, s1, Assembler::HS);
4509 
4510     // s2 = s2 % BASE
4511     __ lsr(temp0, s2, 16);
4512     __ lsl(temp1, temp0, 4);
4513     __ sub(temp1, temp1, temp0);
4514     __ add(s2, temp1, s2, ext::uxth);
4515 
4516     __ subs(temp0, s2, base);
4517     __ csel(s2, temp0, s2, Assembler::HS);
4518 
4519     __ b(L_combine);
4520 
4521     __ bind(L_nmax);
4522     __ subs(len, len, nmax);
4523     __ sub(count, nmax, 16);
4524     __ br(Assembler::LO, L_by16);
4525 
4526     __ bind(L_nmax_loop);
4527 
4528     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4529                                       vbytes, vs1acc, vs2acc, vtable);
4530 
4531     __ subs(count, count, 16);
4532     __ br(Assembler::HS, L_nmax_loop);
4533 
4534     // s1 = s1 % BASE
4535     __ lsr(temp0, s1, 16);
4536     __ lsl(temp1, temp0, 4);
4537     __ sub(temp1, temp1, temp0);
4538     __ add(temp1, temp1, s1, ext::uxth);
4539 
4540     __ lsr(temp0, temp1, 16);
4541     __ lsl(s1, temp0, 4);
4542     __ sub(s1, s1, temp0);
4543     __ add(s1, s1, temp1, ext:: uxth);
4544 
4545     __ subs(temp0, s1, base);
4546     __ csel(s1, temp0, s1, Assembler::HS);
4547 
4548     // s2 = s2 % BASE
4549     __ lsr(temp0, s2, 16);
4550     __ lsl(temp1, temp0, 4);
4551     __ sub(temp1, temp1, temp0);
4552     __ add(temp1, temp1, s2, ext::uxth);
4553 
4554     __ lsr(temp0, temp1, 16);
4555     __ lsl(s2, temp0, 4);
4556     __ sub(s2, s2, temp0);
4557     __ add(s2, s2, temp1, ext:: uxth);
4558 
4559     __ subs(temp0, s2, base);
4560     __ csel(s2, temp0, s2, Assembler::HS);
4561 
4562     __ subs(len, len, nmax);
4563     __ sub(count, nmax, 16);
4564     __ br(Assembler::HS, L_nmax_loop);
4565 
4566     __ bind(L_by16);
4567     __ adds(len, len, count);
4568     __ br(Assembler::LO, L_by1);
4569 
4570     __ bind(L_by16_loop);
4571 
4572     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4573                                       vbytes, vs1acc, vs2acc, vtable);
4574 
4575     __ subs(len, len, 16);
4576     __ br(Assembler::HS, L_by16_loop);
4577 
4578     __ bind(L_by1);
4579     __ adds(len, len, 15);
4580     __ br(Assembler::LO, L_do_mod);
4581 
4582     __ bind(L_by1_loop);
4583     __ ldrb(temp0, Address(__ post(buff, 1)));
4584     __ add(s1, temp0, s1);
4585     __ add(s2, s2, s1);
4586     __ subs(len, len, 1);
4587     __ br(Assembler::HS, L_by1_loop);
4588 
4589     __ bind(L_do_mod);
4590     // s1 = s1 % BASE
4591     __ lsr(temp0, s1, 16);
4592     __ lsl(temp1, temp0, 4);
4593     __ sub(temp1, temp1, temp0);
4594     __ add(temp1, temp1, s1, ext::uxth);
4595 
4596     __ lsr(temp0, temp1, 16);
4597     __ lsl(s1, temp0, 4);
4598     __ sub(s1, s1, temp0);
4599     __ add(s1, s1, temp1, ext:: uxth);
4600 
4601     __ subs(temp0, s1, base);
4602     __ csel(s1, temp0, s1, Assembler::HS);
4603 
4604     // s2 = s2 % BASE
4605     __ lsr(temp0, s2, 16);
4606     __ lsl(temp1, temp0, 4);
4607     __ sub(temp1, temp1, temp0);
4608     __ add(temp1, temp1, s2, ext::uxth);
4609 
4610     __ lsr(temp0, temp1, 16);
4611     __ lsl(s2, temp0, 4);
4612     __ sub(s2, s2, temp0);
4613     __ add(s2, s2, temp1, ext:: uxth);
4614 
4615     __ subs(temp0, s2, base);
4616     __ csel(s2, temp0, s2, Assembler::HS);
4617 
4618     // Combine lower bits and higher bits
4619     __ bind(L_combine);
4620     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4621 
4622     __ ret(lr);
4623 
4624     return start;
4625   }
4626 
4627   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4628           Register temp0, Register temp1, FloatRegister vbytes,
4629           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4630     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4631     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4632     // In non-vectorized code, we update s1 and s2 as:
4633     //   s1 <- s1 + b1
4634     //   s2 <- s2 + s1
4635     //   s1 <- s1 + b2
4636     //   s2 <- s2 + b1
4637     //   ...
4638     //   s1 <- s1 + b16
4639     //   s2 <- s2 + s1
4640     // Putting above assignments together, we have:
4641     //   s1_new = s1 + b1 + b2 + ... + b16
4642     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4643     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4644     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4645     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4646 
4647     // s2 = s2 + s1 * 16
4648     __ add(s2, s2, s1, Assembler::LSL, 4);
4649 
4650     // vs1acc = b1 + b2 + b3 + ... + b16
4651     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4652     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4653     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4654     __ uaddlv(vs1acc, __ T16B, vbytes);
4655     __ uaddlv(vs2acc, __ T8H, vs2acc);
4656 
4657     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4658     __ fmovd(temp0, vs1acc);
4659     __ fmovd(temp1, vs2acc);
4660     __ add(s1, s1, temp0);
4661     __ add(s2, s2, temp1);
4662   }
4663 
4664   /**
4665    *  Arguments:
4666    *
4667    *  Input:
4668    *    c_rarg0   - x address
4669    *    c_rarg1   - x length
4670    *    c_rarg2   - y address
4671    *    c_rarg3   - y length
4672    *    c_rarg4   - z address
4673    *    c_rarg5   - z length
4674    */
4675   address generate_multiplyToLen() {
4676     __ align(CodeEntryAlignment);
4677     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4678 
4679     address start = __ pc();
4680     const Register x     = r0;
4681     const Register xlen  = r1;
4682     const Register y     = r2;
4683     const Register ylen  = r3;
4684     const Register z     = r4;
4685     const Register zlen  = r5;
4686 
4687     const Register tmp1  = r10;
4688     const Register tmp2  = r11;
4689     const Register tmp3  = r12;
4690     const Register tmp4  = r13;
4691     const Register tmp5  = r14;
4692     const Register tmp6  = r15;
4693     const Register tmp7  = r16;
4694 
4695     BLOCK_COMMENT("Entry:");
4696     __ enter(); // required for proper stackwalking of RuntimeStub frame
4697     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4698     __ leave(); // required for proper stackwalking of RuntimeStub frame
4699     __ ret(lr);
4700 
4701     return start;
4702   }
4703 
4704   address generate_squareToLen() {
4705     // squareToLen algorithm for sizes 1..127 described in java code works
4706     // faster than multiply_to_len on some CPUs and slower on others, but
4707     // multiply_to_len shows a bit better overall results
4708     __ align(CodeEntryAlignment);
4709     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4710     address start = __ pc();
4711 
4712     const Register x     = r0;
4713     const Register xlen  = r1;
4714     const Register z     = r2;
4715     const Register zlen  = r3;
4716     const Register y     = r4; // == x
4717     const Register ylen  = r5; // == xlen
4718 
4719     const Register tmp1  = r10;
4720     const Register tmp2  = r11;
4721     const Register tmp3  = r12;
4722     const Register tmp4  = r13;
4723     const Register tmp5  = r14;
4724     const Register tmp6  = r15;
4725     const Register tmp7  = r16;
4726 
4727     RegSet spilled_regs = RegSet::of(y, ylen);
4728     BLOCK_COMMENT("Entry:");
4729     __ enter();
4730     __ push(spilled_regs, sp);
4731     __ mov(y, x);
4732     __ mov(ylen, xlen);
4733     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4734     __ pop(spilled_regs, sp);
4735     __ leave();
4736     __ ret(lr);
4737     return start;
4738   }
4739 
4740   address generate_mulAdd() {
4741     __ align(CodeEntryAlignment);
4742     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4743 
4744     address start = __ pc();
4745 
4746     const Register out     = r0;
4747     const Register in      = r1;
4748     const Register offset  = r2;
4749     const Register len     = r3;
4750     const Register k       = r4;
4751 
4752     BLOCK_COMMENT("Entry:");
4753     __ enter();
4754     __ mul_add(out, in, offset, len, k);
4755     __ leave();
4756     __ ret(lr);
4757 
4758     return start;
4759   }
4760 
4761   // Arguments:
4762   //
4763   // Input:
4764   //   c_rarg0   - newArr address
4765   //   c_rarg1   - oldArr address
4766   //   c_rarg2   - newIdx
4767   //   c_rarg3   - shiftCount
4768   //   c_rarg4   - numIter
4769   //
4770   address generate_bigIntegerRightShift() {
4771     __ align(CodeEntryAlignment);
4772     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4773     address start = __ pc();
4774 
4775     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4776 
4777     Register newArr        = c_rarg0;
4778     Register oldArr        = c_rarg1;
4779     Register newIdx        = c_rarg2;
4780     Register shiftCount    = c_rarg3;
4781     Register numIter       = c_rarg4;
4782     Register idx           = numIter;
4783 
4784     Register newArrCur     = rscratch1;
4785     Register shiftRevCount = rscratch2;
4786     Register oldArrCur     = r13;
4787     Register oldArrNext    = r14;
4788 
4789     FloatRegister oldElem0        = v0;
4790     FloatRegister oldElem1        = v1;
4791     FloatRegister newElem         = v2;
4792     FloatRegister shiftVCount     = v3;
4793     FloatRegister shiftVRevCount  = v4;
4794 
4795     __ cbz(idx, Exit);
4796 
4797     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4798 
4799     // left shift count
4800     __ movw(shiftRevCount, 32);
4801     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4802 
4803     // numIter too small to allow a 4-words SIMD loop, rolling back
4804     __ cmp(numIter, (u1)4);
4805     __ br(Assembler::LT, ShiftThree);
4806 
4807     __ dup(shiftVCount,    __ T4S, shiftCount);
4808     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4809     __ negr(shiftVCount,   __ T4S, shiftVCount);
4810 
4811     __ BIND(ShiftSIMDLoop);
4812 
4813     // Calculate the load addresses
4814     __ sub(idx, idx, 4);
4815     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4816     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4817     __ add(oldArrCur,  oldArrNext, 4);
4818 
4819     // Load 4 words and process
4820     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4821     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4822     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4823     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4824     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4825     __ st1(newElem,   __ T4S,  Address(newArrCur));
4826 
4827     __ cmp(idx, (u1)4);
4828     __ br(Assembler::LT, ShiftTwoLoop);
4829     __ b(ShiftSIMDLoop);
4830 
4831     __ BIND(ShiftTwoLoop);
4832     __ cbz(idx, Exit);
4833     __ cmp(idx, (u1)1);
4834     __ br(Assembler::EQ, ShiftOne);
4835 
4836     // Calculate the load addresses
4837     __ sub(idx, idx, 2);
4838     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4839     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4840     __ add(oldArrCur,  oldArrNext, 4);
4841 
4842     // Load 2 words and process
4843     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4844     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4845     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4846     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4847     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4848     __ st1(newElem,   __ T2S, Address(newArrCur));
4849     __ b(ShiftTwoLoop);
4850 
4851     __ BIND(ShiftThree);
4852     __ tbz(idx, 1, ShiftOne);
4853     __ tbz(idx, 0, ShiftTwo);
4854     __ ldrw(r10,  Address(oldArr, 12));
4855     __ ldrw(r11,  Address(oldArr, 8));
4856     __ lsrvw(r10, r10, shiftCount);
4857     __ lslvw(r11, r11, shiftRevCount);
4858     __ orrw(r12,  r10, r11);
4859     __ strw(r12,  Address(newArr, 8));
4860 
4861     __ BIND(ShiftTwo);
4862     __ ldrw(r10,  Address(oldArr, 8));
4863     __ ldrw(r11,  Address(oldArr, 4));
4864     __ lsrvw(r10, r10, shiftCount);
4865     __ lslvw(r11, r11, shiftRevCount);
4866     __ orrw(r12,  r10, r11);
4867     __ strw(r12,  Address(newArr, 4));
4868 
4869     __ BIND(ShiftOne);
4870     __ ldrw(r10,  Address(oldArr, 4));
4871     __ ldrw(r11,  Address(oldArr));
4872     __ lsrvw(r10, r10, shiftCount);
4873     __ lslvw(r11, r11, shiftRevCount);
4874     __ orrw(r12,  r10, r11);
4875     __ strw(r12,  Address(newArr));
4876 
4877     __ BIND(Exit);
4878     __ ret(lr);
4879 
4880     return start;
4881   }
4882 
4883   // Arguments:
4884   //
4885   // Input:
4886   //   c_rarg0   - newArr address
4887   //   c_rarg1   - oldArr address
4888   //   c_rarg2   - newIdx
4889   //   c_rarg3   - shiftCount
4890   //   c_rarg4   - numIter
4891   //
4892   address generate_bigIntegerLeftShift() {
4893     __ align(CodeEntryAlignment);
4894     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4895     address start = __ pc();
4896 
4897     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4898 
4899     Register newArr        = c_rarg0;
4900     Register oldArr        = c_rarg1;
4901     Register newIdx        = c_rarg2;
4902     Register shiftCount    = c_rarg3;
4903     Register numIter       = c_rarg4;
4904 
4905     Register shiftRevCount = rscratch1;
4906     Register oldArrNext    = rscratch2;
4907 
4908     FloatRegister oldElem0        = v0;
4909     FloatRegister oldElem1        = v1;
4910     FloatRegister newElem         = v2;
4911     FloatRegister shiftVCount     = v3;
4912     FloatRegister shiftVRevCount  = v4;
4913 
4914     __ cbz(numIter, Exit);
4915 
4916     __ add(oldArrNext, oldArr, 4);
4917     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4918 
4919     // right shift count
4920     __ movw(shiftRevCount, 32);
4921     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4922 
4923     // numIter too small to allow a 4-words SIMD loop, rolling back
4924     __ cmp(numIter, (u1)4);
4925     __ br(Assembler::LT, ShiftThree);
4926 
4927     __ dup(shiftVCount,     __ T4S, shiftCount);
4928     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4929     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4930 
4931     __ BIND(ShiftSIMDLoop);
4932 
4933     // load 4 words and process
4934     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4935     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4936     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4937     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4938     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4939     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4940     __ sub(numIter,   numIter, 4);
4941 
4942     __ cmp(numIter, (u1)4);
4943     __ br(Assembler::LT, ShiftTwoLoop);
4944     __ b(ShiftSIMDLoop);
4945 
4946     __ BIND(ShiftTwoLoop);
4947     __ cbz(numIter, Exit);
4948     __ cmp(numIter, (u1)1);
4949     __ br(Assembler::EQ, ShiftOne);
4950 
4951     // load 2 words and process
4952     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4953     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4954     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4955     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4956     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4957     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4958     __ sub(numIter,   numIter, 2);
4959     __ b(ShiftTwoLoop);
4960 
4961     __ BIND(ShiftThree);
4962     __ ldrw(r10,  __ post(oldArr, 4));
4963     __ ldrw(r11,  __ post(oldArrNext, 4));
4964     __ lslvw(r10, r10, shiftCount);
4965     __ lsrvw(r11, r11, shiftRevCount);
4966     __ orrw(r12,  r10, r11);
4967     __ strw(r12,  __ post(newArr, 4));
4968     __ tbz(numIter, 1, Exit);
4969     __ tbz(numIter, 0, ShiftOne);
4970 
4971     __ BIND(ShiftTwo);
4972     __ ldrw(r10,  __ post(oldArr, 4));
4973     __ ldrw(r11,  __ post(oldArrNext, 4));
4974     __ lslvw(r10, r10, shiftCount);
4975     __ lsrvw(r11, r11, shiftRevCount);
4976     __ orrw(r12,  r10, r11);
4977     __ strw(r12,  __ post(newArr, 4));
4978 
4979     __ BIND(ShiftOne);
4980     __ ldrw(r10,  Address(oldArr));
4981     __ ldrw(r11,  Address(oldArrNext));
4982     __ lslvw(r10, r10, shiftCount);
4983     __ lsrvw(r11, r11, shiftRevCount);
4984     __ orrw(r12,  r10, r11);
4985     __ strw(r12,  Address(newArr));
4986 
4987     __ BIND(Exit);
4988     __ ret(lr);
4989 
4990     return start;
4991   }
4992 
4993   address generate_count_positives(address &count_positives_long) {
4994     const u1 large_loop_size = 64;
4995     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4996     int dcache_line = VM_Version::dcache_line_size();
4997 
4998     Register ary1 = r1, len = r2, result = r0;
4999 
5000     __ align(CodeEntryAlignment);
5001 
5002     StubCodeMark mark(this, "StubRoutines", "count_positives");
5003 
5004     address entry = __ pc();
5005 
5006     __ enter();
5007     // precondition: a copy of len is already in result
5008     // __ mov(result, len);
5009 
5010   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5011         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5012 
5013   __ cmp(len, (u1)15);
5014   __ br(Assembler::GT, LEN_OVER_15);
5015   // The only case when execution falls into this code is when pointer is near
5016   // the end of memory page and we have to avoid reading next page
5017   __ add(ary1, ary1, len);
5018   __ subs(len, len, 8);
5019   __ br(Assembler::GT, LEN_OVER_8);
5020   __ ldr(rscratch2, Address(ary1, -8));
5021   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5022   __ lsrv(rscratch2, rscratch2, rscratch1);
5023   __ tst(rscratch2, UPPER_BIT_MASK);
5024   __ csel(result, zr, result, Assembler::NE);
5025   __ leave();
5026   __ ret(lr);
5027   __ bind(LEN_OVER_8);
5028   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5029   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5030   __ tst(rscratch2, UPPER_BIT_MASK);
5031   __ br(Assembler::NE, RET_NO_POP);
5032   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5033   __ lsrv(rscratch1, rscratch1, rscratch2);
5034   __ tst(rscratch1, UPPER_BIT_MASK);
5035   __ bind(RET_NO_POP);
5036   __ csel(result, zr, result, Assembler::NE);
5037   __ leave();
5038   __ ret(lr);
5039 
5040   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5041   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5042 
5043   count_positives_long = __ pc(); // 2nd entry point
5044 
5045   __ enter();
5046 
5047   __ bind(LEN_OVER_15);
5048     __ push(spilled_regs, sp);
5049     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5050     __ cbz(rscratch2, ALIGNED);
5051     __ ldp(tmp6, tmp1, Address(ary1));
5052     __ mov(tmp5, 16);
5053     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5054     __ add(ary1, ary1, rscratch1);
5055     __ orr(tmp6, tmp6, tmp1);
5056     __ tst(tmp6, UPPER_BIT_MASK);
5057     __ br(Assembler::NE, RET_ADJUST);
5058     __ sub(len, len, rscratch1);
5059 
5060   __ bind(ALIGNED);
5061     __ cmp(len, large_loop_size);
5062     __ br(Assembler::LT, CHECK_16);
5063     // Perform 16-byte load as early return in pre-loop to handle situation
5064     // when initially aligned large array has negative values at starting bytes,
5065     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5066     // slower. Cases with negative bytes further ahead won't be affected that
5067     // much. In fact, it'll be faster due to early loads, less instructions and
5068     // less branches in LARGE_LOOP.
5069     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5070     __ sub(len, len, 16);
5071     __ orr(tmp6, tmp6, tmp1);
5072     __ tst(tmp6, UPPER_BIT_MASK);
5073     __ br(Assembler::NE, RET_ADJUST_16);
5074     __ cmp(len, large_loop_size);
5075     __ br(Assembler::LT, CHECK_16);
5076 
5077     if (SoftwarePrefetchHintDistance >= 0
5078         && SoftwarePrefetchHintDistance >= dcache_line) {
5079       // initial prefetch
5080       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5081     }
5082   __ bind(LARGE_LOOP);
5083     if (SoftwarePrefetchHintDistance >= 0) {
5084       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5085     }
5086     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5087     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5088     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5089     // instructions per cycle and have less branches, but this approach disables
5090     // early return, thus, all 64 bytes are loaded and checked every time.
5091     __ ldp(tmp2, tmp3, Address(ary1));
5092     __ ldp(tmp4, tmp5, Address(ary1, 16));
5093     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5094     __ ldp(tmp6, tmp1, Address(ary1, 48));
5095     __ add(ary1, ary1, large_loop_size);
5096     __ sub(len, len, large_loop_size);
5097     __ orr(tmp2, tmp2, tmp3);
5098     __ orr(tmp4, tmp4, tmp5);
5099     __ orr(rscratch1, rscratch1, rscratch2);
5100     __ orr(tmp6, tmp6, tmp1);
5101     __ orr(tmp2, tmp2, tmp4);
5102     __ orr(rscratch1, rscratch1, tmp6);
5103     __ orr(tmp2, tmp2, rscratch1);
5104     __ tst(tmp2, UPPER_BIT_MASK);
5105     __ br(Assembler::NE, RET_ADJUST_LONG);
5106     __ cmp(len, large_loop_size);
5107     __ br(Assembler::GE, LARGE_LOOP);
5108 
5109   __ bind(CHECK_16); // small 16-byte load pre-loop
5110     __ cmp(len, (u1)16);
5111     __ br(Assembler::LT, POST_LOOP16);
5112 
5113   __ bind(LOOP16); // small 16-byte load loop
5114     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5115     __ sub(len, len, 16);
5116     __ orr(tmp2, tmp2, tmp3);
5117     __ tst(tmp2, UPPER_BIT_MASK);
5118     __ br(Assembler::NE, RET_ADJUST_16);
5119     __ cmp(len, (u1)16);
5120     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5121 
5122   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5123     __ cmp(len, (u1)8);
5124     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5125     __ ldr(tmp3, Address(__ post(ary1, 8)));
5126     __ tst(tmp3, UPPER_BIT_MASK);
5127     __ br(Assembler::NE, RET_ADJUST);
5128     __ sub(len, len, 8);
5129 
5130   __ bind(POST_LOOP16_LOAD_TAIL);
5131     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5132     __ ldr(tmp1, Address(ary1));
5133     __ mov(tmp2, 64);
5134     __ sub(tmp4, tmp2, len, __ LSL, 3);
5135     __ lslv(tmp1, tmp1, tmp4);
5136     __ tst(tmp1, UPPER_BIT_MASK);
5137     __ br(Assembler::NE, RET_ADJUST);
5138     // Fallthrough
5139 
5140   __ bind(RET_LEN);
5141     __ pop(spilled_regs, sp);
5142     __ leave();
5143     __ ret(lr);
5144 
5145     // difference result - len is the count of guaranteed to be
5146     // positive bytes
5147 
5148   __ bind(RET_ADJUST_LONG);
5149     __ add(len, len, (u1)(large_loop_size - 16));
5150   __ bind(RET_ADJUST_16);
5151     __ add(len, len, 16);
5152   __ bind(RET_ADJUST);
5153     __ pop(spilled_regs, sp);
5154     __ leave();
5155     __ sub(result, result, len);
5156     __ ret(lr);
5157 
5158     return entry;
5159   }
5160 
5161   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5162         bool usePrefetch, Label &NOT_EQUAL) {
5163     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5164         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5165         tmp7 = r12, tmp8 = r13;
5166     Label LOOP;
5167 
5168     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5169     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5170     __ bind(LOOP);
5171     if (usePrefetch) {
5172       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5173       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5174     }
5175     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5176     __ eor(tmp1, tmp1, tmp2);
5177     __ eor(tmp3, tmp3, tmp4);
5178     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5179     __ orr(tmp1, tmp1, tmp3);
5180     __ cbnz(tmp1, NOT_EQUAL);
5181     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5182     __ eor(tmp5, tmp5, tmp6);
5183     __ eor(tmp7, tmp7, tmp8);
5184     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5185     __ orr(tmp5, tmp5, tmp7);
5186     __ cbnz(tmp5, NOT_EQUAL);
5187     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5188     __ eor(tmp1, tmp1, tmp2);
5189     __ eor(tmp3, tmp3, tmp4);
5190     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5191     __ orr(tmp1, tmp1, tmp3);
5192     __ cbnz(tmp1, NOT_EQUAL);
5193     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5194     __ eor(tmp5, tmp5, tmp6);
5195     __ sub(cnt1, cnt1, 8 * wordSize);
5196     __ eor(tmp7, tmp7, tmp8);
5197     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5198     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5199     // cmp) because subs allows an unlimited range of immediate operand.
5200     __ subs(tmp6, cnt1, loopThreshold);
5201     __ orr(tmp5, tmp5, tmp7);
5202     __ cbnz(tmp5, NOT_EQUAL);
5203     __ br(__ GE, LOOP);
5204     // post-loop
5205     __ eor(tmp1, tmp1, tmp2);
5206     __ eor(tmp3, tmp3, tmp4);
5207     __ orr(tmp1, tmp1, tmp3);
5208     __ sub(cnt1, cnt1, 2 * wordSize);
5209     __ cbnz(tmp1, NOT_EQUAL);
5210   }
5211 
5212   void generate_large_array_equals_loop_simd(int loopThreshold,
5213         bool usePrefetch, Label &NOT_EQUAL) {
5214     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5215         tmp2 = rscratch2;
5216     Label LOOP;
5217 
5218     __ bind(LOOP);
5219     if (usePrefetch) {
5220       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5221       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5222     }
5223     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5224     __ sub(cnt1, cnt1, 8 * wordSize);
5225     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5226     __ subs(tmp1, cnt1, loopThreshold);
5227     __ eor(v0, __ T16B, v0, v4);
5228     __ eor(v1, __ T16B, v1, v5);
5229     __ eor(v2, __ T16B, v2, v6);
5230     __ eor(v3, __ T16B, v3, v7);
5231     __ orr(v0, __ T16B, v0, v1);
5232     __ orr(v1, __ T16B, v2, v3);
5233     __ orr(v0, __ T16B, v0, v1);
5234     __ umov(tmp1, v0, __ D, 0);
5235     __ umov(tmp2, v0, __ D, 1);
5236     __ orr(tmp1, tmp1, tmp2);
5237     __ cbnz(tmp1, NOT_EQUAL);
5238     __ br(__ GE, LOOP);
5239   }
5240 
5241   // a1 = r1 - array1 address
5242   // a2 = r2 - array2 address
5243   // result = r0 - return value. Already contains "false"
5244   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5245   // r3-r5 are reserved temporary registers
5246   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5247   address generate_large_array_equals() {
5248     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5249         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5250         tmp7 = r12, tmp8 = r13;
5251     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5252         SMALL_LOOP, POST_LOOP;
5253     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5254     // calculate if at least 32 prefetched bytes are used
5255     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5256     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5257     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5258     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5259         tmp5, tmp6, tmp7, tmp8);
5260 
5261     __ align(CodeEntryAlignment);
5262 
5263     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5264 
5265     address entry = __ pc();
5266     __ enter();
5267     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5268     // also advance pointers to use post-increment instead of pre-increment
5269     __ add(a1, a1, wordSize);
5270     __ add(a2, a2, wordSize);
5271     if (AvoidUnalignedAccesses) {
5272       // both implementations (SIMD/nonSIMD) are using relatively large load
5273       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5274       // on some CPUs in case of address is not at least 16-byte aligned.
5275       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5276       // load if needed at least for 1st address and make if 16-byte aligned.
5277       Label ALIGNED16;
5278       __ tbz(a1, 3, ALIGNED16);
5279       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5280       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5281       __ sub(cnt1, cnt1, wordSize);
5282       __ eor(tmp1, tmp1, tmp2);
5283       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5284       __ bind(ALIGNED16);
5285     }
5286     if (UseSIMDForArrayEquals) {
5287       if (SoftwarePrefetchHintDistance >= 0) {
5288         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5289         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5290         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5291             /* prfm = */ true, NOT_EQUAL);
5292         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5293         __ br(__ LT, TAIL);
5294       }
5295       __ bind(NO_PREFETCH_LARGE_LOOP);
5296       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5297           /* prfm = */ false, NOT_EQUAL);
5298     } else {
5299       __ push(spilled_regs, sp);
5300       if (SoftwarePrefetchHintDistance >= 0) {
5301         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5302         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5303         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5304             /* prfm = */ true, NOT_EQUAL);
5305         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5306         __ br(__ LT, TAIL);
5307       }
5308       __ bind(NO_PREFETCH_LARGE_LOOP);
5309       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5310           /* prfm = */ false, NOT_EQUAL);
5311     }
5312     __ bind(TAIL);
5313       __ cbz(cnt1, EQUAL);
5314       __ subs(cnt1, cnt1, wordSize);
5315       __ br(__ LE, POST_LOOP);
5316     __ bind(SMALL_LOOP);
5317       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5318       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5319       __ subs(cnt1, cnt1, wordSize);
5320       __ eor(tmp1, tmp1, tmp2);
5321       __ cbnz(tmp1, NOT_EQUAL);
5322       __ br(__ GT, SMALL_LOOP);
5323     __ bind(POST_LOOP);
5324       __ ldr(tmp1, Address(a1, cnt1));
5325       __ ldr(tmp2, Address(a2, cnt1));
5326       __ eor(tmp1, tmp1, tmp2);
5327       __ cbnz(tmp1, NOT_EQUAL);
5328     __ bind(EQUAL);
5329       __ mov(result, true);
5330     __ bind(NOT_EQUAL);
5331       if (!UseSIMDForArrayEquals) {
5332         __ pop(spilled_regs, sp);
5333       }
5334     __ bind(NOT_EQUAL_NO_POP);
5335     __ leave();
5336     __ ret(lr);
5337     return entry;
5338   }
5339 
5340   address generate_dsin_dcos(bool isCos) {
5341     __ align(CodeEntryAlignment);
5342     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5343     address start = __ pc();
5344     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5345         (address)StubRoutines::aarch64::_two_over_pi,
5346         (address)StubRoutines::aarch64::_pio2,
5347         (address)StubRoutines::aarch64::_dsin_coef,
5348         (address)StubRoutines::aarch64::_dcos_coef);
5349     return start;
5350   }
5351 
5352   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5353   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5354       Label &DIFF2) {
5355     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5356     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5357 
5358     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5359     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5360     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5361     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5362 
5363     __ fmovd(tmpL, vtmp3);
5364     __ eor(rscratch2, tmp3, tmpL);
5365     __ cbnz(rscratch2, DIFF2);
5366 
5367     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5368     __ umov(tmpL, vtmp3, __ D, 1);
5369     __ eor(rscratch2, tmpU, tmpL);
5370     __ cbnz(rscratch2, DIFF1);
5371 
5372     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5373     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5374     __ fmovd(tmpL, vtmp);
5375     __ eor(rscratch2, tmp3, tmpL);
5376     __ cbnz(rscratch2, DIFF2);
5377 
5378     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5379     __ umov(tmpL, vtmp, __ D, 1);
5380     __ eor(rscratch2, tmpU, tmpL);
5381     __ cbnz(rscratch2, DIFF1);
5382   }
5383 
5384   // r0  = result
5385   // r1  = str1
5386   // r2  = cnt1
5387   // r3  = str2
5388   // r4  = cnt2
5389   // r10 = tmp1
5390   // r11 = tmp2
5391   address generate_compare_long_string_different_encoding(bool isLU) {
5392     __ align(CodeEntryAlignment);
5393     StubCodeMark mark(this, "StubRoutines", isLU
5394         ? "compare_long_string_different_encoding LU"
5395         : "compare_long_string_different_encoding UL");
5396     address entry = __ pc();
5397     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5398         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5399         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5400     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5401         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5402     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5403     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5404 
5405     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5406 
5407     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5408     // cnt2 == amount of characters left to compare
5409     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5410     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5411     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5412     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5413     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5414     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5415     __ eor(rscratch2, tmp1, tmp2);
5416     __ mov(rscratch1, tmp2);
5417     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5418     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5419              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5420     __ push(spilled_regs, sp);
5421     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5422     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5423 
5424     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5425 
5426     if (SoftwarePrefetchHintDistance >= 0) {
5427       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5428       __ br(__ LT, NO_PREFETCH);
5429       __ bind(LARGE_LOOP_PREFETCH);
5430         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5431         __ mov(tmp4, 2);
5432         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5433         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5434           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5435           __ subs(tmp4, tmp4, 1);
5436           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5437           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5438           __ mov(tmp4, 2);
5439         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5440           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5441           __ subs(tmp4, tmp4, 1);
5442           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5443           __ sub(cnt2, cnt2, 64);
5444           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5445           __ br(__ GE, LARGE_LOOP_PREFETCH);
5446     }
5447     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5448     __ bind(NO_PREFETCH);
5449     __ subs(cnt2, cnt2, 16);
5450     __ br(__ LT, TAIL);
5451     __ align(OptoLoopAlignment);
5452     __ bind(SMALL_LOOP); // smaller loop
5453       __ subs(cnt2, cnt2, 16);
5454       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5455       __ br(__ GE, SMALL_LOOP);
5456       __ cmn(cnt2, (u1)16);
5457       __ br(__ EQ, LOAD_LAST);
5458     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5459       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5460       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5461       __ ldr(tmp3, Address(cnt1, -8));
5462       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5463       __ b(LOAD_LAST);
5464     __ bind(DIFF2);
5465       __ mov(tmpU, tmp3);
5466     __ bind(DIFF1);
5467       __ pop(spilled_regs, sp);
5468       __ b(CALCULATE_DIFFERENCE);
5469     __ bind(LOAD_LAST);
5470       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5471       // No need to load it again
5472       __ mov(tmpU, tmp3);
5473       __ pop(spilled_regs, sp);
5474 
5475       // tmp2 points to the address of the last 4 Latin1 characters right now
5476       __ ldrs(vtmp, Address(tmp2));
5477       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5478       __ fmovd(tmpL, vtmp);
5479 
5480       __ eor(rscratch2, tmpU, tmpL);
5481       __ cbz(rscratch2, DONE);
5482 
5483     // Find the first different characters in the longwords and
5484     // compute their difference.
5485     __ bind(CALCULATE_DIFFERENCE);
5486       __ rev(rscratch2, rscratch2);
5487       __ clz(rscratch2, rscratch2);
5488       __ andr(rscratch2, rscratch2, -16);
5489       __ lsrv(tmp1, tmp1, rscratch2);
5490       __ uxthw(tmp1, tmp1);
5491       __ lsrv(rscratch1, rscratch1, rscratch2);
5492       __ uxthw(rscratch1, rscratch1);
5493       __ subw(result, tmp1, rscratch1);
5494     __ bind(DONE);
5495       __ ret(lr);
5496     return entry;
5497   }
5498 
5499   // r0 = input (float16)
5500   // v0 = result (float)
5501   // v1 = temporary float register
5502   address generate_float16ToFloat() {
5503     __ align(CodeEntryAlignment);
5504     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
5505     address entry = __ pc();
5506     BLOCK_COMMENT("Entry:");
5507     __ flt16_to_flt(v0, r0, v1);
5508     __ ret(lr);
5509     return entry;
5510   }
5511 
5512   // v0 = input (float)
5513   // r0 = result (float16)
5514   // v1 = temporary float register
5515   address generate_floatToFloat16() {
5516     __ align(CodeEntryAlignment);
5517     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
5518     address entry = __ pc();
5519     BLOCK_COMMENT("Entry:");
5520     __ flt_to_flt16(r0, v0, v1);
5521     __ ret(lr);
5522     return entry;
5523   }
5524 
5525   address generate_method_entry_barrier() {
5526     __ align(CodeEntryAlignment);
5527     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5528 
5529     Label deoptimize_label;
5530 
5531     address start = __ pc();
5532 
5533     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5534 
5535     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5536       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5537       // We can get here despite the nmethod being good, if we have not
5538       // yet applied our cross modification fence (or data fence).
5539       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5540       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5541       __ ldrw(rscratch2, rscratch2);
5542       __ strw(rscratch2, thread_epoch_addr);
5543       __ isb();
5544       __ membar(__ LoadLoad);
5545     }
5546 
5547     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5548 
5549     __ enter();
5550     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5551 
5552     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5553 
5554     __ push_call_clobbered_registers();
5555 
5556     __ mov(c_rarg0, rscratch2);
5557     __ call_VM_leaf
5558          (CAST_FROM_FN_PTR
5559           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5560 
5561     __ reset_last_Java_frame(true);
5562 
5563     __ mov(rscratch1, r0);
5564 
5565     __ pop_call_clobbered_registers();
5566 
5567     __ cbnz(rscratch1, deoptimize_label);
5568 
5569     __ leave();
5570     __ ret(lr);
5571 
5572     __ BIND(deoptimize_label);
5573 
5574     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5575     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5576 
5577     __ mov(sp, rscratch1);
5578     __ br(rscratch2);
5579 
5580     return start;
5581   }
5582 
5583   // r0  = result
5584   // r1  = str1
5585   // r2  = cnt1
5586   // r3  = str2
5587   // r4  = cnt2
5588   // r10 = tmp1
5589   // r11 = tmp2
5590   address generate_compare_long_string_same_encoding(bool isLL) {
5591     __ align(CodeEntryAlignment);
5592     StubCodeMark mark(this, "StubRoutines", isLL
5593         ? "compare_long_string_same_encoding LL"
5594         : "compare_long_string_same_encoding UU");
5595     address entry = __ pc();
5596     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5597         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5598 
5599     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5600 
5601     // exit from large loop when less than 64 bytes left to read or we're about
5602     // to prefetch memory behind array border
5603     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5604 
5605     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5606     __ eor(rscratch2, tmp1, tmp2);
5607     __ cbnz(rscratch2, CAL_DIFFERENCE);
5608 
5609     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5610     // update pointers, because of previous read
5611     __ add(str1, str1, wordSize);
5612     __ add(str2, str2, wordSize);
5613     if (SoftwarePrefetchHintDistance >= 0) {
5614       __ align(OptoLoopAlignment);
5615       __ bind(LARGE_LOOP_PREFETCH);
5616         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5617         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5618 
5619         for (int i = 0; i < 4; i++) {
5620           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5621           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5622           __ cmp(tmp1, tmp2);
5623           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5624           __ br(Assembler::NE, DIFF);
5625         }
5626         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5627         __ add(str1, str1, 64);
5628         __ add(str2, str2, 64);
5629         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5630         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5631         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5632     }
5633 
5634     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5635     __ br(Assembler::LE, LESS16);
5636     __ align(OptoLoopAlignment);
5637     __ bind(LOOP_COMPARE16);
5638       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5639       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5640       __ cmp(tmp1, tmp2);
5641       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5642       __ br(Assembler::NE, DIFF);
5643       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5644       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5645       __ br(Assembler::LT, LESS16);
5646 
5647       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5648       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5649       __ cmp(tmp1, tmp2);
5650       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5651       __ br(Assembler::NE, DIFF);
5652       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5653       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5654       __ br(Assembler::GE, LOOP_COMPARE16);
5655       __ cbz(cnt2, LENGTH_DIFF);
5656 
5657     __ bind(LESS16);
5658       // each 8 compare
5659       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5660       __ br(Assembler::LE, LESS8);
5661       __ ldr(tmp1, Address(__ post(str1, 8)));
5662       __ ldr(tmp2, Address(__ post(str2, 8)));
5663       __ eor(rscratch2, tmp1, tmp2);
5664       __ cbnz(rscratch2, CAL_DIFFERENCE);
5665       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5666 
5667     __ bind(LESS8); // directly load last 8 bytes
5668       if (!isLL) {
5669         __ add(cnt2, cnt2, cnt2);
5670       }
5671       __ ldr(tmp1, Address(str1, cnt2));
5672       __ ldr(tmp2, Address(str2, cnt2));
5673       __ eor(rscratch2, tmp1, tmp2);
5674       __ cbz(rscratch2, LENGTH_DIFF);
5675       __ b(CAL_DIFFERENCE);
5676 
5677     __ bind(DIFF);
5678       __ cmp(tmp1, tmp2);
5679       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5680       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5681       // reuse rscratch2 register for the result of eor instruction
5682       __ eor(rscratch2, tmp1, tmp2);
5683 
5684     __ bind(CAL_DIFFERENCE);
5685       __ rev(rscratch2, rscratch2);
5686       __ clz(rscratch2, rscratch2);
5687       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5688       __ lsrv(tmp1, tmp1, rscratch2);
5689       __ lsrv(tmp2, tmp2, rscratch2);
5690       if (isLL) {
5691         __ uxtbw(tmp1, tmp1);
5692         __ uxtbw(tmp2, tmp2);
5693       } else {
5694         __ uxthw(tmp1, tmp1);
5695         __ uxthw(tmp2, tmp2);
5696       }
5697       __ subw(result, tmp1, tmp2);
5698 
5699     __ bind(LENGTH_DIFF);
5700       __ ret(lr);
5701     return entry;
5702   }
5703 
5704   enum string_compare_mode {
5705     LL,
5706     LU,
5707     UL,
5708     UU,
5709   };
5710 
5711   // The following registers are declared in aarch64.ad
5712   // r0  = result
5713   // r1  = str1
5714   // r2  = cnt1
5715   // r3  = str2
5716   // r4  = cnt2
5717   // r10 = tmp1
5718   // r11 = tmp2
5719   // z0  = ztmp1
5720   // z1  = ztmp2
5721   // p0  = pgtmp1
5722   // p1  = pgtmp2
5723   address generate_compare_long_string_sve(string_compare_mode mode) {
5724     __ align(CodeEntryAlignment);
5725     address entry = __ pc();
5726     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5727              tmp1 = r10, tmp2 = r11;
5728 
5729     Label LOOP, DONE, MISMATCH;
5730     Register vec_len = tmp1;
5731     Register idx = tmp2;
5732     // The minimum of the string lengths has been stored in cnt2.
5733     Register cnt = cnt2;
5734     FloatRegister ztmp1 = z0, ztmp2 = z1;
5735     PRegister pgtmp1 = p0, pgtmp2 = p1;
5736 
5737 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5738     switch (mode) {                                                            \
5739       case LL:                                                                 \
5740         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5741         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5742         break;                                                                 \
5743       case LU:                                                                 \
5744         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5745         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5746         break;                                                                 \
5747       case UL:                                                                 \
5748         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5749         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5750         break;                                                                 \
5751       case UU:                                                                 \
5752         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5753         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5754         break;                                                                 \
5755       default:                                                                 \
5756         ShouldNotReachHere();                                                  \
5757     }
5758 
5759     const char* stubname;
5760     switch (mode) {
5761       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5762       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5763       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5764       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5765       default: ShouldNotReachHere();
5766     }
5767 
5768     StubCodeMark mark(this, "StubRoutines", stubname);
5769 
5770     __ mov(idx, 0);
5771     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5772 
5773     if (mode == LL) {
5774       __ sve_cntb(vec_len);
5775     } else {
5776       __ sve_cnth(vec_len);
5777     }
5778 
5779     __ sub(rscratch1, cnt, vec_len);
5780 
5781     __ bind(LOOP);
5782 
5783       // main loop
5784       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5785       __ add(idx, idx, vec_len);
5786       // Compare strings.
5787       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5788       __ br(__ NE, MISMATCH);
5789       __ cmp(idx, rscratch1);
5790       __ br(__ LT, LOOP);
5791 
5792     // post loop, last iteration
5793     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5794 
5795     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5796     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5797     __ br(__ EQ, DONE);
5798 
5799     __ bind(MISMATCH);
5800 
5801     // Crop the vector to find its location.
5802     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5803     // Extract the first different characters of each string.
5804     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5805     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5806 
5807     // Compute the difference of the first different characters.
5808     __ sub(result, rscratch1, rscratch2);
5809 
5810     __ bind(DONE);
5811     __ ret(lr);
5812 #undef LOAD_PAIR
5813     return entry;
5814   }
5815 
5816   void generate_compare_long_strings() {
5817     if (UseSVE == 0) {
5818       StubRoutines::aarch64::_compare_long_string_LL
5819           = generate_compare_long_string_same_encoding(true);
5820       StubRoutines::aarch64::_compare_long_string_UU
5821           = generate_compare_long_string_same_encoding(false);
5822       StubRoutines::aarch64::_compare_long_string_LU
5823           = generate_compare_long_string_different_encoding(true);
5824       StubRoutines::aarch64::_compare_long_string_UL
5825           = generate_compare_long_string_different_encoding(false);
5826     } else {
5827       StubRoutines::aarch64::_compare_long_string_LL
5828           = generate_compare_long_string_sve(LL);
5829       StubRoutines::aarch64::_compare_long_string_UU
5830           = generate_compare_long_string_sve(UU);
5831       StubRoutines::aarch64::_compare_long_string_LU
5832           = generate_compare_long_string_sve(LU);
5833       StubRoutines::aarch64::_compare_long_string_UL
5834           = generate_compare_long_string_sve(UL);
5835     }
5836   }
5837 
5838   // R0 = result
5839   // R1 = str2
5840   // R2 = cnt1
5841   // R3 = str1
5842   // R4 = cnt2
5843   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5844   //
5845   // This generic linear code use few additional ideas, which makes it faster:
5846   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5847   // in order to skip initial loading(help in systems with 1 ld pipeline)
5848   // 2) we can use "fast" algorithm of finding single character to search for
5849   // first symbol with less branches(1 branch per each loaded register instead
5850   // of branch for each symbol), so, this is where constants like
5851   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5852   // 3) after loading and analyzing 1st register of source string, it can be
5853   // used to search for every 1st character entry, saving few loads in
5854   // comparison with "simplier-but-slower" implementation
5855   // 4) in order to avoid lots of push/pop operations, code below is heavily
5856   // re-using/re-initializing/compressing register values, which makes code
5857   // larger and a bit less readable, however, most of extra operations are
5858   // issued during loads or branches, so, penalty is minimal
5859   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5860     const char* stubName = str1_isL
5861         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5862         : "indexof_linear_uu";
5863     __ align(CodeEntryAlignment);
5864     StubCodeMark mark(this, "StubRoutines", stubName);
5865     address entry = __ pc();
5866 
5867     int str1_chr_size = str1_isL ? 1 : 2;
5868     int str2_chr_size = str2_isL ? 1 : 2;
5869     int str1_chr_shift = str1_isL ? 0 : 1;
5870     int str2_chr_shift = str2_isL ? 0 : 1;
5871     bool isL = str1_isL && str2_isL;
5872    // parameters
5873     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5874     // temporary registers
5875     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5876     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5877     // redefinitions
5878     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5879 
5880     __ push(spilled_regs, sp);
5881     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5882         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5883         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5884         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5885         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5886         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5887     // Read whole register from str1. It is safe, because length >=8 here
5888     __ ldr(ch1, Address(str1));
5889     // Read whole register from str2. It is safe, because length >=8 here
5890     __ ldr(ch2, Address(str2));
5891     __ sub(cnt2, cnt2, cnt1);
5892     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5893     if (str1_isL != str2_isL) {
5894       __ eor(v0, __ T16B, v0, v0);
5895     }
5896     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5897     __ mul(first, first, tmp1);
5898     // check if we have less than 1 register to check
5899     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5900     if (str1_isL != str2_isL) {
5901       __ fmovd(v1, ch1);
5902     }
5903     __ br(__ LE, L_SMALL);
5904     __ eor(ch2, first, ch2);
5905     if (str1_isL != str2_isL) {
5906       __ zip1(v1, __ T16B, v1, v0);
5907     }
5908     __ sub(tmp2, ch2, tmp1);
5909     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5910     __ bics(tmp2, tmp2, ch2);
5911     if (str1_isL != str2_isL) {
5912       __ fmovd(ch1, v1);
5913     }
5914     __ br(__ NE, L_HAS_ZERO);
5915     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5916     __ add(result, result, wordSize/str2_chr_size);
5917     __ add(str2, str2, wordSize);
5918     __ br(__ LT, L_POST_LOOP);
5919     __ BIND(L_LOOP);
5920       __ ldr(ch2, Address(str2));
5921       __ eor(ch2, first, ch2);
5922       __ sub(tmp2, ch2, tmp1);
5923       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5924       __ bics(tmp2, tmp2, ch2);
5925       __ br(__ NE, L_HAS_ZERO);
5926     __ BIND(L_LOOP_PROCEED);
5927       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5928       __ add(str2, str2, wordSize);
5929       __ add(result, result, wordSize/str2_chr_size);
5930       __ br(__ GE, L_LOOP);
5931     __ BIND(L_POST_LOOP);
5932       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5933       __ br(__ LE, NOMATCH);
5934       __ ldr(ch2, Address(str2));
5935       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5936       __ eor(ch2, first, ch2);
5937       __ sub(tmp2, ch2, tmp1);
5938       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5939       __ mov(tmp4, -1); // all bits set
5940       __ b(L_SMALL_PROCEED);
5941     __ align(OptoLoopAlignment);
5942     __ BIND(L_SMALL);
5943       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5944       __ eor(ch2, first, ch2);
5945       if (str1_isL != str2_isL) {
5946         __ zip1(v1, __ T16B, v1, v0);
5947       }
5948       __ sub(tmp2, ch2, tmp1);
5949       __ mov(tmp4, -1); // all bits set
5950       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5951       if (str1_isL != str2_isL) {
5952         __ fmovd(ch1, v1); // move converted 4 symbols
5953       }
5954     __ BIND(L_SMALL_PROCEED);
5955       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5956       __ bic(tmp2, tmp2, ch2);
5957       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5958       __ rbit(tmp2, tmp2);
5959       __ br(__ EQ, NOMATCH);
5960     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5961       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5962       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5963       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5964       if (str2_isL) { // LL
5965         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5966         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5967         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5968         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5969         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5970       } else {
5971         __ mov(ch2, 0xE); // all bits in byte set except last one
5972         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5973         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5974         __ lslv(tmp2, tmp2, tmp4);
5975         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5976         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5977         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5978         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5979       }
5980       __ cmp(ch1, ch2);
5981       __ mov(tmp4, wordSize/str2_chr_size);
5982       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5983     __ BIND(L_SMALL_CMP_LOOP);
5984       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5985                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5986       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5987                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5988       __ add(tmp4, tmp4, 1);
5989       __ cmp(tmp4, cnt1);
5990       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5991       __ cmp(first, ch2);
5992       __ br(__ EQ, L_SMALL_CMP_LOOP);
5993     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5994       __ cbz(tmp2, NOMATCH); // no more matches. exit
5995       __ clz(tmp4, tmp2);
5996       __ add(result, result, 1); // advance index
5997       __ add(str2, str2, str2_chr_size); // advance pointer
5998       __ b(L_SMALL_HAS_ZERO_LOOP);
5999     __ align(OptoLoopAlignment);
6000     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
6001       __ cmp(first, ch2);
6002       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6003       __ b(DONE);
6004     __ align(OptoLoopAlignment);
6005     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
6006       if (str2_isL) { // LL
6007         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6008         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6009         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6010         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6011         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6012       } else {
6013         __ mov(ch2, 0xE); // all bits in byte set except last one
6014         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6015         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6016         __ lslv(tmp2, tmp2, tmp4);
6017         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6018         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6019         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6020         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6021       }
6022       __ cmp(ch1, ch2);
6023       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6024       __ b(DONE);
6025     __ align(OptoLoopAlignment);
6026     __ BIND(L_HAS_ZERO);
6027       __ rbit(tmp2, tmp2);
6028       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6029       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6030       // It's fine because both counters are 32bit and are not changed in this
6031       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6032       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6033       __ sub(result, result, 1);
6034     __ BIND(L_HAS_ZERO_LOOP);
6035       __ mov(cnt1, wordSize/str2_chr_size);
6036       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6037       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6038       if (str2_isL) {
6039         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6040         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6041         __ lslv(tmp2, tmp2, tmp4);
6042         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6043         __ add(tmp4, tmp4, 1);
6044         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6045         __ lsl(tmp2, tmp2, 1);
6046         __ mov(tmp4, wordSize/str2_chr_size);
6047       } else {
6048         __ mov(ch2, 0xE);
6049         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6050         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6051         __ lslv(tmp2, tmp2, tmp4);
6052         __ add(tmp4, tmp4, 1);
6053         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6054         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6055         __ lsl(tmp2, tmp2, 1);
6056         __ mov(tmp4, wordSize/str2_chr_size);
6057         __ sub(str2, str2, str2_chr_size);
6058       }
6059       __ cmp(ch1, ch2);
6060       __ mov(tmp4, wordSize/str2_chr_size);
6061       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6062     __ BIND(L_CMP_LOOP);
6063       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6064                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6065       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6066                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6067       __ add(tmp4, tmp4, 1);
6068       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6069       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6070       __ cmp(cnt1, ch2);
6071       __ br(__ EQ, L_CMP_LOOP);
6072     __ BIND(L_CMP_LOOP_NOMATCH);
6073       // here we're not matched
6074       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6075       __ clz(tmp4, tmp2);
6076       __ add(str2, str2, str2_chr_size); // advance pointer
6077       __ b(L_HAS_ZERO_LOOP);
6078     __ align(OptoLoopAlignment);
6079     __ BIND(L_CMP_LOOP_LAST_CMP);
6080       __ cmp(cnt1, ch2);
6081       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6082       __ b(DONE);
6083     __ align(OptoLoopAlignment);
6084     __ BIND(L_CMP_LOOP_LAST_CMP2);
6085       if (str2_isL) {
6086         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6087         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6088         __ lslv(tmp2, tmp2, tmp4);
6089         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6090         __ add(tmp4, tmp4, 1);
6091         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6092         __ lsl(tmp2, tmp2, 1);
6093       } else {
6094         __ mov(ch2, 0xE);
6095         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6096         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6097         __ lslv(tmp2, tmp2, tmp4);
6098         __ add(tmp4, tmp4, 1);
6099         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6100         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6101         __ lsl(tmp2, tmp2, 1);
6102         __ sub(str2, str2, str2_chr_size);
6103       }
6104       __ cmp(ch1, ch2);
6105       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6106       __ b(DONE);
6107     __ align(OptoLoopAlignment);
6108     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6109       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6110       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6111       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6112       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6113       // result by analyzed characters value, so, we can just reset lower bits
6114       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6115       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6116       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6117       // index of last analyzed substring inside current octet. So, str2 in at
6118       // respective start address. We need to advance it to next octet
6119       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6120       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6121       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6122       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6123       __ movw(cnt2, cnt2);
6124       __ b(L_LOOP_PROCEED);
6125     __ align(OptoLoopAlignment);
6126     __ BIND(NOMATCH);
6127       __ mov(result, -1);
6128     __ BIND(DONE);
6129       __ pop(spilled_regs, sp);
6130       __ ret(lr);
6131     return entry;
6132   }
6133 
6134   void generate_string_indexof_stubs() {
6135     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6136     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6137     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6138   }
6139 
6140   void inflate_and_store_2_fp_registers(bool generatePrfm,
6141       FloatRegister src1, FloatRegister src2) {
6142     Register dst = r1;
6143     __ zip1(v1, __ T16B, src1, v0);
6144     __ zip2(v2, __ T16B, src1, v0);
6145     if (generatePrfm) {
6146       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6147     }
6148     __ zip1(v3, __ T16B, src2, v0);
6149     __ zip2(v4, __ T16B, src2, v0);
6150     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6151   }
6152 
6153   // R0 = src
6154   // R1 = dst
6155   // R2 = len
6156   // R3 = len >> 3
6157   // V0 = 0
6158   // v1 = loaded 8 bytes
6159   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6160   address generate_large_byte_array_inflate() {
6161     __ align(CodeEntryAlignment);
6162     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6163     address entry = __ pc();
6164     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6165     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6166     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6167 
6168     // do one more 8-byte read to have address 16-byte aligned in most cases
6169     // also use single store instruction
6170     __ ldrd(v2, __ post(src, 8));
6171     __ sub(octetCounter, octetCounter, 2);
6172     __ zip1(v1, __ T16B, v1, v0);
6173     __ zip1(v2, __ T16B, v2, v0);
6174     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6175     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6176     __ subs(rscratch1, octetCounter, large_loop_threshold);
6177     __ br(__ LE, LOOP_START);
6178     __ b(LOOP_PRFM_START);
6179     __ bind(LOOP_PRFM);
6180       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6181     __ bind(LOOP_PRFM_START);
6182       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6183       __ sub(octetCounter, octetCounter, 8);
6184       __ subs(rscratch1, octetCounter, large_loop_threshold);
6185       inflate_and_store_2_fp_registers(true, v3, v4);
6186       inflate_and_store_2_fp_registers(true, v5, v6);
6187       __ br(__ GT, LOOP_PRFM);
6188       __ cmp(octetCounter, (u1)8);
6189       __ br(__ LT, DONE);
6190     __ bind(LOOP);
6191       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6192       __ bind(LOOP_START);
6193       __ sub(octetCounter, octetCounter, 8);
6194       __ cmp(octetCounter, (u1)8);
6195       inflate_and_store_2_fp_registers(false, v3, v4);
6196       inflate_and_store_2_fp_registers(false, v5, v6);
6197       __ br(__ GE, LOOP);
6198     __ bind(DONE);
6199       __ ret(lr);
6200     return entry;
6201   }
6202 
6203   /**
6204    *  Arguments:
6205    *
6206    *  Input:
6207    *  c_rarg0   - current state address
6208    *  c_rarg1   - H key address
6209    *  c_rarg2   - data address
6210    *  c_rarg3   - number of blocks
6211    *
6212    *  Output:
6213    *  Updated state at c_rarg0
6214    */
6215   address generate_ghash_processBlocks() {
6216     // Bafflingly, GCM uses little-endian for the byte order, but
6217     // big-endian for the bit order.  For example, the polynomial 1 is
6218     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6219     //
6220     // So, we must either reverse the bytes in each word and do
6221     // everything big-endian or reverse the bits in each byte and do
6222     // it little-endian.  On AArch64 it's more idiomatic to reverse
6223     // the bits in each byte (we have an instruction, RBIT, to do
6224     // that) and keep the data in little-endian bit order through the
6225     // calculation, bit-reversing the inputs and outputs.
6226 
6227     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6228     __ align(wordSize * 2);
6229     address p = __ pc();
6230     __ emit_int64(0x87);  // The low-order bits of the field
6231                           // polynomial (i.e. p = z^7+z^2+z+1)
6232                           // repeated in the low and high parts of a
6233                           // 128-bit vector
6234     __ emit_int64(0x87);
6235 
6236     __ align(CodeEntryAlignment);
6237     address start = __ pc();
6238 
6239     Register state   = c_rarg0;
6240     Register subkeyH = c_rarg1;
6241     Register data    = c_rarg2;
6242     Register blocks  = c_rarg3;
6243 
6244     FloatRegister vzr = v30;
6245     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6246 
6247     __ ldrq(v24, p);    // The field polynomial
6248 
6249     __ ldrq(v0, Address(state));
6250     __ ldrq(v1, Address(subkeyH));
6251 
6252     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6253     __ rbit(v0, __ T16B, v0);
6254     __ rev64(v1, __ T16B, v1);
6255     __ rbit(v1, __ T16B, v1);
6256 
6257     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6258     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6259 
6260     {
6261       Label L_ghash_loop;
6262       __ bind(L_ghash_loop);
6263 
6264       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6265                                                  // reversing each byte
6266       __ rbit(v2, __ T16B, v2);
6267       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6268 
6269       // Multiply state in v2 by subkey in v1
6270       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6271                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6272                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6273       // Reduce v7:v5 by the field polynomial
6274       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6275 
6276       __ sub(blocks, blocks, 1);
6277       __ cbnz(blocks, L_ghash_loop);
6278     }
6279 
6280     // The bit-reversed result is at this point in v0
6281     __ rev64(v0, __ T16B, v0);
6282     __ rbit(v0, __ T16B, v0);
6283 
6284     __ st1(v0, __ T16B, state);
6285     __ ret(lr);
6286 
6287     return start;
6288   }
6289 
6290   address generate_ghash_processBlocks_wide() {
6291     address small = generate_ghash_processBlocks();
6292 
6293     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6294     __ align(wordSize * 2);
6295     address p = __ pc();
6296     __ emit_int64(0x87);  // The low-order bits of the field
6297                           // polynomial (i.e. p = z^7+z^2+z+1)
6298                           // repeated in the low and high parts of a
6299                           // 128-bit vector
6300     __ emit_int64(0x87);
6301 
6302     __ align(CodeEntryAlignment);
6303     address start = __ pc();
6304 
6305     Register state   = c_rarg0;
6306     Register subkeyH = c_rarg1;
6307     Register data    = c_rarg2;
6308     Register blocks  = c_rarg3;
6309 
6310     const int unroll = 4;
6311 
6312     __ cmp(blocks, (unsigned char)(unroll * 2));
6313     __ br(__ LT, small);
6314 
6315     if (unroll > 1) {
6316     // Save state before entering routine
6317       __ sub(sp, sp, 4 * 16);
6318       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6319       __ sub(sp, sp, 4 * 16);
6320       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6321     }
6322 
6323     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6324 
6325     if (unroll > 1) {
6326       // And restore state
6327       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6328       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6329     }
6330 
6331     __ cmp(blocks, (unsigned char)0);
6332     __ br(__ GT, small);
6333 
6334     __ ret(lr);
6335 
6336     return start;
6337   }
6338 
6339   void generate_base64_encode_simdround(Register src, Register dst,
6340         FloatRegister codec, u8 size) {
6341 
6342     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6343     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6344     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6345 
6346     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6347 
6348     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6349 
6350     __ ushr(ind0, arrangement, in0,  2);
6351 
6352     __ ushr(ind1, arrangement, in1,  2);
6353     __ shl(in0,   arrangement, in0,  6);
6354     __ orr(ind1,  arrangement, ind1, in0);
6355     __ ushr(ind1, arrangement, ind1, 2);
6356 
6357     __ ushr(ind2, arrangement, in2,  4);
6358     __ shl(in1,   arrangement, in1,  4);
6359     __ orr(ind2,  arrangement, in1,  ind2);
6360     __ ushr(ind2, arrangement, ind2, 2);
6361 
6362     __ shl(ind3,  arrangement, in2,  2);
6363     __ ushr(ind3, arrangement, ind3, 2);
6364 
6365     __ tbl(out0,  arrangement, codec,  4, ind0);
6366     __ tbl(out1,  arrangement, codec,  4, ind1);
6367     __ tbl(out2,  arrangement, codec,  4, ind2);
6368     __ tbl(out3,  arrangement, codec,  4, ind3);
6369 
6370     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6371   }
6372 
6373    /**
6374    *  Arguments:
6375    *
6376    *  Input:
6377    *  c_rarg0   - src_start
6378    *  c_rarg1   - src_offset
6379    *  c_rarg2   - src_length
6380    *  c_rarg3   - dest_start
6381    *  c_rarg4   - dest_offset
6382    *  c_rarg5   - isURL
6383    *
6384    */
6385   address generate_base64_encodeBlock() {
6386 
6387     static const char toBase64[64] = {
6388       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6389       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6390       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6391       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6392       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6393     };
6394 
6395     static const char toBase64URL[64] = {
6396       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6397       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6398       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6399       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6400       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6401     };
6402 
6403     __ align(CodeEntryAlignment);
6404     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6405     address start = __ pc();
6406 
6407     Register src   = c_rarg0;  // source array
6408     Register soff  = c_rarg1;  // source start offset
6409     Register send  = c_rarg2;  // source end offset
6410     Register dst   = c_rarg3;  // dest array
6411     Register doff  = c_rarg4;  // position for writing to dest array
6412     Register isURL = c_rarg5;  // Base64 or URL character set
6413 
6414     // c_rarg6 and c_rarg7 are free to use as temps
6415     Register codec  = c_rarg6;
6416     Register length = c_rarg7;
6417 
6418     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6419 
6420     __ add(src, src, soff);
6421     __ add(dst, dst, doff);
6422     __ sub(length, send, soff);
6423 
6424     // load the codec base address
6425     __ lea(codec, ExternalAddress((address) toBase64));
6426     __ cbz(isURL, ProcessData);
6427     __ lea(codec, ExternalAddress((address) toBase64URL));
6428 
6429     __ BIND(ProcessData);
6430 
6431     // too short to formup a SIMD loop, roll back
6432     __ cmp(length, (u1)24);
6433     __ br(Assembler::LT, Process3B);
6434 
6435     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6436 
6437     __ BIND(Process48B);
6438     __ cmp(length, (u1)48);
6439     __ br(Assembler::LT, Process24B);
6440     generate_base64_encode_simdround(src, dst, v0, 16);
6441     __ sub(length, length, 48);
6442     __ b(Process48B);
6443 
6444     __ BIND(Process24B);
6445     __ cmp(length, (u1)24);
6446     __ br(Assembler::LT, SIMDExit);
6447     generate_base64_encode_simdround(src, dst, v0, 8);
6448     __ sub(length, length, 24);
6449 
6450     __ BIND(SIMDExit);
6451     __ cbz(length, Exit);
6452 
6453     __ BIND(Process3B);
6454     //  3 src bytes, 24 bits
6455     __ ldrb(r10, __ post(src, 1));
6456     __ ldrb(r11, __ post(src, 1));
6457     __ ldrb(r12, __ post(src, 1));
6458     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6459     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6460     // codec index
6461     __ ubfmw(r15, r12, 18, 23);
6462     __ ubfmw(r14, r12, 12, 17);
6463     __ ubfmw(r13, r12, 6,  11);
6464     __ andw(r12,  r12, 63);
6465     // get the code based on the codec
6466     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6467     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6468     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6469     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6470     __ strb(r15, __ post(dst, 1));
6471     __ strb(r14, __ post(dst, 1));
6472     __ strb(r13, __ post(dst, 1));
6473     __ strb(r12, __ post(dst, 1));
6474     __ sub(length, length, 3);
6475     __ cbnz(length, Process3B);
6476 
6477     __ BIND(Exit);
6478     __ ret(lr);
6479 
6480     return start;
6481   }
6482 
6483   void generate_base64_decode_simdround(Register src, Register dst,
6484         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6485 
6486     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6487     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6488 
6489     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6490     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6491 
6492     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6493 
6494     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6495 
6496     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6497 
6498     // we need unsigned saturating subtract, to make sure all input values
6499     // in range [0, 63] will have 0U value in the higher half lookup
6500     __ uqsubv(decH0, __ T16B, in0, v27);
6501     __ uqsubv(decH1, __ T16B, in1, v27);
6502     __ uqsubv(decH2, __ T16B, in2, v27);
6503     __ uqsubv(decH3, __ T16B, in3, v27);
6504 
6505     // lower half lookup
6506     __ tbl(decL0, arrangement, codecL, 4, in0);
6507     __ tbl(decL1, arrangement, codecL, 4, in1);
6508     __ tbl(decL2, arrangement, codecL, 4, in2);
6509     __ tbl(decL3, arrangement, codecL, 4, in3);
6510 
6511     // higher half lookup
6512     __ tbx(decH0, arrangement, codecH, 4, decH0);
6513     __ tbx(decH1, arrangement, codecH, 4, decH1);
6514     __ tbx(decH2, arrangement, codecH, 4, decH2);
6515     __ tbx(decH3, arrangement, codecH, 4, decH3);
6516 
6517     // combine lower and higher
6518     __ orr(decL0, arrangement, decL0, decH0);
6519     __ orr(decL1, arrangement, decL1, decH1);
6520     __ orr(decL2, arrangement, decL2, decH2);
6521     __ orr(decL3, arrangement, decL3, decH3);
6522 
6523     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6524     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6525     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6526     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6527     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6528     __ orr(in0, arrangement, decH0, decH1);
6529     __ orr(in1, arrangement, decH2, decH3);
6530     __ orr(in2, arrangement, in0,   in1);
6531     __ umaxv(in3, arrangement, in2);
6532     __ umov(rscratch2, in3, __ B, 0);
6533 
6534     // get the data to output
6535     __ shl(out0,  arrangement, decL0, 2);
6536     __ ushr(out1, arrangement, decL1, 4);
6537     __ orr(out0,  arrangement, out0,  out1);
6538     __ shl(out1,  arrangement, decL1, 4);
6539     __ ushr(out2, arrangement, decL2, 2);
6540     __ orr(out1,  arrangement, out1,  out2);
6541     __ shl(out2,  arrangement, decL2, 6);
6542     __ orr(out2,  arrangement, out2,  decL3);
6543 
6544     __ cbz(rscratch2, NoIllegalData);
6545 
6546     // handle illegal input
6547     __ umov(r10, in2, __ D, 0);
6548     if (size == 16) {
6549       __ cbnz(r10, ErrorInLowerHalf);
6550 
6551       // illegal input is in higher half, store the lower half now.
6552       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6553 
6554       __ umov(r10, in2,  __ D, 1);
6555       __ umov(r11, out0, __ D, 1);
6556       __ umov(r12, out1, __ D, 1);
6557       __ umov(r13, out2, __ D, 1);
6558       __ b(StoreLegalData);
6559 
6560       __ BIND(ErrorInLowerHalf);
6561     }
6562     __ umov(r11, out0, __ D, 0);
6563     __ umov(r12, out1, __ D, 0);
6564     __ umov(r13, out2, __ D, 0);
6565 
6566     __ BIND(StoreLegalData);
6567     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6568     __ strb(r11, __ post(dst, 1));
6569     __ strb(r12, __ post(dst, 1));
6570     __ strb(r13, __ post(dst, 1));
6571     __ lsr(r10, r10, 8);
6572     __ lsr(r11, r11, 8);
6573     __ lsr(r12, r12, 8);
6574     __ lsr(r13, r13, 8);
6575     __ b(StoreLegalData);
6576 
6577     __ BIND(NoIllegalData);
6578     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6579   }
6580 
6581 
6582    /**
6583    *  Arguments:
6584    *
6585    *  Input:
6586    *  c_rarg0   - src_start
6587    *  c_rarg1   - src_offset
6588    *  c_rarg2   - src_length
6589    *  c_rarg3   - dest_start
6590    *  c_rarg4   - dest_offset
6591    *  c_rarg5   - isURL
6592    *  c_rarg6   - isMIME
6593    *
6594    */
6595   address generate_base64_decodeBlock() {
6596 
6597     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6598     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6599     // titled "Base64 decoding".
6600 
6601     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6602     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6603     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6604     static const uint8_t fromBase64ForNoSIMD[256] = {
6605       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6606       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6607       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6608        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6609       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6610        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6611       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6612        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6613       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6614       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6616       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6617       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6618       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6619       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6620       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6621     };
6622 
6623     static const uint8_t fromBase64URLForNoSIMD[256] = {
6624       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6625       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6626       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6627        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6628       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6629        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6630       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6631        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6632       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6633       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6634       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6635       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6636       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6637       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6638       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6639       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6640     };
6641 
6642     // A legal value of base64 code is in range [0, 127].  We need two lookups
6643     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6644     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6645     // table vector lookup use tbx, out of range indices are unchanged in
6646     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6647     // The value of index 64 is set to 0, so that we know that we already get the
6648     // decoded data with the 1st lookup.
6649     static const uint8_t fromBase64ForSIMD[128] = {
6650       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6651       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6652       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6653        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6654         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6655        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6656       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6657        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6658     };
6659 
6660     static const uint8_t fromBase64URLForSIMD[128] = {
6661       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6662       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6663       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6664        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6665         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6666        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6667        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6668        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6669     };
6670 
6671     __ align(CodeEntryAlignment);
6672     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6673     address start = __ pc();
6674 
6675     Register src    = c_rarg0;  // source array
6676     Register soff   = c_rarg1;  // source start offset
6677     Register send   = c_rarg2;  // source end offset
6678     Register dst    = c_rarg3;  // dest array
6679     Register doff   = c_rarg4;  // position for writing to dest array
6680     Register isURL  = c_rarg5;  // Base64 or URL character set
6681     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6682 
6683     Register length = send;    // reuse send as length of source data to process
6684 
6685     Register simd_codec   = c_rarg6;
6686     Register nosimd_codec = c_rarg7;
6687 
6688     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6689 
6690     __ enter();
6691 
6692     __ add(src, src, soff);
6693     __ add(dst, dst, doff);
6694 
6695     __ mov(doff, dst);
6696 
6697     __ sub(length, send, soff);
6698     __ bfm(length, zr, 0, 1);
6699 
6700     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6701     __ cbz(isURL, ProcessData);
6702     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6703 
6704     __ BIND(ProcessData);
6705     __ mov(rscratch1, length);
6706     __ cmp(length, (u1)144); // 144 = 80 + 64
6707     __ br(Assembler::LT, Process4B);
6708 
6709     // In the MIME case, the line length cannot be more than 76
6710     // bytes (see RFC 2045). This is too short a block for SIMD
6711     // to be worthwhile, so we use non-SIMD here.
6712     __ movw(rscratch1, 79);
6713 
6714     __ BIND(Process4B);
6715     __ ldrw(r14, __ post(src, 4));
6716     __ ubfxw(r10, r14, 0,  8);
6717     __ ubfxw(r11, r14, 8,  8);
6718     __ ubfxw(r12, r14, 16, 8);
6719     __ ubfxw(r13, r14, 24, 8);
6720     // get the de-code
6721     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6722     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6723     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6724     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6725     // error detection, 255u indicates an illegal input
6726     __ orrw(r14, r10, r11);
6727     __ orrw(r15, r12, r13);
6728     __ orrw(r14, r14, r15);
6729     __ tbnz(r14, 7, Exit);
6730     // recover the data
6731     __ lslw(r14, r10, 10);
6732     __ bfiw(r14, r11, 4, 6);
6733     __ bfmw(r14, r12, 2, 5);
6734     __ rev16w(r14, r14);
6735     __ bfiw(r13, r12, 6, 2);
6736     __ strh(r14, __ post(dst, 2));
6737     __ strb(r13, __ post(dst, 1));
6738     // non-simd loop
6739     __ subsw(rscratch1, rscratch1, 4);
6740     __ br(Assembler::GT, Process4B);
6741 
6742     // if exiting from PreProcess80B, rscratch1 == -1;
6743     // otherwise, rscratch1 == 0.
6744     __ cbzw(rscratch1, Exit);
6745     __ sub(length, length, 80);
6746 
6747     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6748     __ cbz(isURL, SIMDEnter);
6749     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6750 
6751     __ BIND(SIMDEnter);
6752     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6753     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6754     __ mov(rscratch1, 63);
6755     __ dup(v27, __ T16B, rscratch1);
6756 
6757     __ BIND(Process64B);
6758     __ cmp(length, (u1)64);
6759     __ br(Assembler::LT, Process32B);
6760     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6761     __ sub(length, length, 64);
6762     __ b(Process64B);
6763 
6764     __ BIND(Process32B);
6765     __ cmp(length, (u1)32);
6766     __ br(Assembler::LT, SIMDExit);
6767     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6768     __ sub(length, length, 32);
6769     __ b(Process32B);
6770 
6771     __ BIND(SIMDExit);
6772     __ cbz(length, Exit);
6773     __ movw(rscratch1, length);
6774     __ b(Process4B);
6775 
6776     __ BIND(Exit);
6777     __ sub(c_rarg0, dst, doff);
6778 
6779     __ leave();
6780     __ ret(lr);
6781 
6782     return start;
6783   }
6784 
6785   // Support for spin waits.
6786   address generate_spin_wait() {
6787     __ align(CodeEntryAlignment);
6788     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6789     address start = __ pc();
6790 
6791     __ spin_wait();
6792     __ ret(lr);
6793 
6794     return start;
6795   }
6796 
6797 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6798 
6799   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6800   //
6801   // If LSE is in use, generate LSE versions of all the stubs. The
6802   // non-LSE versions are in atomic_aarch64.S.
6803 
6804   // class AtomicStubMark records the entry point of a stub and the
6805   // stub pointer which will point to it. The stub pointer is set to
6806   // the entry point when ~AtomicStubMark() is called, which must be
6807   // after ICache::invalidate_range. This ensures safe publication of
6808   // the generated code.
6809   class AtomicStubMark {
6810     address _entry_point;
6811     aarch64_atomic_stub_t *_stub;
6812     MacroAssembler *_masm;
6813   public:
6814     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6815       _masm = masm;
6816       __ align(32);
6817       _entry_point = __ pc();
6818       _stub = stub;
6819     }
6820     ~AtomicStubMark() {
6821       *_stub = (aarch64_atomic_stub_t)_entry_point;
6822     }
6823   };
6824 
6825   // NB: For memory_order_conservative we need a trailing membar after
6826   // LSE atomic operations but not a leading membar.
6827   //
6828   // We don't need a leading membar because a clause in the Arm ARM
6829   // says:
6830   //
6831   //   Barrier-ordered-before
6832   //
6833   //   Barrier instructions order prior Memory effects before subsequent
6834   //   Memory effects generated by the same Observer. A read or a write
6835   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6836   //   Observer if and only if RW1 appears in program order before RW 2
6837   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6838   //   instruction with both Acquire and Release semantics.
6839   //
6840   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6841   // and Release semantics, therefore we don't need a leading
6842   // barrier. However, there is no corresponding Barrier-ordered-after
6843   // relationship, therefore we need a trailing membar to prevent a
6844   // later store or load from being reordered with the store in an
6845   // atomic instruction.
6846   //
6847   // This was checked by using the herd7 consistency model simulator
6848   // (http://diy.inria.fr/) with this test case:
6849   //
6850   // AArch64 LseCas
6851   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6852   // P0 | P1;
6853   // LDR W4, [X2] | MOV W3, #0;
6854   // DMB LD       | MOV W4, #1;
6855   // LDR W3, [X1] | CASAL W3, W4, [X1];
6856   //              | DMB ISH;
6857   //              | STR W4, [X2];
6858   // exists
6859   // (0:X3=0 /\ 0:X4=1)
6860   //
6861   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6862   // with the store to x in P1. Without the DMB in P1 this may happen.
6863   //
6864   // At the time of writing we don't know of any AArch64 hardware that
6865   // reorders stores in this way, but the Reference Manual permits it.
6866 
6867   void gen_cas_entry(Assembler::operand_size size,
6868                      atomic_memory_order order) {
6869     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6870       exchange_val = c_rarg2;
6871     bool acquire, release;
6872     switch (order) {
6873       case memory_order_relaxed:
6874         acquire = false;
6875         release = false;
6876         break;
6877       case memory_order_release:
6878         acquire = false;
6879         release = true;
6880         break;
6881       default:
6882         acquire = true;
6883         release = true;
6884         break;
6885     }
6886     __ mov(prev, compare_val);
6887     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6888     if (order == memory_order_conservative) {
6889       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6890     }
6891     if (size == Assembler::xword) {
6892       __ mov(r0, prev);
6893     } else {
6894       __ movw(r0, prev);
6895     }
6896     __ ret(lr);
6897   }
6898 
6899   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6900     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6901     // If not relaxed, then default to conservative.  Relaxed is the only
6902     // case we use enough to be worth specializing.
6903     if (order == memory_order_relaxed) {
6904       __ ldadd(size, incr, prev, addr);
6905     } else {
6906       __ ldaddal(size, incr, prev, addr);
6907       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6908     }
6909     if (size == Assembler::xword) {
6910       __ mov(r0, prev);
6911     } else {
6912       __ movw(r0, prev);
6913     }
6914     __ ret(lr);
6915   }
6916 
6917   void gen_swpal_entry(Assembler::operand_size size) {
6918     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6919     __ swpal(size, incr, prev, addr);
6920     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6921     if (size == Assembler::xword) {
6922       __ mov(r0, prev);
6923     } else {
6924       __ movw(r0, prev);
6925     }
6926     __ ret(lr);
6927   }
6928 
6929   void generate_atomic_entry_points() {
6930     if (! UseLSE) {
6931       return;
6932     }
6933 
6934     __ align(CodeEntryAlignment);
6935     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6936     address first_entry = __ pc();
6937 
6938     // ADD, memory_order_conservative
6939     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6940     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6941     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6942     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6943 
6944     // ADD, memory_order_relaxed
6945     AtomicStubMark mark_fetch_add_4_relaxed
6946       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6947     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6948     AtomicStubMark mark_fetch_add_8_relaxed
6949       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6950     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6951 
6952     // XCHG, memory_order_conservative
6953     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6954     gen_swpal_entry(Assembler::word);
6955     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6956     gen_swpal_entry(Assembler::xword);
6957 
6958     // CAS, memory_order_conservative
6959     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6960     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6961     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6962     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6963     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6964     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6965 
6966     // CAS, memory_order_relaxed
6967     AtomicStubMark mark_cmpxchg_1_relaxed
6968       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6969     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6970     AtomicStubMark mark_cmpxchg_4_relaxed
6971       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6972     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6973     AtomicStubMark mark_cmpxchg_8_relaxed
6974       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6975     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6976 
6977     AtomicStubMark mark_cmpxchg_4_release
6978       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6979     gen_cas_entry(MacroAssembler::word, memory_order_release);
6980     AtomicStubMark mark_cmpxchg_8_release
6981       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6982     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6983 
6984     AtomicStubMark mark_cmpxchg_4_seq_cst
6985       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6986     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6987     AtomicStubMark mark_cmpxchg_8_seq_cst
6988       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6989     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6990 
6991     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6992   }
6993 #endif // LINUX
6994 
6995   address generate_cont_thaw(Continuation::thaw_kind kind) {
6996     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6997     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6998 
6999     address start = __ pc();
7000 
7001     if (return_barrier) {
7002       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7003       __ mov(sp, rscratch1);
7004     }
7005     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7006 
7007     if (return_barrier) {
7008       // preserve possible return value from a method returning to the return barrier
7009       __ fmovd(rscratch1, v0);
7010       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7011     }
7012 
7013     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7014     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7015     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7016 
7017     if (return_barrier) {
7018       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7019       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7020       __ fmovd(v0, rscratch1);
7021     }
7022     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7023 
7024 
7025     Label thaw_success;
7026     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7027     __ cbnz(rscratch2, thaw_success);
7028     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
7029     __ br(rscratch1);
7030     __ bind(thaw_success);
7031 
7032     // make room for the thawed frames
7033     __ sub(rscratch1, sp, rscratch2);
7034     __ andr(rscratch1, rscratch1, -16); // align
7035     __ mov(sp, rscratch1);
7036 
7037     if (return_barrier) {
7038       // save original return value -- again
7039       __ fmovd(rscratch1, v0);
7040       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7041     }
7042 
7043     // If we want, we can templatize thaw by kind, and have three different entries
7044     __ movw(c_rarg1, (uint32_t)kind);
7045 
7046     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7047     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7048 
7049     if (return_barrier) {
7050       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7051       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7052       __ fmovd(v0, rscratch1);
7053     } else {
7054       __ mov(r0, zr); // return 0 (success) from doYield
7055     }
7056 
7057     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7058     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7059     __ mov(rfp, sp);
7060 
7061     if (return_barrier_exception) {
7062       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7063       __ authenticate_return_address(c_rarg1);
7064       __ verify_oop(r0);
7065       // save return value containing the exception oop in callee-saved R19
7066       __ mov(r19, r0);
7067 
7068       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7069 
7070       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7071       // __ reinitialize_ptrue();
7072 
7073       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7074 
7075       __ mov(r1, r0); // the exception handler
7076       __ mov(r0, r19); // restore return value containing the exception oop
7077       __ verify_oop(r0);
7078 
7079       __ leave();
7080       __ mov(r3, lr);
7081       __ br(r1); // the exception handler
7082     } else {
7083       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7084       __ leave();
7085       __ ret(lr);
7086     }
7087 
7088     return start;
7089   }
7090 
7091   address generate_cont_thaw() {
7092     if (!Continuations::enabled()) return nullptr;
7093 
7094     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7095     address start = __ pc();
7096     generate_cont_thaw(Continuation::thaw_top);
7097     return start;
7098   }
7099 
7100   address generate_cont_returnBarrier() {
7101     if (!Continuations::enabled()) return nullptr;
7102 
7103     // TODO: will probably need multiple return barriers depending on return type
7104     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7105     address start = __ pc();
7106 
7107     generate_cont_thaw(Continuation::thaw_return_barrier);
7108 
7109     return start;
7110   }
7111 
7112   address generate_cont_returnBarrier_exception() {
7113     if (!Continuations::enabled()) return nullptr;
7114 
7115     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7116     address start = __ pc();
7117 
7118     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7119 
7120     return start;
7121   }
7122 
7123   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7124   // are represented as long[5], with BITS_PER_LIMB = 26.
7125   // Pack five 26-bit limbs into three 64-bit registers.
7126   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7127     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7128     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7129     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7130     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7131 
7132     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7133     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7134     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7135     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7136 
7137     if (dest2->is_valid()) {
7138       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7139     } else {
7140 #ifdef ASSERT
7141       Label OK;
7142       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7143       __ br(__ EQ, OK);
7144       __ stop("high bits of Poly1305 integer should be zero");
7145       __ should_not_reach_here();
7146       __ bind(OK);
7147 #endif
7148     }
7149   }
7150 
7151   // As above, but return only a 128-bit integer, packed into two
7152   // 64-bit registers.
7153   void pack_26(Register dest0, Register dest1, Register src) {
7154     pack_26(dest0, dest1, noreg, src);
7155   }
7156 
7157   // Multiply and multiply-accumulate unsigned 64-bit registers.
7158   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7159     __ mul(prod_lo, n, m);
7160     __ umulh(prod_hi, n, m);
7161   }
7162   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7163     wide_mul(rscratch1, rscratch2, n, m);
7164     __ adds(sum_lo, sum_lo, rscratch1);
7165     __ adc(sum_hi, sum_hi, rscratch2);
7166   }
7167 
7168   // Poly1305, RFC 7539
7169 
7170   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7171   // description of the tricks used to simplify and accelerate this
7172   // computation.
7173 
7174   address generate_poly1305_processBlocks() {
7175     __ align(CodeEntryAlignment);
7176     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7177     address start = __ pc();
7178     Label here;
7179     __ enter();
7180     RegSet callee_saved = RegSet::range(r19, r28);
7181     __ push(callee_saved, sp);
7182 
7183     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7184 
7185     // Arguments
7186     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7187 
7188     // R_n is the 128-bit randomly-generated key, packed into two
7189     // registers.  The caller passes this key to us as long[5], with
7190     // BITS_PER_LIMB = 26.
7191     const Register R_0 = *++regs, R_1 = *++regs;
7192     pack_26(R_0, R_1, r_start);
7193 
7194     // RR_n is (R_n >> 2) * 5
7195     const Register RR_0 = *++regs, RR_1 = *++regs;
7196     __ lsr(RR_0, R_0, 2);
7197     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7198     __ lsr(RR_1, R_1, 2);
7199     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7200 
7201     // U_n is the current checksum
7202     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7203     pack_26(U_0, U_1, U_2, acc_start);
7204 
7205     static constexpr int BLOCK_LENGTH = 16;
7206     Label DONE, LOOP;
7207 
7208     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7209     __ br(Assembler::LT, DONE); {
7210       __ bind(LOOP);
7211 
7212       // S_n is to be the sum of U_n and the next block of data
7213       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7214       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7215       __ adds(S_0, U_0, S_0);
7216       __ adcs(S_1, U_1, S_1);
7217       __ adc(S_2, U_2, zr);
7218       __ add(S_2, S_2, 1);
7219 
7220       const Register U_0HI = *++regs, U_1HI = *++regs;
7221 
7222       // NB: this logic depends on some of the special properties of
7223       // Poly1305 keys. In particular, because we know that the top
7224       // four bits of R_0 and R_1 are zero, we can add together
7225       // partial products without any risk of needing to propagate a
7226       // carry out.
7227       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7228       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7229       __ andr(U_2, R_0, 3);
7230       __ mul(U_2, S_2, U_2);
7231 
7232       // Recycle registers S_0, S_1, S_2
7233       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7234 
7235       // Partial reduction mod 2**130 - 5
7236       __ adds(U_1, U_0HI, U_1);
7237       __ adc(U_2, U_1HI, U_2);
7238       // Sum now in U_2:U_1:U_0.
7239       // Dead: U_0HI, U_1HI.
7240       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7241 
7242       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7243 
7244       // First, U_2:U_1:U_0 += (U_2 >> 2)
7245       __ lsr(rscratch1, U_2, 2);
7246       __ andr(U_2, U_2, (u8)3);
7247       __ adds(U_0, U_0, rscratch1);
7248       __ adcs(U_1, U_1, zr);
7249       __ adc(U_2, U_2, zr);
7250       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7251       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7252       __ adcs(U_1, U_1, zr);
7253       __ adc(U_2, U_2, zr);
7254 
7255       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7256       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7257       __ br(~ Assembler::LT, LOOP);
7258     }
7259 
7260     // Further reduce modulo 2^130 - 5
7261     __ lsr(rscratch1, U_2, 2);
7262     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7263     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7264     __ adcs(U_1, U_1, zr);
7265     __ andr(U_2, U_2, (u1)3);
7266     __ adc(U_2, U_2, zr);
7267 
7268     // Unpack the sum into five 26-bit limbs and write to memory.
7269     __ ubfiz(rscratch1, U_0, 0, 26);
7270     __ ubfx(rscratch2, U_0, 26, 26);
7271     __ stp(rscratch1, rscratch2, Address(acc_start));
7272     __ ubfx(rscratch1, U_0, 52, 12);
7273     __ bfi(rscratch1, U_1, 12, 14);
7274     __ ubfx(rscratch2, U_1, 14, 26);
7275     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7276     __ ubfx(rscratch1, U_1, 40, 24);
7277     __ bfi(rscratch1, U_2, 24, 3);
7278     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7279 
7280     __ bind(DONE);
7281     __ pop(callee_saved, sp);
7282     __ leave();
7283     __ ret(lr);
7284 
7285     return start;
7286   }
7287 
7288 #if INCLUDE_JFR
7289 
7290   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
7291     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7292     __ mov(c_rarg0, thread);
7293   }
7294 
7295   // The handle is dereferenced through a load barrier.
7296   static void jfr_epilogue(MacroAssembler* _masm) {
7297     __ reset_last_Java_frame(true);
7298   }
7299 
7300   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
7301   // It returns a jobject handle to the event writer.
7302   // The handle is dereferenced and the return value is the event writer oop.
7303   static RuntimeStub* generate_jfr_write_checkpoint() {
7304     enum layout {
7305       rbp_off,
7306       rbpH_off,
7307       return_off,
7308       return_off2,
7309       framesize // inclusive of return address
7310     };
7311 
7312     int insts_size = 1024;
7313     int locs_size = 64;
7314     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
7315     OopMapSet* oop_maps = new OopMapSet();
7316     MacroAssembler* masm = new MacroAssembler(&code);
7317     MacroAssembler* _masm = masm;
7318 
7319     address start = __ pc();
7320     __ enter();
7321     int frame_complete = __ pc() - start;
7322     address the_pc = __ pc();
7323     jfr_prologue(the_pc, _masm, rthread);
7324     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
7325     jfr_epilogue(_masm);
7326     __ resolve_global_jobject(r0, rscratch1, rscratch2);
7327     __ leave();
7328     __ ret(lr);
7329 
7330     OopMap* map = new OopMap(framesize, 1); // rfp
7331     oop_maps->add_gc_map(the_pc - start, map);
7332 
7333     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7334       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
7335                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7336                                     oop_maps, false);
7337     return stub;
7338   }
7339 
7340   // For c2: call to return a leased buffer.
7341   static RuntimeStub* generate_jfr_return_lease() {
7342     enum layout {
7343       rbp_off,
7344       rbpH_off,
7345       return_off,
7346       return_off2,
7347       framesize // inclusive of return address
7348     };
7349 
7350     int insts_size = 1024;
7351     int locs_size = 64;
7352     CodeBuffer code("jfr_return_lease", insts_size, locs_size);
7353     OopMapSet* oop_maps = new OopMapSet();
7354     MacroAssembler* masm = new MacroAssembler(&code);
7355     MacroAssembler* _masm = masm;
7356 
7357     address start = __ pc();
7358     __ enter();
7359     int frame_complete = __ pc() - start;
7360     address the_pc = __ pc();
7361     jfr_prologue(the_pc, _masm, rthread);
7362     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
7363     jfr_epilogue(_masm);
7364 
7365     __ leave();
7366     __ ret(lr);
7367 
7368     OopMap* map = new OopMap(framesize, 1); // rfp
7369     oop_maps->add_gc_map(the_pc - start, map);
7370 
7371     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7372       RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete,
7373                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7374                                     oop_maps, false);
7375     return stub;
7376   }
7377 
7378 #endif // INCLUDE_JFR
7379 
7380   // exception handler for upcall stubs
7381   address generate_upcall_stub_exception_handler() {
7382     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7383     address start = __ pc();
7384 
7385     // Native caller has no idea how to handle exceptions,
7386     // so we just crash here. Up to callee to catch exceptions.
7387     __ verify_oop(r0);
7388     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7389     __ blr(rscratch1);
7390     __ should_not_reach_here();
7391 
7392     return start;
7393   }
7394 
7395   // Continuation point for throwing of implicit exceptions that are
7396   // not handled in the current activation. Fabricates an exception
7397   // oop and initiates normal exception dispatching in this
7398   // frame. Since we need to preserve callee-saved values (currently
7399   // only for C2, but done for C1 as well) we need a callee-saved oop
7400   // map and therefore have to make these stubs into RuntimeStubs
7401   // rather than BufferBlobs.  If the compiler needs all registers to
7402   // be preserved between the fault point and the exception handler
7403   // then it must assume responsibility for that in
7404   // AbstractCompiler::continuation_for_implicit_null_exception or
7405   // continuation_for_implicit_division_by_zero_exception. All other
7406   // implicit exceptions (e.g., NullPointerException or
7407   // AbstractMethodError on entry) are either at call sites or
7408   // otherwise assume that stack unwinding will be initiated, so
7409   // caller saved registers were assumed volatile in the compiler.
7410 
7411 #undef __
7412 #define __ masm->
7413 
7414   address generate_throw_exception(const char* name,
7415                                    address runtime_entry,
7416                                    Register arg1 = noreg,
7417                                    Register arg2 = noreg) {
7418     // Information about frame layout at time of blocking runtime call.
7419     // Note that we only have to preserve callee-saved registers since
7420     // the compilers are responsible for supplying a continuation point
7421     // if they expect all registers to be preserved.
7422     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7423     enum layout {
7424       rfp_off = 0,
7425       rfp_off2,
7426       return_off,
7427       return_off2,
7428       framesize // inclusive of return address
7429     };
7430 
7431     int insts_size = 512;
7432     int locs_size  = 64;
7433 
7434     CodeBuffer code(name, insts_size, locs_size);
7435     OopMapSet* oop_maps  = new OopMapSet();
7436     MacroAssembler* masm = new MacroAssembler(&code);
7437 
7438     address start = __ pc();
7439 
7440     // This is an inlined and slightly modified version of call_VM
7441     // which has the ability to fetch the return PC out of
7442     // thread-local storage and also sets up last_Java_sp slightly
7443     // differently than the real call_VM
7444 
7445     __ enter(); // Save FP and LR before call
7446 
7447     assert(is_even(framesize/2), "sp not 16-byte aligned");
7448 
7449     // lr and fp are already in place
7450     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7451 
7452     int frame_complete = __ pc() - start;
7453 
7454     // Set up last_Java_sp and last_Java_fp
7455     address the_pc = __ pc();
7456     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7457 
7458     // Call runtime
7459     if (arg1 != noreg) {
7460       assert(arg2 != c_rarg1, "clobbered");
7461       __ mov(c_rarg1, arg1);
7462     }
7463     if (arg2 != noreg) {
7464       __ mov(c_rarg2, arg2);
7465     }
7466     __ mov(c_rarg0, rthread);
7467     BLOCK_COMMENT("call runtime_entry");
7468     __ mov(rscratch1, runtime_entry);
7469     __ blr(rscratch1);
7470 
7471     // Generate oop map
7472     OopMap* map = new OopMap(framesize, 0);
7473 
7474     oop_maps->add_gc_map(the_pc - start, map);
7475 
7476     __ reset_last_Java_frame(true);
7477 
7478     // Reinitialize the ptrue predicate register, in case the external runtime
7479     // call clobbers ptrue reg, as we may return to SVE compiled code.
7480     __ reinitialize_ptrue();
7481 
7482     __ leave();
7483 
7484     // check for pending exceptions
7485 #ifdef ASSERT
7486     Label L;
7487     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7488     __ cbnz(rscratch1, L);
7489     __ should_not_reach_here();
7490     __ bind(L);
7491 #endif // ASSERT
7492     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7493 
7494     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7495     RuntimeStub* stub =
7496       RuntimeStub::new_runtime_stub(name,
7497                                     &code,
7498                                     frame_complete,
7499                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7500                                     oop_maps, false);
7501     return stub->entry_point();
7502   }
7503 
7504   class MontgomeryMultiplyGenerator : public MacroAssembler {
7505 
7506     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7507       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7508 
7509     RegSet _toSave;
7510     bool _squaring;
7511 
7512   public:
7513     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7514       : MacroAssembler(as->code()), _squaring(squaring) {
7515 
7516       // Register allocation
7517 
7518       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7519       Pa_base = *regs;       // Argument registers
7520       if (squaring)
7521         Pb_base = Pa_base;
7522       else
7523         Pb_base = *++regs;
7524       Pn_base = *++regs;
7525       Rlen= *++regs;
7526       inv = *++regs;
7527       Pm_base = *++regs;
7528 
7529                           // Working registers:
7530       Ra =  *++regs;        // The current digit of a, b, n, and m.
7531       Rb =  *++regs;
7532       Rm =  *++regs;
7533       Rn =  *++regs;
7534 
7535       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7536       Pb =  *++regs;
7537       Pm =  *++regs;
7538       Pn =  *++regs;
7539 
7540       t0 =  *++regs;        // Three registers which form a
7541       t1 =  *++regs;        // triple-precision accumuator.
7542       t2 =  *++regs;
7543 
7544       Ri =  *++regs;        // Inner and outer loop indexes.
7545       Rj =  *++regs;
7546 
7547       Rhi_ab = *++regs;     // Product registers: low and high parts
7548       Rlo_ab = *++regs;     // of a*b and m*n.
7549       Rhi_mn = *++regs;
7550       Rlo_mn = *++regs;
7551 
7552       // r19 and up are callee-saved.
7553       _toSave = RegSet::range(r19, *regs) + Pm_base;
7554     }
7555 
7556   private:
7557     void save_regs() {
7558       push(_toSave, sp);
7559     }
7560 
7561     void restore_regs() {
7562       pop(_toSave, sp);
7563     }
7564 
7565     template <typename T>
7566     void unroll_2(Register count, T block) {
7567       Label loop, end, odd;
7568       tbnz(count, 0, odd);
7569       cbz(count, end);
7570       align(16);
7571       bind(loop);
7572       (this->*block)();
7573       bind(odd);
7574       (this->*block)();
7575       subs(count, count, 2);
7576       br(Assembler::GT, loop);
7577       bind(end);
7578     }
7579 
7580     template <typename T>
7581     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7582       Label loop, end, odd;
7583       tbnz(count, 0, odd);
7584       cbz(count, end);
7585       align(16);
7586       bind(loop);
7587       (this->*block)(d, s, tmp);
7588       bind(odd);
7589       (this->*block)(d, s, tmp);
7590       subs(count, count, 2);
7591       br(Assembler::GT, loop);
7592       bind(end);
7593     }
7594 
7595     void pre1(RegisterOrConstant i) {
7596       block_comment("pre1");
7597       // Pa = Pa_base;
7598       // Pb = Pb_base + i;
7599       // Pm = Pm_base;
7600       // Pn = Pn_base + i;
7601       // Ra = *Pa;
7602       // Rb = *Pb;
7603       // Rm = *Pm;
7604       // Rn = *Pn;
7605       ldr(Ra, Address(Pa_base));
7606       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7607       ldr(Rm, Address(Pm_base));
7608       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7609       lea(Pa, Address(Pa_base));
7610       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7611       lea(Pm, Address(Pm_base));
7612       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7613 
7614       // Zero the m*n result.
7615       mov(Rhi_mn, zr);
7616       mov(Rlo_mn, zr);
7617     }
7618 
7619     // The core multiply-accumulate step of a Montgomery
7620     // multiplication.  The idea is to schedule operations as a
7621     // pipeline so that instructions with long latencies (loads and
7622     // multiplies) have time to complete before their results are
7623     // used.  This most benefits in-order implementations of the
7624     // architecture but out-of-order ones also benefit.
7625     void step() {
7626       block_comment("step");
7627       // MACC(Ra, Rb, t0, t1, t2);
7628       // Ra = *++Pa;
7629       // Rb = *--Pb;
7630       umulh(Rhi_ab, Ra, Rb);
7631       mul(Rlo_ab, Ra, Rb);
7632       ldr(Ra, pre(Pa, wordSize));
7633       ldr(Rb, pre(Pb, -wordSize));
7634       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7635                                        // previous iteration.
7636       // MACC(Rm, Rn, t0, t1, t2);
7637       // Rm = *++Pm;
7638       // Rn = *--Pn;
7639       umulh(Rhi_mn, Rm, Rn);
7640       mul(Rlo_mn, Rm, Rn);
7641       ldr(Rm, pre(Pm, wordSize));
7642       ldr(Rn, pre(Pn, -wordSize));
7643       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7644     }
7645 
7646     void post1() {
7647       block_comment("post1");
7648 
7649       // MACC(Ra, Rb, t0, t1, t2);
7650       // Ra = *++Pa;
7651       // Rb = *--Pb;
7652       umulh(Rhi_ab, Ra, Rb);
7653       mul(Rlo_ab, Ra, Rb);
7654       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7655       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7656 
7657       // *Pm = Rm = t0 * inv;
7658       mul(Rm, t0, inv);
7659       str(Rm, Address(Pm));
7660 
7661       // MACC(Rm, Rn, t0, t1, t2);
7662       // t0 = t1; t1 = t2; t2 = 0;
7663       umulh(Rhi_mn, Rm, Rn);
7664 
7665 #ifndef PRODUCT
7666       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7667       {
7668         mul(Rlo_mn, Rm, Rn);
7669         add(Rlo_mn, t0, Rlo_mn);
7670         Label ok;
7671         cbz(Rlo_mn, ok); {
7672           stop("broken Montgomery multiply");
7673         } bind(ok);
7674       }
7675 #endif
7676       // We have very carefully set things up so that
7677       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7678       // the lower half of Rm * Rn because we know the result already:
7679       // it must be -t0.  t0 + (-t0) must generate a carry iff
7680       // t0 != 0.  So, rather than do a mul and an adds we just set
7681       // the carry flag iff t0 is nonzero.
7682       //
7683       // mul(Rlo_mn, Rm, Rn);
7684       // adds(zr, t0, Rlo_mn);
7685       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7686       adcs(t0, t1, Rhi_mn);
7687       adc(t1, t2, zr);
7688       mov(t2, zr);
7689     }
7690 
7691     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7692       block_comment("pre2");
7693       // Pa = Pa_base + i-len;
7694       // Pb = Pb_base + len;
7695       // Pm = Pm_base + i-len;
7696       // Pn = Pn_base + len;
7697 
7698       if (i.is_register()) {
7699         sub(Rj, i.as_register(), len);
7700       } else {
7701         mov(Rj, i.as_constant());
7702         sub(Rj, Rj, len);
7703       }
7704       // Rj == i-len
7705 
7706       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7707       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7708       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7709       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7710 
7711       // Ra = *++Pa;
7712       // Rb = *--Pb;
7713       // Rm = *++Pm;
7714       // Rn = *--Pn;
7715       ldr(Ra, pre(Pa, wordSize));
7716       ldr(Rb, pre(Pb, -wordSize));
7717       ldr(Rm, pre(Pm, wordSize));
7718       ldr(Rn, pre(Pn, -wordSize));
7719 
7720       mov(Rhi_mn, zr);
7721       mov(Rlo_mn, zr);
7722     }
7723 
7724     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7725       block_comment("post2");
7726       if (i.is_constant()) {
7727         mov(Rj, i.as_constant()-len.as_constant());
7728       } else {
7729         sub(Rj, i.as_register(), len);
7730       }
7731 
7732       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7733 
7734       // As soon as we know the least significant digit of our result,
7735       // store it.
7736       // Pm_base[i-len] = t0;
7737       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7738 
7739       // t0 = t1; t1 = t2; t2 = 0;
7740       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7741       adc(t1, t2, zr);
7742       mov(t2, zr);
7743     }
7744 
7745     // A carry in t0 after Montgomery multiplication means that we
7746     // should subtract multiples of n from our result in m.  We'll
7747     // keep doing that until there is no carry.
7748     void normalize(RegisterOrConstant len) {
7749       block_comment("normalize");
7750       // while (t0)
7751       //   t0 = sub(Pm_base, Pn_base, t0, len);
7752       Label loop, post, again;
7753       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7754       cbz(t0, post); {
7755         bind(again); {
7756           mov(i, zr);
7757           mov(cnt, len);
7758           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7759           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7760           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7761           align(16);
7762           bind(loop); {
7763             sbcs(Rm, Rm, Rn);
7764             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7765             add(i, i, 1);
7766             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7767             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7768             sub(cnt, cnt, 1);
7769           } cbnz(cnt, loop);
7770           sbc(t0, t0, zr);
7771         } cbnz(t0, again);
7772       } bind(post);
7773     }
7774 
7775     // Move memory at s to d, reversing words.
7776     //    Increments d to end of copied memory
7777     //    Destroys tmp1, tmp2
7778     //    Preserves len
7779     //    Leaves s pointing to the address which was in d at start
7780     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7781       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7782       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7783 
7784       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7785       mov(tmp1, len);
7786       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7787       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7788     }
7789     // where
7790     void reverse1(Register d, Register s, Register tmp) {
7791       ldr(tmp, pre(s, -wordSize));
7792       ror(tmp, tmp, 32);
7793       str(tmp, post(d, wordSize));
7794     }
7795 
7796     void step_squaring() {
7797       // An extra ACC
7798       step();
7799       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7800     }
7801 
7802     void last_squaring(RegisterOrConstant i) {
7803       Label dont;
7804       // if ((i & 1) == 0) {
7805       tbnz(i.as_register(), 0, dont); {
7806         // MACC(Ra, Rb, t0, t1, t2);
7807         // Ra = *++Pa;
7808         // Rb = *--Pb;
7809         umulh(Rhi_ab, Ra, Rb);
7810         mul(Rlo_ab, Ra, Rb);
7811         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7812       } bind(dont);
7813     }
7814 
7815     void extra_step_squaring() {
7816       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7817 
7818       // MACC(Rm, Rn, t0, t1, t2);
7819       // Rm = *++Pm;
7820       // Rn = *--Pn;
7821       umulh(Rhi_mn, Rm, Rn);
7822       mul(Rlo_mn, Rm, Rn);
7823       ldr(Rm, pre(Pm, wordSize));
7824       ldr(Rn, pre(Pn, -wordSize));
7825     }
7826 
7827     void post1_squaring() {
7828       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7829 
7830       // *Pm = Rm = t0 * inv;
7831       mul(Rm, t0, inv);
7832       str(Rm, Address(Pm));
7833 
7834       // MACC(Rm, Rn, t0, t1, t2);
7835       // t0 = t1; t1 = t2; t2 = 0;
7836       umulh(Rhi_mn, Rm, Rn);
7837 
7838 #ifndef PRODUCT
7839       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7840       {
7841         mul(Rlo_mn, Rm, Rn);
7842         add(Rlo_mn, t0, Rlo_mn);
7843         Label ok;
7844         cbz(Rlo_mn, ok); {
7845           stop("broken Montgomery multiply");
7846         } bind(ok);
7847       }
7848 #endif
7849       // We have very carefully set things up so that
7850       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7851       // the lower half of Rm * Rn because we know the result already:
7852       // it must be -t0.  t0 + (-t0) must generate a carry iff
7853       // t0 != 0.  So, rather than do a mul and an adds we just set
7854       // the carry flag iff t0 is nonzero.
7855       //
7856       // mul(Rlo_mn, Rm, Rn);
7857       // adds(zr, t0, Rlo_mn);
7858       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7859       adcs(t0, t1, Rhi_mn);
7860       adc(t1, t2, zr);
7861       mov(t2, zr);
7862     }
7863 
7864     void acc(Register Rhi, Register Rlo,
7865              Register t0, Register t1, Register t2) {
7866       adds(t0, t0, Rlo);
7867       adcs(t1, t1, Rhi);
7868       adc(t2, t2, zr);
7869     }
7870 
7871   public:
7872     /**
7873      * Fast Montgomery multiplication.  The derivation of the
7874      * algorithm is in A Cryptographic Library for the Motorola
7875      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7876      *
7877      * Arguments:
7878      *
7879      * Inputs for multiplication:
7880      *   c_rarg0   - int array elements a
7881      *   c_rarg1   - int array elements b
7882      *   c_rarg2   - int array elements n (the modulus)
7883      *   c_rarg3   - int length
7884      *   c_rarg4   - int inv
7885      *   c_rarg5   - int array elements m (the result)
7886      *
7887      * Inputs for squaring:
7888      *   c_rarg0   - int array elements a
7889      *   c_rarg1   - int array elements n (the modulus)
7890      *   c_rarg2   - int length
7891      *   c_rarg3   - int inv
7892      *   c_rarg4   - int array elements m (the result)
7893      *
7894      */
7895     address generate_multiply() {
7896       Label argh, nothing;
7897       bind(argh);
7898       stop("MontgomeryMultiply total_allocation must be <= 8192");
7899 
7900       align(CodeEntryAlignment);
7901       address entry = pc();
7902 
7903       cbzw(Rlen, nothing);
7904 
7905       enter();
7906 
7907       // Make room.
7908       cmpw(Rlen, 512);
7909       br(Assembler::HI, argh);
7910       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7911       andr(sp, Ra, -2 * wordSize);
7912 
7913       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7914 
7915       {
7916         // Copy input args, reversing as we go.  We use Ra as a
7917         // temporary variable.
7918         reverse(Ra, Pa_base, Rlen, t0, t1);
7919         if (!_squaring)
7920           reverse(Ra, Pb_base, Rlen, t0, t1);
7921         reverse(Ra, Pn_base, Rlen, t0, t1);
7922       }
7923 
7924       // Push all call-saved registers and also Pm_base which we'll need
7925       // at the end.
7926       save_regs();
7927 
7928 #ifndef PRODUCT
7929       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7930       {
7931         ldr(Rn, Address(Pn_base, 0));
7932         mul(Rlo_mn, Rn, inv);
7933         subs(zr, Rlo_mn, -1);
7934         Label ok;
7935         br(EQ, ok); {
7936           stop("broken inverse in Montgomery multiply");
7937         } bind(ok);
7938       }
7939 #endif
7940 
7941       mov(Pm_base, Ra);
7942 
7943       mov(t0, zr);
7944       mov(t1, zr);
7945       mov(t2, zr);
7946 
7947       block_comment("for (int i = 0; i < len; i++) {");
7948       mov(Ri, zr); {
7949         Label loop, end;
7950         cmpw(Ri, Rlen);
7951         br(Assembler::GE, end);
7952 
7953         bind(loop);
7954         pre1(Ri);
7955 
7956         block_comment("  for (j = i; j; j--) {"); {
7957           movw(Rj, Ri);
7958           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7959         } block_comment("  } // j");
7960 
7961         post1();
7962         addw(Ri, Ri, 1);
7963         cmpw(Ri, Rlen);
7964         br(Assembler::LT, loop);
7965         bind(end);
7966         block_comment("} // i");
7967       }
7968 
7969       block_comment("for (int i = len; i < 2*len; i++) {");
7970       mov(Ri, Rlen); {
7971         Label loop, end;
7972         cmpw(Ri, Rlen, Assembler::LSL, 1);
7973         br(Assembler::GE, end);
7974 
7975         bind(loop);
7976         pre2(Ri, Rlen);
7977 
7978         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7979           lslw(Rj, Rlen, 1);
7980           subw(Rj, Rj, Ri);
7981           subw(Rj, Rj, 1);
7982           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7983         } block_comment("  } // j");
7984 
7985         post2(Ri, Rlen);
7986         addw(Ri, Ri, 1);
7987         cmpw(Ri, Rlen, Assembler::LSL, 1);
7988         br(Assembler::LT, loop);
7989         bind(end);
7990       }
7991       block_comment("} // i");
7992 
7993       normalize(Rlen);
7994 
7995       mov(Ra, Pm_base);  // Save Pm_base in Ra
7996       restore_regs();  // Restore caller's Pm_base
7997 
7998       // Copy our result into caller's Pm_base
7999       reverse(Pm_base, Ra, Rlen, t0, t1);
8000 
8001       leave();
8002       bind(nothing);
8003       ret(lr);
8004 
8005       return entry;
8006     }
8007     // In C, approximately:
8008 
8009     // void
8010     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8011     //                     julong Pn_base[], julong Pm_base[],
8012     //                     julong inv, int len) {
8013     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8014     //   julong *Pa, *Pb, *Pn, *Pm;
8015     //   julong Ra, Rb, Rn, Rm;
8016 
8017     //   int i;
8018 
8019     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8020 
8021     //   for (i = 0; i < len; i++) {
8022     //     int j;
8023 
8024     //     Pa = Pa_base;
8025     //     Pb = Pb_base + i;
8026     //     Pm = Pm_base;
8027     //     Pn = Pn_base + i;
8028 
8029     //     Ra = *Pa;
8030     //     Rb = *Pb;
8031     //     Rm = *Pm;
8032     //     Rn = *Pn;
8033 
8034     //     int iters = i;
8035     //     for (j = 0; iters--; j++) {
8036     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8037     //       MACC(Ra, Rb, t0, t1, t2);
8038     //       Ra = *++Pa;
8039     //       Rb = *--Pb;
8040     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8041     //       MACC(Rm, Rn, t0, t1, t2);
8042     //       Rm = *++Pm;
8043     //       Rn = *--Pn;
8044     //     }
8045 
8046     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8047     //     MACC(Ra, Rb, t0, t1, t2);
8048     //     *Pm = Rm = t0 * inv;
8049     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8050     //     MACC(Rm, Rn, t0, t1, t2);
8051 
8052     //     assert(t0 == 0, "broken Montgomery multiply");
8053 
8054     //     t0 = t1; t1 = t2; t2 = 0;
8055     //   }
8056 
8057     //   for (i = len; i < 2*len; i++) {
8058     //     int j;
8059 
8060     //     Pa = Pa_base + i-len;
8061     //     Pb = Pb_base + len;
8062     //     Pm = Pm_base + i-len;
8063     //     Pn = Pn_base + len;
8064 
8065     //     Ra = *++Pa;
8066     //     Rb = *--Pb;
8067     //     Rm = *++Pm;
8068     //     Rn = *--Pn;
8069 
8070     //     int iters = len*2-i-1;
8071     //     for (j = i-len+1; iters--; j++) {
8072     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8073     //       MACC(Ra, Rb, t0, t1, t2);
8074     //       Ra = *++Pa;
8075     //       Rb = *--Pb;
8076     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8077     //       MACC(Rm, Rn, t0, t1, t2);
8078     //       Rm = *++Pm;
8079     //       Rn = *--Pn;
8080     //     }
8081 
8082     //     Pm_base[i-len] = t0;
8083     //     t0 = t1; t1 = t2; t2 = 0;
8084     //   }
8085 
8086     //   while (t0)
8087     //     t0 = sub(Pm_base, Pn_base, t0, len);
8088     // }
8089 
8090     /**
8091      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8092      * multiplies than Montgomery multiplication so it should be up to
8093      * 25% faster.  However, its loop control is more complex and it
8094      * may actually run slower on some machines.
8095      *
8096      * Arguments:
8097      *
8098      * Inputs:
8099      *   c_rarg0   - int array elements a
8100      *   c_rarg1   - int array elements n (the modulus)
8101      *   c_rarg2   - int length
8102      *   c_rarg3   - int inv
8103      *   c_rarg4   - int array elements m (the result)
8104      *
8105      */
8106     address generate_square() {
8107       Label argh;
8108       bind(argh);
8109       stop("MontgomeryMultiply total_allocation must be <= 8192");
8110 
8111       align(CodeEntryAlignment);
8112       address entry = pc();
8113 
8114       enter();
8115 
8116       // Make room.
8117       cmpw(Rlen, 512);
8118       br(Assembler::HI, argh);
8119       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8120       andr(sp, Ra, -2 * wordSize);
8121 
8122       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8123 
8124       {
8125         // Copy input args, reversing as we go.  We use Ra as a
8126         // temporary variable.
8127         reverse(Ra, Pa_base, Rlen, t0, t1);
8128         reverse(Ra, Pn_base, Rlen, t0, t1);
8129       }
8130 
8131       // Push all call-saved registers and also Pm_base which we'll need
8132       // at the end.
8133       save_regs();
8134 
8135       mov(Pm_base, Ra);
8136 
8137       mov(t0, zr);
8138       mov(t1, zr);
8139       mov(t2, zr);
8140 
8141       block_comment("for (int i = 0; i < len; i++) {");
8142       mov(Ri, zr); {
8143         Label loop, end;
8144         bind(loop);
8145         cmp(Ri, Rlen);
8146         br(Assembler::GE, end);
8147 
8148         pre1(Ri);
8149 
8150         block_comment("for (j = (i+1)/2; j; j--) {"); {
8151           add(Rj, Ri, 1);
8152           lsr(Rj, Rj, 1);
8153           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8154         } block_comment("  } // j");
8155 
8156         last_squaring(Ri);
8157 
8158         block_comment("  for (j = i/2; j; j--) {"); {
8159           lsr(Rj, Ri, 1);
8160           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8161         } block_comment("  } // j");
8162 
8163         post1_squaring();
8164         add(Ri, Ri, 1);
8165         cmp(Ri, Rlen);
8166         br(Assembler::LT, loop);
8167 
8168         bind(end);
8169         block_comment("} // i");
8170       }
8171 
8172       block_comment("for (int i = len; i < 2*len; i++) {");
8173       mov(Ri, Rlen); {
8174         Label loop, end;
8175         bind(loop);
8176         cmp(Ri, Rlen, Assembler::LSL, 1);
8177         br(Assembler::GE, end);
8178 
8179         pre2(Ri, Rlen);
8180 
8181         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8182           lsl(Rj, Rlen, 1);
8183           sub(Rj, Rj, Ri);
8184           sub(Rj, Rj, 1);
8185           lsr(Rj, Rj, 1);
8186           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8187         } block_comment("  } // j");
8188 
8189         last_squaring(Ri);
8190 
8191         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8192           lsl(Rj, Rlen, 1);
8193           sub(Rj, Rj, Ri);
8194           lsr(Rj, Rj, 1);
8195           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8196         } block_comment("  } // j");
8197 
8198         post2(Ri, Rlen);
8199         add(Ri, Ri, 1);
8200         cmp(Ri, Rlen, Assembler::LSL, 1);
8201 
8202         br(Assembler::LT, loop);
8203         bind(end);
8204         block_comment("} // i");
8205       }
8206 
8207       normalize(Rlen);
8208 
8209       mov(Ra, Pm_base);  // Save Pm_base in Ra
8210       restore_regs();  // Restore caller's Pm_base
8211 
8212       // Copy our result into caller's Pm_base
8213       reverse(Pm_base, Ra, Rlen, t0, t1);
8214 
8215       leave();
8216       ret(lr);
8217 
8218       return entry;
8219     }
8220     // In C, approximately:
8221 
8222     // void
8223     // montgomery_square(julong Pa_base[], julong Pn_base[],
8224     //                   julong Pm_base[], julong inv, int len) {
8225     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8226     //   julong *Pa, *Pb, *Pn, *Pm;
8227     //   julong Ra, Rb, Rn, Rm;
8228 
8229     //   int i;
8230 
8231     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8232 
8233     //   for (i = 0; i < len; i++) {
8234     //     int j;
8235 
8236     //     Pa = Pa_base;
8237     //     Pb = Pa_base + i;
8238     //     Pm = Pm_base;
8239     //     Pn = Pn_base + i;
8240 
8241     //     Ra = *Pa;
8242     //     Rb = *Pb;
8243     //     Rm = *Pm;
8244     //     Rn = *Pn;
8245 
8246     //     int iters = (i+1)/2;
8247     //     for (j = 0; iters--; j++) {
8248     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8249     //       MACC2(Ra, Rb, t0, t1, t2);
8250     //       Ra = *++Pa;
8251     //       Rb = *--Pb;
8252     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8253     //       MACC(Rm, Rn, t0, t1, t2);
8254     //       Rm = *++Pm;
8255     //       Rn = *--Pn;
8256     //     }
8257     //     if ((i & 1) == 0) {
8258     //       assert(Ra == Pa_base[j], "must be");
8259     //       MACC(Ra, Ra, t0, t1, t2);
8260     //     }
8261     //     iters = i/2;
8262     //     assert(iters == i-j, "must be");
8263     //     for (; iters--; j++) {
8264     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8265     //       MACC(Rm, Rn, t0, t1, t2);
8266     //       Rm = *++Pm;
8267     //       Rn = *--Pn;
8268     //     }
8269 
8270     //     *Pm = Rm = t0 * inv;
8271     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8272     //     MACC(Rm, Rn, t0, t1, t2);
8273 
8274     //     assert(t0 == 0, "broken Montgomery multiply");
8275 
8276     //     t0 = t1; t1 = t2; t2 = 0;
8277     //   }
8278 
8279     //   for (i = len; i < 2*len; i++) {
8280     //     int start = i-len+1;
8281     //     int end = start + (len - start)/2;
8282     //     int j;
8283 
8284     //     Pa = Pa_base + i-len;
8285     //     Pb = Pa_base + len;
8286     //     Pm = Pm_base + i-len;
8287     //     Pn = Pn_base + len;
8288 
8289     //     Ra = *++Pa;
8290     //     Rb = *--Pb;
8291     //     Rm = *++Pm;
8292     //     Rn = *--Pn;
8293 
8294     //     int iters = (2*len-i-1)/2;
8295     //     assert(iters == end-start, "must be");
8296     //     for (j = start; iters--; j++) {
8297     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8298     //       MACC2(Ra, Rb, t0, t1, t2);
8299     //       Ra = *++Pa;
8300     //       Rb = *--Pb;
8301     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8302     //       MACC(Rm, Rn, t0, t1, t2);
8303     //       Rm = *++Pm;
8304     //       Rn = *--Pn;
8305     //     }
8306     //     if ((i & 1) == 0) {
8307     //       assert(Ra == Pa_base[j], "must be");
8308     //       MACC(Ra, Ra, t0, t1, t2);
8309     //     }
8310     //     iters =  (2*len-i)/2;
8311     //     assert(iters == len-j, "must be");
8312     //     for (; iters--; j++) {
8313     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8314     //       MACC(Rm, Rn, t0, t1, t2);
8315     //       Rm = *++Pm;
8316     //       Rn = *--Pn;
8317     //     }
8318     //     Pm_base[i-len] = t0;
8319     //     t0 = t1; t1 = t2; t2 = 0;
8320     //   }
8321 
8322     //   while (t0)
8323     //     t0 = sub(Pm_base, Pn_base, t0, len);
8324     // }
8325   };
8326 
8327 
8328   // Call here from the interpreter or compiled code to either load
8329   // multiple returned values from the inline type instance being
8330   // returned to registers or to store returned values to a newly
8331   // allocated inline type instance.
8332   address generate_return_value_stub(address destination, const char* name, bool has_res) {
8333     // We need to save all registers the calling convention may use so
8334     // the runtime calls read or update those registers. This needs to
8335     // be in sync with SharedRuntime::java_return_convention().
8336     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
8337     enum layout {
8338       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
8339       j_rarg6_off, j_rarg6_2,
8340       j_rarg5_off, j_rarg5_2,
8341       j_rarg4_off, j_rarg4_2,
8342       j_rarg3_off, j_rarg3_2,
8343       j_rarg2_off, j_rarg2_2,
8344       j_rarg1_off, j_rarg1_2,
8345       j_rarg0_off, j_rarg0_2,
8346 
8347       j_farg7_off, j_farg7_2,
8348       j_farg6_off, j_farg6_2,
8349       j_farg5_off, j_farg5_2,
8350       j_farg4_off, j_farg4_2,
8351       j_farg3_off, j_farg3_2,
8352       j_farg2_off, j_farg2_2,
8353       j_farg1_off, j_farg1_2,
8354       j_farg0_off, j_farg0_2,
8355 
8356       rfp_off, rfp_off2,
8357       return_off, return_off2,
8358 
8359       framesize // inclusive of return address
8360     };
8361 
8362     CodeBuffer code(name, 512, 64);
8363     MacroAssembler* masm = new MacroAssembler(&code);
8364 
8365     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
8366     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
8367     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
8368     int frame_size_in_words = frame_size_in_bytes / wordSize;
8369 
8370     OopMapSet* oop_maps = new OopMapSet();
8371     OopMap* map = new OopMap(frame_size_in_slots, 0);
8372 
8373     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
8374     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
8375     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
8376     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
8377     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
8378     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
8379     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
8380     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
8381 
8382     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
8383     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
8384     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
8385     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
8386     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
8387     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
8388     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
8389     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
8390 
8391     address start = __ pc();
8392 
8393     __ enter(); // Save FP and LR before call
8394 
8395     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
8396     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
8397     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
8398     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
8399 
8400     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
8401     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
8402     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
8403     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
8404 
8405     int frame_complete = __ offset();
8406 
8407     // Set up last_Java_sp and last_Java_fp
8408     address the_pc = __ pc();
8409     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
8410 
8411     // Call runtime
8412     __ mov(c_rarg1, r0);
8413     __ mov(c_rarg0, rthread);
8414 
8415     __ mov(rscratch1, destination);
8416     __ blr(rscratch1);
8417 
8418     oop_maps->add_gc_map(the_pc - start, map);
8419 
8420     __ reset_last_Java_frame(false);
8421 
8422     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
8423     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
8424     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
8425     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
8426 
8427     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
8428     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
8429     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
8430     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
8431 
8432     __ leave();
8433 
8434     // check for pending exceptions
8435     Label pending;
8436     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
8437     __ cbnz(rscratch1, pending);
8438 
8439     if (has_res) {
8440       __ get_vm_result(r0, rthread);
8441     }
8442 
8443     __ ret(lr);
8444 
8445     __ bind(pending);
8446     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
8447 
8448     // -------------
8449     // make sure all code is generated
8450     masm->flush();
8451 
8452     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
8453     return stub->entry_point();
8454   }
8455 
8456   // Initialization
8457   void generate_initial_stubs() {
8458     // Generate initial stubs and initializes the entry points
8459 
8460     // entry points that exist in all platforms Note: This is code
8461     // that could be shared among different platforms - however the
8462     // benefit seems to be smaller than the disadvantage of having a
8463     // much more complicated generator structure. See also comment in
8464     // stubRoutines.hpp.
8465 
8466     StubRoutines::_forward_exception_entry = generate_forward_exception();
8467 
8468     StubRoutines::_call_stub_entry =
8469       generate_call_stub(StubRoutines::_call_stub_return_address);
8470 
8471     // is referenced by megamorphic call
8472     StubRoutines::_catch_exception_entry = generate_catch_exception();
8473 
8474     // Build this early so it's available for the interpreter.
8475     StubRoutines::_throw_StackOverflowError_entry =
8476       generate_throw_exception("StackOverflowError throw_exception",
8477                                CAST_FROM_FN_PTR(address,
8478                                                 SharedRuntime::throw_StackOverflowError));
8479     StubRoutines::_throw_delayed_StackOverflowError_entry =
8480       generate_throw_exception("delayed StackOverflowError throw_exception",
8481                                CAST_FROM_FN_PTR(address,
8482                                                 SharedRuntime::throw_delayed_StackOverflowError));
8483 
8484     // Initialize table for copy memory (arraycopy) check.
8485     if (UnsafeCopyMemory::_table == nullptr) {
8486       UnsafeCopyMemory::create_table(8);
8487     }
8488 
8489     if (UseCRC32Intrinsics) {
8490       // set table address before stub generation which use it
8491       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8492       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8493     }
8494 
8495     if (UseCRC32CIntrinsics) {
8496       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8497     }
8498 
8499     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8500       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8501     }
8502 
8503     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8504       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8505     }
8506 
8507     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8508         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8509       StubRoutines::_hf2f = generate_float16ToFloat();
8510       StubRoutines::_f2hf = generate_floatToFloat16();
8511     }
8512 
8513     if (InlineTypeReturnedAsFields) {
8514       StubRoutines::_load_inline_type_fields_in_regs =
8515          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
8516       StubRoutines::_store_inline_type_fields_to_buf =
8517          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
8518     }
8519 
8520   }
8521 
8522   void generate_continuation_stubs() {
8523     // Continuation stubs:
8524     StubRoutines::_cont_thaw          = generate_cont_thaw();
8525     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8526     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8527 
8528     JFR_ONLY(generate_jfr_stubs();)
8529   }
8530 
8531 #if INCLUDE_JFR
8532   void generate_jfr_stubs() {
8533     StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
8534     StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
8535     StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
8536     StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
8537   }
8538 #endif // INCLUDE_JFR
8539 
8540   void generate_final_stubs() {
8541     // support for verify_oop (must happen after universe_init)
8542     if (VerifyOops) {
8543       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8544     }
8545     StubRoutines::_throw_AbstractMethodError_entry =
8546       generate_throw_exception("AbstractMethodError throw_exception",
8547                                CAST_FROM_FN_PTR(address,
8548                                                 SharedRuntime::
8549                                                 throw_AbstractMethodError));
8550 
8551     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8552       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8553                                CAST_FROM_FN_PTR(address,
8554                                                 SharedRuntime::
8555                                                 throw_IncompatibleClassChangeError));
8556 
8557     StubRoutines::_throw_NullPointerException_at_call_entry =
8558       generate_throw_exception("NullPointerException at call throw_exception",
8559                                CAST_FROM_FN_PTR(address,
8560                                                 SharedRuntime::
8561                                                 throw_NullPointerException_at_call));
8562 
8563     // arraycopy stubs used by compilers
8564     generate_arraycopy_stubs();
8565 
8566     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8567     if (bs_nm != nullptr) {
8568       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8569     }
8570 
8571     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8572 
8573     if (UsePoly1305Intrinsics) {
8574       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8575     }
8576 
8577 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8578 
8579     generate_atomic_entry_points();
8580 
8581 #endif // LINUX
8582 
8583     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8584 
8585     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8586   }
8587 
8588   void generate_compiler_stubs() {
8589 #if COMPILER2_OR_JVMCI
8590 
8591     if (UseSVE == 0) {
8592       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8593     }
8594 
8595     // array equals stub for large arrays.
8596     if (!UseSimpleArrayEquals) {
8597       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8598     }
8599 
8600     // byte_array_inflate stub for large arrays.
8601     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8602 
8603     // countPositives stub for large arrays.
8604     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8605 
8606     generate_compare_long_strings();
8607 
8608     generate_string_indexof_stubs();
8609 
8610 #ifdef COMPILER2
8611     if (UseMultiplyToLenIntrinsic) {
8612       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8613     }
8614 
8615     if (UseSquareToLenIntrinsic) {
8616       StubRoutines::_squareToLen = generate_squareToLen();
8617     }
8618 
8619     if (UseMulAddIntrinsic) {
8620       StubRoutines::_mulAdd = generate_mulAdd();
8621     }
8622 
8623     if (UseSIMDForBigIntegerShiftIntrinsics) {
8624       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8625       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8626     }
8627 
8628     if (UseMontgomeryMultiplyIntrinsic) {
8629       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8630       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8631       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8632     }
8633 
8634     if (UseMontgomerySquareIntrinsic) {
8635       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8636       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8637       // We use generate_multiply() rather than generate_square()
8638       // because it's faster for the sizes of modulus we care about.
8639       StubRoutines::_montgomerySquare = g.generate_multiply();
8640     }
8641 #endif // COMPILER2
8642 
8643     if (UseChaCha20Intrinsics) {
8644       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8645     }
8646 
8647     if (UseBASE64Intrinsics) {
8648         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8649         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8650     }
8651 
8652     // data cache line writeback
8653     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8654     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8655 
8656     if (UseAESIntrinsics) {
8657       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8658       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8659       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8660       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8661       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8662     }
8663     if (UseGHASHIntrinsics) {
8664       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8665       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8666     }
8667     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8668       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8669     }
8670 
8671     if (UseMD5Intrinsics) {
8672       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8673       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8674     }
8675     if (UseSHA1Intrinsics) {
8676       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8677       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8678     }
8679     if (UseSHA256Intrinsics) {
8680       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8681       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8682     }
8683     if (UseSHA512Intrinsics) {
8684       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8685       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8686     }
8687     if (UseSHA3Intrinsics) {
8688       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8689       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8690     }
8691 
8692     // generate Adler32 intrinsics code
8693     if (UseAdler32Intrinsics) {
8694       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8695     }
8696 #endif // COMPILER2_OR_JVMCI
8697   }
8698 
8699  public:
8700   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8701     switch(kind) {
8702     case Initial_stubs:
8703       generate_initial_stubs();
8704       break;
8705      case Continuation_stubs:
8706       generate_continuation_stubs();
8707       break;
8708     case Compiler_stubs:
8709       generate_compiler_stubs();
8710       break;
8711     case Final_stubs:
8712       generate_final_stubs();
8713       break;
8714     default:
8715       fatal("unexpected stubs kind: %d", kind);
8716       break;
8717     };
8718   }
8719 }; // end class declaration
8720 
8721 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8722   StubGenerator g(code, kind);
8723 }
8724 
8725 
8726 #if defined (LINUX)
8727 
8728 // Define pointers to atomic stubs and initialize them to point to the
8729 // code in atomic_aarch64.S.
8730 
8731 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8732   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8733     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8734   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8735     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8736 
8737 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8738 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8739 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8740 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8741 DEFAULT_ATOMIC_OP(xchg, 4, )
8742 DEFAULT_ATOMIC_OP(xchg, 8, )
8743 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8744 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8745 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8746 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8747 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8748 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8749 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8750 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8751 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8752 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8753 
8754 #undef DEFAULT_ATOMIC_OP
8755 
8756 #endif // LINUX