1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/arguments.hpp"
  46 #include "runtime/atomic.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/frame.inline.hpp"
  50 #include "runtime/handles.inline.hpp"
  51 #include "runtime/javaThread.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/stubCodeGenerator.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "utilities/align.hpp"
  56 #include "utilities/checkedCast.hpp"
  57 #include "utilities/debug.hpp"
  58 #include "utilities/globalDefinitions.hpp"
  59 #include "utilities/intpow.hpp"
  60 #include "utilities/powerOfTwo.hpp"
  61 #ifdef COMPILER2
  62 #include "opto/runtime.hpp"
  63 #endif
  64 #if INCLUDE_ZGC
  65 #include "gc/z/zThreadLocalData.hpp"
  66 #endif
  67 
  68 // Declaration and definition of StubGenerator (no .hpp file).
  69 // For a more detailed description of the stub routine structure
  70 // see the comment in stubRoutines.hpp
  71 
  72 #undef __
  73 #define __ _masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif
  80 
  81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  82 
  83 // Stub Code definitions
  84 
  85 class StubGenerator: public StubCodeGenerator {
  86  private:
  87 
  88 #ifdef PRODUCT
  89 #define inc_counter_np(counter) ((void)0)
  90 #else
  91   void inc_counter_np_(uint& counter) {
  92     __ incrementw(ExternalAddress((address)&counter));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -29 [ argument word 1      ]
 145   // -28 [ saved Floating-point Control Register ]
 146   // -26 [ saved v15            ] <--- sp_after_call
 147   // -25 [ saved v14            ]
 148   // -24 [ saved v13            ]
 149   // -23 [ saved v12            ]
 150   // -22 [ saved v11            ]
 151   // -21 [ saved v10            ]
 152   // -20 [ saved v9             ]
 153   // -19 [ saved v8             ]
 154   // -18 [ saved r28            ]
 155   // -17 [ saved r27            ]
 156   // -16 [ saved r26            ]
 157   // -15 [ saved r25            ]
 158   // -14 [ saved r24            ]
 159   // -13 [ saved r23            ]
 160   // -12 [ saved r22            ]
 161   // -11 [ saved r21            ]
 162   // -10 [ saved r20            ]
 163   //  -9 [ saved r19            ]
 164   //  -8 [ call wrapper    (r0) ]
 165   //  -7 [ result          (r1) ]
 166   //  -6 [ result type     (r2) ]
 167   //  -5 [ method          (r3) ]
 168   //  -4 [ entry point     (r4) ]
 169   //  -3 [ parameters      (r5) ]
 170   //  -2 [ parameter size  (r6) ]
 171   //  -1 [ thread (r7)          ]
 172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 173   //   1 [ saved lr       (r30) ]
 174 
 175   // Call stub stack layout word offsets from fp
 176   enum call_stub_layout {
 177     sp_after_call_off  = -28,
 178 
 179     fpcr_off           = sp_after_call_off,
 180     d15_off            = -26,
 181     d13_off            = -24,
 182     d11_off            = -22,
 183     d9_off             = -20,
 184 
 185     r28_off            = -18,
 186     r26_off            = -16,
 187     r24_off            = -14,
 188     r22_off            = -12,
 189     r20_off            = -10,
 190     call_wrapper_off   =  -8,
 191     result_off         =  -7,
 192     result_type_off    =  -6,
 193     method_off         =  -5,
 194     entry_point_off    =  -4,
 195     parameter_size_off =  -2,
 196     thread_off         =  -1,
 197     fp_f               =   0,
 198     retaddr_off        =   1,
 199   };
 200 
 201   address generate_call_stub(address& return_address) {
 202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 204            "adjust this code");
 205 
 206     StubCodeMark mark(this, "StubRoutines", "call_stub");
 207     address start = __ pc();
 208 
 209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 210 
 211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     __ get_fpcr(rscratch1);
 261     __ str(rscratch1, fpcr_save);
 262     // Set FPCR to the state we need. We do want Round to Nearest. We
 263     // don't want non-IEEE rounding modes or floating-point traps.
 264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 266     __ set_fpcr(rscratch1);
 267 
 268     // install Java thread in global register now we have saved
 269     // whatever value it held
 270     __ mov(rthread, c_rarg7);
 271     // And method
 272     __ mov(rmethod, c_rarg3);
 273 
 274     // set up the heapbase register
 275     __ reinit_heapbase();
 276 
 277 #ifdef ASSERT
 278     // make sure we have no pending exceptions
 279     {
 280       Label L;
 281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 282       __ cmp(rscratch1, (u1)NULL_WORD);
 283       __ br(Assembler::EQ, L);
 284       __ stop("StubRoutines::call_stub: entered with pending exception");
 285       __ BIND(L);
 286     }
 287 #endif
 288     // pass parameters if any
 289     __ mov(esp, sp);
 290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 291     __ andr(sp, rscratch1, -2 * wordSize);
 292 
 293     BLOCK_COMMENT("pass parameters if any");
 294     Label parameters_done;
 295     // parameter count is still in c_rarg6
 296     // and parameter pointer identifying param 1 is in c_rarg5
 297     __ cbzw(c_rarg6, parameters_done);
 298 
 299     address loop = __ pc();
 300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 301     __ subsw(c_rarg6, c_rarg6, 1);
 302     __ push(rscratch1);
 303     __ br(Assembler::GT, loop);
 304 
 305     __ BIND(parameters_done);
 306 
 307     // call Java entry -- passing methdoOop, and current sp
 308     //      rmethod: Method*
 309     //      r19_sender_sp: sender sp
 310     BLOCK_COMMENT("call Java function");
 311     __ mov(r19_sender_sp, sp);
 312     __ blr(c_rarg4);
 313 
 314     // we do this here because the notify will already have been done
 315     // if we get to the next instruction via an exception
 316     //
 317     // n.b. adding this instruction here affects the calculation of
 318     // whether or not a routine returns to the call stub (used when
 319     // doing stack walks) since the normal test is to check the return
 320     // pc against the address saved below. so we may need to allow for
 321     // this extra instruction in the check.
 322 
 323     // save current address for use by exception handling code
 324 
 325     return_address = __ pc();
 326 
 327     // store result depending on type (everything that is not
 328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 329     // n.b. this assumes Java returns an integral result in r0
 330     // and a floating result in j_farg0
 331     __ ldr(j_rarg2, result);
 332     Label is_long, is_float, is_double, exit;
 333     __ ldr(j_rarg1, result_type);
 334     __ cmp(j_rarg1, (u1)T_OBJECT);
 335     __ br(Assembler::EQ, is_long);
 336     __ cmp(j_rarg1, (u1)T_LONG);
 337     __ br(Assembler::EQ, is_long);
 338     __ cmp(j_rarg1, (u1)T_FLOAT);
 339     __ br(Assembler::EQ, is_float);
 340     __ cmp(j_rarg1, (u1)T_DOUBLE);
 341     __ br(Assembler::EQ, is_double);
 342 
 343     // handle T_INT case
 344     __ strw(r0, Address(j_rarg2));
 345 
 346     __ BIND(exit);
 347 
 348     // pop parameters
 349     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 350 
 351 #ifdef ASSERT
 352     // verify that threads correspond
 353     {
 354       Label L, S;
 355       __ ldr(rscratch1, thread);
 356       __ cmp(rthread, rscratch1);
 357       __ br(Assembler::NE, S);
 358       __ get_thread(rscratch1);
 359       __ cmp(rthread, rscratch1);
 360       __ br(Assembler::EQ, L);
 361       __ BIND(S);
 362       __ stop("StubRoutines::call_stub: threads must correspond");
 363       __ BIND(L);
 364     }
 365 #endif
 366 
 367     __ pop_cont_fastpath(rthread);
 368 
 369     // restore callee-save registers
 370     __ ldpd(v15, v14,  d15_save);
 371     __ ldpd(v13, v12,  d13_save);
 372     __ ldpd(v11, v10,  d11_save);
 373     __ ldpd(v9,  v8,   d9_save);
 374 
 375     __ ldp(r28, r27,   r28_save);
 376     __ ldp(r26, r25,   r26_save);
 377     __ ldp(r24, r23,   r24_save);
 378     __ ldp(r22, r21,   r22_save);
 379     __ ldp(r20, r19,   r20_save);
 380 
 381     // restore fpcr
 382     __ ldr(rscratch1,  fpcr_save);
 383     __ set_fpcr(rscratch1);
 384 
 385     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 386     __ ldrw(c_rarg2, result_type);
 387     __ ldr(c_rarg3,  method);
 388     __ ldp(c_rarg4, c_rarg5,  entry_point);
 389     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 390 
 391     // leave frame and return to caller
 392     __ leave();
 393     __ ret(lr);
 394 
 395     // handle return types different from T_INT
 396 
 397     __ BIND(is_long);
 398     __ str(r0, Address(j_rarg2, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     __ BIND(is_float);
 402     __ strs(j_farg0, Address(j_rarg2, 0));
 403     __ br(Assembler::AL, exit);
 404 
 405     __ BIND(is_double);
 406     __ strd(j_farg0, Address(j_rarg2, 0));
 407     __ br(Assembler::AL, exit);
 408 
 409     return start;
 410   }
 411 
 412   // Return point for a Java call if there's an exception thrown in
 413   // Java code.  The exception is caught and transformed into a
 414   // pending exception stored in JavaThread that can be tested from
 415   // within the VM.
 416   //
 417   // Note: Usually the parameters are removed by the callee. In case
 418   // of an exception crossing an activation frame boundary, that is
 419   // not the case if the callee is compiled code => need to setup the
 420   // rsp.
 421   //
 422   // r0: exception oop
 423 
 424   address generate_catch_exception() {
 425     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 426     address start = __ pc();
 427 
 428     // same as in generate_call_stub():
 429     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 430     const Address thread        (rfp, thread_off         * wordSize);
 431 
 432 #ifdef ASSERT
 433     // verify that threads correspond
 434     {
 435       Label L, S;
 436       __ ldr(rscratch1, thread);
 437       __ cmp(rthread, rscratch1);
 438       __ br(Assembler::NE, S);
 439       __ get_thread(rscratch1);
 440       __ cmp(rthread, rscratch1);
 441       __ br(Assembler::EQ, L);
 442       __ bind(S);
 443       __ stop("StubRoutines::catch_exception: threads must correspond");
 444       __ bind(L);
 445     }
 446 #endif
 447 
 448     // set pending exception
 449     __ verify_oop(r0);
 450 
 451     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 452     __ mov(rscratch1, (address)__FILE__);
 453     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 454     __ movw(rscratch1, (int)__LINE__);
 455     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 456 
 457     // complete return to VM
 458     assert(StubRoutines::_call_stub_return_address != nullptr,
 459            "_call_stub_return_address must have been generated before");
 460     __ b(StubRoutines::_call_stub_return_address);
 461 
 462     return start;
 463   }
 464 
 465   // Continuation point for runtime calls returning with a pending
 466   // exception.  The pending exception check happened in the runtime
 467   // or native call stub.  The pending exception in Thread is
 468   // converted into a Java-level exception.
 469   //
 470   // Contract with Java-level exception handlers:
 471   // r0: exception
 472   // r3: throwing pc
 473   //
 474   // NOTE: At entry of this stub, exception-pc must be in LR !!
 475 
 476   // NOTE: this is always used as a jump target within generated code
 477   // so it just needs to be generated code with no x86 prolog
 478 
 479   address generate_forward_exception() {
 480     StubCodeMark mark(this, "StubRoutines", "forward exception");
 481     address start = __ pc();
 482 
 483     // Upon entry, LR points to the return address returning into
 484     // Java (interpreted or compiled) code; i.e., the return address
 485     // becomes the throwing pc.
 486     //
 487     // Arguments pushed before the runtime call are still on the stack
 488     // but the exception handler will reset the stack pointer ->
 489     // ignore them.  A potential result in registers can be ignored as
 490     // well.
 491 
 492 #ifdef ASSERT
 493     // make sure this code is only executed if there is a pending exception
 494     {
 495       Label L;
 496       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 497       __ cbnz(rscratch1, L);
 498       __ stop("StubRoutines::forward exception: no pending exception (1)");
 499       __ bind(L);
 500     }
 501 #endif
 502 
 503     // compute exception handler into r19
 504 
 505     // call the VM to find the handler address associated with the
 506     // caller address. pass thread in r0 and caller pc (ret address)
 507     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 508     // the stack.
 509     __ mov(c_rarg1, lr);
 510     // lr will be trashed by the VM call so we move it to R19
 511     // (callee-saved) because we also need to pass it to the handler
 512     // returned by this call.
 513     __ mov(r19, lr);
 514     BLOCK_COMMENT("call exception_handler_for_return_address");
 515     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 516                          SharedRuntime::exception_handler_for_return_address),
 517                     rthread, c_rarg1);
 518     // Reinitialize the ptrue predicate register, in case the external runtime
 519     // call clobbers ptrue reg, as we may return to SVE compiled code.
 520     __ reinitialize_ptrue();
 521 
 522     // we should not really care that lr is no longer the callee
 523     // address. we saved the value the handler needs in r19 so we can
 524     // just copy it to r3. however, the C2 handler will push its own
 525     // frame and then calls into the VM and the VM code asserts that
 526     // the PC for the frame above the handler belongs to a compiled
 527     // Java method. So, we restore lr here to satisfy that assert.
 528     __ mov(lr, r19);
 529     // setup r0 & r3 & clear pending exception
 530     __ mov(r3, r19);
 531     __ mov(r19, r0);
 532     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 533     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 534 
 535 #ifdef ASSERT
 536     // make sure exception is set
 537     {
 538       Label L;
 539       __ cbnz(r0, L);
 540       __ stop("StubRoutines::forward exception: no pending exception (2)");
 541       __ bind(L);
 542     }
 543 #endif
 544 
 545     // continue at exception handler
 546     // r0: exception
 547     // r3: throwing pc
 548     // r19: exception handler
 549     __ verify_oop(r0);
 550     __ br(r19);
 551 
 552     return start;
 553   }
 554 
 555   // Non-destructive plausibility checks for oops
 556   //
 557   // Arguments:
 558   //    r0: oop to verify
 559   //    rscratch1: error message
 560   //
 561   // Stack after saving c_rarg3:
 562   //    [tos + 0]: saved c_rarg3
 563   //    [tos + 1]: saved c_rarg2
 564   //    [tos + 2]: saved lr
 565   //    [tos + 3]: saved rscratch2
 566   //    [tos + 4]: saved r0
 567   //    [tos + 5]: saved rscratch1
 568   address generate_verify_oop() {
 569 
 570     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 571     address start = __ pc();
 572 
 573     Label exit, error;
 574 
 575     // save c_rarg2 and c_rarg3
 576     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 577 
 578     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 579     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 580     __ ldr(c_rarg3, Address(c_rarg2));
 581     __ add(c_rarg3, c_rarg3, 1);
 582     __ str(c_rarg3, Address(c_rarg2));
 583 
 584     // object is in r0
 585     // make sure object is 'reasonable'
 586     __ cbz(r0, exit); // if obj is null it is OK
 587 
 588     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 589     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 590 
 591     // return if everything seems ok
 592     __ bind(exit);
 593 
 594     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 595     __ ret(lr);
 596 
 597     // handle errors
 598     __ bind(error);
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600 
 601     __ push(RegSet::range(r0, r29), sp);
 602     // debug(char* msg, int64_t pc, int64_t regs[])
 603     __ mov(c_rarg0, rscratch1);      // pass address of error message
 604     __ mov(c_rarg1, lr);             // pass return address
 605     __ mov(c_rarg2, sp);             // pass address of regs on stack
 606 #ifndef PRODUCT
 607     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 608 #endif
 609     BLOCK_COMMENT("call MacroAssembler::debug");
 610     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 611     __ blr(rscratch1);
 612     __ hlt(0);
 613 
 614     return start;
 615   }
 616 
 617   // Generate indices for iota vector.
 618   address generate_iota_indices(const char *stub_name) {
 619     __ align(CodeEntryAlignment);
 620     StubCodeMark mark(this, "StubRoutines", stub_name);
 621     address start = __ pc();
 622     // B
 623     __ emit_data64(0x0706050403020100, relocInfo::none);
 624     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 625     // H
 626     __ emit_data64(0x0003000200010000, relocInfo::none);
 627     __ emit_data64(0x0007000600050004, relocInfo::none);
 628     // S
 629     __ emit_data64(0x0000000100000000, relocInfo::none);
 630     __ emit_data64(0x0000000300000002, relocInfo::none);
 631     // D
 632     __ emit_data64(0x0000000000000000, relocInfo::none);
 633     __ emit_data64(0x0000000000000001, relocInfo::none);
 634     // S - FP
 635     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 636     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 637     // D - FP
 638     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 639     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 640     return start;
 641   }
 642 
 643   // The inner part of zero_words().  This is the bulk operation,
 644   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 645   // caller is responsible for zeroing the last few words.
 646   //
 647   // Inputs:
 648   // r10: the HeapWord-aligned base address of an array to zero.
 649   // r11: the count in HeapWords, r11 > 0.
 650   //
 651   // Returns r10 and r11, adjusted for the caller to clear.
 652   // r10: the base address of the tail of words left to clear.
 653   // r11: the number of words in the tail.
 654   //      r11 < MacroAssembler::zero_words_block_size.
 655 
 656   address generate_zero_blocks() {
 657     Label done;
 658     Label base_aligned;
 659 
 660     Register base = r10, cnt = r11;
 661 
 662     __ align(CodeEntryAlignment);
 663     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 664     address start = __ pc();
 665 
 666     if (UseBlockZeroing) {
 667       int zva_length = VM_Version::zva_length();
 668 
 669       // Ensure ZVA length can be divided by 16. This is required by
 670       // the subsequent operations.
 671       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 672 
 673       __ tbz(base, 3, base_aligned);
 674       __ str(zr, Address(__ post(base, 8)));
 675       __ sub(cnt, cnt, 1);
 676       __ bind(base_aligned);
 677 
 678       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 679       // alignment.
 680       Label small;
 681       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 682       __ subs(rscratch1, cnt, low_limit >> 3);
 683       __ br(Assembler::LT, small);
 684       __ zero_dcache_blocks(base, cnt);
 685       __ bind(small);
 686     }
 687 
 688     {
 689       // Number of stp instructions we'll unroll
 690       const int unroll =
 691         MacroAssembler::zero_words_block_size / 2;
 692       // Clear the remaining blocks.
 693       Label loop;
 694       __ subs(cnt, cnt, unroll * 2);
 695       __ br(Assembler::LT, done);
 696       __ bind(loop);
 697       for (int i = 0; i < unroll; i++)
 698         __ stp(zr, zr, __ post(base, 16));
 699       __ subs(cnt, cnt, unroll * 2);
 700       __ br(Assembler::GE, loop);
 701       __ bind(done);
 702       __ add(cnt, cnt, unroll * 2);
 703     }
 704 
 705     __ ret(lr);
 706 
 707     return start;
 708   }
 709 
 710 
 711   typedef enum {
 712     copy_forwards = 1,
 713     copy_backwards = -1
 714   } copy_direction;
 715 
 716   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 717   // for arraycopy stubs.
 718   class ArrayCopyBarrierSetHelper : StackObj {
 719     BarrierSetAssembler* _bs_asm;
 720     MacroAssembler* _masm;
 721     DecoratorSet _decorators;
 722     BasicType _type;
 723     Register _gct1;
 724     Register _gct2;
 725     Register _gct3;
 726     FloatRegister _gcvt1;
 727     FloatRegister _gcvt2;
 728     FloatRegister _gcvt3;
 729 
 730   public:
 731     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 732                               DecoratorSet decorators,
 733                               BasicType type,
 734                               Register gct1,
 735                               Register gct2,
 736                               Register gct3,
 737                               FloatRegister gcvt1,
 738                               FloatRegister gcvt2,
 739                               FloatRegister gcvt3)
 740       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 741         _masm(masm),
 742         _decorators(decorators),
 743         _type(type),
 744         _gct1(gct1),
 745         _gct2(gct2),
 746         _gct3(gct3),
 747         _gcvt1(gcvt1),
 748         _gcvt2(gcvt2),
 749         _gcvt3(gcvt3) {
 750     }
 751 
 752     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 753       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 754                             dst1, dst2, src,
 755                             _gct1, _gct2, _gcvt1);
 756     }
 757 
 758     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 759       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 760                              dst, src1, src2,
 761                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 762     }
 763 
 764     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 765       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 766                             dst1, dst2, src,
 767                             _gct1);
 768     }
 769 
 770     void copy_store_at_16(Address dst, Register src1, Register src2) {
 771       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 772                              dst, src1, src2,
 773                              _gct1, _gct2, _gct3);
 774     }
 775 
 776     void copy_load_at_8(Register dst, Address src) {
 777       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 778                             dst, noreg, src,
 779                             _gct1);
 780     }
 781 
 782     void copy_store_at_8(Address dst, Register src) {
 783       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 784                              dst, src, noreg,
 785                              _gct1, _gct2, _gct3);
 786     }
 787   };
 788 
 789   // Bulk copy of blocks of 8 words.
 790   //
 791   // count is a count of words.
 792   //
 793   // Precondition: count >= 8
 794   //
 795   // Postconditions:
 796   //
 797   // The least significant bit of count contains the remaining count
 798   // of words to copy.  The rest of count is trash.
 799   //
 800   // s and d are adjusted to point to the remaining words to copy
 801   //
 802   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 803                            copy_direction direction) {
 804     int unit = wordSize * direction;
 805     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 806 
 807     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 808       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 809     const Register stride = r14;
 810     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 811     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 812     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 813 
 814     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 815     assert_different_registers(s, d, count, rscratch1, rscratch2);
 816 
 817     Label again, drain;
 818     const char *stub_name;
 819     if (direction == copy_forwards)
 820       stub_name = "forward_copy_longs";
 821     else
 822       stub_name = "backward_copy_longs";
 823 
 824     __ align(CodeEntryAlignment);
 825 
 826     StubCodeMark mark(this, "StubRoutines", stub_name);
 827 
 828     __ bind(start);
 829 
 830     Label unaligned_copy_long;
 831     if (AvoidUnalignedAccesses) {
 832       __ tbnz(d, 3, unaligned_copy_long);
 833     }
 834 
 835     if (direction == copy_forwards) {
 836       __ sub(s, s, bias);
 837       __ sub(d, d, bias);
 838     }
 839 
 840 #ifdef ASSERT
 841     // Make sure we are never given < 8 words
 842     {
 843       Label L;
 844       __ cmp(count, (u1)8);
 845       __ br(Assembler::GE, L);
 846       __ stop("genrate_copy_longs called with < 8 words");
 847       __ bind(L);
 848     }
 849 #endif
 850 
 851     // Fill 8 registers
 852     if (UseSIMDForMemoryOps) {
 853       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 854       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 855     } else {
 856       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 857       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 858       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 859       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 860     }
 861 
 862     __ subs(count, count, 16);
 863     __ br(Assembler::LO, drain);
 864 
 865     int prefetch = PrefetchCopyIntervalInBytes;
 866     bool use_stride = false;
 867     if (direction == copy_backwards) {
 868        use_stride = prefetch > 256;
 869        prefetch = -prefetch;
 870        if (use_stride) __ mov(stride, prefetch);
 871     }
 872 
 873     __ bind(again);
 874 
 875     if (PrefetchCopyIntervalInBytes > 0)
 876       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 877 
 878     if (UseSIMDForMemoryOps) {
 879       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 880       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 881       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 882       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 883     } else {
 884       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 885       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 886       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 888       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 889       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 890       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 891       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 892     }
 893 
 894     __ subs(count, count, 8);
 895     __ br(Assembler::HS, again);
 896 
 897     // Drain
 898     __ bind(drain);
 899     if (UseSIMDForMemoryOps) {
 900       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 901       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 902     } else {
 903       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 904       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 905       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 906       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 907     }
 908 
 909     {
 910       Label L1, L2;
 911       __ tbz(count, exact_log2(4), L1);
 912       if (UseSIMDForMemoryOps) {
 913         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 914         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 915       } else {
 916         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 917         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 918         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 919         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 920       }
 921       __ bind(L1);
 922 
 923       if (direction == copy_forwards) {
 924         __ add(s, s, bias);
 925         __ add(d, d, bias);
 926       }
 927 
 928       __ tbz(count, 1, L2);
 929       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 930       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 931       __ bind(L2);
 932     }
 933 
 934     __ ret(lr);
 935 
 936     if (AvoidUnalignedAccesses) {
 937       Label drain, again;
 938       // Register order for storing. Order is different for backward copy.
 939 
 940       __ bind(unaligned_copy_long);
 941 
 942       // source address is even aligned, target odd aligned
 943       //
 944       // when forward copying word pairs we read long pairs at offsets
 945       // {0, 2, 4, 6} (in long words). when backwards copying we read
 946       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 947       // address by -2 in the forwards case so we can compute the
 948       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 949       // or -1.
 950       //
 951       // when forward copying we need to store 1 word, 3 pairs and
 952       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 953       // zero offset We adjust the destination by -1 which means we
 954       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 955       //
 956       // When backwards copyng we need to store 1 word, 3 pairs and
 957       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 958       // offsets {1, 3, 5, 7, 8} * unit.
 959 
 960       if (direction == copy_forwards) {
 961         __ sub(s, s, 16);
 962         __ sub(d, d, 8);
 963       }
 964 
 965       // Fill 8 registers
 966       //
 967       // for forwards copy s was offset by -16 from the original input
 968       // value of s so the register contents are at these offsets
 969       // relative to the 64 bit block addressed by that original input
 970       // and so on for each successive 64 byte block when s is updated
 971       //
 972       // t0 at offset 0,  t1 at offset 8
 973       // t2 at offset 16, t3 at offset 24
 974       // t4 at offset 32, t5 at offset 40
 975       // t6 at offset 48, t7 at offset 56
 976 
 977       // for backwards copy s was not offset so the register contents
 978       // are at these offsets into the preceding 64 byte block
 979       // relative to that original input and so on for each successive
 980       // preceding 64 byte block when s is updated. this explains the
 981       // slightly counter-intuitive looking pattern of register usage
 982       // in the stp instructions for backwards copy.
 983       //
 984       // t0 at offset -16, t1 at offset -8
 985       // t2 at offset -32, t3 at offset -24
 986       // t4 at offset -48, t5 at offset -40
 987       // t6 at offset -64, t7 at offset -56
 988 
 989       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 990       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 991       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 992       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 993 
 994       __ subs(count, count, 16);
 995       __ br(Assembler::LO, drain);
 996 
 997       int prefetch = PrefetchCopyIntervalInBytes;
 998       bool use_stride = false;
 999       if (direction == copy_backwards) {
1000          use_stride = prefetch > 256;
1001          prefetch = -prefetch;
1002          if (use_stride) __ mov(stride, prefetch);
1003       }
1004 
1005       __ bind(again);
1006 
1007       if (PrefetchCopyIntervalInBytes > 0)
1008         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1009 
1010       if (direction == copy_forwards) {
1011        // allowing for the offset of -8 the store instructions place
1012        // registers into the target 64 bit block at the following
1013        // offsets
1014        //
1015        // t0 at offset 0
1016        // t1 at offset 8,  t2 at offset 16
1017        // t3 at offset 24, t4 at offset 32
1018        // t5 at offset 40, t6 at offset 48
1019        // t7 at offset 56
1020 
1021         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1022         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1023         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1024         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1025         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1026         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1027         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1028         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1029         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1030       } else {
1031        // d was not offset when we started so the registers are
1032        // written into the 64 bit block preceding d with the following
1033        // offsets
1034        //
1035        // t1 at offset -8
1036        // t3 at offset -24, t0 at offset -16
1037        // t5 at offset -48, t2 at offset -32
1038        // t7 at offset -56, t4 at offset -48
1039        //                   t6 at offset -64
1040        //
1041        // note that this matches the offsets previously noted for the
1042        // loads
1043 
1044         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1045         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1046         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1047         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1048         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1049         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1050         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1051         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1052         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1053       }
1054 
1055       __ subs(count, count, 8);
1056       __ br(Assembler::HS, again);
1057 
1058       // Drain
1059       //
1060       // this uses the same pattern of offsets and register arguments
1061       // as above
1062       __ bind(drain);
1063       if (direction == copy_forwards) {
1064         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1065         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1066         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1067         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1068         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1069       } else {
1070         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1071         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1072         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1073         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1074         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1075       }
1076       // now we need to copy any remaining part block which may
1077       // include a 4 word block subblock and/or a 2 word subblock.
1078       // bits 2 and 1 in the count are the tell-tale for whether we
1079       // have each such subblock
1080       {
1081         Label L1, L2;
1082         __ tbz(count, exact_log2(4), L1);
1083        // this is the same as above but copying only 4 longs hence
1084        // with only one intervening stp between the str instructions
1085        // but note that the offsets and registers still follow the
1086        // same pattern
1087         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1088         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1089         if (direction == copy_forwards) {
1090           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1091           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1092           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1093         } else {
1094           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1095           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1096           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1097         }
1098         __ bind(L1);
1099 
1100         __ tbz(count, 1, L2);
1101        // this is the same as above but copying only 2 longs hence
1102        // there is no intervening stp between the str instructions
1103        // but note that the offset and register patterns are still
1104        // the same
1105         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1106         if (direction == copy_forwards) {
1107           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1108           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1109         } else {
1110           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1111           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1112         }
1113         __ bind(L2);
1114 
1115        // for forwards copy we need to re-adjust the offsets we
1116        // applied so that s and d are follow the last words written
1117 
1118        if (direction == copy_forwards) {
1119          __ add(s, s, 16);
1120          __ add(d, d, 8);
1121        }
1122 
1123       }
1124 
1125       __ ret(lr);
1126       }
1127   }
1128 
1129   // Small copy: less than 16 bytes.
1130   //
1131   // NB: Ignores all of the bits of count which represent more than 15
1132   // bytes, so a caller doesn't have to mask them.
1133 
1134   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1135     bool is_backwards = step < 0;
1136     size_t granularity = uabs(step);
1137     int direction = is_backwards ? -1 : 1;
1138 
1139     Label Lword, Lint, Lshort, Lbyte;
1140 
1141     assert(granularity
1142            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1143 
1144     const Register t0 = r3;
1145     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1146     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1147 
1148     // ??? I don't know if this bit-test-and-branch is the right thing
1149     // to do.  It does a lot of jumping, resulting in several
1150     // mispredicted branches.  It might make more sense to do this
1151     // with something like Duff's device with a single computed branch.
1152 
1153     __ tbz(count, 3 - exact_log2(granularity), Lword);
1154     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1155     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1156     __ bind(Lword);
1157 
1158     if (granularity <= sizeof (jint)) {
1159       __ tbz(count, 2 - exact_log2(granularity), Lint);
1160       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1161       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1162       __ bind(Lint);
1163     }
1164 
1165     if (granularity <= sizeof (jshort)) {
1166       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1167       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1168       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1169       __ bind(Lshort);
1170     }
1171 
1172     if (granularity <= sizeof (jbyte)) {
1173       __ tbz(count, 0, Lbyte);
1174       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1175       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1176       __ bind(Lbyte);
1177     }
1178   }
1179 
1180   Label copy_f, copy_b;
1181   Label copy_obj_f, copy_obj_b;
1182   Label copy_obj_uninit_f, copy_obj_uninit_b;
1183 
1184   // All-singing all-dancing memory copy.
1185   //
1186   // Copy count units of memory from s to d.  The size of a unit is
1187   // step, which can be positive or negative depending on the direction
1188   // of copy.  If is_aligned is false, we align the source address.
1189   //
1190 
1191   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1192                    Register s, Register d, Register count, int step) {
1193     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1194     bool is_backwards = step < 0;
1195     unsigned int granularity = uabs(step);
1196     const Register t0 = r3, t1 = r4;
1197 
1198     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1199     // load all the data before writing anything
1200     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1201     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1202     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1203     const Register send = r17, dend = r16;
1204     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1205     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1206     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1207 
1208     if (PrefetchCopyIntervalInBytes > 0)
1209       __ prfm(Address(s, 0), PLDL1KEEP);
1210     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1211     __ br(Assembler::HI, copy_big);
1212 
1213     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1214     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1215 
1216     __ cmp(count, u1(16/granularity));
1217     __ br(Assembler::LS, copy16);
1218 
1219     __ cmp(count, u1(64/granularity));
1220     __ br(Assembler::HI, copy80);
1221 
1222     __ cmp(count, u1(32/granularity));
1223     __ br(Assembler::LS, copy32);
1224 
1225     // 33..64 bytes
1226     if (UseSIMDForMemoryOps) {
1227       bs.copy_load_at_32(v0, v1, Address(s, 0));
1228       bs.copy_load_at_32(v2, v3, Address(send, -32));
1229       bs.copy_store_at_32(Address(d, 0), v0, v1);
1230       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1231     } else {
1232       bs.copy_load_at_16(t0, t1, Address(s, 0));
1233       bs.copy_load_at_16(t2, t3, Address(s, 16));
1234       bs.copy_load_at_16(t4, t5, Address(send, -32));
1235       bs.copy_load_at_16(t6, t7, Address(send, -16));
1236 
1237       bs.copy_store_at_16(Address(d, 0), t0, t1);
1238       bs.copy_store_at_16(Address(d, 16), t2, t3);
1239       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1240       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1241     }
1242     __ b(finish);
1243 
1244     // 17..32 bytes
1245     __ bind(copy32);
1246     bs.copy_load_at_16(t0, t1, Address(s, 0));
1247     bs.copy_load_at_16(t6, t7, Address(send, -16));
1248 
1249     bs.copy_store_at_16(Address(d, 0), t0, t1);
1250     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1251     __ b(finish);
1252 
1253     // 65..80/96 bytes
1254     // (96 bytes if SIMD because we do 32 byes per instruction)
1255     __ bind(copy80);
1256     if (UseSIMDForMemoryOps) {
1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
1258       bs.copy_load_at_32(v2, v3, Address(s, 32));
1259       // Unaligned pointers can be an issue for copying.
1260       // The issue has more chances to happen when granularity of data is
1261       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1262       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1263       // The most performance drop has been seen for the range 65-80 bytes.
1264       // For such cases using the pair of ldp/stp instead of the third pair of
1265       // ldpq/stpq fixes the performance issue.
1266       if (granularity < sizeof (jint)) {
1267         Label copy96;
1268         __ cmp(count, u1(80/granularity));
1269         __ br(Assembler::HI, copy96);
1270         bs.copy_load_at_16(t0, t1, Address(send, -16));
1271 
1272         bs.copy_store_at_32(Address(d, 0), v0, v1);
1273         bs.copy_store_at_32(Address(d, 32), v2, v3);
1274 
1275         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1276         __ b(finish);
1277 
1278         __ bind(copy96);
1279       }
1280       bs.copy_load_at_32(v4, v5, Address(send, -32));
1281 
1282       bs.copy_store_at_32(Address(d, 0), v0, v1);
1283       bs.copy_store_at_32(Address(d, 32), v2, v3);
1284 
1285       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1286     } else {
1287       bs.copy_load_at_16(t0, t1, Address(s, 0));
1288       bs.copy_load_at_16(t2, t3, Address(s, 16));
1289       bs.copy_load_at_16(t4, t5, Address(s, 32));
1290       bs.copy_load_at_16(t6, t7, Address(s, 48));
1291       bs.copy_load_at_16(t8, t9, Address(send, -16));
1292 
1293       bs.copy_store_at_16(Address(d, 0), t0, t1);
1294       bs.copy_store_at_16(Address(d, 16), t2, t3);
1295       bs.copy_store_at_16(Address(d, 32), t4, t5);
1296       bs.copy_store_at_16(Address(d, 48), t6, t7);
1297       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1298     }
1299     __ b(finish);
1300 
1301     // 0..16 bytes
1302     __ bind(copy16);
1303     __ cmp(count, u1(8/granularity));
1304     __ br(Assembler::LO, copy8);
1305 
1306     // 8..16 bytes
1307     bs.copy_load_at_8(t0, Address(s, 0));
1308     bs.copy_load_at_8(t1, Address(send, -8));
1309     bs.copy_store_at_8(Address(d, 0), t0);
1310     bs.copy_store_at_8(Address(dend, -8), t1);
1311     __ b(finish);
1312 
1313     if (granularity < 8) {
1314       // 4..7 bytes
1315       __ bind(copy8);
1316       __ tbz(count, 2 - exact_log2(granularity), copy4);
1317       __ ldrw(t0, Address(s, 0));
1318       __ ldrw(t1, Address(send, -4));
1319       __ strw(t0, Address(d, 0));
1320       __ strw(t1, Address(dend, -4));
1321       __ b(finish);
1322       if (granularity < 4) {
1323         // 0..3 bytes
1324         __ bind(copy4);
1325         __ cbz(count, finish); // get rid of 0 case
1326         if (granularity == 2) {
1327           __ ldrh(t0, Address(s, 0));
1328           __ strh(t0, Address(d, 0));
1329         } else { // granularity == 1
1330           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1331           // the first and last byte.
1332           // Handle the 3 byte case by loading and storing base + count/2
1333           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1334           // This does means in the 1 byte case we load/store the same
1335           // byte 3 times.
1336           __ lsr(count, count, 1);
1337           __ ldrb(t0, Address(s, 0));
1338           __ ldrb(t1, Address(send, -1));
1339           __ ldrb(t2, Address(s, count));
1340           __ strb(t0, Address(d, 0));
1341           __ strb(t1, Address(dend, -1));
1342           __ strb(t2, Address(d, count));
1343         }
1344         __ b(finish);
1345       }
1346     }
1347 
1348     __ bind(copy_big);
1349     if (is_backwards) {
1350       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1351       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1352     }
1353 
1354     // Now we've got the small case out of the way we can align the
1355     // source address on a 2-word boundary.
1356 
1357     // Here we will materialize a count in r15, which is used by copy_memory_small
1358     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1359     // Up until here, we have used t9, which aliases r15, but from here on, that register
1360     // can not be used as a temp register, as it contains the count.
1361 
1362     Label aligned;
1363 
1364     if (is_aligned) {
1365       // We may have to adjust by 1 word to get s 2-word-aligned.
1366       __ tbz(s, exact_log2(wordSize), aligned);
1367       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1368       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1369       __ sub(count, count, wordSize/granularity);
1370     } else {
1371       if (is_backwards) {
1372         __ andr(r15, s, 2 * wordSize - 1);
1373       } else {
1374         __ neg(r15, s);
1375         __ andr(r15, r15, 2 * wordSize - 1);
1376       }
1377       // r15 is the byte adjustment needed to align s.
1378       __ cbz(r15, aligned);
1379       int shift = exact_log2(granularity);
1380       if (shift > 0) {
1381         __ lsr(r15, r15, shift);
1382       }
1383       __ sub(count, count, r15);
1384 
1385 #if 0
1386       // ?? This code is only correct for a disjoint copy.  It may or
1387       // may not make sense to use it in that case.
1388 
1389       // Copy the first pair; s and d may not be aligned.
1390       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1391       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1392 
1393       // Align s and d, adjust count
1394       if (is_backwards) {
1395         __ sub(s, s, r15);
1396         __ sub(d, d, r15);
1397       } else {
1398         __ add(s, s, r15);
1399         __ add(d, d, r15);
1400       }
1401 #else
1402       copy_memory_small(decorators, type, s, d, r15, step);
1403 #endif
1404     }
1405 
1406     __ bind(aligned);
1407 
1408     // s is now 2-word-aligned.
1409 
1410     // We have a count of units and some trailing bytes. Adjust the
1411     // count and do a bulk copy of words. If the shift is zero
1412     // perform a move instead to benefit from zero latency moves.
1413     int shift = exact_log2(wordSize/granularity);
1414     if (shift > 0) {
1415       __ lsr(r15, count, shift);
1416     } else {
1417       __ mov(r15, count);
1418     }
1419     if (direction == copy_forwards) {
1420       if (type != T_OBJECT) {
1421         __ bl(copy_f);
1422       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1423         __ bl(copy_obj_uninit_f);
1424       } else {
1425         __ bl(copy_obj_f);
1426       }
1427     } else {
1428       if (type != T_OBJECT) {
1429         __ bl(copy_b);
1430       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1431         __ bl(copy_obj_uninit_b);
1432       } else {
1433         __ bl(copy_obj_b);
1434       }
1435     }
1436 
1437     // And the tail.
1438     copy_memory_small(decorators, type, s, d, count, step);
1439 
1440     if (granularity >= 8) __ bind(copy8);
1441     if (granularity >= 4) __ bind(copy4);
1442     __ bind(finish);
1443   }
1444 
1445 
1446   void clobber_registers() {
1447 #ifdef ASSERT
1448     RegSet clobbered
1449       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1450     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1451     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1452     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1453       __ mov(*it, rscratch1);
1454     }
1455 #endif
1456 
1457   }
1458 
1459   // Scan over array at a for count oops, verifying each one.
1460   // Preserves a and count, clobbers rscratch1 and rscratch2.
1461   void verify_oop_array (int size, Register a, Register count, Register temp) {
1462     Label loop, end;
1463     __ mov(rscratch1, a);
1464     __ mov(rscratch2, zr);
1465     __ bind(loop);
1466     __ cmp(rscratch2, count);
1467     __ br(Assembler::HS, end);
1468     if (size == wordSize) {
1469       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1470       __ verify_oop(temp);
1471     } else {
1472       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1473       __ decode_heap_oop(temp); // calls verify_oop
1474     }
1475     __ add(rscratch2, rscratch2, 1);
1476     __ b(loop);
1477     __ bind(end);
1478   }
1479 
1480   // Arguments:
1481   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1482   //             ignored
1483   //   is_oop  - true => oop array, so generate store check code
1484   //   name    - stub name string
1485   //
1486   // Inputs:
1487   //   c_rarg0   - source array address
1488   //   c_rarg1   - destination array address
1489   //   c_rarg2   - element count, treated as ssize_t, can be zero
1490   //
1491   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1492   // the hardware handle it.  The two dwords within qwords that span
1493   // cache line boundaries will still be loaded and stored atomically.
1494   //
1495   // Side Effects:
1496   //   disjoint_int_copy_entry is set to the no-overlap entry point
1497   //   used by generate_conjoint_int_oop_copy().
1498   //
1499   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1500                                   const char *name, bool dest_uninitialized = false) {
1501     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1502     RegSet saved_reg = RegSet::of(s, d, count);
1503     __ align(CodeEntryAlignment);
1504     StubCodeMark mark(this, "StubRoutines", name);
1505     address start = __ pc();
1506     __ enter();
1507 
1508     if (entry != nullptr) {
1509       *entry = __ pc();
1510       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1511       BLOCK_COMMENT("Entry:");
1512     }
1513 
1514     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1515     if (dest_uninitialized) {
1516       decorators |= IS_DEST_UNINITIALIZED;
1517     }
1518     if (aligned) {
1519       decorators |= ARRAYCOPY_ALIGNED;
1520     }
1521 
1522     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1523     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1524 
1525     if (is_oop) {
1526       // save regs before copy_memory
1527       __ push(RegSet::of(d, count), sp);
1528     }
1529     {
1530       // UnsafeMemoryAccess page error: continue after unsafe access
1531       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1532       UnsafeMemoryAccessMark umam(this, add_entry, true);
1533       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1534     }
1535 
1536     if (is_oop) {
1537       __ pop(RegSet::of(d, count), sp);
1538       if (VerifyOops)
1539         verify_oop_array(size, d, count, r16);
1540     }
1541 
1542     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1543 
1544     __ leave();
1545     __ mov(r0, zr); // return 0
1546     __ ret(lr);
1547     return start;
1548   }
1549 
1550   // Arguments:
1551   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1552   //             ignored
1553   //   is_oop  - true => oop array, so generate store check code
1554   //   name    - stub name string
1555   //
1556   // Inputs:
1557   //   c_rarg0   - source array address
1558   //   c_rarg1   - destination array address
1559   //   c_rarg2   - element count, treated as ssize_t, can be zero
1560   //
1561   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1562   // the hardware handle it.  The two dwords within qwords that span
1563   // cache line boundaries will still be loaded and stored atomically.
1564   //
1565   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1566                                  address *entry, const char *name,
1567                                  bool dest_uninitialized = false) {
1568     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1569     RegSet saved_regs = RegSet::of(s, d, count);
1570     StubCodeMark mark(this, "StubRoutines", name);
1571     address start = __ pc();
1572     __ enter();
1573 
1574     if (entry != nullptr) {
1575       *entry = __ pc();
1576       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1577       BLOCK_COMMENT("Entry:");
1578     }
1579 
1580     // use fwd copy when (d-s) above_equal (count*size)
1581     __ sub(rscratch1, d, s);
1582     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1583     __ br(Assembler::HS, nooverlap_target);
1584 
1585     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1586     if (dest_uninitialized) {
1587       decorators |= IS_DEST_UNINITIALIZED;
1588     }
1589     if (aligned) {
1590       decorators |= ARRAYCOPY_ALIGNED;
1591     }
1592 
1593     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1594     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1595 
1596     if (is_oop) {
1597       // save regs before copy_memory
1598       __ push(RegSet::of(d, count), sp);
1599     }
1600     {
1601       // UnsafeMemoryAccess page error: continue after unsafe access
1602       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1603       UnsafeMemoryAccessMark umam(this, add_entry, true);
1604       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1605     }
1606     if (is_oop) {
1607       __ pop(RegSet::of(d, count), sp);
1608       if (VerifyOops)
1609         verify_oop_array(size, d, count, r16);
1610     }
1611     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1612     __ leave();
1613     __ mov(r0, zr); // return 0
1614     __ ret(lr);
1615     return start;
1616 }
1617 
1618   // Arguments:
1619   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1620   //             ignored
1621   //   name    - stub name string
1622   //
1623   // Inputs:
1624   //   c_rarg0   - source array address
1625   //   c_rarg1   - destination array address
1626   //   c_rarg2   - element count, treated as ssize_t, can be zero
1627   //
1628   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1629   // we let the hardware handle it.  The one to eight bytes within words,
1630   // dwords or qwords that span cache line boundaries will still be loaded
1631   // and stored atomically.
1632   //
1633   // Side Effects:
1634   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1635   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1636   // we let the hardware handle it.  The one to eight bytes within words,
1637   // dwords or qwords that span cache line boundaries will still be loaded
1638   // and stored atomically.
1639   //
1640   // Side Effects:
1641   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1642   //   used by generate_conjoint_byte_copy().
1643   //
1644   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1645     const bool not_oop = false;
1646     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1647   }
1648 
1649   // Arguments:
1650   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1651   //             ignored
1652   //   name    - stub name string
1653   //
1654   // Inputs:
1655   //   c_rarg0   - source array address
1656   //   c_rarg1   - destination array address
1657   //   c_rarg2   - element count, treated as ssize_t, can be zero
1658   //
1659   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1660   // we let the hardware handle it.  The one to eight bytes within words,
1661   // dwords or qwords that span cache line boundaries will still be loaded
1662   // and stored atomically.
1663   //
1664   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1665                                       address* entry, const char *name) {
1666     const bool not_oop = false;
1667     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1668   }
1669 
1670   // Arguments:
1671   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1672   //             ignored
1673   //   name    - stub name string
1674   //
1675   // Inputs:
1676   //   c_rarg0   - source array address
1677   //   c_rarg1   - destination array address
1678   //   c_rarg2   - element count, treated as ssize_t, can be zero
1679   //
1680   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1681   // let the hardware handle it.  The two or four words within dwords
1682   // or qwords that span cache line boundaries will still be loaded
1683   // and stored atomically.
1684   //
1685   // Side Effects:
1686   //   disjoint_short_copy_entry is set to the no-overlap entry point
1687   //   used by generate_conjoint_short_copy().
1688   //
1689   address generate_disjoint_short_copy(bool aligned,
1690                                        address* entry, const char *name) {
1691     const bool not_oop = false;
1692     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1693   }
1694 
1695   // Arguments:
1696   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1697   //             ignored
1698   //   name    - stub name string
1699   //
1700   // Inputs:
1701   //   c_rarg0   - source array address
1702   //   c_rarg1   - destination array address
1703   //   c_rarg2   - element count, treated as ssize_t, can be zero
1704   //
1705   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1706   // let the hardware handle it.  The two or four words within dwords
1707   // or qwords that span cache line boundaries will still be loaded
1708   // and stored atomically.
1709   //
1710   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1711                                        address *entry, const char *name) {
1712     const bool not_oop = false;
1713     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1714 
1715   }
1716   // Arguments:
1717   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1718   //             ignored
1719   //   name    - stub name string
1720   //
1721   // Inputs:
1722   //   c_rarg0   - source array address
1723   //   c_rarg1   - destination array address
1724   //   c_rarg2   - element count, treated as ssize_t, can be zero
1725   //
1726   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1727   // the hardware handle it.  The two dwords within qwords that span
1728   // cache line boundaries will still be loaded and stored atomically.
1729   //
1730   // Side Effects:
1731   //   disjoint_int_copy_entry is set to the no-overlap entry point
1732   //   used by generate_conjoint_int_oop_copy().
1733   //
1734   address generate_disjoint_int_copy(bool aligned, address *entry,
1735                                          const char *name, bool dest_uninitialized = false) {
1736     const bool not_oop = false;
1737     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1738   }
1739 
1740   // Arguments:
1741   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1742   //             ignored
1743   //   name    - stub name string
1744   //
1745   // Inputs:
1746   //   c_rarg0   - source array address
1747   //   c_rarg1   - destination array address
1748   //   c_rarg2   - element count, treated as ssize_t, can be zero
1749   //
1750   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1751   // the hardware handle it.  The two dwords within qwords that span
1752   // cache line boundaries will still be loaded and stored atomically.
1753   //
1754   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1755                                      address *entry, const char *name,
1756                                      bool dest_uninitialized = false) {
1757     const bool not_oop = false;
1758     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1759   }
1760 
1761 
1762   // Arguments:
1763   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1764   //             ignored
1765   //   name    - stub name string
1766   //
1767   // Inputs:
1768   //   c_rarg0   - source array address
1769   //   c_rarg1   - destination array address
1770   //   c_rarg2   - element count, treated as size_t, can be zero
1771   //
1772   // Side Effects:
1773   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1774   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1775   //
1776   address generate_disjoint_long_copy(bool aligned, address *entry,
1777                                           const char *name, bool dest_uninitialized = false) {
1778     const bool not_oop = false;
1779     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1780   }
1781 
1782   // Arguments:
1783   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1784   //             ignored
1785   //   name    - stub name string
1786   //
1787   // Inputs:
1788   //   c_rarg0   - source array address
1789   //   c_rarg1   - destination array address
1790   //   c_rarg2   - element count, treated as size_t, can be zero
1791   //
1792   address generate_conjoint_long_copy(bool aligned,
1793                                       address nooverlap_target, address *entry,
1794                                       const char *name, bool dest_uninitialized = false) {
1795     const bool not_oop = false;
1796     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1797   }
1798 
1799   // Arguments:
1800   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1801   //             ignored
1802   //   name    - stub name string
1803   //
1804   // Inputs:
1805   //   c_rarg0   - source array address
1806   //   c_rarg1   - destination array address
1807   //   c_rarg2   - element count, treated as size_t, can be zero
1808   //
1809   // Side Effects:
1810   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1811   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1812   //
1813   address generate_disjoint_oop_copy(bool aligned, address *entry,
1814                                      const char *name, bool dest_uninitialized) {
1815     const bool is_oop = true;
1816     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1817     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1818   }
1819 
1820   // Arguments:
1821   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1822   //             ignored
1823   //   name    - stub name string
1824   //
1825   // Inputs:
1826   //   c_rarg0   - source array address
1827   //   c_rarg1   - destination array address
1828   //   c_rarg2   - element count, treated as size_t, can be zero
1829   //
1830   address generate_conjoint_oop_copy(bool aligned,
1831                                      address nooverlap_target, address *entry,
1832                                      const char *name, bool dest_uninitialized) {
1833     const bool is_oop = true;
1834     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1835     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1836                                   name, dest_uninitialized);
1837   }
1838 
1839 
1840   // Helper for generating a dynamic type check.
1841   // Smashes rscratch1, rscratch2.
1842   void generate_type_check(Register sub_klass,
1843                            Register super_check_offset,
1844                            Register super_klass,
1845                            Register temp1,
1846                            Register temp2,
1847                            Register result,
1848                            Label& L_success) {
1849     assert_different_registers(sub_klass, super_check_offset, super_klass);
1850 
1851     BLOCK_COMMENT("type_check:");
1852 
1853     Label L_miss;
1854 
1855     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1856                                      super_check_offset);
1857     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1858 
1859     // Fall through on failure!
1860     __ BIND(L_miss);
1861   }
1862 
1863   //
1864   //  Generate checkcasting array copy stub
1865   //
1866   //  Input:
1867   //    c_rarg0   - source array address
1868   //    c_rarg1   - destination array address
1869   //    c_rarg2   - element count, treated as ssize_t, can be zero
1870   //    c_rarg3   - size_t ckoff (super_check_offset)
1871   //    c_rarg4   - oop ckval (super_klass)
1872   //
1873   //  Output:
1874   //    r0 ==  0  -  success
1875   //    r0 == -1^K - failure, where K is partial transfer count
1876   //
1877   address generate_checkcast_copy(const char *name, address *entry,
1878                                   bool dest_uninitialized = false) {
1879 
1880     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1881 
1882     // Input registers (after setup_arg_regs)
1883     const Register from        = c_rarg0;   // source array address
1884     const Register to          = c_rarg1;   // destination array address
1885     const Register count       = c_rarg2;   // elementscount
1886     const Register ckoff       = c_rarg3;   // super_check_offset
1887     const Register ckval       = c_rarg4;   // super_klass
1888 
1889     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1890     RegSet wb_post_saved_regs = RegSet::of(count);
1891 
1892     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1893     const Register copied_oop  = r22;       // actual oop copied
1894     const Register count_save  = r21;       // orig elementscount
1895     const Register start_to    = r20;       // destination array start address
1896     const Register r19_klass   = r19;       // oop._klass
1897 
1898     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1899     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1900 
1901     //---------------------------------------------------------------
1902     // Assembler stub will be used for this call to arraycopy
1903     // if the two arrays are subtypes of Object[] but the
1904     // destination array type is not equal to or a supertype
1905     // of the source type.  Each element must be separately
1906     // checked.
1907 
1908     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1909                                copied_oop, r19_klass, count_save);
1910 
1911     __ align(CodeEntryAlignment);
1912     StubCodeMark mark(this, "StubRoutines", name);
1913     address start = __ pc();
1914 
1915     __ enter(); // required for proper stackwalking of RuntimeStub frame
1916 
1917 #ifdef ASSERT
1918     // caller guarantees that the arrays really are different
1919     // otherwise, we would have to make conjoint checks
1920     { Label L;
1921       __ b(L);                  // conjoint check not yet implemented
1922       __ stop("checkcast_copy within a single array");
1923       __ bind(L);
1924     }
1925 #endif //ASSERT
1926 
1927     // Caller of this entry point must set up the argument registers.
1928     if (entry != nullptr) {
1929       *entry = __ pc();
1930       BLOCK_COMMENT("Entry:");
1931     }
1932 
1933      // Empty array:  Nothing to do.
1934     __ cbz(count, L_done);
1935     __ push(RegSet::of(r19, r20, r21, r22), sp);
1936 
1937 #ifdef ASSERT
1938     BLOCK_COMMENT("assert consistent ckoff/ckval");
1939     // The ckoff and ckval must be mutually consistent,
1940     // even though caller generates both.
1941     { Label L;
1942       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1943       __ ldrw(start_to, Address(ckval, sco_offset));
1944       __ cmpw(ckoff, start_to);
1945       __ br(Assembler::EQ, L);
1946       __ stop("super_check_offset inconsistent");
1947       __ bind(L);
1948     }
1949 #endif //ASSERT
1950 
1951     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1952     bool is_oop = true;
1953     int element_size = UseCompressedOops ? 4 : 8;
1954     if (dest_uninitialized) {
1955       decorators |= IS_DEST_UNINITIALIZED;
1956     }
1957 
1958     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1959     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1960 
1961     // save the original count
1962     __ mov(count_save, count);
1963 
1964     // Copy from low to high addresses
1965     __ mov(start_to, to);              // Save destination array start address
1966     __ b(L_load_element);
1967 
1968     // ======== begin loop ========
1969     // (Loop is rotated; its entry is L_load_element.)
1970     // Loop control:
1971     //   for (; count != 0; count--) {
1972     //     copied_oop = load_heap_oop(from++);
1973     //     ... generate_type_check ...;
1974     //     store_heap_oop(to++, copied_oop);
1975     //   }
1976     __ align(OptoLoopAlignment);
1977 
1978     __ BIND(L_store_element);
1979     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1980                       __ post(to, element_size), copied_oop, noreg,
1981                       gct1, gct2, gct3);
1982     __ sub(count, count, 1);
1983     __ cbz(count, L_do_card_marks);
1984 
1985     // ======== loop entry is here ========
1986     __ BIND(L_load_element);
1987     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1988                      copied_oop, noreg, __ post(from, element_size),
1989                      gct1);
1990     __ cbz(copied_oop, L_store_element);
1991 
1992     __ load_klass(r19_klass, copied_oop);// query the object klass
1993 
1994     BLOCK_COMMENT("type_check:");
1995     generate_type_check(/*sub_klass*/r19_klass,
1996                         /*super_check_offset*/ckoff,
1997                         /*super_klass*/ckval,
1998                         /*r_array_base*/gct1,
1999                         /*temp2*/gct2,
2000                         /*result*/r10, L_store_element);
2001 
2002     // Fall through on failure!
2003 
2004     // ======== end loop ========
2005 
2006     // It was a real error; we must depend on the caller to finish the job.
2007     // Register count = remaining oops, count_orig = total oops.
2008     // Emit GC store barriers for the oops we have copied and report
2009     // their number to the caller.
2010 
2011     __ subs(count, count_save, count);     // K = partially copied oop count
2012     __ eon(count, count, zr);              // report (-1^K) to caller
2013     __ br(Assembler::EQ, L_done_pop);
2014 
2015     __ BIND(L_do_card_marks);
2016     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2017 
2018     __ bind(L_done_pop);
2019     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2020     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2021 
2022     __ bind(L_done);
2023     __ mov(r0, count);
2024     __ leave();
2025     __ ret(lr);
2026 
2027     return start;
2028   }
2029 
2030   // Perform range checks on the proposed arraycopy.
2031   // Kills temp, but nothing else.
2032   // Also, clean the sign bits of src_pos and dst_pos.
2033   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2034                               Register src_pos, // source position (c_rarg1)
2035                               Register dst,     // destination array oo (c_rarg2)
2036                               Register dst_pos, // destination position (c_rarg3)
2037                               Register length,
2038                               Register temp,
2039                               Label& L_failed) {
2040     BLOCK_COMMENT("arraycopy_range_checks:");
2041 
2042     assert_different_registers(rscratch1, temp);
2043 
2044     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2045     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2046     __ addw(temp, length, src_pos);
2047     __ cmpw(temp, rscratch1);
2048     __ br(Assembler::HI, L_failed);
2049 
2050     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2051     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2052     __ addw(temp, length, dst_pos);
2053     __ cmpw(temp, rscratch1);
2054     __ br(Assembler::HI, L_failed);
2055 
2056     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2057     __ movw(src_pos, src_pos);
2058     __ movw(dst_pos, dst_pos);
2059 
2060     BLOCK_COMMENT("arraycopy_range_checks done");
2061   }
2062 
2063   // These stubs get called from some dumb test routine.
2064   // I'll write them properly when they're called from
2065   // something that's actually doing something.
2066   static void fake_arraycopy_stub(address src, address dst, int count) {
2067     assert(count == 0, "huh?");
2068   }
2069 
2070 
2071   //
2072   //  Generate 'unsafe' array copy stub
2073   //  Though just as safe as the other stubs, it takes an unscaled
2074   //  size_t argument instead of an element count.
2075   //
2076   //  Input:
2077   //    c_rarg0   - source array address
2078   //    c_rarg1   - destination array address
2079   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2080   //
2081   // Examines the alignment of the operands and dispatches
2082   // to a long, int, short, or byte copy loop.
2083   //
2084   address generate_unsafe_copy(const char *name,
2085                                address byte_copy_entry,
2086                                address short_copy_entry,
2087                                address int_copy_entry,
2088                                address long_copy_entry) {
2089     Label L_long_aligned, L_int_aligned, L_short_aligned;
2090     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2091 
2092     __ align(CodeEntryAlignment);
2093     StubCodeMark mark(this, "StubRoutines", name);
2094     address start = __ pc();
2095     __ enter(); // required for proper stackwalking of RuntimeStub frame
2096 
2097     // bump this on entry, not on exit:
2098     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2099 
2100     __ orr(rscratch1, s, d);
2101     __ orr(rscratch1, rscratch1, count);
2102 
2103     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2104     __ cbz(rscratch1, L_long_aligned);
2105     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2106     __ cbz(rscratch1, L_int_aligned);
2107     __ tbz(rscratch1, 0, L_short_aligned);
2108     __ b(RuntimeAddress(byte_copy_entry));
2109 
2110     __ BIND(L_short_aligned);
2111     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2112     __ b(RuntimeAddress(short_copy_entry));
2113     __ BIND(L_int_aligned);
2114     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2115     __ b(RuntimeAddress(int_copy_entry));
2116     __ BIND(L_long_aligned);
2117     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2118     __ b(RuntimeAddress(long_copy_entry));
2119 
2120     return start;
2121   }
2122 
2123   //
2124   //  Generate generic array copy stubs
2125   //
2126   //  Input:
2127   //    c_rarg0    -  src oop
2128   //    c_rarg1    -  src_pos (32-bits)
2129   //    c_rarg2    -  dst oop
2130   //    c_rarg3    -  dst_pos (32-bits)
2131   //    c_rarg4    -  element count (32-bits)
2132   //
2133   //  Output:
2134   //    r0 ==  0  -  success
2135   //    r0 == -1^K - failure, where K is partial transfer count
2136   //
2137   address generate_generic_copy(const char *name,
2138                                 address byte_copy_entry, address short_copy_entry,
2139                                 address int_copy_entry, address oop_copy_entry,
2140                                 address long_copy_entry, address checkcast_copy_entry) {
2141 
2142     Label L_failed, L_objArray;
2143     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2144 
2145     // Input registers
2146     const Register src        = c_rarg0;  // source array oop
2147     const Register src_pos    = c_rarg1;  // source position
2148     const Register dst        = c_rarg2;  // destination array oop
2149     const Register dst_pos    = c_rarg3;  // destination position
2150     const Register length     = c_rarg4;
2151 
2152 
2153     // Registers used as temps
2154     const Register dst_klass  = c_rarg5;
2155 
2156     __ align(CodeEntryAlignment);
2157 
2158     StubCodeMark mark(this, "StubRoutines", name);
2159 
2160     address start = __ pc();
2161 
2162     __ enter(); // required for proper stackwalking of RuntimeStub frame
2163 
2164     // bump this on entry, not on exit:
2165     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2166 
2167     //-----------------------------------------------------------------------
2168     // Assembler stub will be used for this call to arraycopy
2169     // if the following conditions are met:
2170     //
2171     // (1) src and dst must not be null.
2172     // (2) src_pos must not be negative.
2173     // (3) dst_pos must not be negative.
2174     // (4) length  must not be negative.
2175     // (5) src klass and dst klass should be the same and not null.
2176     // (6) src and dst should be arrays.
2177     // (7) src_pos + length must not exceed length of src.
2178     // (8) dst_pos + length must not exceed length of dst.
2179     //
2180 
2181     //  if (src == nullptr) return -1;
2182     __ cbz(src, L_failed);
2183 
2184     //  if (src_pos < 0) return -1;
2185     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2186 
2187     //  if (dst == nullptr) return -1;
2188     __ cbz(dst, L_failed);
2189 
2190     //  if (dst_pos < 0) return -1;
2191     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2192 
2193     // registers used as temp
2194     const Register scratch_length    = r16; // elements count to copy
2195     const Register scratch_src_klass = r17; // array klass
2196     const Register lh                = r15; // layout helper
2197 
2198     //  if (length < 0) return -1;
2199     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2200     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2201 
2202     __ load_klass(scratch_src_klass, src);
2203 #ifdef ASSERT
2204     //  assert(src->klass() != nullptr);
2205     {
2206       BLOCK_COMMENT("assert klasses not null {");
2207       Label L1, L2;
2208       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2209       __ bind(L1);
2210       __ stop("broken null klass");
2211       __ bind(L2);
2212       __ load_klass(rscratch1, dst);
2213       __ cbz(rscratch1, L1);     // this would be broken also
2214       BLOCK_COMMENT("} assert klasses not null done");
2215     }
2216 #endif
2217 
2218     // Load layout helper (32-bits)
2219     //
2220     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2221     // 32        30    24            16              8     2                 0
2222     //
2223     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2224     //
2225 
2226     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2227 
2228     // Handle objArrays completely differently...
2229     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2230     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2231     __ movw(rscratch1, objArray_lh);
2232     __ eorw(rscratch2, lh, rscratch1);
2233     __ cbzw(rscratch2, L_objArray);
2234 
2235     //  if (src->klass() != dst->klass()) return -1;
2236     __ load_klass(rscratch2, dst);
2237     __ eor(rscratch2, rscratch2, scratch_src_klass);
2238     __ cbnz(rscratch2, L_failed);
2239 
2240     //  if (!src->is_Array()) return -1;
2241     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2242 
2243     // At this point, it is known to be a typeArray (array_tag 0x3).
2244 #ifdef ASSERT
2245     {
2246       BLOCK_COMMENT("assert primitive array {");
2247       Label L;
2248       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2249       __ cmpw(lh, rscratch2);
2250       __ br(Assembler::GE, L);
2251       __ stop("must be a primitive array");
2252       __ bind(L);
2253       BLOCK_COMMENT("} assert primitive array done");
2254     }
2255 #endif
2256 
2257     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2258                            rscratch2, L_failed);
2259 
2260     // TypeArrayKlass
2261     //
2262     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2263     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2264     //
2265 
2266     const Register rscratch1_offset = rscratch1;    // array offset
2267     const Register r15_elsize = lh; // element size
2268 
2269     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2270            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2271     __ add(src, src, rscratch1_offset);           // src array offset
2272     __ add(dst, dst, rscratch1_offset);           // dst array offset
2273     BLOCK_COMMENT("choose copy loop based on element size");
2274 
2275     // next registers should be set before the jump to corresponding stub
2276     const Register from     = c_rarg0;  // source array address
2277     const Register to       = c_rarg1;  // destination array address
2278     const Register count    = c_rarg2;  // elements count
2279 
2280     // 'from', 'to', 'count' registers should be set in such order
2281     // since they are the same as 'src', 'src_pos', 'dst'.
2282 
2283     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2284 
2285     // The possible values of elsize are 0-3, i.e. exact_log2(element
2286     // size in bytes).  We do a simple bitwise binary search.
2287   __ BIND(L_copy_bytes);
2288     __ tbnz(r15_elsize, 1, L_copy_ints);
2289     __ tbnz(r15_elsize, 0, L_copy_shorts);
2290     __ lea(from, Address(src, src_pos));// src_addr
2291     __ lea(to,   Address(dst, dst_pos));// dst_addr
2292     __ movw(count, scratch_length); // length
2293     __ b(RuntimeAddress(byte_copy_entry));
2294 
2295   __ BIND(L_copy_shorts);
2296     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2297     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2298     __ movw(count, scratch_length); // length
2299     __ b(RuntimeAddress(short_copy_entry));
2300 
2301   __ BIND(L_copy_ints);
2302     __ tbnz(r15_elsize, 0, L_copy_longs);
2303     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2304     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2305     __ movw(count, scratch_length); // length
2306     __ b(RuntimeAddress(int_copy_entry));
2307 
2308   __ BIND(L_copy_longs);
2309 #ifdef ASSERT
2310     {
2311       BLOCK_COMMENT("assert long copy {");
2312       Label L;
2313       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2314       __ cmpw(r15_elsize, LogBytesPerLong);
2315       __ br(Assembler::EQ, L);
2316       __ stop("must be long copy, but elsize is wrong");
2317       __ bind(L);
2318       BLOCK_COMMENT("} assert long copy done");
2319     }
2320 #endif
2321     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2322     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2323     __ movw(count, scratch_length); // length
2324     __ b(RuntimeAddress(long_copy_entry));
2325 
2326     // ObjArrayKlass
2327   __ BIND(L_objArray);
2328     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2329 
2330     Label L_plain_copy, L_checkcast_copy;
2331     //  test array classes for subtyping
2332     __ load_klass(r15, dst);
2333     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2334     __ br(Assembler::NE, L_checkcast_copy);
2335 
2336     // Identically typed arrays can be copied without element-wise checks.
2337     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2338                            rscratch2, L_failed);
2339 
2340     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2341     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2342     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2343     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2344     __ movw(count, scratch_length); // length
2345   __ BIND(L_plain_copy);
2346     __ b(RuntimeAddress(oop_copy_entry));
2347 
2348   __ BIND(L_checkcast_copy);
2349     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2350     {
2351       // Before looking at dst.length, make sure dst is also an objArray.
2352       __ ldrw(rscratch1, Address(r15, lh_offset));
2353       __ movw(rscratch2, objArray_lh);
2354       __ eorw(rscratch1, rscratch1, rscratch2);
2355       __ cbnzw(rscratch1, L_failed);
2356 
2357       // It is safe to examine both src.length and dst.length.
2358       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2359                              r15, L_failed);
2360 
2361       __ load_klass(dst_klass, dst); // reload
2362 
2363       // Marshal the base address arguments now, freeing registers.
2364       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2365       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2366       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2367       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2368       __ movw(count, length);           // length (reloaded)
2369       Register sco_temp = c_rarg3;      // this register is free now
2370       assert_different_registers(from, to, count, sco_temp,
2371                                  dst_klass, scratch_src_klass);
2372       // assert_clean_int(count, sco_temp);
2373 
2374       // Generate the type check.
2375       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2376       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2377 
2378       // Smashes rscratch1, rscratch2
2379       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2380                           L_plain_copy);
2381 
2382       // Fetch destination element klass from the ObjArrayKlass header.
2383       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2384       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2386 
2387       // the checkcast_copy loop needs two extra arguments:
2388       assert(c_rarg3 == sco_temp, "#3 already in place");
2389       // Set up arguments for checkcast_copy_entry.
2390       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2391       __ b(RuntimeAddress(checkcast_copy_entry));
2392     }
2393 
2394   __ BIND(L_failed);
2395     __ mov(r0, -1);
2396     __ leave();   // required for proper stackwalking of RuntimeStub frame
2397     __ ret(lr);
2398 
2399     return start;
2400   }
2401 
2402   //
2403   // Generate stub for array fill. If "aligned" is true, the
2404   // "to" address is assumed to be heapword aligned.
2405   //
2406   // Arguments for generated stub:
2407   //   to:    c_rarg0
2408   //   value: c_rarg1
2409   //   count: c_rarg2 treated as signed
2410   //
2411   address generate_fill(BasicType t, bool aligned, const char *name) {
2412     __ align(CodeEntryAlignment);
2413     StubCodeMark mark(this, "StubRoutines", name);
2414     address start = __ pc();
2415 
2416     BLOCK_COMMENT("Entry:");
2417 
2418     const Register to        = c_rarg0;  // source array address
2419     const Register value     = c_rarg1;  // value
2420     const Register count     = c_rarg2;  // elements count
2421 
2422     const Register bz_base = r10;        // base for block_zero routine
2423     const Register cnt_words = r11;      // temp register
2424 
2425     __ enter();
2426 
2427     Label L_fill_elements, L_exit1;
2428 
2429     int shift = -1;
2430     switch (t) {
2431       case T_BYTE:
2432         shift = 0;
2433         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2434         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2435         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2436         __ br(Assembler::LO, L_fill_elements);
2437         break;
2438       case T_SHORT:
2439         shift = 1;
2440         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2441         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2442         __ br(Assembler::LO, L_fill_elements);
2443         break;
2444       case T_INT:
2445         shift = 2;
2446         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2447         __ br(Assembler::LO, L_fill_elements);
2448         break;
2449       default: ShouldNotReachHere();
2450     }
2451 
2452     // Align source address at 8 bytes address boundary.
2453     Label L_skip_align1, L_skip_align2, L_skip_align4;
2454     if (!aligned) {
2455       switch (t) {
2456         case T_BYTE:
2457           // One byte misalignment happens only for byte arrays.
2458           __ tbz(to, 0, L_skip_align1);
2459           __ strb(value, Address(__ post(to, 1)));
2460           __ subw(count, count, 1);
2461           __ bind(L_skip_align1);
2462           // Fallthrough
2463         case T_SHORT:
2464           // Two bytes misalignment happens only for byte and short (char) arrays.
2465           __ tbz(to, 1, L_skip_align2);
2466           __ strh(value, Address(__ post(to, 2)));
2467           __ subw(count, count, 2 >> shift);
2468           __ bind(L_skip_align2);
2469           // Fallthrough
2470         case T_INT:
2471           // Align to 8 bytes, we know we are 4 byte aligned to start.
2472           __ tbz(to, 2, L_skip_align4);
2473           __ strw(value, Address(__ post(to, 4)));
2474           __ subw(count, count, 4 >> shift);
2475           __ bind(L_skip_align4);
2476           break;
2477         default: ShouldNotReachHere();
2478       }
2479     }
2480 
2481     //
2482     //  Fill large chunks
2483     //
2484     __ lsrw(cnt_words, count, 3 - shift); // number of words
2485     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2486     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2487     if (UseBlockZeroing) {
2488       Label non_block_zeroing, rest;
2489       // If the fill value is zero we can use the fast zero_words().
2490       __ cbnz(value, non_block_zeroing);
2491       __ mov(bz_base, to);
2492       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2493       address tpc = __ zero_words(bz_base, cnt_words);
2494       if (tpc == nullptr) {
2495         fatal("CodeCache is full at generate_fill");
2496       }
2497       __ b(rest);
2498       __ bind(non_block_zeroing);
2499       __ fill_words(to, cnt_words, value);
2500       __ bind(rest);
2501     } else {
2502       __ fill_words(to, cnt_words, value);
2503     }
2504 
2505     // Remaining count is less than 8 bytes. Fill it by a single store.
2506     // Note that the total length is no less than 8 bytes.
2507     if (t == T_BYTE || t == T_SHORT) {
2508       Label L_exit1;
2509       __ cbzw(count, L_exit1);
2510       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2511       __ str(value, Address(to, -8));    // overwrite some elements
2512       __ bind(L_exit1);
2513       __ leave();
2514       __ ret(lr);
2515     }
2516 
2517     // Handle copies less than 8 bytes.
2518     Label L_fill_2, L_fill_4, L_exit2;
2519     __ bind(L_fill_elements);
2520     switch (t) {
2521       case T_BYTE:
2522         __ tbz(count, 0, L_fill_2);
2523         __ strb(value, Address(__ post(to, 1)));
2524         __ bind(L_fill_2);
2525         __ tbz(count, 1, L_fill_4);
2526         __ strh(value, Address(__ post(to, 2)));
2527         __ bind(L_fill_4);
2528         __ tbz(count, 2, L_exit2);
2529         __ strw(value, Address(to));
2530         break;
2531       case T_SHORT:
2532         __ tbz(count, 0, L_fill_4);
2533         __ strh(value, Address(__ post(to, 2)));
2534         __ bind(L_fill_4);
2535         __ tbz(count, 1, L_exit2);
2536         __ strw(value, Address(to));
2537         break;
2538       case T_INT:
2539         __ cbzw(count, L_exit2);
2540         __ strw(value, Address(to));
2541         break;
2542       default: ShouldNotReachHere();
2543     }
2544     __ bind(L_exit2);
2545     __ leave();
2546     __ ret(lr);
2547     return start;
2548   }
2549 
2550   address generate_data_cache_writeback() {
2551     const Register line        = c_rarg0;  // address of line to write back
2552 
2553     __ align(CodeEntryAlignment);
2554 
2555     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2556 
2557     address start = __ pc();
2558     __ enter();
2559     __ cache_wb(Address(line, 0));
2560     __ leave();
2561     __ ret(lr);
2562 
2563     return start;
2564   }
2565 
2566   address generate_data_cache_writeback_sync() {
2567     const Register is_pre     = c_rarg0;  // pre or post sync
2568 
2569     __ align(CodeEntryAlignment);
2570 
2571     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2572 
2573     // pre wbsync is a no-op
2574     // post wbsync translates to an sfence
2575 
2576     Label skip;
2577     address start = __ pc();
2578     __ enter();
2579     __ cbnz(is_pre, skip);
2580     __ cache_wbsync(false);
2581     __ bind(skip);
2582     __ leave();
2583     __ ret(lr);
2584 
2585     return start;
2586   }
2587 
2588   void generate_arraycopy_stubs() {
2589     address entry;
2590     address entry_jbyte_arraycopy;
2591     address entry_jshort_arraycopy;
2592     address entry_jint_arraycopy;
2593     address entry_oop_arraycopy;
2594     address entry_jlong_arraycopy;
2595     address entry_checkcast_arraycopy;
2596 
2597     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2598     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2599 
2600     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2601     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2602 
2603     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2604     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2605 
2606     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2607 
2608     //*** jbyte
2609     // Always need aligned and unaligned versions
2610     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2611                                                                                   "jbyte_disjoint_arraycopy");
2612     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2613                                                                                   &entry_jbyte_arraycopy,
2614                                                                                   "jbyte_arraycopy");
2615     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2616                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2617     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2618                                                                                   "arrayof_jbyte_arraycopy");
2619 
2620     //*** jshort
2621     // Always need aligned and unaligned versions
2622     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2623                                                                                     "jshort_disjoint_arraycopy");
2624     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2625                                                                                     &entry_jshort_arraycopy,
2626                                                                                     "jshort_arraycopy");
2627     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2628                                                                                     "arrayof_jshort_disjoint_arraycopy");
2629     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2630                                                                                     "arrayof_jshort_arraycopy");
2631 
2632     //*** jint
2633     // Aligned versions
2634     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2635                                                                                 "arrayof_jint_disjoint_arraycopy");
2636     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2637                                                                                 "arrayof_jint_arraycopy");
2638     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2639     // entry_jint_arraycopy always points to the unaligned version
2640     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2641                                                                                 "jint_disjoint_arraycopy");
2642     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2643                                                                                 &entry_jint_arraycopy,
2644                                                                                 "jint_arraycopy");
2645 
2646     //*** jlong
2647     // It is always aligned
2648     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2649                                                                                   "arrayof_jlong_disjoint_arraycopy");
2650     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2651                                                                                   "arrayof_jlong_arraycopy");
2652     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2653     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2654 
2655     //*** oops
2656     {
2657       // With compressed oops we need unaligned versions; notice that
2658       // we overwrite entry_oop_arraycopy.
2659       bool aligned = !UseCompressedOops;
2660 
2661       StubRoutines::_arrayof_oop_disjoint_arraycopy
2662         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2663                                      /*dest_uninitialized*/false);
2664       StubRoutines::_arrayof_oop_arraycopy
2665         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2666                                      /*dest_uninitialized*/false);
2667       // Aligned versions without pre-barriers
2668       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2669         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2670                                      /*dest_uninitialized*/true);
2671       StubRoutines::_arrayof_oop_arraycopy_uninit
2672         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2673                                      /*dest_uninitialized*/true);
2674     }
2675 
2676     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2677     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2678     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2679     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2680 
2681     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2682     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2683                                                                         /*dest_uninitialized*/true);
2684 
2685     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2686                                                               entry_jbyte_arraycopy,
2687                                                               entry_jshort_arraycopy,
2688                                                               entry_jint_arraycopy,
2689                                                               entry_jlong_arraycopy);
2690 
2691     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2692                                                                entry_jbyte_arraycopy,
2693                                                                entry_jshort_arraycopy,
2694                                                                entry_jint_arraycopy,
2695                                                                entry_oop_arraycopy,
2696                                                                entry_jlong_arraycopy,
2697                                                                entry_checkcast_arraycopy);
2698 
2699     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2700     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2701     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2702     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2703     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2704     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2705   }
2706 
2707   void generate_math_stubs() { Unimplemented(); }
2708 
2709   // Arguments:
2710   //
2711   // Inputs:
2712   //   c_rarg0   - source byte array address
2713   //   c_rarg1   - destination byte array address
2714   //   c_rarg2   - K (key) in little endian int array
2715   //
2716   address generate_aescrypt_encryptBlock() {
2717     __ align(CodeEntryAlignment);
2718     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2719 
2720     const Register from        = c_rarg0;  // source array address
2721     const Register to          = c_rarg1;  // destination array address
2722     const Register key         = c_rarg2;  // key array address
2723     const Register keylen      = rscratch1;
2724 
2725     address start = __ pc();
2726     __ enter();
2727 
2728     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2729 
2730     __ aesenc_loadkeys(key, keylen);
2731     __ aesecb_encrypt(from, to, keylen);
2732 
2733     __ mov(r0, 0);
2734 
2735     __ leave();
2736     __ ret(lr);
2737 
2738     return start;
2739   }
2740 
2741   // Arguments:
2742   //
2743   // Inputs:
2744   //   c_rarg0   - source byte array address
2745   //   c_rarg1   - destination byte array address
2746   //   c_rarg2   - K (key) in little endian int array
2747   //
2748   address generate_aescrypt_decryptBlock() {
2749     assert(UseAES, "need AES cryptographic extension support");
2750     __ align(CodeEntryAlignment);
2751     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2752     Label L_doLast;
2753 
2754     const Register from        = c_rarg0;  // source array address
2755     const Register to          = c_rarg1;  // destination array address
2756     const Register key         = c_rarg2;  // key array address
2757     const Register keylen      = rscratch1;
2758 
2759     address start = __ pc();
2760     __ enter(); // required for proper stackwalking of RuntimeStub frame
2761 
2762     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2763 
2764     __ aesecb_decrypt(from, to, key, keylen);
2765 
2766     __ mov(r0, 0);
2767 
2768     __ leave();
2769     __ ret(lr);
2770 
2771     return start;
2772   }
2773 
2774   // Arguments:
2775   //
2776   // Inputs:
2777   //   c_rarg0   - source byte array address
2778   //   c_rarg1   - destination byte array address
2779   //   c_rarg2   - K (key) in little endian int array
2780   //   c_rarg3   - r vector byte array address
2781   //   c_rarg4   - input length
2782   //
2783   // Output:
2784   //   x0        - input length
2785   //
2786   address generate_cipherBlockChaining_encryptAESCrypt() {
2787     assert(UseAES, "need AES cryptographic extension support");
2788     __ align(CodeEntryAlignment);
2789     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2790 
2791     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2792 
2793     const Register from        = c_rarg0;  // source array address
2794     const Register to          = c_rarg1;  // destination array address
2795     const Register key         = c_rarg2;  // key array address
2796     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2797                                            // and left with the results of the last encryption block
2798     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2799     const Register keylen      = rscratch1;
2800 
2801     address start = __ pc();
2802 
2803       __ enter();
2804 
2805       __ movw(rscratch2, len_reg);
2806 
2807       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2808 
2809       __ ld1(v0, __ T16B, rvec);
2810 
2811       __ cmpw(keylen, 52);
2812       __ br(Assembler::CC, L_loadkeys_44);
2813       __ br(Assembler::EQ, L_loadkeys_52);
2814 
2815       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2816       __ rev32(v17, __ T16B, v17);
2817       __ rev32(v18, __ T16B, v18);
2818     __ BIND(L_loadkeys_52);
2819       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2820       __ rev32(v19, __ T16B, v19);
2821       __ rev32(v20, __ T16B, v20);
2822     __ BIND(L_loadkeys_44);
2823       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2824       __ rev32(v21, __ T16B, v21);
2825       __ rev32(v22, __ T16B, v22);
2826       __ rev32(v23, __ T16B, v23);
2827       __ rev32(v24, __ T16B, v24);
2828       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2829       __ rev32(v25, __ T16B, v25);
2830       __ rev32(v26, __ T16B, v26);
2831       __ rev32(v27, __ T16B, v27);
2832       __ rev32(v28, __ T16B, v28);
2833       __ ld1(v29, v30, v31, __ T16B, key);
2834       __ rev32(v29, __ T16B, v29);
2835       __ rev32(v30, __ T16B, v30);
2836       __ rev32(v31, __ T16B, v31);
2837 
2838     __ BIND(L_aes_loop);
2839       __ ld1(v1, __ T16B, __ post(from, 16));
2840       __ eor(v0, __ T16B, v0, v1);
2841 
2842       __ br(Assembler::CC, L_rounds_44);
2843       __ br(Assembler::EQ, L_rounds_52);
2844 
2845       __ aese(v0, v17); __ aesmc(v0, v0);
2846       __ aese(v0, v18); __ aesmc(v0, v0);
2847     __ BIND(L_rounds_52);
2848       __ aese(v0, v19); __ aesmc(v0, v0);
2849       __ aese(v0, v20); __ aesmc(v0, v0);
2850     __ BIND(L_rounds_44);
2851       __ aese(v0, v21); __ aesmc(v0, v0);
2852       __ aese(v0, v22); __ aesmc(v0, v0);
2853       __ aese(v0, v23); __ aesmc(v0, v0);
2854       __ aese(v0, v24); __ aesmc(v0, v0);
2855       __ aese(v0, v25); __ aesmc(v0, v0);
2856       __ aese(v0, v26); __ aesmc(v0, v0);
2857       __ aese(v0, v27); __ aesmc(v0, v0);
2858       __ aese(v0, v28); __ aesmc(v0, v0);
2859       __ aese(v0, v29); __ aesmc(v0, v0);
2860       __ aese(v0, v30);
2861       __ eor(v0, __ T16B, v0, v31);
2862 
2863       __ st1(v0, __ T16B, __ post(to, 16));
2864 
2865       __ subw(len_reg, len_reg, 16);
2866       __ cbnzw(len_reg, L_aes_loop);
2867 
2868       __ st1(v0, __ T16B, rvec);
2869 
2870       __ mov(r0, rscratch2);
2871 
2872       __ leave();
2873       __ ret(lr);
2874 
2875       return start;
2876   }
2877 
2878   // Arguments:
2879   //
2880   // Inputs:
2881   //   c_rarg0   - source byte array address
2882   //   c_rarg1   - destination byte array address
2883   //   c_rarg2   - K (key) in little endian int array
2884   //   c_rarg3   - r vector byte array address
2885   //   c_rarg4   - input length
2886   //
2887   // Output:
2888   //   r0        - input length
2889   //
2890   address generate_cipherBlockChaining_decryptAESCrypt() {
2891     assert(UseAES, "need AES cryptographic extension support");
2892     __ align(CodeEntryAlignment);
2893     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2894 
2895     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2896 
2897     const Register from        = c_rarg0;  // source array address
2898     const Register to          = c_rarg1;  // destination array address
2899     const Register key         = c_rarg2;  // key array address
2900     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2901                                            // and left with the results of the last encryption block
2902     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2903     const Register keylen      = rscratch1;
2904 
2905     address start = __ pc();
2906 
2907       __ enter();
2908 
2909       __ movw(rscratch2, len_reg);
2910 
2911       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2912 
2913       __ ld1(v2, __ T16B, rvec);
2914 
2915       __ ld1(v31, __ T16B, __ post(key, 16));
2916       __ rev32(v31, __ T16B, v31);
2917 
2918       __ cmpw(keylen, 52);
2919       __ br(Assembler::CC, L_loadkeys_44);
2920       __ br(Assembler::EQ, L_loadkeys_52);
2921 
2922       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2923       __ rev32(v17, __ T16B, v17);
2924       __ rev32(v18, __ T16B, v18);
2925     __ BIND(L_loadkeys_52);
2926       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2927       __ rev32(v19, __ T16B, v19);
2928       __ rev32(v20, __ T16B, v20);
2929     __ BIND(L_loadkeys_44);
2930       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2931       __ rev32(v21, __ T16B, v21);
2932       __ rev32(v22, __ T16B, v22);
2933       __ rev32(v23, __ T16B, v23);
2934       __ rev32(v24, __ T16B, v24);
2935       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2936       __ rev32(v25, __ T16B, v25);
2937       __ rev32(v26, __ T16B, v26);
2938       __ rev32(v27, __ T16B, v27);
2939       __ rev32(v28, __ T16B, v28);
2940       __ ld1(v29, v30, __ T16B, key);
2941       __ rev32(v29, __ T16B, v29);
2942       __ rev32(v30, __ T16B, v30);
2943 
2944     __ BIND(L_aes_loop);
2945       __ ld1(v0, __ T16B, __ post(from, 16));
2946       __ orr(v1, __ T16B, v0, v0);
2947 
2948       __ br(Assembler::CC, L_rounds_44);
2949       __ br(Assembler::EQ, L_rounds_52);
2950 
2951       __ aesd(v0, v17); __ aesimc(v0, v0);
2952       __ aesd(v0, v18); __ aesimc(v0, v0);
2953     __ BIND(L_rounds_52);
2954       __ aesd(v0, v19); __ aesimc(v0, v0);
2955       __ aesd(v0, v20); __ aesimc(v0, v0);
2956     __ BIND(L_rounds_44);
2957       __ aesd(v0, v21); __ aesimc(v0, v0);
2958       __ aesd(v0, v22); __ aesimc(v0, v0);
2959       __ aesd(v0, v23); __ aesimc(v0, v0);
2960       __ aesd(v0, v24); __ aesimc(v0, v0);
2961       __ aesd(v0, v25); __ aesimc(v0, v0);
2962       __ aesd(v0, v26); __ aesimc(v0, v0);
2963       __ aesd(v0, v27); __ aesimc(v0, v0);
2964       __ aesd(v0, v28); __ aesimc(v0, v0);
2965       __ aesd(v0, v29); __ aesimc(v0, v0);
2966       __ aesd(v0, v30);
2967       __ eor(v0, __ T16B, v0, v31);
2968       __ eor(v0, __ T16B, v0, v2);
2969 
2970       __ st1(v0, __ T16B, __ post(to, 16));
2971       __ orr(v2, __ T16B, v1, v1);
2972 
2973       __ subw(len_reg, len_reg, 16);
2974       __ cbnzw(len_reg, L_aes_loop);
2975 
2976       __ st1(v2, __ T16B, rvec);
2977 
2978       __ mov(r0, rscratch2);
2979 
2980       __ leave();
2981       __ ret(lr);
2982 
2983     return start;
2984   }
2985 
2986   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2987   // Inputs: 128-bits. in is preserved.
2988   // The least-significant 64-bit word is in the upper dword of each vector.
2989   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2990   // Output: result
2991   void be_add_128_64(FloatRegister result, FloatRegister in,
2992                      FloatRegister inc, FloatRegister tmp) {
2993     assert_different_registers(result, tmp, inc);
2994 
2995     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2996                                            // input
2997     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2998     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
2999                                            // MSD == 0 (must be!) to LSD
3000     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
3001   }
3002 
3003   // CTR AES crypt.
3004   // Arguments:
3005   //
3006   // Inputs:
3007   //   c_rarg0   - source byte array address
3008   //   c_rarg1   - destination byte array address
3009   //   c_rarg2   - K (key) in little endian int array
3010   //   c_rarg3   - counter vector byte array address
3011   //   c_rarg4   - input length
3012   //   c_rarg5   - saved encryptedCounter start
3013   //   c_rarg6   - saved used length
3014   //
3015   // Output:
3016   //   r0       - input length
3017   //
3018   address generate_counterMode_AESCrypt() {
3019     const Register in = c_rarg0;
3020     const Register out = c_rarg1;
3021     const Register key = c_rarg2;
3022     const Register counter = c_rarg3;
3023     const Register saved_len = c_rarg4, len = r10;
3024     const Register saved_encrypted_ctr = c_rarg5;
3025     const Register used_ptr = c_rarg6, used = r12;
3026 
3027     const Register offset = r7;
3028     const Register keylen = r11;
3029 
3030     const unsigned char block_size = 16;
3031     const int bulk_width = 4;
3032     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3033     // performance with larger data sizes, but it also means that the
3034     // fast path isn't used until you have at least 8 blocks, and up
3035     // to 127 bytes of data will be executed on the slow path. For
3036     // that reason, and also so as not to blow away too much icache, 4
3037     // blocks seems like a sensible compromise.
3038 
3039     // Algorithm:
3040     //
3041     //    if (len == 0) {
3042     //        goto DONE;
3043     //    }
3044     //    int result = len;
3045     //    do {
3046     //        if (used >= blockSize) {
3047     //            if (len >= bulk_width * blockSize) {
3048     //                CTR_large_block();
3049     //                if (len == 0)
3050     //                    goto DONE;
3051     //            }
3052     //            for (;;) {
3053     //                16ByteVector v0 = counter;
3054     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3055     //                used = 0;
3056     //                if (len < blockSize)
3057     //                    break;    /* goto NEXT */
3058     //                16ByteVector v1 = load16Bytes(in, offset);
3059     //                v1 = v1 ^ encryptedCounter;
3060     //                store16Bytes(out, offset);
3061     //                used = blockSize;
3062     //                offset += blockSize;
3063     //                len -= blockSize;
3064     //                if (len == 0)
3065     //                    goto DONE;
3066     //            }
3067     //        }
3068     //      NEXT:
3069     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3070     //        len--;
3071     //    } while (len != 0);
3072     //  DONE:
3073     //    return result;
3074     //
3075     // CTR_large_block()
3076     //    Wide bulk encryption of whole blocks.
3077 
3078     __ align(CodeEntryAlignment);
3079     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3080     const address start = __ pc();
3081     __ enter();
3082 
3083     Label DONE, CTR_large_block, large_block_return;
3084     __ ldrw(used, Address(used_ptr));
3085     __ cbzw(saved_len, DONE);
3086 
3087     __ mov(len, saved_len);
3088     __ mov(offset, 0);
3089 
3090     // Compute #rounds for AES based on the length of the key array
3091     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3092 
3093     __ aesenc_loadkeys(key, keylen);
3094 
3095     {
3096       Label L_CTR_loop, NEXT;
3097 
3098       __ bind(L_CTR_loop);
3099 
3100       __ cmp(used, block_size);
3101       __ br(__ LO, NEXT);
3102 
3103       // Maybe we have a lot of data
3104       __ subsw(rscratch1, len, bulk_width * block_size);
3105       __ br(__ HS, CTR_large_block);
3106       __ BIND(large_block_return);
3107       __ cbzw(len, DONE);
3108 
3109       // Setup the counter
3110       __ movi(v4, __ T4S, 0);
3111       __ movi(v5, __ T4S, 1);
3112       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3113 
3114       // 128-bit big-endian increment
3115       __ ld1(v0, __ T16B, counter);
3116       __ rev64(v16, __ T16B, v0);
3117       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3118       __ rev64(v16, __ T16B, v16);
3119       __ st1(v16, __ T16B, counter);
3120       // Previous counter value is in v0
3121       // v4 contains { 0, 1 }
3122 
3123       {
3124         // We have fewer than bulk_width blocks of data left. Encrypt
3125         // them one by one until there is less than a full block
3126         // remaining, being careful to save both the encrypted counter
3127         // and the counter.
3128 
3129         Label inner_loop;
3130         __ bind(inner_loop);
3131         // Counter to encrypt is in v0
3132         __ aesecb_encrypt(noreg, noreg, keylen);
3133         __ st1(v0, __ T16B, saved_encrypted_ctr);
3134 
3135         // Do we have a remaining full block?
3136 
3137         __ mov(used, 0);
3138         __ cmp(len, block_size);
3139         __ br(__ LO, NEXT);
3140 
3141         // Yes, we have a full block
3142         __ ldrq(v1, Address(in, offset));
3143         __ eor(v1, __ T16B, v1, v0);
3144         __ strq(v1, Address(out, offset));
3145         __ mov(used, block_size);
3146         __ add(offset, offset, block_size);
3147 
3148         __ subw(len, len, block_size);
3149         __ cbzw(len, DONE);
3150 
3151         // Increment the counter, store it back
3152         __ orr(v0, __ T16B, v16, v16);
3153         __ rev64(v16, __ T16B, v16);
3154         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3155         __ rev64(v16, __ T16B, v16);
3156         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3157 
3158         __ b(inner_loop);
3159       }
3160 
3161       __ BIND(NEXT);
3162 
3163       // Encrypt a single byte, and loop.
3164       // We expect this to be a rare event.
3165       __ ldrb(rscratch1, Address(in, offset));
3166       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3167       __ eor(rscratch1, rscratch1, rscratch2);
3168       __ strb(rscratch1, Address(out, offset));
3169       __ add(offset, offset, 1);
3170       __ add(used, used, 1);
3171       __ subw(len, len,1);
3172       __ cbnzw(len, L_CTR_loop);
3173     }
3174 
3175     __ bind(DONE);
3176     __ strw(used, Address(used_ptr));
3177     __ mov(r0, saved_len);
3178 
3179     __ leave(); // required for proper stackwalking of RuntimeStub frame
3180     __ ret(lr);
3181 
3182     // Bulk encryption
3183 
3184     __ BIND (CTR_large_block);
3185     assert(bulk_width == 4 || bulk_width == 8, "must be");
3186 
3187     if (bulk_width == 8) {
3188       __ sub(sp, sp, 4 * 16);
3189       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3190     }
3191     __ sub(sp, sp, 4 * 16);
3192     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3193     RegSet saved_regs = (RegSet::of(in, out, offset)
3194                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3195     __ push(saved_regs, sp);
3196     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3197     __ add(in, in, offset);
3198     __ add(out, out, offset);
3199 
3200     // Keys should already be loaded into the correct registers
3201 
3202     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3203     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3204 
3205     // AES/CTR loop
3206     {
3207       Label L_CTR_loop;
3208       __ BIND(L_CTR_loop);
3209 
3210       // Setup the counters
3211       __ movi(v8, __ T4S, 0);
3212       __ movi(v9, __ T4S, 1);
3213       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3214 
3215       for (int i = 0; i < bulk_width; i++) {
3216         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3217         __ rev64(v0_ofs, __ T16B, v16);
3218         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3219       }
3220 
3221       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3222 
3223       // Encrypt the counters
3224       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3225 
3226       if (bulk_width == 8) {
3227         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3228       }
3229 
3230       // XOR the encrypted counters with the inputs
3231       for (int i = 0; i < bulk_width; i++) {
3232         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3233         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3234         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3235       }
3236 
3237       // Write the encrypted data
3238       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3239       if (bulk_width == 8) {
3240         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3241       }
3242 
3243       __ subw(len, len, 16 * bulk_width);
3244       __ cbnzw(len, L_CTR_loop);
3245     }
3246 
3247     // Save the counter back where it goes
3248     __ rev64(v16, __ T16B, v16);
3249     __ st1(v16, __ T16B, counter);
3250 
3251     __ pop(saved_regs, sp);
3252 
3253     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3254     if (bulk_width == 8) {
3255       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3256     }
3257 
3258     __ andr(rscratch1, len, -16 * bulk_width);
3259     __ sub(len, len, rscratch1);
3260     __ add(offset, offset, rscratch1);
3261     __ mov(used, 16);
3262     __ strw(used, Address(used_ptr));
3263     __ b(large_block_return);
3264 
3265     return start;
3266   }
3267 
3268   // Vector AES Galois Counter Mode implementation. Parameters:
3269   //
3270   // in = c_rarg0
3271   // len = c_rarg1
3272   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3273   // out = c_rarg3
3274   // key = c_rarg4
3275   // state = c_rarg5 - GHASH.state
3276   // subkeyHtbl = c_rarg6 - powers of H
3277   // counter = c_rarg7 - 16 bytes of CTR
3278   // return - number of processed bytes
3279   address generate_galoisCounterMode_AESCrypt() {
3280     address ghash_polynomial = __ pc();
3281     __ emit_int64(0x87);  // The low-order bits of the field
3282                           // polynomial (i.e. p = z^7+z^2+z+1)
3283                           // repeated in the low and high parts of a
3284                           // 128-bit vector
3285     __ emit_int64(0x87);
3286 
3287     __ align(CodeEntryAlignment);
3288      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3289     address start = __ pc();
3290     __ enter();
3291 
3292     const Register in = c_rarg0;
3293     const Register len = c_rarg1;
3294     const Register ct = c_rarg2;
3295     const Register out = c_rarg3;
3296     // and updated with the incremented counter in the end
3297 
3298     const Register key = c_rarg4;
3299     const Register state = c_rarg5;
3300 
3301     const Register subkeyHtbl = c_rarg6;
3302 
3303     const Register counter = c_rarg7;
3304 
3305     const Register keylen = r10;
3306     // Save state before entering routine
3307     __ sub(sp, sp, 4 * 16);
3308     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3309     __ sub(sp, sp, 4 * 16);
3310     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3311 
3312     // __ andr(len, len, -512);
3313     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3314     __ str(len, __ pre(sp, -2 * wordSize));
3315 
3316     Label DONE;
3317     __ cbz(len, DONE);
3318 
3319     // Compute #rounds for AES based on the length of the key array
3320     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3321 
3322     __ aesenc_loadkeys(key, keylen);
3323     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3324     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3325 
3326     // AES/CTR loop
3327     {
3328       Label L_CTR_loop;
3329       __ BIND(L_CTR_loop);
3330 
3331       // Setup the counters
3332       __ movi(v8, __ T4S, 0);
3333       __ movi(v9, __ T4S, 1);
3334       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3335 
3336       assert(v0->encoding() < v8->encoding(), "");
3337       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3338         FloatRegister f = as_FloatRegister(i);
3339         __ rev32(f, __ T16B, v16);
3340         __ addv(v16, __ T4S, v16, v8);
3341       }
3342 
3343       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3344 
3345       // Encrypt the counters
3346       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3347 
3348       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3349 
3350       // XOR the encrypted counters with the inputs
3351       for (int i = 0; i < 8; i++) {
3352         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3353         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3354         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3355       }
3356       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3357       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3358 
3359       __ subw(len, len, 16 * 8);
3360       __ cbnzw(len, L_CTR_loop);
3361     }
3362 
3363     __ rev32(v16, __ T16B, v16);
3364     __ st1(v16, __ T16B, counter);
3365 
3366     __ ldr(len, Address(sp));
3367     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3368 
3369     // GHASH/CTR loop
3370     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3371                                 len, /*unrolls*/4);
3372 
3373 #ifdef ASSERT
3374     { Label L;
3375       __ cmp(len, (unsigned char)0);
3376       __ br(Assembler::EQ, L);
3377       __ stop("stubGenerator: abort");
3378       __ bind(L);
3379   }
3380 #endif
3381 
3382   __ bind(DONE);
3383     // Return the number of bytes processed
3384     __ ldr(r0, __ post(sp, 2 * wordSize));
3385 
3386     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3387     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3388 
3389     __ leave(); // required for proper stackwalking of RuntimeStub frame
3390     __ ret(lr);
3391      return start;
3392   }
3393 
3394   class Cached64Bytes {
3395   private:
3396     MacroAssembler *_masm;
3397     Register _regs[8];
3398 
3399   public:
3400     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3401       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3402       auto it = rs.begin();
3403       for (auto &r: _regs) {
3404         r = *it;
3405         ++it;
3406       }
3407     }
3408 
3409     void gen_loads(Register base) {
3410       for (int i = 0; i < 8; i += 2) {
3411         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3412       }
3413     }
3414 
3415     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3416     void extract_u32(Register dest, int i) {
3417       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3418     }
3419   };
3420 
3421   // Utility routines for md5.
3422   // Clobbers r10 and r11.
3423   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3424               int k, int s, int t) {
3425     Register rscratch3 = r10;
3426     Register rscratch4 = r11;
3427 
3428     __ eorw(rscratch3, r3, r4);
3429     __ movw(rscratch2, t);
3430     __ andw(rscratch3, rscratch3, r2);
3431     __ addw(rscratch4, r1, rscratch2);
3432     reg_cache.extract_u32(rscratch1, k);
3433     __ eorw(rscratch3, rscratch3, r4);
3434     __ addw(rscratch4, rscratch4, rscratch1);
3435     __ addw(rscratch3, rscratch3, rscratch4);
3436     __ rorw(rscratch2, rscratch3, 32 - s);
3437     __ addw(r1, rscratch2, r2);
3438   }
3439 
3440   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3441               int k, int s, int t) {
3442     Register rscratch3 = r10;
3443     Register rscratch4 = r11;
3444 
3445     reg_cache.extract_u32(rscratch1, k);
3446     __ movw(rscratch2, t);
3447     __ addw(rscratch4, r1, rscratch2);
3448     __ addw(rscratch4, rscratch4, rscratch1);
3449     __ bicw(rscratch2, r3, r4);
3450     __ andw(rscratch3, r2, r4);
3451     __ addw(rscratch2, rscratch2, rscratch4);
3452     __ addw(rscratch2, rscratch2, rscratch3);
3453     __ rorw(rscratch2, rscratch2, 32 - s);
3454     __ addw(r1, rscratch2, r2);
3455   }
3456 
3457   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3458               int k, int s, int t) {
3459     Register rscratch3 = r10;
3460     Register rscratch4 = r11;
3461 
3462     __ eorw(rscratch3, r3, r4);
3463     __ movw(rscratch2, t);
3464     __ addw(rscratch4, r1, rscratch2);
3465     reg_cache.extract_u32(rscratch1, k);
3466     __ eorw(rscratch3, rscratch3, r2);
3467     __ addw(rscratch4, rscratch4, rscratch1);
3468     __ addw(rscratch3, rscratch3, rscratch4);
3469     __ rorw(rscratch2, rscratch3, 32 - s);
3470     __ addw(r1, rscratch2, r2);
3471   }
3472 
3473   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3474               int k, int s, int t) {
3475     Register rscratch3 = r10;
3476     Register rscratch4 = r11;
3477 
3478     __ movw(rscratch3, t);
3479     __ ornw(rscratch2, r2, r4);
3480     __ addw(rscratch4, r1, rscratch3);
3481     reg_cache.extract_u32(rscratch1, k);
3482     __ eorw(rscratch3, rscratch2, r3);
3483     __ addw(rscratch4, rscratch4, rscratch1);
3484     __ addw(rscratch3, rscratch3, rscratch4);
3485     __ rorw(rscratch2, rscratch3, 32 - s);
3486     __ addw(r1, rscratch2, r2);
3487   }
3488 
3489   // Arguments:
3490   //
3491   // Inputs:
3492   //   c_rarg0   - byte[]  source+offset
3493   //   c_rarg1   - int[]   SHA.state
3494   //   c_rarg2   - int     offset
3495   //   c_rarg3   - int     limit
3496   //
3497   address generate_md5_implCompress(bool multi_block, const char *name) {
3498     __ align(CodeEntryAlignment);
3499     StubCodeMark mark(this, "StubRoutines", name);
3500     address start = __ pc();
3501 
3502     Register buf       = c_rarg0;
3503     Register state     = c_rarg1;
3504     Register ofs       = c_rarg2;
3505     Register limit     = c_rarg3;
3506     Register a         = r4;
3507     Register b         = r5;
3508     Register c         = r6;
3509     Register d         = r7;
3510     Register rscratch3 = r10;
3511     Register rscratch4 = r11;
3512 
3513     Register state_regs[2] = { r12, r13 };
3514     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3515     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3516 
3517     __ push(saved_regs, sp);
3518 
3519     __ ldp(state_regs[0], state_regs[1], Address(state));
3520     __ ubfx(a, state_regs[0],  0, 32);
3521     __ ubfx(b, state_regs[0], 32, 32);
3522     __ ubfx(c, state_regs[1],  0, 32);
3523     __ ubfx(d, state_regs[1], 32, 32);
3524 
3525     Label md5_loop;
3526     __ BIND(md5_loop);
3527 
3528     reg_cache.gen_loads(buf);
3529 
3530     // Round 1
3531     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3532     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3533     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3534     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3535     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3536     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3537     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3538     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3539     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3540     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3541     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3542     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3543     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3544     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3545     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3546     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3547 
3548     // Round 2
3549     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3550     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3551     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3552     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3553     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3554     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3555     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3556     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3557     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3558     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3559     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3560     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3561     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3562     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3563     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3564     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3565 
3566     // Round 3
3567     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3568     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3569     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3570     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3571     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3572     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3573     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3574     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3575     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3576     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3577     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3578     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3579     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3580     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3581     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3582     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3583 
3584     // Round 4
3585     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3586     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3587     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3588     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3589     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3590     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3591     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3592     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3593     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3594     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3595     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3596     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3597     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3598     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3599     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3600     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3601 
3602     __ addw(a, state_regs[0], a);
3603     __ ubfx(rscratch2, state_regs[0], 32, 32);
3604     __ addw(b, rscratch2, b);
3605     __ addw(c, state_regs[1], c);
3606     __ ubfx(rscratch4, state_regs[1], 32, 32);
3607     __ addw(d, rscratch4, d);
3608 
3609     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3610     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3611 
3612     if (multi_block) {
3613       __ add(buf, buf, 64);
3614       __ add(ofs, ofs, 64);
3615       __ cmp(ofs, limit);
3616       __ br(Assembler::LE, md5_loop);
3617       __ mov(c_rarg0, ofs); // return ofs
3618     }
3619 
3620     // write hash values back in the correct order
3621     __ stp(state_regs[0], state_regs[1], Address(state));
3622 
3623     __ pop(saved_regs, sp);
3624 
3625     __ ret(lr);
3626 
3627     return start;
3628   }
3629 
3630   // Arguments:
3631   //
3632   // Inputs:
3633   //   c_rarg0   - byte[]  source+offset
3634   //   c_rarg1   - int[]   SHA.state
3635   //   c_rarg2   - int     offset
3636   //   c_rarg3   - int     limit
3637   //
3638   address generate_sha1_implCompress(bool multi_block, const char *name) {
3639     __ align(CodeEntryAlignment);
3640     StubCodeMark mark(this, "StubRoutines", name);
3641     address start = __ pc();
3642 
3643     Register buf   = c_rarg0;
3644     Register state = c_rarg1;
3645     Register ofs   = c_rarg2;
3646     Register limit = c_rarg3;
3647 
3648     Label keys;
3649     Label sha1_loop;
3650 
3651     // load the keys into v0..v3
3652     __ adr(rscratch1, keys);
3653     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3654     // load 5 words state into v6, v7
3655     __ ldrq(v6, Address(state, 0));
3656     __ ldrs(v7, Address(state, 16));
3657 
3658 
3659     __ BIND(sha1_loop);
3660     // load 64 bytes of data into v16..v19
3661     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3662     __ rev32(v16, __ T16B, v16);
3663     __ rev32(v17, __ T16B, v17);
3664     __ rev32(v18, __ T16B, v18);
3665     __ rev32(v19, __ T16B, v19);
3666 
3667     // do the sha1
3668     __ addv(v4, __ T4S, v16, v0);
3669     __ orr(v20, __ T16B, v6, v6);
3670 
3671     FloatRegister d0 = v16;
3672     FloatRegister d1 = v17;
3673     FloatRegister d2 = v18;
3674     FloatRegister d3 = v19;
3675 
3676     for (int round = 0; round < 20; round++) {
3677       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3678       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3679       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3680       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3681       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3682 
3683       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3684       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3685       __ sha1h(tmp2, __ T4S, v20);
3686       if (round < 5)
3687         __ sha1c(v20, __ T4S, tmp3, tmp4);
3688       else if (round < 10 || round >= 15)
3689         __ sha1p(v20, __ T4S, tmp3, tmp4);
3690       else
3691         __ sha1m(v20, __ T4S, tmp3, tmp4);
3692       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3693 
3694       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3695     }
3696 
3697     __ addv(v7, __ T2S, v7, v21);
3698     __ addv(v6, __ T4S, v6, v20);
3699 
3700     if (multi_block) {
3701       __ add(ofs, ofs, 64);
3702       __ cmp(ofs, limit);
3703       __ br(Assembler::LE, sha1_loop);
3704       __ mov(c_rarg0, ofs); // return ofs
3705     }
3706 
3707     __ strq(v6, Address(state, 0));
3708     __ strs(v7, Address(state, 16));
3709 
3710     __ ret(lr);
3711 
3712     __ bind(keys);
3713     __ emit_int32(0x5a827999);
3714     __ emit_int32(0x6ed9eba1);
3715     __ emit_int32(0x8f1bbcdc);
3716     __ emit_int32(0xca62c1d6);
3717 
3718     return start;
3719   }
3720 
3721 
3722   // Arguments:
3723   //
3724   // Inputs:
3725   //   c_rarg0   - byte[]  source+offset
3726   //   c_rarg1   - int[]   SHA.state
3727   //   c_rarg2   - int     offset
3728   //   c_rarg3   - int     limit
3729   //
3730   address generate_sha256_implCompress(bool multi_block, const char *name) {
3731     static const uint32_t round_consts[64] = {
3732       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3733       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3734       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3735       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3736       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3737       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3738       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3739       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3740       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3741       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3742       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3743       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3744       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3745       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3746       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3747       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3748     };
3749     __ align(CodeEntryAlignment);
3750     StubCodeMark mark(this, "StubRoutines", name);
3751     address start = __ pc();
3752 
3753     Register buf   = c_rarg0;
3754     Register state = c_rarg1;
3755     Register ofs   = c_rarg2;
3756     Register limit = c_rarg3;
3757 
3758     Label sha1_loop;
3759 
3760     __ stpd(v8, v9, __ pre(sp, -32));
3761     __ stpd(v10, v11, Address(sp, 16));
3762 
3763 // dga == v0
3764 // dgb == v1
3765 // dg0 == v2
3766 // dg1 == v3
3767 // dg2 == v4
3768 // t0 == v6
3769 // t1 == v7
3770 
3771     // load 16 keys to v16..v31
3772     __ lea(rscratch1, ExternalAddress((address)round_consts));
3773     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3774     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3775     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3776     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3777 
3778     // load 8 words (256 bits) state
3779     __ ldpq(v0, v1, state);
3780 
3781     __ BIND(sha1_loop);
3782     // load 64 bytes of data into v8..v11
3783     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3784     __ rev32(v8, __ T16B, v8);
3785     __ rev32(v9, __ T16B, v9);
3786     __ rev32(v10, __ T16B, v10);
3787     __ rev32(v11, __ T16B, v11);
3788 
3789     __ addv(v6, __ T4S, v8, v16);
3790     __ orr(v2, __ T16B, v0, v0);
3791     __ orr(v3, __ T16B, v1, v1);
3792 
3793     FloatRegister d0 = v8;
3794     FloatRegister d1 = v9;
3795     FloatRegister d2 = v10;
3796     FloatRegister d3 = v11;
3797 
3798 
3799     for (int round = 0; round < 16; round++) {
3800       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3801       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3802       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3803       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3804 
3805       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3806        __ orr(v4, __ T16B, v2, v2);
3807       if (round < 15)
3808         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3809       __ sha256h(v2, __ T4S, v3, tmp2);
3810       __ sha256h2(v3, __ T4S, v4, tmp2);
3811       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3812 
3813       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3814     }
3815 
3816     __ addv(v0, __ T4S, v0, v2);
3817     __ addv(v1, __ T4S, v1, v3);
3818 
3819     if (multi_block) {
3820       __ add(ofs, ofs, 64);
3821       __ cmp(ofs, limit);
3822       __ br(Assembler::LE, sha1_loop);
3823       __ mov(c_rarg0, ofs); // return ofs
3824     }
3825 
3826     __ ldpd(v10, v11, Address(sp, 16));
3827     __ ldpd(v8, v9, __ post(sp, 32));
3828 
3829     __ stpq(v0, v1, state);
3830 
3831     __ ret(lr);
3832 
3833     return start;
3834   }
3835 
3836   // Double rounds for sha512.
3837   void sha512_dround(int dr,
3838                      FloatRegister vi0, FloatRegister vi1,
3839                      FloatRegister vi2, FloatRegister vi3,
3840                      FloatRegister vi4, FloatRegister vrc0,
3841                      FloatRegister vrc1, FloatRegister vin0,
3842                      FloatRegister vin1, FloatRegister vin2,
3843                      FloatRegister vin3, FloatRegister vin4) {
3844       if (dr < 36) {
3845         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3846       }
3847       __ addv(v5, __ T2D, vrc0, vin0);
3848       __ ext(v6, __ T16B, vi2, vi3, 8);
3849       __ ext(v5, __ T16B, v5, v5, 8);
3850       __ ext(v7, __ T16B, vi1, vi2, 8);
3851       __ addv(vi3, __ T2D, vi3, v5);
3852       if (dr < 32) {
3853         __ ext(v5, __ T16B, vin3, vin4, 8);
3854         __ sha512su0(vin0, __ T2D, vin1);
3855       }
3856       __ sha512h(vi3, __ T2D, v6, v7);
3857       if (dr < 32) {
3858         __ sha512su1(vin0, __ T2D, vin2, v5);
3859       }
3860       __ addv(vi4, __ T2D, vi1, vi3);
3861       __ sha512h2(vi3, __ T2D, vi1, vi0);
3862   }
3863 
3864   // Arguments:
3865   //
3866   // Inputs:
3867   //   c_rarg0   - byte[]  source+offset
3868   //   c_rarg1   - int[]   SHA.state
3869   //   c_rarg2   - int     offset
3870   //   c_rarg3   - int     limit
3871   //
3872   address generate_sha512_implCompress(bool multi_block, const char *name) {
3873     static const uint64_t round_consts[80] = {
3874       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3875       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3876       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3877       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3878       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3879       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3880       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3881       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3882       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3883       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3884       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3885       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3886       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3887       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3888       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3889       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3890       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3891       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3892       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3893       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3894       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3895       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3896       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3897       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3898       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3899       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3900       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3901     };
3902 
3903     __ align(CodeEntryAlignment);
3904     StubCodeMark mark(this, "StubRoutines", name);
3905     address start = __ pc();
3906 
3907     Register buf   = c_rarg0;
3908     Register state = c_rarg1;
3909     Register ofs   = c_rarg2;
3910     Register limit = c_rarg3;
3911 
3912     __ stpd(v8, v9, __ pre(sp, -64));
3913     __ stpd(v10, v11, Address(sp, 16));
3914     __ stpd(v12, v13, Address(sp, 32));
3915     __ stpd(v14, v15, Address(sp, 48));
3916 
3917     Label sha512_loop;
3918 
3919     // load state
3920     __ ld1(v8, v9, v10, v11, __ T2D, state);
3921 
3922     // load first 4 round constants
3923     __ lea(rscratch1, ExternalAddress((address)round_consts));
3924     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3925 
3926     __ BIND(sha512_loop);
3927     // load 128B of data into v12..v19
3928     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3929     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3930     __ rev64(v12, __ T16B, v12);
3931     __ rev64(v13, __ T16B, v13);
3932     __ rev64(v14, __ T16B, v14);
3933     __ rev64(v15, __ T16B, v15);
3934     __ rev64(v16, __ T16B, v16);
3935     __ rev64(v17, __ T16B, v17);
3936     __ rev64(v18, __ T16B, v18);
3937     __ rev64(v19, __ T16B, v19);
3938 
3939     __ mov(rscratch2, rscratch1);
3940 
3941     __ mov(v0, __ T16B, v8);
3942     __ mov(v1, __ T16B, v9);
3943     __ mov(v2, __ T16B, v10);
3944     __ mov(v3, __ T16B, v11);
3945 
3946     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3947     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3948     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3949     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3950     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3951     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3952     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3953     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3954     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3955     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3956     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3957     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3958     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3959     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3960     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3961     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3962     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3963     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3964     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3965     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3966     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3967     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3968     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3969     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3970     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3971     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3972     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3973     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3974     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3975     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3976     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3977     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3978     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3979     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3980     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3981     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3982     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3983     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3984     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3985     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3986 
3987     __ addv(v8, __ T2D, v8, v0);
3988     __ addv(v9, __ T2D, v9, v1);
3989     __ addv(v10, __ T2D, v10, v2);
3990     __ addv(v11, __ T2D, v11, v3);
3991 
3992     if (multi_block) {
3993       __ add(ofs, ofs, 128);
3994       __ cmp(ofs, limit);
3995       __ br(Assembler::LE, sha512_loop);
3996       __ mov(c_rarg0, ofs); // return ofs
3997     }
3998 
3999     __ st1(v8, v9, v10, v11, __ T2D, state);
4000 
4001     __ ldpd(v14, v15, Address(sp, 48));
4002     __ ldpd(v12, v13, Address(sp, 32));
4003     __ ldpd(v10, v11, Address(sp, 16));
4004     __ ldpd(v8, v9, __ post(sp, 64));
4005 
4006     __ ret(lr);
4007 
4008     return start;
4009   }
4010 
4011   // Arguments:
4012   //
4013   // Inputs:
4014   //   c_rarg0   - byte[]  source+offset
4015   //   c_rarg1   - byte[]  SHA.state
4016   //   c_rarg2   - int     block_size
4017   //   c_rarg3   - int     offset
4018   //   c_rarg4   - int     limit
4019   //
4020   address generate_sha3_implCompress(bool multi_block, const char *name) {
4021     static const uint64_t round_consts[24] = {
4022       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4023       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4024       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4025       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4026       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4027       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4028       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4029       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4030     };
4031 
4032     __ align(CodeEntryAlignment);
4033     StubCodeMark mark(this, "StubRoutines", name);
4034     address start = __ pc();
4035 
4036     Register buf           = c_rarg0;
4037     Register state         = c_rarg1;
4038     Register block_size    = c_rarg2;
4039     Register ofs           = c_rarg3;
4040     Register limit         = c_rarg4;
4041 
4042     Label sha3_loop, rounds24_loop;
4043     Label sha3_512_or_sha3_384, shake128;
4044 
4045     __ stpd(v8, v9, __ pre(sp, -64));
4046     __ stpd(v10, v11, Address(sp, 16));
4047     __ stpd(v12, v13, Address(sp, 32));
4048     __ stpd(v14, v15, Address(sp, 48));
4049 
4050     // load state
4051     __ add(rscratch1, state, 32);
4052     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4053     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4054     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4055     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4056     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4057     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4058     __ ld1(v24, __ T1D, rscratch1);
4059 
4060     __ BIND(sha3_loop);
4061 
4062     // 24 keccak rounds
4063     __ movw(rscratch2, 24);
4064 
4065     // load round_constants base
4066     __ lea(rscratch1, ExternalAddress((address) round_consts));
4067 
4068     // load input
4069     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4070     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4071     __ eor(v0, __ T8B, v0, v25);
4072     __ eor(v1, __ T8B, v1, v26);
4073     __ eor(v2, __ T8B, v2, v27);
4074     __ eor(v3, __ T8B, v3, v28);
4075     __ eor(v4, __ T8B, v4, v29);
4076     __ eor(v5, __ T8B, v5, v30);
4077     __ eor(v6, __ T8B, v6, v31);
4078 
4079     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4080     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4081 
4082     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4083     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4084     __ eor(v7, __ T8B, v7, v25);
4085     __ eor(v8, __ T8B, v8, v26);
4086     __ eor(v9, __ T8B, v9, v27);
4087     __ eor(v10, __ T8B, v10, v28);
4088     __ eor(v11, __ T8B, v11, v29);
4089     __ eor(v12, __ T8B, v12, v30);
4090     __ eor(v13, __ T8B, v13, v31);
4091 
4092     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4093     __ eor(v14, __ T8B, v14, v25);
4094     __ eor(v15, __ T8B, v15, v26);
4095     __ eor(v16, __ T8B, v16, v27);
4096 
4097     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4098     __ andw(c_rarg5, block_size, 48);
4099     __ cbzw(c_rarg5, rounds24_loop);
4100 
4101     __ tbnz(block_size, 5, shake128);
4102     // block_size == 144, bit5 == 0, SHA3-244
4103     __ ldrd(v28, __ post(buf, 8));
4104     __ eor(v17, __ T8B, v17, v28);
4105     __ b(rounds24_loop);
4106 
4107     __ BIND(shake128);
4108     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4109     __ eor(v17, __ T8B, v17, v28);
4110     __ eor(v18, __ T8B, v18, v29);
4111     __ eor(v19, __ T8B, v19, v30);
4112     __ eor(v20, __ T8B, v20, v31);
4113     __ b(rounds24_loop); // block_size == 168, SHAKE128
4114 
4115     __ BIND(sha3_512_or_sha3_384);
4116     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4117     __ eor(v7, __ T8B, v7, v25);
4118     __ eor(v8, __ T8B, v8, v26);
4119     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4120 
4121     // SHA3-384
4122     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4123     __ eor(v9,  __ T8B, v9,  v27);
4124     __ eor(v10, __ T8B, v10, v28);
4125     __ eor(v11, __ T8B, v11, v29);
4126     __ eor(v12, __ T8B, v12, v30);
4127 
4128     __ BIND(rounds24_loop);
4129     __ subw(rscratch2, rscratch2, 1);
4130 
4131     __ eor3(v29, __ T16B, v4, v9, v14);
4132     __ eor3(v26, __ T16B, v1, v6, v11);
4133     __ eor3(v28, __ T16B, v3, v8, v13);
4134     __ eor3(v25, __ T16B, v0, v5, v10);
4135     __ eor3(v27, __ T16B, v2, v7, v12);
4136     __ eor3(v29, __ T16B, v29, v19, v24);
4137     __ eor3(v26, __ T16B, v26, v16, v21);
4138     __ eor3(v28, __ T16B, v28, v18, v23);
4139     __ eor3(v25, __ T16B, v25, v15, v20);
4140     __ eor3(v27, __ T16B, v27, v17, v22);
4141 
4142     __ rax1(v30, __ T2D, v29, v26);
4143     __ rax1(v26, __ T2D, v26, v28);
4144     __ rax1(v28, __ T2D, v28, v25);
4145     __ rax1(v25, __ T2D, v25, v27);
4146     __ rax1(v27, __ T2D, v27, v29);
4147 
4148     __ eor(v0, __ T16B, v0, v30);
4149     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4150     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4151     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4152     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4153     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4154     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4155     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4156     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4157     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4158     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4159     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4160     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4161     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4162     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4163     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4164     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4165     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4166     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4167     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4168     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4169     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4170     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4171     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4172     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4173 
4174     __ bcax(v20, __ T16B, v31, v22, v8);
4175     __ bcax(v21, __ T16B, v8,  v23, v22);
4176     __ bcax(v22, __ T16B, v22, v24, v23);
4177     __ bcax(v23, __ T16B, v23, v31, v24);
4178     __ bcax(v24, __ T16B, v24, v8,  v31);
4179 
4180     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4181 
4182     __ bcax(v17, __ T16B, v25, v19, v3);
4183     __ bcax(v18, __ T16B, v3,  v15, v19);
4184     __ bcax(v19, __ T16B, v19, v16, v15);
4185     __ bcax(v15, __ T16B, v15, v25, v16);
4186     __ bcax(v16, __ T16B, v16, v3,  v25);
4187 
4188     __ bcax(v10, __ T16B, v29, v12, v26);
4189     __ bcax(v11, __ T16B, v26, v13, v12);
4190     __ bcax(v12, __ T16B, v12, v14, v13);
4191     __ bcax(v13, __ T16B, v13, v29, v14);
4192     __ bcax(v14, __ T16B, v14, v26, v29);
4193 
4194     __ bcax(v7, __ T16B, v30, v9,  v4);
4195     __ bcax(v8, __ T16B, v4,  v5,  v9);
4196     __ bcax(v9, __ T16B, v9,  v6,  v5);
4197     __ bcax(v5, __ T16B, v5,  v30, v6);
4198     __ bcax(v6, __ T16B, v6,  v4,  v30);
4199 
4200     __ bcax(v3, __ T16B, v27, v0,  v28);
4201     __ bcax(v4, __ T16B, v28, v1,  v0);
4202     __ bcax(v0, __ T16B, v0,  v2,  v1);
4203     __ bcax(v1, __ T16B, v1,  v27, v2);
4204     __ bcax(v2, __ T16B, v2,  v28, v27);
4205 
4206     __ eor(v0, __ T16B, v0, v31);
4207 
4208     __ cbnzw(rscratch2, rounds24_loop);
4209 
4210     if (multi_block) {
4211       __ add(ofs, ofs, block_size);
4212       __ cmp(ofs, limit);
4213       __ br(Assembler::LE, sha3_loop);
4214       __ mov(c_rarg0, ofs); // return ofs
4215     }
4216 
4217     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4218     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4219     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4220     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4221     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4222     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4223     __ st1(v24, __ T1D, state);
4224 
4225     __ ldpd(v14, v15, Address(sp, 48));
4226     __ ldpd(v12, v13, Address(sp, 32));
4227     __ ldpd(v10, v11, Address(sp, 16));
4228     __ ldpd(v8, v9, __ post(sp, 64));
4229 
4230     __ ret(lr);
4231 
4232     return start;
4233   }
4234 
4235   /**
4236    *  Arguments:
4237    *
4238    * Inputs:
4239    *   c_rarg0   - int crc
4240    *   c_rarg1   - byte* buf
4241    *   c_rarg2   - int length
4242    *
4243    * Output:
4244    *       rax   - int crc result
4245    */
4246   address generate_updateBytesCRC32() {
4247     assert(UseCRC32Intrinsics, "what are we doing here?");
4248 
4249     __ align(CodeEntryAlignment);
4250     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4251 
4252     address start = __ pc();
4253 
4254     const Register crc   = c_rarg0;  // crc
4255     const Register buf   = c_rarg1;  // source java byte array address
4256     const Register len   = c_rarg2;  // length
4257     const Register table0 = c_rarg3; // crc_table address
4258     const Register table1 = c_rarg4;
4259     const Register table2 = c_rarg5;
4260     const Register table3 = c_rarg6;
4261     const Register tmp3 = c_rarg7;
4262 
4263     BLOCK_COMMENT("Entry:");
4264     __ enter(); // required for proper stackwalking of RuntimeStub frame
4265 
4266     __ kernel_crc32(crc, buf, len,
4267               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4268 
4269     __ leave(); // required for proper stackwalking of RuntimeStub frame
4270     __ ret(lr);
4271 
4272     return start;
4273   }
4274 
4275   // ChaCha20 block function.  This version parallelizes by loading
4276   // individual 32-bit state elements into vectors for four blocks
4277   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4278   //
4279   // state (int[16]) = c_rarg0
4280   // keystream (byte[1024]) = c_rarg1
4281   // return - number of bytes of keystream (always 256)
4282   address generate_chacha20Block_blockpar() {
4283     Label L_twoRounds, L_cc20_const;
4284     // The constant data is broken into two 128-bit segments to be loaded
4285     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4286     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4287     // The second 128-bits is a table constant used for 8-bit left rotations.
4288     __ BIND(L_cc20_const);
4289     __ emit_int64(0x0000000100000000UL);
4290     __ emit_int64(0x0000000300000002UL);
4291     __ emit_int64(0x0605040702010003UL);
4292     __ emit_int64(0x0E0D0C0F0A09080BUL);
4293 
4294     __ align(CodeEntryAlignment);
4295     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4296     address start = __ pc();
4297     __ enter();
4298 
4299     int i, j;
4300     const Register state = c_rarg0;
4301     const Register keystream = c_rarg1;
4302     const Register loopCtr = r10;
4303     const Register tmpAddr = r11;
4304 
4305     const FloatRegister stateFirst = v0;
4306     const FloatRegister stateSecond = v1;
4307     const FloatRegister stateThird = v2;
4308     const FloatRegister stateFourth = v3;
4309     const FloatRegister origCtrState = v28;
4310     const FloatRegister scratch = v29;
4311     const FloatRegister lrot8Tbl = v30;
4312 
4313     // Organize SIMD registers in an array that facilitates
4314     // putting repetitive opcodes into loop structures.  It is
4315     // important that each grouping of 4 registers is monotonically
4316     // increasing to support the requirements of multi-register
4317     // instructions (e.g. ld4r, st4, etc.)
4318     const FloatRegister workSt[16] = {
4319          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4320         v20, v21, v22, v23, v24, v25, v26, v27
4321     };
4322 
4323     // Load from memory and interlace across 16 SIMD registers,
4324     // With each word from memory being broadcast to all lanes of
4325     // each successive SIMD register.
4326     //      Addr(0) -> All lanes in workSt[i]
4327     //      Addr(4) -> All lanes workSt[i + 1], etc.
4328     __ mov(tmpAddr, state);
4329     for (i = 0; i < 16; i += 4) {
4330       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4331           __ post(tmpAddr, 16));
4332     }
4333 
4334     // Pull in constant data.  The first 16 bytes are the add overlay
4335     // which is applied to the vector holding the counter (state[12]).
4336     // The second 16 bytes is the index register for the 8-bit left
4337     // rotation tbl instruction.
4338     __ adr(tmpAddr, L_cc20_const);
4339     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4340     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4341 
4342     // Set up the 10 iteration loop and perform all 8 quarter round ops
4343     __ mov(loopCtr, 10);
4344     __ BIND(L_twoRounds);
4345 
4346     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4347         scratch, lrot8Tbl);
4348     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4349         scratch, lrot8Tbl);
4350     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4351         scratch, lrot8Tbl);
4352     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4353         scratch, lrot8Tbl);
4354 
4355     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4356         scratch, lrot8Tbl);
4357     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4358         scratch, lrot8Tbl);
4359     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4360         scratch, lrot8Tbl);
4361     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4362         scratch, lrot8Tbl);
4363 
4364     // Decrement and iterate
4365     __ sub(loopCtr, loopCtr, 1);
4366     __ cbnz(loopCtr, L_twoRounds);
4367 
4368     __ mov(tmpAddr, state);
4369 
4370     // Add the starting state back to the post-loop keystream
4371     // state.  We read/interlace the state array from memory into
4372     // 4 registers similar to what we did in the beginning.  Then
4373     // add the counter overlay onto workSt[12] at the end.
4374     for (i = 0; i < 16; i += 4) {
4375       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4376           __ post(tmpAddr, 16));
4377       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4378       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4379       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4380       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4381     }
4382     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4383 
4384     // Write to key stream, storing the same element out of workSt[0..15]
4385     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4386     // for the next element position.
4387     for (i = 0; i < 4; i++) {
4388       for (j = 0; j < 16; j += 4) {
4389         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4390             __ post(keystream, 16));
4391       }
4392     }
4393 
4394     __ mov(r0, 256);             // Return length of output keystream
4395     __ leave();
4396     __ ret(lr);
4397 
4398     return start;
4399   }
4400 
4401   /**
4402    *  Arguments:
4403    *
4404    * Inputs:
4405    *   c_rarg0   - int crc
4406    *   c_rarg1   - byte* buf
4407    *   c_rarg2   - int length
4408    *   c_rarg3   - int* table
4409    *
4410    * Output:
4411    *       r0   - int crc result
4412    */
4413   address generate_updateBytesCRC32C() {
4414     assert(UseCRC32CIntrinsics, "what are we doing here?");
4415 
4416     __ align(CodeEntryAlignment);
4417     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4418 
4419     address start = __ pc();
4420 
4421     const Register crc   = c_rarg0;  // crc
4422     const Register buf   = c_rarg1;  // source java byte array address
4423     const Register len   = c_rarg2;  // length
4424     const Register table0 = c_rarg3; // crc_table address
4425     const Register table1 = c_rarg4;
4426     const Register table2 = c_rarg5;
4427     const Register table3 = c_rarg6;
4428     const Register tmp3 = c_rarg7;
4429 
4430     BLOCK_COMMENT("Entry:");
4431     __ enter(); // required for proper stackwalking of RuntimeStub frame
4432 
4433     __ kernel_crc32c(crc, buf, len,
4434               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4435 
4436     __ leave(); // required for proper stackwalking of RuntimeStub frame
4437     __ ret(lr);
4438 
4439     return start;
4440   }
4441 
4442   /***
4443    *  Arguments:
4444    *
4445    *  Inputs:
4446    *   c_rarg0   - int   adler
4447    *   c_rarg1   - byte* buff
4448    *   c_rarg2   - int   len
4449    *
4450    * Output:
4451    *   c_rarg0   - int adler result
4452    */
4453   address generate_updateBytesAdler32() {
4454     __ align(CodeEntryAlignment);
4455     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4456     address start = __ pc();
4457 
4458     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4459 
4460     // Aliases
4461     Register adler  = c_rarg0;
4462     Register s1     = c_rarg0;
4463     Register s2     = c_rarg3;
4464     Register buff   = c_rarg1;
4465     Register len    = c_rarg2;
4466     Register nmax  = r4;
4467     Register base  = r5;
4468     Register count = r6;
4469     Register temp0 = rscratch1;
4470     Register temp1 = rscratch2;
4471     FloatRegister vbytes = v0;
4472     FloatRegister vs1acc = v1;
4473     FloatRegister vs2acc = v2;
4474     FloatRegister vtable = v3;
4475 
4476     // Max number of bytes we can process before having to take the mod
4477     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4478     uint64_t BASE = 0xfff1;
4479     uint64_t NMAX = 0x15B0;
4480 
4481     __ mov(base, BASE);
4482     __ mov(nmax, NMAX);
4483 
4484     // Load accumulation coefficients for the upper 16 bits
4485     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4486     __ ld1(vtable, __ T16B, Address(temp0));
4487 
4488     // s1 is initialized to the lower 16 bits of adler
4489     // s2 is initialized to the upper 16 bits of adler
4490     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4491     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4492 
4493     // The pipelined loop needs at least 16 elements for 1 iteration
4494     // It does check this, but it is more effective to skip to the cleanup loop
4495     __ cmp(len, (u1)16);
4496     __ br(Assembler::HS, L_nmax);
4497     __ cbz(len, L_combine);
4498 
4499     __ bind(L_simple_by1_loop);
4500     __ ldrb(temp0, Address(__ post(buff, 1)));
4501     __ add(s1, s1, temp0);
4502     __ add(s2, s2, s1);
4503     __ subs(len, len, 1);
4504     __ br(Assembler::HI, L_simple_by1_loop);
4505 
4506     // s1 = s1 % BASE
4507     __ subs(temp0, s1, base);
4508     __ csel(s1, temp0, s1, Assembler::HS);
4509 
4510     // s2 = s2 % BASE
4511     __ lsr(temp0, s2, 16);
4512     __ lsl(temp1, temp0, 4);
4513     __ sub(temp1, temp1, temp0);
4514     __ add(s2, temp1, s2, ext::uxth);
4515 
4516     __ subs(temp0, s2, base);
4517     __ csel(s2, temp0, s2, Assembler::HS);
4518 
4519     __ b(L_combine);
4520 
4521     __ bind(L_nmax);
4522     __ subs(len, len, nmax);
4523     __ sub(count, nmax, 16);
4524     __ br(Assembler::LO, L_by16);
4525 
4526     __ bind(L_nmax_loop);
4527 
4528     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4529                                       vbytes, vs1acc, vs2acc, vtable);
4530 
4531     __ subs(count, count, 16);
4532     __ br(Assembler::HS, L_nmax_loop);
4533 
4534     // s1 = s1 % BASE
4535     __ lsr(temp0, s1, 16);
4536     __ lsl(temp1, temp0, 4);
4537     __ sub(temp1, temp1, temp0);
4538     __ add(temp1, temp1, s1, ext::uxth);
4539 
4540     __ lsr(temp0, temp1, 16);
4541     __ lsl(s1, temp0, 4);
4542     __ sub(s1, s1, temp0);
4543     __ add(s1, s1, temp1, ext:: uxth);
4544 
4545     __ subs(temp0, s1, base);
4546     __ csel(s1, temp0, s1, Assembler::HS);
4547 
4548     // s2 = s2 % BASE
4549     __ lsr(temp0, s2, 16);
4550     __ lsl(temp1, temp0, 4);
4551     __ sub(temp1, temp1, temp0);
4552     __ add(temp1, temp1, s2, ext::uxth);
4553 
4554     __ lsr(temp0, temp1, 16);
4555     __ lsl(s2, temp0, 4);
4556     __ sub(s2, s2, temp0);
4557     __ add(s2, s2, temp1, ext:: uxth);
4558 
4559     __ subs(temp0, s2, base);
4560     __ csel(s2, temp0, s2, Assembler::HS);
4561 
4562     __ subs(len, len, nmax);
4563     __ sub(count, nmax, 16);
4564     __ br(Assembler::HS, L_nmax_loop);
4565 
4566     __ bind(L_by16);
4567     __ adds(len, len, count);
4568     __ br(Assembler::LO, L_by1);
4569 
4570     __ bind(L_by16_loop);
4571 
4572     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4573                                       vbytes, vs1acc, vs2acc, vtable);
4574 
4575     __ subs(len, len, 16);
4576     __ br(Assembler::HS, L_by16_loop);
4577 
4578     __ bind(L_by1);
4579     __ adds(len, len, 15);
4580     __ br(Assembler::LO, L_do_mod);
4581 
4582     __ bind(L_by1_loop);
4583     __ ldrb(temp0, Address(__ post(buff, 1)));
4584     __ add(s1, temp0, s1);
4585     __ add(s2, s2, s1);
4586     __ subs(len, len, 1);
4587     __ br(Assembler::HS, L_by1_loop);
4588 
4589     __ bind(L_do_mod);
4590     // s1 = s1 % BASE
4591     __ lsr(temp0, s1, 16);
4592     __ lsl(temp1, temp0, 4);
4593     __ sub(temp1, temp1, temp0);
4594     __ add(temp1, temp1, s1, ext::uxth);
4595 
4596     __ lsr(temp0, temp1, 16);
4597     __ lsl(s1, temp0, 4);
4598     __ sub(s1, s1, temp0);
4599     __ add(s1, s1, temp1, ext:: uxth);
4600 
4601     __ subs(temp0, s1, base);
4602     __ csel(s1, temp0, s1, Assembler::HS);
4603 
4604     // s2 = s2 % BASE
4605     __ lsr(temp0, s2, 16);
4606     __ lsl(temp1, temp0, 4);
4607     __ sub(temp1, temp1, temp0);
4608     __ add(temp1, temp1, s2, ext::uxth);
4609 
4610     __ lsr(temp0, temp1, 16);
4611     __ lsl(s2, temp0, 4);
4612     __ sub(s2, s2, temp0);
4613     __ add(s2, s2, temp1, ext:: uxth);
4614 
4615     __ subs(temp0, s2, base);
4616     __ csel(s2, temp0, s2, Assembler::HS);
4617 
4618     // Combine lower bits and higher bits
4619     __ bind(L_combine);
4620     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4621 
4622     __ ret(lr);
4623 
4624     return start;
4625   }
4626 
4627   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4628           Register temp0, Register temp1, FloatRegister vbytes,
4629           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4630     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4631     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4632     // In non-vectorized code, we update s1 and s2 as:
4633     //   s1 <- s1 + b1
4634     //   s2 <- s2 + s1
4635     //   s1 <- s1 + b2
4636     //   s2 <- s2 + b1
4637     //   ...
4638     //   s1 <- s1 + b16
4639     //   s2 <- s2 + s1
4640     // Putting above assignments together, we have:
4641     //   s1_new = s1 + b1 + b2 + ... + b16
4642     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4643     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4644     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4645     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4646 
4647     // s2 = s2 + s1 * 16
4648     __ add(s2, s2, s1, Assembler::LSL, 4);
4649 
4650     // vs1acc = b1 + b2 + b3 + ... + b16
4651     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4652     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4653     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4654     __ uaddlv(vs1acc, __ T16B, vbytes);
4655     __ uaddlv(vs2acc, __ T8H, vs2acc);
4656 
4657     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4658     __ fmovd(temp0, vs1acc);
4659     __ fmovd(temp1, vs2acc);
4660     __ add(s1, s1, temp0);
4661     __ add(s2, s2, temp1);
4662   }
4663 
4664   /**
4665    *  Arguments:
4666    *
4667    *  Input:
4668    *    c_rarg0   - x address
4669    *    c_rarg1   - x length
4670    *    c_rarg2   - y address
4671    *    c_rarg3   - y length
4672    *    c_rarg4   - z address
4673    */
4674   address generate_multiplyToLen() {
4675     __ align(CodeEntryAlignment);
4676     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4677 
4678     address start = __ pc();
4679     const Register x     = r0;
4680     const Register xlen  = r1;
4681     const Register y     = r2;
4682     const Register ylen  = r3;
4683     const Register z     = r4;
4684 
4685     const Register tmp0  = r5;
4686     const Register tmp1  = r10;
4687     const Register tmp2  = r11;
4688     const Register tmp3  = r12;
4689     const Register tmp4  = r13;
4690     const Register tmp5  = r14;
4691     const Register tmp6  = r15;
4692     const Register tmp7  = r16;
4693 
4694     BLOCK_COMMENT("Entry:");
4695     __ enter(); // required for proper stackwalking of RuntimeStub frame
4696     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4697     __ leave(); // required for proper stackwalking of RuntimeStub frame
4698     __ ret(lr);
4699 
4700     return start;
4701   }
4702 
4703   address generate_squareToLen() {
4704     // squareToLen algorithm for sizes 1..127 described in java code works
4705     // faster than multiply_to_len on some CPUs and slower on others, but
4706     // multiply_to_len shows a bit better overall results
4707     __ align(CodeEntryAlignment);
4708     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4709     address start = __ pc();
4710 
4711     const Register x     = r0;
4712     const Register xlen  = r1;
4713     const Register z     = r2;
4714     const Register y     = r4; // == x
4715     const Register ylen  = r5; // == xlen
4716 
4717     const Register tmp0  = r3;
4718     const Register tmp1  = r10;
4719     const Register tmp2  = r11;
4720     const Register tmp3  = r12;
4721     const Register tmp4  = r13;
4722     const Register tmp5  = r14;
4723     const Register tmp6  = r15;
4724     const Register tmp7  = r16;
4725 
4726     RegSet spilled_regs = RegSet::of(y, ylen);
4727     BLOCK_COMMENT("Entry:");
4728     __ enter();
4729     __ push(spilled_regs, sp);
4730     __ mov(y, x);
4731     __ mov(ylen, xlen);
4732     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4733     __ pop(spilled_regs, sp);
4734     __ leave();
4735     __ ret(lr);
4736     return start;
4737   }
4738 
4739   address generate_mulAdd() {
4740     __ align(CodeEntryAlignment);
4741     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4742 
4743     address start = __ pc();
4744 
4745     const Register out     = r0;
4746     const Register in      = r1;
4747     const Register offset  = r2;
4748     const Register len     = r3;
4749     const Register k       = r4;
4750 
4751     BLOCK_COMMENT("Entry:");
4752     __ enter();
4753     __ mul_add(out, in, offset, len, k);
4754     __ leave();
4755     __ ret(lr);
4756 
4757     return start;
4758   }
4759 
4760   // Arguments:
4761   //
4762   // Input:
4763   //   c_rarg0   - newArr address
4764   //   c_rarg1   - oldArr address
4765   //   c_rarg2   - newIdx
4766   //   c_rarg3   - shiftCount
4767   //   c_rarg4   - numIter
4768   //
4769   address generate_bigIntegerRightShift() {
4770     __ align(CodeEntryAlignment);
4771     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4772     address start = __ pc();
4773 
4774     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4775 
4776     Register newArr        = c_rarg0;
4777     Register oldArr        = c_rarg1;
4778     Register newIdx        = c_rarg2;
4779     Register shiftCount    = c_rarg3;
4780     Register numIter       = c_rarg4;
4781     Register idx           = numIter;
4782 
4783     Register newArrCur     = rscratch1;
4784     Register shiftRevCount = rscratch2;
4785     Register oldArrCur     = r13;
4786     Register oldArrNext    = r14;
4787 
4788     FloatRegister oldElem0        = v0;
4789     FloatRegister oldElem1        = v1;
4790     FloatRegister newElem         = v2;
4791     FloatRegister shiftVCount     = v3;
4792     FloatRegister shiftVRevCount  = v4;
4793 
4794     __ cbz(idx, Exit);
4795 
4796     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4797 
4798     // left shift count
4799     __ movw(shiftRevCount, 32);
4800     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4801 
4802     // numIter too small to allow a 4-words SIMD loop, rolling back
4803     __ cmp(numIter, (u1)4);
4804     __ br(Assembler::LT, ShiftThree);
4805 
4806     __ dup(shiftVCount,    __ T4S, shiftCount);
4807     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4808     __ negr(shiftVCount,   __ T4S, shiftVCount);
4809 
4810     __ BIND(ShiftSIMDLoop);
4811 
4812     // Calculate the load addresses
4813     __ sub(idx, idx, 4);
4814     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4815     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4816     __ add(oldArrCur,  oldArrNext, 4);
4817 
4818     // Load 4 words and process
4819     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4820     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4821     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4822     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4823     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4824     __ st1(newElem,   __ T4S,  Address(newArrCur));
4825 
4826     __ cmp(idx, (u1)4);
4827     __ br(Assembler::LT, ShiftTwoLoop);
4828     __ b(ShiftSIMDLoop);
4829 
4830     __ BIND(ShiftTwoLoop);
4831     __ cbz(idx, Exit);
4832     __ cmp(idx, (u1)1);
4833     __ br(Assembler::EQ, ShiftOne);
4834 
4835     // Calculate the load addresses
4836     __ sub(idx, idx, 2);
4837     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4838     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4839     __ add(oldArrCur,  oldArrNext, 4);
4840 
4841     // Load 2 words and process
4842     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4843     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4844     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4845     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4846     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4847     __ st1(newElem,   __ T2S, Address(newArrCur));
4848     __ b(ShiftTwoLoop);
4849 
4850     __ BIND(ShiftThree);
4851     __ tbz(idx, 1, ShiftOne);
4852     __ tbz(idx, 0, ShiftTwo);
4853     __ ldrw(r10,  Address(oldArr, 12));
4854     __ ldrw(r11,  Address(oldArr, 8));
4855     __ lsrvw(r10, r10, shiftCount);
4856     __ lslvw(r11, r11, shiftRevCount);
4857     __ orrw(r12,  r10, r11);
4858     __ strw(r12,  Address(newArr, 8));
4859 
4860     __ BIND(ShiftTwo);
4861     __ ldrw(r10,  Address(oldArr, 8));
4862     __ ldrw(r11,  Address(oldArr, 4));
4863     __ lsrvw(r10, r10, shiftCount);
4864     __ lslvw(r11, r11, shiftRevCount);
4865     __ orrw(r12,  r10, r11);
4866     __ strw(r12,  Address(newArr, 4));
4867 
4868     __ BIND(ShiftOne);
4869     __ ldrw(r10,  Address(oldArr, 4));
4870     __ ldrw(r11,  Address(oldArr));
4871     __ lsrvw(r10, r10, shiftCount);
4872     __ lslvw(r11, r11, shiftRevCount);
4873     __ orrw(r12,  r10, r11);
4874     __ strw(r12,  Address(newArr));
4875 
4876     __ BIND(Exit);
4877     __ ret(lr);
4878 
4879     return start;
4880   }
4881 
4882   // Arguments:
4883   //
4884   // Input:
4885   //   c_rarg0   - newArr address
4886   //   c_rarg1   - oldArr address
4887   //   c_rarg2   - newIdx
4888   //   c_rarg3   - shiftCount
4889   //   c_rarg4   - numIter
4890   //
4891   address generate_bigIntegerLeftShift() {
4892     __ align(CodeEntryAlignment);
4893     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4894     address start = __ pc();
4895 
4896     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4897 
4898     Register newArr        = c_rarg0;
4899     Register oldArr        = c_rarg1;
4900     Register newIdx        = c_rarg2;
4901     Register shiftCount    = c_rarg3;
4902     Register numIter       = c_rarg4;
4903 
4904     Register shiftRevCount = rscratch1;
4905     Register oldArrNext    = rscratch2;
4906 
4907     FloatRegister oldElem0        = v0;
4908     FloatRegister oldElem1        = v1;
4909     FloatRegister newElem         = v2;
4910     FloatRegister shiftVCount     = v3;
4911     FloatRegister shiftVRevCount  = v4;
4912 
4913     __ cbz(numIter, Exit);
4914 
4915     __ add(oldArrNext, oldArr, 4);
4916     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4917 
4918     // right shift count
4919     __ movw(shiftRevCount, 32);
4920     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4921 
4922     // numIter too small to allow a 4-words SIMD loop, rolling back
4923     __ cmp(numIter, (u1)4);
4924     __ br(Assembler::LT, ShiftThree);
4925 
4926     __ dup(shiftVCount,     __ T4S, shiftCount);
4927     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4928     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4929 
4930     __ BIND(ShiftSIMDLoop);
4931 
4932     // load 4 words and process
4933     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4934     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4935     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4936     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4937     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4938     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4939     __ sub(numIter,   numIter, 4);
4940 
4941     __ cmp(numIter, (u1)4);
4942     __ br(Assembler::LT, ShiftTwoLoop);
4943     __ b(ShiftSIMDLoop);
4944 
4945     __ BIND(ShiftTwoLoop);
4946     __ cbz(numIter, Exit);
4947     __ cmp(numIter, (u1)1);
4948     __ br(Assembler::EQ, ShiftOne);
4949 
4950     // load 2 words and process
4951     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4952     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4953     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4954     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4955     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4956     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4957     __ sub(numIter,   numIter, 2);
4958     __ b(ShiftTwoLoop);
4959 
4960     __ BIND(ShiftThree);
4961     __ ldrw(r10,  __ post(oldArr, 4));
4962     __ ldrw(r11,  __ post(oldArrNext, 4));
4963     __ lslvw(r10, r10, shiftCount);
4964     __ lsrvw(r11, r11, shiftRevCount);
4965     __ orrw(r12,  r10, r11);
4966     __ strw(r12,  __ post(newArr, 4));
4967     __ tbz(numIter, 1, Exit);
4968     __ tbz(numIter, 0, ShiftOne);
4969 
4970     __ BIND(ShiftTwo);
4971     __ ldrw(r10,  __ post(oldArr, 4));
4972     __ ldrw(r11,  __ post(oldArrNext, 4));
4973     __ lslvw(r10, r10, shiftCount);
4974     __ lsrvw(r11, r11, shiftRevCount);
4975     __ orrw(r12,  r10, r11);
4976     __ strw(r12,  __ post(newArr, 4));
4977 
4978     __ BIND(ShiftOne);
4979     __ ldrw(r10,  Address(oldArr));
4980     __ ldrw(r11,  Address(oldArrNext));
4981     __ lslvw(r10, r10, shiftCount);
4982     __ lsrvw(r11, r11, shiftRevCount);
4983     __ orrw(r12,  r10, r11);
4984     __ strw(r12,  Address(newArr));
4985 
4986     __ BIND(Exit);
4987     __ ret(lr);
4988 
4989     return start;
4990   }
4991 
4992   address generate_count_positives(address &count_positives_long) {
4993     const u1 large_loop_size = 64;
4994     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4995     int dcache_line = VM_Version::dcache_line_size();
4996 
4997     Register ary1 = r1, len = r2, result = r0;
4998 
4999     __ align(CodeEntryAlignment);
5000 
5001     StubCodeMark mark(this, "StubRoutines", "count_positives");
5002 
5003     address entry = __ pc();
5004 
5005     __ enter();
5006     // precondition: a copy of len is already in result
5007     // __ mov(result, len);
5008 
5009   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5010         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5011 
5012   __ cmp(len, (u1)15);
5013   __ br(Assembler::GT, LEN_OVER_15);
5014   // The only case when execution falls into this code is when pointer is near
5015   // the end of memory page and we have to avoid reading next page
5016   __ add(ary1, ary1, len);
5017   __ subs(len, len, 8);
5018   __ br(Assembler::GT, LEN_OVER_8);
5019   __ ldr(rscratch2, Address(ary1, -8));
5020   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5021   __ lsrv(rscratch2, rscratch2, rscratch1);
5022   __ tst(rscratch2, UPPER_BIT_MASK);
5023   __ csel(result, zr, result, Assembler::NE);
5024   __ leave();
5025   __ ret(lr);
5026   __ bind(LEN_OVER_8);
5027   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5028   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5029   __ tst(rscratch2, UPPER_BIT_MASK);
5030   __ br(Assembler::NE, RET_NO_POP);
5031   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5032   __ lsrv(rscratch1, rscratch1, rscratch2);
5033   __ tst(rscratch1, UPPER_BIT_MASK);
5034   __ bind(RET_NO_POP);
5035   __ csel(result, zr, result, Assembler::NE);
5036   __ leave();
5037   __ ret(lr);
5038 
5039   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5040   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5041 
5042   count_positives_long = __ pc(); // 2nd entry point
5043 
5044   __ enter();
5045 
5046   __ bind(LEN_OVER_15);
5047     __ push(spilled_regs, sp);
5048     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5049     __ cbz(rscratch2, ALIGNED);
5050     __ ldp(tmp6, tmp1, Address(ary1));
5051     __ mov(tmp5, 16);
5052     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5053     __ add(ary1, ary1, rscratch1);
5054     __ orr(tmp6, tmp6, tmp1);
5055     __ tst(tmp6, UPPER_BIT_MASK);
5056     __ br(Assembler::NE, RET_ADJUST);
5057     __ sub(len, len, rscratch1);
5058 
5059   __ bind(ALIGNED);
5060     __ cmp(len, large_loop_size);
5061     __ br(Assembler::LT, CHECK_16);
5062     // Perform 16-byte load as early return in pre-loop to handle situation
5063     // when initially aligned large array has negative values at starting bytes,
5064     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5065     // slower. Cases with negative bytes further ahead won't be affected that
5066     // much. In fact, it'll be faster due to early loads, less instructions and
5067     // less branches in LARGE_LOOP.
5068     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5069     __ sub(len, len, 16);
5070     __ orr(tmp6, tmp6, tmp1);
5071     __ tst(tmp6, UPPER_BIT_MASK);
5072     __ br(Assembler::NE, RET_ADJUST_16);
5073     __ cmp(len, large_loop_size);
5074     __ br(Assembler::LT, CHECK_16);
5075 
5076     if (SoftwarePrefetchHintDistance >= 0
5077         && SoftwarePrefetchHintDistance >= dcache_line) {
5078       // initial prefetch
5079       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5080     }
5081   __ bind(LARGE_LOOP);
5082     if (SoftwarePrefetchHintDistance >= 0) {
5083       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5084     }
5085     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5086     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5087     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5088     // instructions per cycle and have less branches, but this approach disables
5089     // early return, thus, all 64 bytes are loaded and checked every time.
5090     __ ldp(tmp2, tmp3, Address(ary1));
5091     __ ldp(tmp4, tmp5, Address(ary1, 16));
5092     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5093     __ ldp(tmp6, tmp1, Address(ary1, 48));
5094     __ add(ary1, ary1, large_loop_size);
5095     __ sub(len, len, large_loop_size);
5096     __ orr(tmp2, tmp2, tmp3);
5097     __ orr(tmp4, tmp4, tmp5);
5098     __ orr(rscratch1, rscratch1, rscratch2);
5099     __ orr(tmp6, tmp6, tmp1);
5100     __ orr(tmp2, tmp2, tmp4);
5101     __ orr(rscratch1, rscratch1, tmp6);
5102     __ orr(tmp2, tmp2, rscratch1);
5103     __ tst(tmp2, UPPER_BIT_MASK);
5104     __ br(Assembler::NE, RET_ADJUST_LONG);
5105     __ cmp(len, large_loop_size);
5106     __ br(Assembler::GE, LARGE_LOOP);
5107 
5108   __ bind(CHECK_16); // small 16-byte load pre-loop
5109     __ cmp(len, (u1)16);
5110     __ br(Assembler::LT, POST_LOOP16);
5111 
5112   __ bind(LOOP16); // small 16-byte load loop
5113     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5114     __ sub(len, len, 16);
5115     __ orr(tmp2, tmp2, tmp3);
5116     __ tst(tmp2, UPPER_BIT_MASK);
5117     __ br(Assembler::NE, RET_ADJUST_16);
5118     __ cmp(len, (u1)16);
5119     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5120 
5121   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5122     __ cmp(len, (u1)8);
5123     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5124     __ ldr(tmp3, Address(__ post(ary1, 8)));
5125     __ tst(tmp3, UPPER_BIT_MASK);
5126     __ br(Assembler::NE, RET_ADJUST);
5127     __ sub(len, len, 8);
5128 
5129   __ bind(POST_LOOP16_LOAD_TAIL);
5130     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5131     __ ldr(tmp1, Address(ary1));
5132     __ mov(tmp2, 64);
5133     __ sub(tmp4, tmp2, len, __ LSL, 3);
5134     __ lslv(tmp1, tmp1, tmp4);
5135     __ tst(tmp1, UPPER_BIT_MASK);
5136     __ br(Assembler::NE, RET_ADJUST);
5137     // Fallthrough
5138 
5139   __ bind(RET_LEN);
5140     __ pop(spilled_regs, sp);
5141     __ leave();
5142     __ ret(lr);
5143 
5144     // difference result - len is the count of guaranteed to be
5145     // positive bytes
5146 
5147   __ bind(RET_ADJUST_LONG);
5148     __ add(len, len, (u1)(large_loop_size - 16));
5149   __ bind(RET_ADJUST_16);
5150     __ add(len, len, 16);
5151   __ bind(RET_ADJUST);
5152     __ pop(spilled_regs, sp);
5153     __ leave();
5154     __ sub(result, result, len);
5155     __ ret(lr);
5156 
5157     return entry;
5158   }
5159 
5160   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5161         bool usePrefetch, Label &NOT_EQUAL) {
5162     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5163         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5164         tmp7 = r12, tmp8 = r13;
5165     Label LOOP;
5166 
5167     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5168     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5169     __ bind(LOOP);
5170     if (usePrefetch) {
5171       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5172       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5173     }
5174     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5175     __ eor(tmp1, tmp1, tmp2);
5176     __ eor(tmp3, tmp3, tmp4);
5177     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5178     __ orr(tmp1, tmp1, tmp3);
5179     __ cbnz(tmp1, NOT_EQUAL);
5180     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5181     __ eor(tmp5, tmp5, tmp6);
5182     __ eor(tmp7, tmp7, tmp8);
5183     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5184     __ orr(tmp5, tmp5, tmp7);
5185     __ cbnz(tmp5, NOT_EQUAL);
5186     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5187     __ eor(tmp1, tmp1, tmp2);
5188     __ eor(tmp3, tmp3, tmp4);
5189     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5190     __ orr(tmp1, tmp1, tmp3);
5191     __ cbnz(tmp1, NOT_EQUAL);
5192     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5193     __ eor(tmp5, tmp5, tmp6);
5194     __ sub(cnt1, cnt1, 8 * wordSize);
5195     __ eor(tmp7, tmp7, tmp8);
5196     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5197     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5198     // cmp) because subs allows an unlimited range of immediate operand.
5199     __ subs(tmp6, cnt1, loopThreshold);
5200     __ orr(tmp5, tmp5, tmp7);
5201     __ cbnz(tmp5, NOT_EQUAL);
5202     __ br(__ GE, LOOP);
5203     // post-loop
5204     __ eor(tmp1, tmp1, tmp2);
5205     __ eor(tmp3, tmp3, tmp4);
5206     __ orr(tmp1, tmp1, tmp3);
5207     __ sub(cnt1, cnt1, 2 * wordSize);
5208     __ cbnz(tmp1, NOT_EQUAL);
5209   }
5210 
5211   void generate_large_array_equals_loop_simd(int loopThreshold,
5212         bool usePrefetch, Label &NOT_EQUAL) {
5213     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5214         tmp2 = rscratch2;
5215     Label LOOP;
5216 
5217     __ bind(LOOP);
5218     if (usePrefetch) {
5219       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5220       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5221     }
5222     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5223     __ sub(cnt1, cnt1, 8 * wordSize);
5224     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5225     __ subs(tmp1, cnt1, loopThreshold);
5226     __ eor(v0, __ T16B, v0, v4);
5227     __ eor(v1, __ T16B, v1, v5);
5228     __ eor(v2, __ T16B, v2, v6);
5229     __ eor(v3, __ T16B, v3, v7);
5230     __ orr(v0, __ T16B, v0, v1);
5231     __ orr(v1, __ T16B, v2, v3);
5232     __ orr(v0, __ T16B, v0, v1);
5233     __ umov(tmp1, v0, __ D, 0);
5234     __ umov(tmp2, v0, __ D, 1);
5235     __ orr(tmp1, tmp1, tmp2);
5236     __ cbnz(tmp1, NOT_EQUAL);
5237     __ br(__ GE, LOOP);
5238   }
5239 
5240   // a1 = r1 - array1 address
5241   // a2 = r2 - array2 address
5242   // result = r0 - return value. Already contains "false"
5243   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5244   // r3-r5 are reserved temporary registers
5245   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5246   address generate_large_array_equals() {
5247     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5248         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5249         tmp7 = r12, tmp8 = r13;
5250     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5251         SMALL_LOOP, POST_LOOP;
5252     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5253     // calculate if at least 32 prefetched bytes are used
5254     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5255     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5256     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5257     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5258         tmp5, tmp6, tmp7, tmp8);
5259 
5260     __ align(CodeEntryAlignment);
5261 
5262     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5263 
5264     address entry = __ pc();
5265     __ enter();
5266     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5267     // also advance pointers to use post-increment instead of pre-increment
5268     __ add(a1, a1, wordSize);
5269     __ add(a2, a2, wordSize);
5270     if (AvoidUnalignedAccesses) {
5271       // both implementations (SIMD/nonSIMD) are using relatively large load
5272       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5273       // on some CPUs in case of address is not at least 16-byte aligned.
5274       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5275       // load if needed at least for 1st address and make if 16-byte aligned.
5276       Label ALIGNED16;
5277       __ tbz(a1, 3, ALIGNED16);
5278       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5279       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5280       __ sub(cnt1, cnt1, wordSize);
5281       __ eor(tmp1, tmp1, tmp2);
5282       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5283       __ bind(ALIGNED16);
5284     }
5285     if (UseSIMDForArrayEquals) {
5286       if (SoftwarePrefetchHintDistance >= 0) {
5287         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5288         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5289         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5290             /* prfm = */ true, NOT_EQUAL);
5291         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5292         __ br(__ LT, TAIL);
5293       }
5294       __ bind(NO_PREFETCH_LARGE_LOOP);
5295       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5296           /* prfm = */ false, NOT_EQUAL);
5297     } else {
5298       __ push(spilled_regs, sp);
5299       if (SoftwarePrefetchHintDistance >= 0) {
5300         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5301         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5302         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5303             /* prfm = */ true, NOT_EQUAL);
5304         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5305         __ br(__ LT, TAIL);
5306       }
5307       __ bind(NO_PREFETCH_LARGE_LOOP);
5308       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5309           /* prfm = */ false, NOT_EQUAL);
5310     }
5311     __ bind(TAIL);
5312       __ cbz(cnt1, EQUAL);
5313       __ subs(cnt1, cnt1, wordSize);
5314       __ br(__ LE, POST_LOOP);
5315     __ bind(SMALL_LOOP);
5316       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5317       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5318       __ subs(cnt1, cnt1, wordSize);
5319       __ eor(tmp1, tmp1, tmp2);
5320       __ cbnz(tmp1, NOT_EQUAL);
5321       __ br(__ GT, SMALL_LOOP);
5322     __ bind(POST_LOOP);
5323       __ ldr(tmp1, Address(a1, cnt1));
5324       __ ldr(tmp2, Address(a2, cnt1));
5325       __ eor(tmp1, tmp1, tmp2);
5326       __ cbnz(tmp1, NOT_EQUAL);
5327     __ bind(EQUAL);
5328       __ mov(result, true);
5329     __ bind(NOT_EQUAL);
5330       if (!UseSIMDForArrayEquals) {
5331         __ pop(spilled_regs, sp);
5332       }
5333     __ bind(NOT_EQUAL_NO_POP);
5334     __ leave();
5335     __ ret(lr);
5336     return entry;
5337   }
5338 
5339   // result = r0 - return value. Contains initial hashcode value on entry.
5340   // ary = r1 - array address
5341   // cnt = r2 - elements count
5342   // Clobbers: v0-v13, rscratch1, rscratch2
5343   address generate_large_arrays_hashcode(BasicType eltype) {
5344     const Register result = r0, ary = r1, cnt = r2;
5345     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
5346     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
5347     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
5348     const FloatRegister vpowm = v13;
5349 
5350     ARRAYS_HASHCODE_REGISTERS;
5351 
5352     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
5353 
5354     unsigned int vf; // vectorization factor
5355     bool multiply_by_halves;
5356     Assembler::SIMD_Arrangement load_arrangement;
5357     switch (eltype) {
5358     case T_BOOLEAN:
5359     case T_BYTE:
5360       load_arrangement = Assembler::T8B;
5361       multiply_by_halves = true;
5362       vf = 8;
5363       break;
5364     case T_CHAR:
5365     case T_SHORT:
5366       load_arrangement = Assembler::T8H;
5367       multiply_by_halves = true;
5368       vf = 8;
5369       break;
5370     case T_INT:
5371       load_arrangement = Assembler::T4S;
5372       multiply_by_halves = false;
5373       vf = 4;
5374       break;
5375     default:
5376       ShouldNotReachHere();
5377     }
5378 
5379     // Unroll factor
5380     const unsigned uf = 4;
5381 
5382     // Effective vectorization factor
5383     const unsigned evf = vf * uf;
5384 
5385     __ align(CodeEntryAlignment);
5386 
5387     const char *mark_name = "";
5388     switch (eltype) {
5389     case T_BOOLEAN:
5390       mark_name = "_large_arrays_hashcode_boolean";
5391       break;
5392     case T_BYTE:
5393       mark_name = "_large_arrays_hashcode_byte";
5394       break;
5395     case T_CHAR:
5396       mark_name = "_large_arrays_hashcode_char";
5397       break;
5398     case T_SHORT:
5399       mark_name = "_large_arrays_hashcode_short";
5400       break;
5401     case T_INT:
5402       mark_name = "_large_arrays_hashcode_int";
5403       break;
5404     default:
5405       mark_name = "_large_arrays_hashcode_incorrect_type";
5406       __ should_not_reach_here();
5407     };
5408 
5409     StubCodeMark mark(this, "StubRoutines", mark_name);
5410 
5411     address entry = __ pc();
5412     __ enter();
5413 
5414     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
5415     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
5416     // value shouldn't change throughout both loops.
5417     __ movw(rscratch1, intpow(31U, 3));
5418     __ mov(vpow, Assembler::S, 0, rscratch1);
5419     __ movw(rscratch1, intpow(31U, 2));
5420     __ mov(vpow, Assembler::S, 1, rscratch1);
5421     __ movw(rscratch1, intpow(31U, 1));
5422     __ mov(vpow, Assembler::S, 2, rscratch1);
5423     __ movw(rscratch1, intpow(31U, 0));
5424     __ mov(vpow, Assembler::S, 3, rscratch1);
5425 
5426     __ mov(vmul0, Assembler::T16B, 0);
5427     __ mov(vmul0, Assembler::S, 3, result);
5428 
5429     __ andr(rscratch2, cnt, (uf - 1) * vf);
5430     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
5431 
5432     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
5433     __ mov(vpowm, Assembler::S, 0, rscratch1);
5434 
5435     // SMALL LOOP
5436     __ bind(SMALL_LOOP);
5437 
5438     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
5439     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5440     __ subsw(rscratch2, rscratch2, vf);
5441 
5442     if (load_arrangement == Assembler::T8B) {
5443       // Extend 8B to 8H to be able to use vector multiply
5444       // instructions
5445       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5446       if (is_signed_subword_type(eltype)) {
5447         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5448       } else {
5449         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5450       }
5451     }
5452 
5453     switch (load_arrangement) {
5454     case Assembler::T4S:
5455       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5456       break;
5457     case Assembler::T8B:
5458     case Assembler::T8H:
5459       assert(is_subword_type(eltype), "subword type expected");
5460       if (is_signed_subword_type(eltype)) {
5461         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5462       } else {
5463         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5464       }
5465       break;
5466     default:
5467       __ should_not_reach_here();
5468     }
5469 
5470     // Process the upper half of a vector
5471     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5472       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5473       if (is_signed_subword_type(eltype)) {
5474         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5475       } else {
5476         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5477       }
5478     }
5479 
5480     __ br(Assembler::HI, SMALL_LOOP);
5481 
5482     // SMALL LOOP'S EPILOQUE
5483     __ lsr(rscratch2, cnt, exact_log2(evf));
5484     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
5485 
5486     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5487     __ addv(vmul0, Assembler::T4S, vmul0);
5488     __ umov(result, vmul0, Assembler::S, 0);
5489 
5490     // TAIL
5491     __ bind(TAIL);
5492 
5493     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
5494     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
5495     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
5496     __ andr(rscratch2, cnt, vf - 1);
5497     __ bind(TAIL_SHORTCUT);
5498     __ adr(rscratch1, BR_BASE);
5499     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
5500     __ movw(rscratch2, 0x1f);
5501     __ br(rscratch1);
5502 
5503     for (size_t i = 0; i < vf - 1; ++i) {
5504       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
5505                                    eltype);
5506       __ maddw(result, result, rscratch2, rscratch1);
5507     }
5508     __ bind(BR_BASE);
5509 
5510     __ leave();
5511     __ ret(lr);
5512 
5513     // LARGE LOOP
5514     __ bind(LARGE_LOOP_PREHEADER);
5515 
5516     __ lsr(rscratch2, cnt, exact_log2(evf));
5517 
5518     if (multiply_by_halves) {
5519       // 31^4 - multiplier between lower and upper parts of a register
5520       __ movw(rscratch1, intpow(31U, vf / 2));
5521       __ mov(vpowm, Assembler::S, 1, rscratch1);
5522       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
5523       __ movw(rscratch1, intpow(31U, evf - vf / 2));
5524       __ mov(vpowm, Assembler::S, 0, rscratch1);
5525     } else {
5526       // 31^16
5527       __ movw(rscratch1, intpow(31U, evf));
5528       __ mov(vpowm, Assembler::S, 0, rscratch1);
5529     }
5530 
5531     __ mov(vmul3, Assembler::T16B, 0);
5532     __ mov(vmul2, Assembler::T16B, 0);
5533     __ mov(vmul1, Assembler::T16B, 0);
5534 
5535     __ bind(LARGE_LOOP);
5536 
5537     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
5538     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
5539     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
5540     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5541 
5542     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
5543            Address(__ post(ary, evf * type2aelembytes(eltype))));
5544 
5545     if (load_arrangement == Assembler::T8B) {
5546       // Extend 8B to 8H to be able to use vector multiply
5547       // instructions
5548       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5549       if (is_signed_subword_type(eltype)) {
5550         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5551         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5552         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5553         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5554       } else {
5555         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5556         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5557         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5558         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5559       }
5560     }
5561 
5562     switch (load_arrangement) {
5563     case Assembler::T4S:
5564       __ addv(vmul3, load_arrangement, vmul3, vdata3);
5565       __ addv(vmul2, load_arrangement, vmul2, vdata2);
5566       __ addv(vmul1, load_arrangement, vmul1, vdata1);
5567       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5568       break;
5569     case Assembler::T8B:
5570     case Assembler::T8H:
5571       assert(is_subword_type(eltype), "subword type expected");
5572       if (is_signed_subword_type(eltype)) {
5573         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5574         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5575         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5576         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5577       } else {
5578         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5579         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5580         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5581         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5582       }
5583       break;
5584     default:
5585       __ should_not_reach_here();
5586     }
5587 
5588     // Process the upper half of a vector
5589     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5590       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
5591       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
5592       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
5593       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
5594       if (is_signed_subword_type(eltype)) {
5595         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5596         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5597         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5598         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5599       } else {
5600         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5601         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5602         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5603         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5604       }
5605     }
5606 
5607     __ subsw(rscratch2, rscratch2, 1);
5608     __ br(Assembler::HI, LARGE_LOOP);
5609 
5610     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
5611     __ addv(vmul3, Assembler::T4S, vmul3);
5612     __ umov(result, vmul3, Assembler::S, 0);
5613 
5614     __ mov(rscratch2, intpow(31U, vf));
5615 
5616     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
5617     __ addv(vmul2, Assembler::T4S, vmul2);
5618     __ umov(rscratch1, vmul2, Assembler::S, 0);
5619     __ maddw(result, result, rscratch2, rscratch1);
5620 
5621     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
5622     __ addv(vmul1, Assembler::T4S, vmul1);
5623     __ umov(rscratch1, vmul1, Assembler::S, 0);
5624     __ maddw(result, result, rscratch2, rscratch1);
5625 
5626     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5627     __ addv(vmul0, Assembler::T4S, vmul0);
5628     __ umov(rscratch1, vmul0, Assembler::S, 0);
5629     __ maddw(result, result, rscratch2, rscratch1);
5630 
5631     __ andr(rscratch2, cnt, vf - 1);
5632     __ cbnz(rscratch2, TAIL_SHORTCUT);
5633 
5634     __ leave();
5635     __ ret(lr);
5636 
5637     return entry;
5638   }
5639 
5640   address generate_dsin_dcos(bool isCos) {
5641     __ align(CodeEntryAlignment);
5642     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5643     address start = __ pc();
5644     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5645         (address)StubRoutines::aarch64::_two_over_pi,
5646         (address)StubRoutines::aarch64::_pio2,
5647         (address)StubRoutines::aarch64::_dsin_coef,
5648         (address)StubRoutines::aarch64::_dcos_coef);
5649     return start;
5650   }
5651 
5652   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5653   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5654       Label &DIFF2) {
5655     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5656     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5657 
5658     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5659     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5660     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5661     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5662 
5663     __ fmovd(tmpL, vtmp3);
5664     __ eor(rscratch2, tmp3, tmpL);
5665     __ cbnz(rscratch2, DIFF2);
5666 
5667     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5668     __ umov(tmpL, vtmp3, __ D, 1);
5669     __ eor(rscratch2, tmpU, tmpL);
5670     __ cbnz(rscratch2, DIFF1);
5671 
5672     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5673     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5674     __ fmovd(tmpL, vtmp);
5675     __ eor(rscratch2, tmp3, tmpL);
5676     __ cbnz(rscratch2, DIFF2);
5677 
5678     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5679     __ umov(tmpL, vtmp, __ D, 1);
5680     __ eor(rscratch2, tmpU, tmpL);
5681     __ cbnz(rscratch2, DIFF1);
5682   }
5683 
5684   // r0  = result
5685   // r1  = str1
5686   // r2  = cnt1
5687   // r3  = str2
5688   // r4  = cnt2
5689   // r10 = tmp1
5690   // r11 = tmp2
5691   address generate_compare_long_string_different_encoding(bool isLU) {
5692     __ align(CodeEntryAlignment);
5693     StubCodeMark mark(this, "StubRoutines", isLU
5694         ? "compare_long_string_different_encoding LU"
5695         : "compare_long_string_different_encoding UL");
5696     address entry = __ pc();
5697     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5698         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5699         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5700     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5701         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5702     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5703     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5704 
5705     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5706 
5707     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5708     // cnt2 == amount of characters left to compare
5709     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5710     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5711     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5712     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5713     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5714     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5715     __ eor(rscratch2, tmp1, tmp2);
5716     __ mov(rscratch1, tmp2);
5717     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5718     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5719              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5720     __ push(spilled_regs, sp);
5721     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5722     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5723 
5724     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5725 
5726     if (SoftwarePrefetchHintDistance >= 0) {
5727       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5728       __ br(__ LT, NO_PREFETCH);
5729       __ bind(LARGE_LOOP_PREFETCH);
5730         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5731         __ mov(tmp4, 2);
5732         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5733         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5734           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5735           __ subs(tmp4, tmp4, 1);
5736           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5737           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5738           __ mov(tmp4, 2);
5739         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5740           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5741           __ subs(tmp4, tmp4, 1);
5742           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5743           __ sub(cnt2, cnt2, 64);
5744           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5745           __ br(__ GE, LARGE_LOOP_PREFETCH);
5746     }
5747     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5748     __ bind(NO_PREFETCH);
5749     __ subs(cnt2, cnt2, 16);
5750     __ br(__ LT, TAIL);
5751     __ align(OptoLoopAlignment);
5752     __ bind(SMALL_LOOP); // smaller loop
5753       __ subs(cnt2, cnt2, 16);
5754       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5755       __ br(__ GE, SMALL_LOOP);
5756       __ cmn(cnt2, (u1)16);
5757       __ br(__ EQ, LOAD_LAST);
5758     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5759       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5760       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5761       __ ldr(tmp3, Address(cnt1, -8));
5762       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5763       __ b(LOAD_LAST);
5764     __ bind(DIFF2);
5765       __ mov(tmpU, tmp3);
5766     __ bind(DIFF1);
5767       __ pop(spilled_regs, sp);
5768       __ b(CALCULATE_DIFFERENCE);
5769     __ bind(LOAD_LAST);
5770       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5771       // No need to load it again
5772       __ mov(tmpU, tmp3);
5773       __ pop(spilled_regs, sp);
5774 
5775       // tmp2 points to the address of the last 4 Latin1 characters right now
5776       __ ldrs(vtmp, Address(tmp2));
5777       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5778       __ fmovd(tmpL, vtmp);
5779 
5780       __ eor(rscratch2, tmpU, tmpL);
5781       __ cbz(rscratch2, DONE);
5782 
5783     // Find the first different characters in the longwords and
5784     // compute their difference.
5785     __ bind(CALCULATE_DIFFERENCE);
5786       __ rev(rscratch2, rscratch2);
5787       __ clz(rscratch2, rscratch2);
5788       __ andr(rscratch2, rscratch2, -16);
5789       __ lsrv(tmp1, tmp1, rscratch2);
5790       __ uxthw(tmp1, tmp1);
5791       __ lsrv(rscratch1, rscratch1, rscratch2);
5792       __ uxthw(rscratch1, rscratch1);
5793       __ subw(result, tmp1, rscratch1);
5794     __ bind(DONE);
5795       __ ret(lr);
5796     return entry;
5797   }
5798 
5799   // r0 = input (float16)
5800   // v0 = result (float)
5801   // v1 = temporary float register
5802   address generate_float16ToFloat() {
5803     __ align(CodeEntryAlignment);
5804     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
5805     address entry = __ pc();
5806     BLOCK_COMMENT("Entry:");
5807     __ flt16_to_flt(v0, r0, v1);
5808     __ ret(lr);
5809     return entry;
5810   }
5811 
5812   // v0 = input (float)
5813   // r0 = result (float16)
5814   // v1 = temporary float register
5815   address generate_floatToFloat16() {
5816     __ align(CodeEntryAlignment);
5817     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
5818     address entry = __ pc();
5819     BLOCK_COMMENT("Entry:");
5820     __ flt_to_flt16(r0, v0, v1);
5821     __ ret(lr);
5822     return entry;
5823   }
5824 
5825   address generate_method_entry_barrier() {
5826     __ align(CodeEntryAlignment);
5827     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5828 
5829     Label deoptimize_label;
5830 
5831     address start = __ pc();
5832 
5833     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5834 
5835     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5836       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5837       // We can get here despite the nmethod being good, if we have not
5838       // yet applied our cross modification fence (or data fence).
5839       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5840       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5841       __ ldrw(rscratch2, rscratch2);
5842       __ strw(rscratch2, thread_epoch_addr);
5843       __ isb();
5844       __ membar(__ LoadLoad);
5845     }
5846 
5847     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5848 
5849     __ enter();
5850     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5851 
5852     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5853 
5854     __ push_call_clobbered_registers();
5855 
5856     __ mov(c_rarg0, rscratch2);
5857     __ call_VM_leaf
5858          (CAST_FROM_FN_PTR
5859           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5860 
5861     __ reset_last_Java_frame(true);
5862 
5863     __ mov(rscratch1, r0);
5864 
5865     __ pop_call_clobbered_registers();
5866 
5867     __ cbnz(rscratch1, deoptimize_label);
5868 
5869     __ leave();
5870     __ ret(lr);
5871 
5872     __ BIND(deoptimize_label);
5873 
5874     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5875     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5876 
5877     __ mov(sp, rscratch1);
5878     __ br(rscratch2);
5879 
5880     return start;
5881   }
5882 
5883   // r0  = result
5884   // r1  = str1
5885   // r2  = cnt1
5886   // r3  = str2
5887   // r4  = cnt2
5888   // r10 = tmp1
5889   // r11 = tmp2
5890   address generate_compare_long_string_same_encoding(bool isLL) {
5891     __ align(CodeEntryAlignment);
5892     StubCodeMark mark(this, "StubRoutines", isLL
5893         ? "compare_long_string_same_encoding LL"
5894         : "compare_long_string_same_encoding UU");
5895     address entry = __ pc();
5896     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5897         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5898 
5899     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5900 
5901     // exit from large loop when less than 64 bytes left to read or we're about
5902     // to prefetch memory behind array border
5903     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5904 
5905     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5906     __ eor(rscratch2, tmp1, tmp2);
5907     __ cbnz(rscratch2, CAL_DIFFERENCE);
5908 
5909     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5910     // update pointers, because of previous read
5911     __ add(str1, str1, wordSize);
5912     __ add(str2, str2, wordSize);
5913     if (SoftwarePrefetchHintDistance >= 0) {
5914       __ align(OptoLoopAlignment);
5915       __ bind(LARGE_LOOP_PREFETCH);
5916         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5917         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5918 
5919         for (int i = 0; i < 4; i++) {
5920           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5921           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5922           __ cmp(tmp1, tmp2);
5923           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5924           __ br(Assembler::NE, DIFF);
5925         }
5926         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5927         __ add(str1, str1, 64);
5928         __ add(str2, str2, 64);
5929         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5930         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5931         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5932     }
5933 
5934     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5935     __ br(Assembler::LE, LESS16);
5936     __ align(OptoLoopAlignment);
5937     __ bind(LOOP_COMPARE16);
5938       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5939       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5940       __ cmp(tmp1, tmp2);
5941       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5942       __ br(Assembler::NE, DIFF);
5943       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5944       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5945       __ br(Assembler::LT, LESS16);
5946 
5947       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5948       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5949       __ cmp(tmp1, tmp2);
5950       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5951       __ br(Assembler::NE, DIFF);
5952       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5953       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5954       __ br(Assembler::GE, LOOP_COMPARE16);
5955       __ cbz(cnt2, LENGTH_DIFF);
5956 
5957     __ bind(LESS16);
5958       // each 8 compare
5959       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5960       __ br(Assembler::LE, LESS8);
5961       __ ldr(tmp1, Address(__ post(str1, 8)));
5962       __ ldr(tmp2, Address(__ post(str2, 8)));
5963       __ eor(rscratch2, tmp1, tmp2);
5964       __ cbnz(rscratch2, CAL_DIFFERENCE);
5965       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5966 
5967     __ bind(LESS8); // directly load last 8 bytes
5968       if (!isLL) {
5969         __ add(cnt2, cnt2, cnt2);
5970       }
5971       __ ldr(tmp1, Address(str1, cnt2));
5972       __ ldr(tmp2, Address(str2, cnt2));
5973       __ eor(rscratch2, tmp1, tmp2);
5974       __ cbz(rscratch2, LENGTH_DIFF);
5975       __ b(CAL_DIFFERENCE);
5976 
5977     __ bind(DIFF);
5978       __ cmp(tmp1, tmp2);
5979       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5980       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5981       // reuse rscratch2 register for the result of eor instruction
5982       __ eor(rscratch2, tmp1, tmp2);
5983 
5984     __ bind(CAL_DIFFERENCE);
5985       __ rev(rscratch2, rscratch2);
5986       __ clz(rscratch2, rscratch2);
5987       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5988       __ lsrv(tmp1, tmp1, rscratch2);
5989       __ lsrv(tmp2, tmp2, rscratch2);
5990       if (isLL) {
5991         __ uxtbw(tmp1, tmp1);
5992         __ uxtbw(tmp2, tmp2);
5993       } else {
5994         __ uxthw(tmp1, tmp1);
5995         __ uxthw(tmp2, tmp2);
5996       }
5997       __ subw(result, tmp1, tmp2);
5998 
5999     __ bind(LENGTH_DIFF);
6000       __ ret(lr);
6001     return entry;
6002   }
6003 
6004   enum string_compare_mode {
6005     LL,
6006     LU,
6007     UL,
6008     UU,
6009   };
6010 
6011   // The following registers are declared in aarch64.ad
6012   // r0  = result
6013   // r1  = str1
6014   // r2  = cnt1
6015   // r3  = str2
6016   // r4  = cnt2
6017   // r10 = tmp1
6018   // r11 = tmp2
6019   // z0  = ztmp1
6020   // z1  = ztmp2
6021   // p0  = pgtmp1
6022   // p1  = pgtmp2
6023   address generate_compare_long_string_sve(string_compare_mode mode) {
6024     __ align(CodeEntryAlignment);
6025     address entry = __ pc();
6026     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
6027              tmp1 = r10, tmp2 = r11;
6028 
6029     Label LOOP, DONE, MISMATCH;
6030     Register vec_len = tmp1;
6031     Register idx = tmp2;
6032     // The minimum of the string lengths has been stored in cnt2.
6033     Register cnt = cnt2;
6034     FloatRegister ztmp1 = z0, ztmp2 = z1;
6035     PRegister pgtmp1 = p0, pgtmp2 = p1;
6036 
6037 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
6038     switch (mode) {                                                            \
6039       case LL:                                                                 \
6040         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
6041         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
6042         break;                                                                 \
6043       case LU:                                                                 \
6044         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
6045         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6046         break;                                                                 \
6047       case UL:                                                                 \
6048         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6049         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
6050         break;                                                                 \
6051       case UU:                                                                 \
6052         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6053         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6054         break;                                                                 \
6055       default:                                                                 \
6056         ShouldNotReachHere();                                                  \
6057     }
6058 
6059     const char* stubname;
6060     switch (mode) {
6061       case LL: stubname = "compare_long_string_same_encoding LL";      break;
6062       case LU: stubname = "compare_long_string_different_encoding LU"; break;
6063       case UL: stubname = "compare_long_string_different_encoding UL"; break;
6064       case UU: stubname = "compare_long_string_same_encoding UU";      break;
6065       default: ShouldNotReachHere();
6066     }
6067 
6068     StubCodeMark mark(this, "StubRoutines", stubname);
6069 
6070     __ mov(idx, 0);
6071     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6072 
6073     if (mode == LL) {
6074       __ sve_cntb(vec_len);
6075     } else {
6076       __ sve_cnth(vec_len);
6077     }
6078 
6079     __ sub(rscratch1, cnt, vec_len);
6080 
6081     __ bind(LOOP);
6082 
6083       // main loop
6084       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6085       __ add(idx, idx, vec_len);
6086       // Compare strings.
6087       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6088       __ br(__ NE, MISMATCH);
6089       __ cmp(idx, rscratch1);
6090       __ br(__ LT, LOOP);
6091 
6092     // post loop, last iteration
6093     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6094 
6095     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6096     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6097     __ br(__ EQ, DONE);
6098 
6099     __ bind(MISMATCH);
6100 
6101     // Crop the vector to find its location.
6102     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
6103     // Extract the first different characters of each string.
6104     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
6105     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
6106 
6107     // Compute the difference of the first different characters.
6108     __ sub(result, rscratch1, rscratch2);
6109 
6110     __ bind(DONE);
6111     __ ret(lr);
6112 #undef LOAD_PAIR
6113     return entry;
6114   }
6115 
6116   void generate_compare_long_strings() {
6117     if (UseSVE == 0) {
6118       StubRoutines::aarch64::_compare_long_string_LL
6119           = generate_compare_long_string_same_encoding(true);
6120       StubRoutines::aarch64::_compare_long_string_UU
6121           = generate_compare_long_string_same_encoding(false);
6122       StubRoutines::aarch64::_compare_long_string_LU
6123           = generate_compare_long_string_different_encoding(true);
6124       StubRoutines::aarch64::_compare_long_string_UL
6125           = generate_compare_long_string_different_encoding(false);
6126     } else {
6127       StubRoutines::aarch64::_compare_long_string_LL
6128           = generate_compare_long_string_sve(LL);
6129       StubRoutines::aarch64::_compare_long_string_UU
6130           = generate_compare_long_string_sve(UU);
6131       StubRoutines::aarch64::_compare_long_string_LU
6132           = generate_compare_long_string_sve(LU);
6133       StubRoutines::aarch64::_compare_long_string_UL
6134           = generate_compare_long_string_sve(UL);
6135     }
6136   }
6137 
6138   // R0 = result
6139   // R1 = str2
6140   // R2 = cnt1
6141   // R3 = str1
6142   // R4 = cnt2
6143   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
6144   //
6145   // This generic linear code use few additional ideas, which makes it faster:
6146   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
6147   // in order to skip initial loading(help in systems with 1 ld pipeline)
6148   // 2) we can use "fast" algorithm of finding single character to search for
6149   // first symbol with less branches(1 branch per each loaded register instead
6150   // of branch for each symbol), so, this is where constants like
6151   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
6152   // 3) after loading and analyzing 1st register of source string, it can be
6153   // used to search for every 1st character entry, saving few loads in
6154   // comparison with "simplier-but-slower" implementation
6155   // 4) in order to avoid lots of push/pop operations, code below is heavily
6156   // re-using/re-initializing/compressing register values, which makes code
6157   // larger and a bit less readable, however, most of extra operations are
6158   // issued during loads or branches, so, penalty is minimal
6159   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
6160     const char* stubName = str1_isL
6161         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
6162         : "indexof_linear_uu";
6163     __ align(CodeEntryAlignment);
6164     StubCodeMark mark(this, "StubRoutines", stubName);
6165     address entry = __ pc();
6166 
6167     int str1_chr_size = str1_isL ? 1 : 2;
6168     int str2_chr_size = str2_isL ? 1 : 2;
6169     int str1_chr_shift = str1_isL ? 0 : 1;
6170     int str2_chr_shift = str2_isL ? 0 : 1;
6171     bool isL = str1_isL && str2_isL;
6172    // parameters
6173     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
6174     // temporary registers
6175     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
6176     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
6177     // redefinitions
6178     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
6179 
6180     __ push(spilled_regs, sp);
6181     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
6182         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
6183         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
6184         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
6185         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
6186         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
6187     // Read whole register from str1. It is safe, because length >=8 here
6188     __ ldr(ch1, Address(str1));
6189     // Read whole register from str2. It is safe, because length >=8 here
6190     __ ldr(ch2, Address(str2));
6191     __ sub(cnt2, cnt2, cnt1);
6192     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
6193     if (str1_isL != str2_isL) {
6194       __ eor(v0, __ T16B, v0, v0);
6195     }
6196     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
6197     __ mul(first, first, tmp1);
6198     // check if we have less than 1 register to check
6199     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
6200     if (str1_isL != str2_isL) {
6201       __ fmovd(v1, ch1);
6202     }
6203     __ br(__ LE, L_SMALL);
6204     __ eor(ch2, first, ch2);
6205     if (str1_isL != str2_isL) {
6206       __ zip1(v1, __ T16B, v1, v0);
6207     }
6208     __ sub(tmp2, ch2, tmp1);
6209     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6210     __ bics(tmp2, tmp2, ch2);
6211     if (str1_isL != str2_isL) {
6212       __ fmovd(ch1, v1);
6213     }
6214     __ br(__ NE, L_HAS_ZERO);
6215     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6216     __ add(result, result, wordSize/str2_chr_size);
6217     __ add(str2, str2, wordSize);
6218     __ br(__ LT, L_POST_LOOP);
6219     __ BIND(L_LOOP);
6220       __ ldr(ch2, Address(str2));
6221       __ eor(ch2, first, ch2);
6222       __ sub(tmp2, ch2, tmp1);
6223       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6224       __ bics(tmp2, tmp2, ch2);
6225       __ br(__ NE, L_HAS_ZERO);
6226     __ BIND(L_LOOP_PROCEED);
6227       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6228       __ add(str2, str2, wordSize);
6229       __ add(result, result, wordSize/str2_chr_size);
6230       __ br(__ GE, L_LOOP);
6231     __ BIND(L_POST_LOOP);
6232       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
6233       __ br(__ LE, NOMATCH);
6234       __ ldr(ch2, Address(str2));
6235       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6236       __ eor(ch2, first, ch2);
6237       __ sub(tmp2, ch2, tmp1);
6238       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6239       __ mov(tmp4, -1); // all bits set
6240       __ b(L_SMALL_PROCEED);
6241     __ align(OptoLoopAlignment);
6242     __ BIND(L_SMALL);
6243       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6244       __ eor(ch2, first, ch2);
6245       if (str1_isL != str2_isL) {
6246         __ zip1(v1, __ T16B, v1, v0);
6247       }
6248       __ sub(tmp2, ch2, tmp1);
6249       __ mov(tmp4, -1); // all bits set
6250       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6251       if (str1_isL != str2_isL) {
6252         __ fmovd(ch1, v1); // move converted 4 symbols
6253       }
6254     __ BIND(L_SMALL_PROCEED);
6255       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
6256       __ bic(tmp2, tmp2, ch2);
6257       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
6258       __ rbit(tmp2, tmp2);
6259       __ br(__ EQ, NOMATCH);
6260     __ BIND(L_SMALL_HAS_ZERO_LOOP);
6261       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
6262       __ cmp(cnt1, u1(wordSize/str2_chr_size));
6263       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
6264       if (str2_isL) { // LL
6265         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6266         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6267         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6268         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6269         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6270       } else {
6271         __ mov(ch2, 0xE); // all bits in byte set except last one
6272         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6273         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6274         __ lslv(tmp2, tmp2, tmp4);
6275         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6276         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6277         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6278         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6279       }
6280       __ cmp(ch1, ch2);
6281       __ mov(tmp4, wordSize/str2_chr_size);
6282       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6283     __ BIND(L_SMALL_CMP_LOOP);
6284       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6285                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6286       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6287                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6288       __ add(tmp4, tmp4, 1);
6289       __ cmp(tmp4, cnt1);
6290       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
6291       __ cmp(first, ch2);
6292       __ br(__ EQ, L_SMALL_CMP_LOOP);
6293     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
6294       __ cbz(tmp2, NOMATCH); // no more matches. exit
6295       __ clz(tmp4, tmp2);
6296       __ add(result, result, 1); // advance index
6297       __ add(str2, str2, str2_chr_size); // advance pointer
6298       __ b(L_SMALL_HAS_ZERO_LOOP);
6299     __ align(OptoLoopAlignment);
6300     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
6301       __ cmp(first, ch2);
6302       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6303       __ b(DONE);
6304     __ align(OptoLoopAlignment);
6305     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
6306       if (str2_isL) { // LL
6307         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6308         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6309         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6310         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6311         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6312       } else {
6313         __ mov(ch2, 0xE); // all bits in byte set except last one
6314         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6315         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6316         __ lslv(tmp2, tmp2, tmp4);
6317         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6318         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6319         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6320         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6321       }
6322       __ cmp(ch1, ch2);
6323       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6324       __ b(DONE);
6325     __ align(OptoLoopAlignment);
6326     __ BIND(L_HAS_ZERO);
6327       __ rbit(tmp2, tmp2);
6328       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6329       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6330       // It's fine because both counters are 32bit and are not changed in this
6331       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6332       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6333       __ sub(result, result, 1);
6334     __ BIND(L_HAS_ZERO_LOOP);
6335       __ mov(cnt1, wordSize/str2_chr_size);
6336       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6337       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6338       if (str2_isL) {
6339         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6340         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6341         __ lslv(tmp2, tmp2, tmp4);
6342         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6343         __ add(tmp4, tmp4, 1);
6344         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6345         __ lsl(tmp2, tmp2, 1);
6346         __ mov(tmp4, wordSize/str2_chr_size);
6347       } else {
6348         __ mov(ch2, 0xE);
6349         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6350         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6351         __ lslv(tmp2, tmp2, tmp4);
6352         __ add(tmp4, tmp4, 1);
6353         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6354         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6355         __ lsl(tmp2, tmp2, 1);
6356         __ mov(tmp4, wordSize/str2_chr_size);
6357         __ sub(str2, str2, str2_chr_size);
6358       }
6359       __ cmp(ch1, ch2);
6360       __ mov(tmp4, wordSize/str2_chr_size);
6361       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6362     __ BIND(L_CMP_LOOP);
6363       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6364                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6365       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6366                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6367       __ add(tmp4, tmp4, 1);
6368       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6369       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6370       __ cmp(cnt1, ch2);
6371       __ br(__ EQ, L_CMP_LOOP);
6372     __ BIND(L_CMP_LOOP_NOMATCH);
6373       // here we're not matched
6374       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6375       __ clz(tmp4, tmp2);
6376       __ add(str2, str2, str2_chr_size); // advance pointer
6377       __ b(L_HAS_ZERO_LOOP);
6378     __ align(OptoLoopAlignment);
6379     __ BIND(L_CMP_LOOP_LAST_CMP);
6380       __ cmp(cnt1, ch2);
6381       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6382       __ b(DONE);
6383     __ align(OptoLoopAlignment);
6384     __ BIND(L_CMP_LOOP_LAST_CMP2);
6385       if (str2_isL) {
6386         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6387         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6388         __ lslv(tmp2, tmp2, tmp4);
6389         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6390         __ add(tmp4, tmp4, 1);
6391         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6392         __ lsl(tmp2, tmp2, 1);
6393       } else {
6394         __ mov(ch2, 0xE);
6395         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6396         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6397         __ lslv(tmp2, tmp2, tmp4);
6398         __ add(tmp4, tmp4, 1);
6399         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6400         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6401         __ lsl(tmp2, tmp2, 1);
6402         __ sub(str2, str2, str2_chr_size);
6403       }
6404       __ cmp(ch1, ch2);
6405       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6406       __ b(DONE);
6407     __ align(OptoLoopAlignment);
6408     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6409       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6410       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6411       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6412       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6413       // result by analyzed characters value, so, we can just reset lower bits
6414       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6415       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6416       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6417       // index of last analyzed substring inside current octet. So, str2 in at
6418       // respective start address. We need to advance it to next octet
6419       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6420       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6421       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6422       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6423       __ movw(cnt2, cnt2);
6424       __ b(L_LOOP_PROCEED);
6425     __ align(OptoLoopAlignment);
6426     __ BIND(NOMATCH);
6427       __ mov(result, -1);
6428     __ BIND(DONE);
6429       __ pop(spilled_regs, sp);
6430       __ ret(lr);
6431     return entry;
6432   }
6433 
6434   void generate_string_indexof_stubs() {
6435     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6436     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6437     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6438   }
6439 
6440   void inflate_and_store_2_fp_registers(bool generatePrfm,
6441       FloatRegister src1, FloatRegister src2) {
6442     Register dst = r1;
6443     __ zip1(v1, __ T16B, src1, v0);
6444     __ zip2(v2, __ T16B, src1, v0);
6445     if (generatePrfm) {
6446       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6447     }
6448     __ zip1(v3, __ T16B, src2, v0);
6449     __ zip2(v4, __ T16B, src2, v0);
6450     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6451   }
6452 
6453   // R0 = src
6454   // R1 = dst
6455   // R2 = len
6456   // R3 = len >> 3
6457   // V0 = 0
6458   // v1 = loaded 8 bytes
6459   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6460   address generate_large_byte_array_inflate() {
6461     __ align(CodeEntryAlignment);
6462     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6463     address entry = __ pc();
6464     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6465     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6466     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6467 
6468     // do one more 8-byte read to have address 16-byte aligned in most cases
6469     // also use single store instruction
6470     __ ldrd(v2, __ post(src, 8));
6471     __ sub(octetCounter, octetCounter, 2);
6472     __ zip1(v1, __ T16B, v1, v0);
6473     __ zip1(v2, __ T16B, v2, v0);
6474     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6475     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6476     __ subs(rscratch1, octetCounter, large_loop_threshold);
6477     __ br(__ LE, LOOP_START);
6478     __ b(LOOP_PRFM_START);
6479     __ bind(LOOP_PRFM);
6480       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6481     __ bind(LOOP_PRFM_START);
6482       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6483       __ sub(octetCounter, octetCounter, 8);
6484       __ subs(rscratch1, octetCounter, large_loop_threshold);
6485       inflate_and_store_2_fp_registers(true, v3, v4);
6486       inflate_and_store_2_fp_registers(true, v5, v6);
6487       __ br(__ GT, LOOP_PRFM);
6488       __ cmp(octetCounter, (u1)8);
6489       __ br(__ LT, DONE);
6490     __ bind(LOOP);
6491       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6492       __ bind(LOOP_START);
6493       __ sub(octetCounter, octetCounter, 8);
6494       __ cmp(octetCounter, (u1)8);
6495       inflate_and_store_2_fp_registers(false, v3, v4);
6496       inflate_and_store_2_fp_registers(false, v5, v6);
6497       __ br(__ GE, LOOP);
6498     __ bind(DONE);
6499       __ ret(lr);
6500     return entry;
6501   }
6502 
6503   /**
6504    *  Arguments:
6505    *
6506    *  Input:
6507    *  c_rarg0   - current state address
6508    *  c_rarg1   - H key address
6509    *  c_rarg2   - data address
6510    *  c_rarg3   - number of blocks
6511    *
6512    *  Output:
6513    *  Updated state at c_rarg0
6514    */
6515   address generate_ghash_processBlocks() {
6516     // Bafflingly, GCM uses little-endian for the byte order, but
6517     // big-endian for the bit order.  For example, the polynomial 1 is
6518     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6519     //
6520     // So, we must either reverse the bytes in each word and do
6521     // everything big-endian or reverse the bits in each byte and do
6522     // it little-endian.  On AArch64 it's more idiomatic to reverse
6523     // the bits in each byte (we have an instruction, RBIT, to do
6524     // that) and keep the data in little-endian bit order through the
6525     // calculation, bit-reversing the inputs and outputs.
6526 
6527     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6528     __ align(wordSize * 2);
6529     address p = __ pc();
6530     __ emit_int64(0x87);  // The low-order bits of the field
6531                           // polynomial (i.e. p = z^7+z^2+z+1)
6532                           // repeated in the low and high parts of a
6533                           // 128-bit vector
6534     __ emit_int64(0x87);
6535 
6536     __ align(CodeEntryAlignment);
6537     address start = __ pc();
6538 
6539     Register state   = c_rarg0;
6540     Register subkeyH = c_rarg1;
6541     Register data    = c_rarg2;
6542     Register blocks  = c_rarg3;
6543 
6544     FloatRegister vzr = v30;
6545     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6546 
6547     __ ldrq(v24, p);    // The field polynomial
6548 
6549     __ ldrq(v0, Address(state));
6550     __ ldrq(v1, Address(subkeyH));
6551 
6552     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6553     __ rbit(v0, __ T16B, v0);
6554     __ rev64(v1, __ T16B, v1);
6555     __ rbit(v1, __ T16B, v1);
6556 
6557     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6558     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6559 
6560     {
6561       Label L_ghash_loop;
6562       __ bind(L_ghash_loop);
6563 
6564       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6565                                                  // reversing each byte
6566       __ rbit(v2, __ T16B, v2);
6567       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6568 
6569       // Multiply state in v2 by subkey in v1
6570       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6571                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6572                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6573       // Reduce v7:v5 by the field polynomial
6574       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6575 
6576       __ sub(blocks, blocks, 1);
6577       __ cbnz(blocks, L_ghash_loop);
6578     }
6579 
6580     // The bit-reversed result is at this point in v0
6581     __ rev64(v0, __ T16B, v0);
6582     __ rbit(v0, __ T16B, v0);
6583 
6584     __ st1(v0, __ T16B, state);
6585     __ ret(lr);
6586 
6587     return start;
6588   }
6589 
6590   address generate_ghash_processBlocks_wide() {
6591     address small = generate_ghash_processBlocks();
6592 
6593     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6594     __ align(wordSize * 2);
6595     address p = __ pc();
6596     __ emit_int64(0x87);  // The low-order bits of the field
6597                           // polynomial (i.e. p = z^7+z^2+z+1)
6598                           // repeated in the low and high parts of a
6599                           // 128-bit vector
6600     __ emit_int64(0x87);
6601 
6602     __ align(CodeEntryAlignment);
6603     address start = __ pc();
6604 
6605     Register state   = c_rarg0;
6606     Register subkeyH = c_rarg1;
6607     Register data    = c_rarg2;
6608     Register blocks  = c_rarg3;
6609 
6610     const int unroll = 4;
6611 
6612     __ cmp(blocks, (unsigned char)(unroll * 2));
6613     __ br(__ LT, small);
6614 
6615     if (unroll > 1) {
6616     // Save state before entering routine
6617       __ sub(sp, sp, 4 * 16);
6618       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6619       __ sub(sp, sp, 4 * 16);
6620       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6621     }
6622 
6623     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6624 
6625     if (unroll > 1) {
6626       // And restore state
6627       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6628       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6629     }
6630 
6631     __ cmp(blocks, (unsigned char)0);
6632     __ br(__ GT, small);
6633 
6634     __ ret(lr);
6635 
6636     return start;
6637   }
6638 
6639   void generate_base64_encode_simdround(Register src, Register dst,
6640         FloatRegister codec, u8 size) {
6641 
6642     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6643     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6644     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6645 
6646     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6647 
6648     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6649 
6650     __ ushr(ind0, arrangement, in0,  2);
6651 
6652     __ ushr(ind1, arrangement, in1,  2);
6653     __ shl(in0,   arrangement, in0,  6);
6654     __ orr(ind1,  arrangement, ind1, in0);
6655     __ ushr(ind1, arrangement, ind1, 2);
6656 
6657     __ ushr(ind2, arrangement, in2,  4);
6658     __ shl(in1,   arrangement, in1,  4);
6659     __ orr(ind2,  arrangement, in1,  ind2);
6660     __ ushr(ind2, arrangement, ind2, 2);
6661 
6662     __ shl(ind3,  arrangement, in2,  2);
6663     __ ushr(ind3, arrangement, ind3, 2);
6664 
6665     __ tbl(out0,  arrangement, codec,  4, ind0);
6666     __ tbl(out1,  arrangement, codec,  4, ind1);
6667     __ tbl(out2,  arrangement, codec,  4, ind2);
6668     __ tbl(out3,  arrangement, codec,  4, ind3);
6669 
6670     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6671   }
6672 
6673    /**
6674    *  Arguments:
6675    *
6676    *  Input:
6677    *  c_rarg0   - src_start
6678    *  c_rarg1   - src_offset
6679    *  c_rarg2   - src_length
6680    *  c_rarg3   - dest_start
6681    *  c_rarg4   - dest_offset
6682    *  c_rarg5   - isURL
6683    *
6684    */
6685   address generate_base64_encodeBlock() {
6686 
6687     static const char toBase64[64] = {
6688       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6689       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6690       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6691       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6692       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6693     };
6694 
6695     static const char toBase64URL[64] = {
6696       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6697       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6698       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6699       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6700       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6701     };
6702 
6703     __ align(CodeEntryAlignment);
6704     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6705     address start = __ pc();
6706 
6707     Register src   = c_rarg0;  // source array
6708     Register soff  = c_rarg1;  // source start offset
6709     Register send  = c_rarg2;  // source end offset
6710     Register dst   = c_rarg3;  // dest array
6711     Register doff  = c_rarg4;  // position for writing to dest array
6712     Register isURL = c_rarg5;  // Base64 or URL character set
6713 
6714     // c_rarg6 and c_rarg7 are free to use as temps
6715     Register codec  = c_rarg6;
6716     Register length = c_rarg7;
6717 
6718     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6719 
6720     __ add(src, src, soff);
6721     __ add(dst, dst, doff);
6722     __ sub(length, send, soff);
6723 
6724     // load the codec base address
6725     __ lea(codec, ExternalAddress((address) toBase64));
6726     __ cbz(isURL, ProcessData);
6727     __ lea(codec, ExternalAddress((address) toBase64URL));
6728 
6729     __ BIND(ProcessData);
6730 
6731     // too short to formup a SIMD loop, roll back
6732     __ cmp(length, (u1)24);
6733     __ br(Assembler::LT, Process3B);
6734 
6735     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6736 
6737     __ BIND(Process48B);
6738     __ cmp(length, (u1)48);
6739     __ br(Assembler::LT, Process24B);
6740     generate_base64_encode_simdround(src, dst, v0, 16);
6741     __ sub(length, length, 48);
6742     __ b(Process48B);
6743 
6744     __ BIND(Process24B);
6745     __ cmp(length, (u1)24);
6746     __ br(Assembler::LT, SIMDExit);
6747     generate_base64_encode_simdround(src, dst, v0, 8);
6748     __ sub(length, length, 24);
6749 
6750     __ BIND(SIMDExit);
6751     __ cbz(length, Exit);
6752 
6753     __ BIND(Process3B);
6754     //  3 src bytes, 24 bits
6755     __ ldrb(r10, __ post(src, 1));
6756     __ ldrb(r11, __ post(src, 1));
6757     __ ldrb(r12, __ post(src, 1));
6758     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6759     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6760     // codec index
6761     __ ubfmw(r15, r12, 18, 23);
6762     __ ubfmw(r14, r12, 12, 17);
6763     __ ubfmw(r13, r12, 6,  11);
6764     __ andw(r12,  r12, 63);
6765     // get the code based on the codec
6766     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6767     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6768     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6769     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6770     __ strb(r15, __ post(dst, 1));
6771     __ strb(r14, __ post(dst, 1));
6772     __ strb(r13, __ post(dst, 1));
6773     __ strb(r12, __ post(dst, 1));
6774     __ sub(length, length, 3);
6775     __ cbnz(length, Process3B);
6776 
6777     __ BIND(Exit);
6778     __ ret(lr);
6779 
6780     return start;
6781   }
6782 
6783   void generate_base64_decode_simdround(Register src, Register dst,
6784         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6785 
6786     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6787     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6788 
6789     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6790     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6791 
6792     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6793 
6794     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6795 
6796     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6797 
6798     // we need unsigned saturating subtract, to make sure all input values
6799     // in range [0, 63] will have 0U value in the higher half lookup
6800     __ uqsubv(decH0, __ T16B, in0, v27);
6801     __ uqsubv(decH1, __ T16B, in1, v27);
6802     __ uqsubv(decH2, __ T16B, in2, v27);
6803     __ uqsubv(decH3, __ T16B, in3, v27);
6804 
6805     // lower half lookup
6806     __ tbl(decL0, arrangement, codecL, 4, in0);
6807     __ tbl(decL1, arrangement, codecL, 4, in1);
6808     __ tbl(decL2, arrangement, codecL, 4, in2);
6809     __ tbl(decL3, arrangement, codecL, 4, in3);
6810 
6811     // higher half lookup
6812     __ tbx(decH0, arrangement, codecH, 4, decH0);
6813     __ tbx(decH1, arrangement, codecH, 4, decH1);
6814     __ tbx(decH2, arrangement, codecH, 4, decH2);
6815     __ tbx(decH3, arrangement, codecH, 4, decH3);
6816 
6817     // combine lower and higher
6818     __ orr(decL0, arrangement, decL0, decH0);
6819     __ orr(decL1, arrangement, decL1, decH1);
6820     __ orr(decL2, arrangement, decL2, decH2);
6821     __ orr(decL3, arrangement, decL3, decH3);
6822 
6823     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6824     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6825     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6826     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6827     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6828     __ orr(in0, arrangement, decH0, decH1);
6829     __ orr(in1, arrangement, decH2, decH3);
6830     __ orr(in2, arrangement, in0,   in1);
6831     __ umaxv(in3, arrangement, in2);
6832     __ umov(rscratch2, in3, __ B, 0);
6833 
6834     // get the data to output
6835     __ shl(out0,  arrangement, decL0, 2);
6836     __ ushr(out1, arrangement, decL1, 4);
6837     __ orr(out0,  arrangement, out0,  out1);
6838     __ shl(out1,  arrangement, decL1, 4);
6839     __ ushr(out2, arrangement, decL2, 2);
6840     __ orr(out1,  arrangement, out1,  out2);
6841     __ shl(out2,  arrangement, decL2, 6);
6842     __ orr(out2,  arrangement, out2,  decL3);
6843 
6844     __ cbz(rscratch2, NoIllegalData);
6845 
6846     // handle illegal input
6847     __ umov(r10, in2, __ D, 0);
6848     if (size == 16) {
6849       __ cbnz(r10, ErrorInLowerHalf);
6850 
6851       // illegal input is in higher half, store the lower half now.
6852       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6853 
6854       __ umov(r10, in2,  __ D, 1);
6855       __ umov(r11, out0, __ D, 1);
6856       __ umov(r12, out1, __ D, 1);
6857       __ umov(r13, out2, __ D, 1);
6858       __ b(StoreLegalData);
6859 
6860       __ BIND(ErrorInLowerHalf);
6861     }
6862     __ umov(r11, out0, __ D, 0);
6863     __ umov(r12, out1, __ D, 0);
6864     __ umov(r13, out2, __ D, 0);
6865 
6866     __ BIND(StoreLegalData);
6867     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6868     __ strb(r11, __ post(dst, 1));
6869     __ strb(r12, __ post(dst, 1));
6870     __ strb(r13, __ post(dst, 1));
6871     __ lsr(r10, r10, 8);
6872     __ lsr(r11, r11, 8);
6873     __ lsr(r12, r12, 8);
6874     __ lsr(r13, r13, 8);
6875     __ b(StoreLegalData);
6876 
6877     __ BIND(NoIllegalData);
6878     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6879   }
6880 
6881 
6882    /**
6883    *  Arguments:
6884    *
6885    *  Input:
6886    *  c_rarg0   - src_start
6887    *  c_rarg1   - src_offset
6888    *  c_rarg2   - src_length
6889    *  c_rarg3   - dest_start
6890    *  c_rarg4   - dest_offset
6891    *  c_rarg5   - isURL
6892    *  c_rarg6   - isMIME
6893    *
6894    */
6895   address generate_base64_decodeBlock() {
6896 
6897     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6898     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6899     // titled "Base64 decoding".
6900 
6901     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6902     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6903     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6904     static const uint8_t fromBase64ForNoSIMD[256] = {
6905       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6906       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6907       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6908        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6909       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6910        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6911       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6912        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6913       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6914       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6915       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6916       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6917       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6918       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6919       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6920       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6921     };
6922 
6923     static const uint8_t fromBase64URLForNoSIMD[256] = {
6924       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6925       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6926       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6927        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6928       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6929        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6930       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6931        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6932       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6933       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6934       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6935       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6936       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6937       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6938       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6939       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6940     };
6941 
6942     // A legal value of base64 code is in range [0, 127].  We need two lookups
6943     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6944     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6945     // table vector lookup use tbx, out of range indices are unchanged in
6946     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6947     // The value of index 64 is set to 0, so that we know that we already get the
6948     // decoded data with the 1st lookup.
6949     static const uint8_t fromBase64ForSIMD[128] = {
6950       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6951       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6952       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6953        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6954         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6955        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6956       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6957        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6958     };
6959 
6960     static const uint8_t fromBase64URLForSIMD[128] = {
6961       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6962       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6963       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6964        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6965         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6966        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6967        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6968        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6969     };
6970 
6971     __ align(CodeEntryAlignment);
6972     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6973     address start = __ pc();
6974 
6975     Register src    = c_rarg0;  // source array
6976     Register soff   = c_rarg1;  // source start offset
6977     Register send   = c_rarg2;  // source end offset
6978     Register dst    = c_rarg3;  // dest array
6979     Register doff   = c_rarg4;  // position for writing to dest array
6980     Register isURL  = c_rarg5;  // Base64 or URL character set
6981     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6982 
6983     Register length = send;    // reuse send as length of source data to process
6984 
6985     Register simd_codec   = c_rarg6;
6986     Register nosimd_codec = c_rarg7;
6987 
6988     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6989 
6990     __ enter();
6991 
6992     __ add(src, src, soff);
6993     __ add(dst, dst, doff);
6994 
6995     __ mov(doff, dst);
6996 
6997     __ sub(length, send, soff);
6998     __ bfm(length, zr, 0, 1);
6999 
7000     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
7001     __ cbz(isURL, ProcessData);
7002     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
7003 
7004     __ BIND(ProcessData);
7005     __ mov(rscratch1, length);
7006     __ cmp(length, (u1)144); // 144 = 80 + 64
7007     __ br(Assembler::LT, Process4B);
7008 
7009     // In the MIME case, the line length cannot be more than 76
7010     // bytes (see RFC 2045). This is too short a block for SIMD
7011     // to be worthwhile, so we use non-SIMD here.
7012     __ movw(rscratch1, 79);
7013 
7014     __ BIND(Process4B);
7015     __ ldrw(r14, __ post(src, 4));
7016     __ ubfxw(r10, r14, 0,  8);
7017     __ ubfxw(r11, r14, 8,  8);
7018     __ ubfxw(r12, r14, 16, 8);
7019     __ ubfxw(r13, r14, 24, 8);
7020     // get the de-code
7021     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
7022     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
7023     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
7024     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
7025     // error detection, 255u indicates an illegal input
7026     __ orrw(r14, r10, r11);
7027     __ orrw(r15, r12, r13);
7028     __ orrw(r14, r14, r15);
7029     __ tbnz(r14, 7, Exit);
7030     // recover the data
7031     __ lslw(r14, r10, 10);
7032     __ bfiw(r14, r11, 4, 6);
7033     __ bfmw(r14, r12, 2, 5);
7034     __ rev16w(r14, r14);
7035     __ bfiw(r13, r12, 6, 2);
7036     __ strh(r14, __ post(dst, 2));
7037     __ strb(r13, __ post(dst, 1));
7038     // non-simd loop
7039     __ subsw(rscratch1, rscratch1, 4);
7040     __ br(Assembler::GT, Process4B);
7041 
7042     // if exiting from PreProcess80B, rscratch1 == -1;
7043     // otherwise, rscratch1 == 0.
7044     __ cbzw(rscratch1, Exit);
7045     __ sub(length, length, 80);
7046 
7047     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
7048     __ cbz(isURL, SIMDEnter);
7049     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
7050 
7051     __ BIND(SIMDEnter);
7052     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
7053     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
7054     __ mov(rscratch1, 63);
7055     __ dup(v27, __ T16B, rscratch1);
7056 
7057     __ BIND(Process64B);
7058     __ cmp(length, (u1)64);
7059     __ br(Assembler::LT, Process32B);
7060     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
7061     __ sub(length, length, 64);
7062     __ b(Process64B);
7063 
7064     __ BIND(Process32B);
7065     __ cmp(length, (u1)32);
7066     __ br(Assembler::LT, SIMDExit);
7067     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
7068     __ sub(length, length, 32);
7069     __ b(Process32B);
7070 
7071     __ BIND(SIMDExit);
7072     __ cbz(length, Exit);
7073     __ movw(rscratch1, length);
7074     __ b(Process4B);
7075 
7076     __ BIND(Exit);
7077     __ sub(c_rarg0, dst, doff);
7078 
7079     __ leave();
7080     __ ret(lr);
7081 
7082     return start;
7083   }
7084 
7085   // Support for spin waits.
7086   address generate_spin_wait() {
7087     __ align(CodeEntryAlignment);
7088     StubCodeMark mark(this, "StubRoutines", "spin_wait");
7089     address start = __ pc();
7090 
7091     __ spin_wait();
7092     __ ret(lr);
7093 
7094     return start;
7095   }
7096 
7097   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
7098     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
7099 
7100     address start = __ pc();
7101     const Register
7102       r_super_klass  = r0,
7103       r_array_base   = r1,
7104       r_array_length = r2,
7105       r_array_index  = r3,
7106       r_sub_klass    = r4,
7107       r_bitmap       = rscratch2,
7108       result         = r5;
7109     const FloatRegister
7110       vtemp          = v0;
7111 
7112     Label L_success;
7113     __ enter();
7114     __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
7115                                            r_array_base, r_array_length, r_array_index,
7116                                            vtemp, result, super_klass_index,
7117                                            /*stub_is_near*/true);
7118     __ leave();
7119     __ ret(lr);
7120 
7121     return start;
7122   }
7123 
7124   // Slow path implementation for UseSecondarySupersTable.
7125   address generate_lookup_secondary_supers_table_slow_path_stub() {
7126     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
7127 
7128     address start = __ pc();
7129     const Register
7130       r_super_klass  = r0,        // argument
7131       r_array_base   = r1,        // argument
7132       temp1          = r2,        // temp
7133       r_array_index  = r3,        // argument
7134       r_bitmap       = rscratch2, // argument
7135       result         = r5;        // argument
7136 
7137     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
7138     __ ret(lr);
7139 
7140     return start;
7141   }
7142 
7143 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
7144 
7145   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
7146   //
7147   // If LSE is in use, generate LSE versions of all the stubs. The
7148   // non-LSE versions are in atomic_aarch64.S.
7149 
7150   // class AtomicStubMark records the entry point of a stub and the
7151   // stub pointer which will point to it. The stub pointer is set to
7152   // the entry point when ~AtomicStubMark() is called, which must be
7153   // after ICache::invalidate_range. This ensures safe publication of
7154   // the generated code.
7155   class AtomicStubMark {
7156     address _entry_point;
7157     aarch64_atomic_stub_t *_stub;
7158     MacroAssembler *_masm;
7159   public:
7160     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
7161       _masm = masm;
7162       __ align(32);
7163       _entry_point = __ pc();
7164       _stub = stub;
7165     }
7166     ~AtomicStubMark() {
7167       *_stub = (aarch64_atomic_stub_t)_entry_point;
7168     }
7169   };
7170 
7171   // NB: For memory_order_conservative we need a trailing membar after
7172   // LSE atomic operations but not a leading membar.
7173   //
7174   // We don't need a leading membar because a clause in the Arm ARM
7175   // says:
7176   //
7177   //   Barrier-ordered-before
7178   //
7179   //   Barrier instructions order prior Memory effects before subsequent
7180   //   Memory effects generated by the same Observer. A read or a write
7181   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
7182   //   Observer if and only if RW1 appears in program order before RW 2
7183   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
7184   //   instruction with both Acquire and Release semantics.
7185   //
7186   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
7187   // and Release semantics, therefore we don't need a leading
7188   // barrier. However, there is no corresponding Barrier-ordered-after
7189   // relationship, therefore we need a trailing membar to prevent a
7190   // later store or load from being reordered with the store in an
7191   // atomic instruction.
7192   //
7193   // This was checked by using the herd7 consistency model simulator
7194   // (http://diy.inria.fr/) with this test case:
7195   //
7196   // AArch64 LseCas
7197   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
7198   // P0 | P1;
7199   // LDR W4, [X2] | MOV W3, #0;
7200   // DMB LD       | MOV W4, #1;
7201   // LDR W3, [X1] | CASAL W3, W4, [X1];
7202   //              | DMB ISH;
7203   //              | STR W4, [X2];
7204   // exists
7205   // (0:X3=0 /\ 0:X4=1)
7206   //
7207   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
7208   // with the store to x in P1. Without the DMB in P1 this may happen.
7209   //
7210   // At the time of writing we don't know of any AArch64 hardware that
7211   // reorders stores in this way, but the Reference Manual permits it.
7212 
7213   void gen_cas_entry(Assembler::operand_size size,
7214                      atomic_memory_order order) {
7215     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
7216       exchange_val = c_rarg2;
7217     bool acquire, release;
7218     switch (order) {
7219       case memory_order_relaxed:
7220         acquire = false;
7221         release = false;
7222         break;
7223       case memory_order_release:
7224         acquire = false;
7225         release = true;
7226         break;
7227       default:
7228         acquire = true;
7229         release = true;
7230         break;
7231     }
7232     __ mov(prev, compare_val);
7233     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
7234     if (order == memory_order_conservative) {
7235       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7236     }
7237     if (size == Assembler::xword) {
7238       __ mov(r0, prev);
7239     } else {
7240       __ movw(r0, prev);
7241     }
7242     __ ret(lr);
7243   }
7244 
7245   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
7246     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7247     // If not relaxed, then default to conservative.  Relaxed is the only
7248     // case we use enough to be worth specializing.
7249     if (order == memory_order_relaxed) {
7250       __ ldadd(size, incr, prev, addr);
7251     } else {
7252       __ ldaddal(size, incr, prev, addr);
7253       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7254     }
7255     if (size == Assembler::xword) {
7256       __ mov(r0, prev);
7257     } else {
7258       __ movw(r0, prev);
7259     }
7260     __ ret(lr);
7261   }
7262 
7263   void gen_swpal_entry(Assembler::operand_size size) {
7264     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7265     __ swpal(size, incr, prev, addr);
7266     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7267     if (size == Assembler::xword) {
7268       __ mov(r0, prev);
7269     } else {
7270       __ movw(r0, prev);
7271     }
7272     __ ret(lr);
7273   }
7274 
7275   void generate_atomic_entry_points() {
7276     if (! UseLSE) {
7277       return;
7278     }
7279 
7280     __ align(CodeEntryAlignment);
7281     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
7282     address first_entry = __ pc();
7283 
7284     // ADD, memory_order_conservative
7285     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
7286     gen_ldadd_entry(Assembler::word, memory_order_conservative);
7287     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
7288     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
7289 
7290     // ADD, memory_order_relaxed
7291     AtomicStubMark mark_fetch_add_4_relaxed
7292       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
7293     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
7294     AtomicStubMark mark_fetch_add_8_relaxed
7295       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
7296     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
7297 
7298     // XCHG, memory_order_conservative
7299     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
7300     gen_swpal_entry(Assembler::word);
7301     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
7302     gen_swpal_entry(Assembler::xword);
7303 
7304     // CAS, memory_order_conservative
7305     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
7306     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
7307     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
7308     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
7309     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
7310     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
7311 
7312     // CAS, memory_order_relaxed
7313     AtomicStubMark mark_cmpxchg_1_relaxed
7314       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
7315     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
7316     AtomicStubMark mark_cmpxchg_4_relaxed
7317       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
7318     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
7319     AtomicStubMark mark_cmpxchg_8_relaxed
7320       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
7321     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
7322 
7323     AtomicStubMark mark_cmpxchg_4_release
7324       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
7325     gen_cas_entry(MacroAssembler::word, memory_order_release);
7326     AtomicStubMark mark_cmpxchg_8_release
7327       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
7328     gen_cas_entry(MacroAssembler::xword, memory_order_release);
7329 
7330     AtomicStubMark mark_cmpxchg_4_seq_cst
7331       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
7332     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
7333     AtomicStubMark mark_cmpxchg_8_seq_cst
7334       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
7335     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
7336 
7337     ICache::invalidate_range(first_entry, __ pc() - first_entry);
7338   }
7339 #endif // LINUX
7340 
7341   address generate_cont_thaw(Continuation::thaw_kind kind) {
7342     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
7343     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
7344 
7345     address start = __ pc();
7346 
7347     if (return_barrier) {
7348       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7349       __ mov(sp, rscratch1);
7350     }
7351     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7352 
7353     if (return_barrier) {
7354       // preserve possible return value from a method returning to the return barrier
7355       __ fmovd(rscratch1, v0);
7356       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7357     }
7358 
7359     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7360     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7361     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7362 
7363     if (return_barrier) {
7364       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7365       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7366       __ fmovd(v0, rscratch1);
7367     }
7368     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7369 
7370 
7371     Label thaw_success;
7372     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7373     __ cbnz(rscratch2, thaw_success);
7374     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
7375     __ br(rscratch1);
7376     __ bind(thaw_success);
7377 
7378     // make room for the thawed frames
7379     __ sub(rscratch1, sp, rscratch2);
7380     __ andr(rscratch1, rscratch1, -16); // align
7381     __ mov(sp, rscratch1);
7382 
7383     if (return_barrier) {
7384       // save original return value -- again
7385       __ fmovd(rscratch1, v0);
7386       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7387     }
7388 
7389     // If we want, we can templatize thaw by kind, and have three different entries
7390     __ movw(c_rarg1, (uint32_t)kind);
7391 
7392     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7393     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7394 
7395     if (return_barrier) {
7396       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7397       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7398       __ fmovd(v0, rscratch1);
7399     } else {
7400       __ mov(r0, zr); // return 0 (success) from doYield
7401     }
7402 
7403     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7404     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7405     __ mov(rfp, sp);
7406 
7407     if (return_barrier_exception) {
7408       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7409       __ authenticate_return_address(c_rarg1);
7410       __ verify_oop(r0);
7411       // save return value containing the exception oop in callee-saved R19
7412       __ mov(r19, r0);
7413 
7414       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7415 
7416       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7417       // __ reinitialize_ptrue();
7418 
7419       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7420 
7421       __ mov(r1, r0); // the exception handler
7422       __ mov(r0, r19); // restore return value containing the exception oop
7423       __ verify_oop(r0);
7424 
7425       __ leave();
7426       __ mov(r3, lr);
7427       __ br(r1); // the exception handler
7428     } else {
7429       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7430       __ leave();
7431       __ ret(lr);
7432     }
7433 
7434     return start;
7435   }
7436 
7437   address generate_cont_thaw() {
7438     if (!Continuations::enabled()) return nullptr;
7439 
7440     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7441     address start = __ pc();
7442     generate_cont_thaw(Continuation::thaw_top);
7443     return start;
7444   }
7445 
7446   address generate_cont_returnBarrier() {
7447     if (!Continuations::enabled()) return nullptr;
7448 
7449     // TODO: will probably need multiple return barriers depending on return type
7450     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7451     address start = __ pc();
7452 
7453     generate_cont_thaw(Continuation::thaw_return_barrier);
7454 
7455     return start;
7456   }
7457 
7458   address generate_cont_returnBarrier_exception() {
7459     if (!Continuations::enabled()) return nullptr;
7460 
7461     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7462     address start = __ pc();
7463 
7464     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7465 
7466     return start;
7467   }
7468 
7469   address generate_cont_preempt_stub() {
7470     if (!Continuations::enabled()) return nullptr;
7471     StubCodeMark mark(this, "StubRoutines","Continuation preempt stub");
7472     address start = __ pc();
7473 
7474     __ reset_last_Java_frame(true);
7475 
7476     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
7477     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
7478     __ mov(sp, rscratch2);
7479 
7480     Label preemption_cancelled;
7481     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
7482     __ cbnz(rscratch1, preemption_cancelled);
7483 
7484     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
7485     SharedRuntime::continuation_enter_cleanup(_masm);
7486     __ leave();
7487     __ ret(lr);
7488 
7489     // We acquired the monitor after freezing the frames so call thaw to continue execution.
7490     __ bind(preemption_cancelled);
7491     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
7492     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
7493     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
7494     __ ldr(rscratch1, Address(rscratch1));
7495     __ br(rscratch1);
7496 
7497     return start;
7498   }
7499 
7500   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7501   // are represented as long[5], with BITS_PER_LIMB = 26.
7502   // Pack five 26-bit limbs into three 64-bit registers.
7503   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7504     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7505     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7506     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7507     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7508 
7509     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7510     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7511     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7512     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7513 
7514     if (dest2->is_valid()) {
7515       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7516     } else {
7517 #ifdef ASSERT
7518       Label OK;
7519       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7520       __ br(__ EQ, OK);
7521       __ stop("high bits of Poly1305 integer should be zero");
7522       __ should_not_reach_here();
7523       __ bind(OK);
7524 #endif
7525     }
7526   }
7527 
7528   // As above, but return only a 128-bit integer, packed into two
7529   // 64-bit registers.
7530   void pack_26(Register dest0, Register dest1, Register src) {
7531     pack_26(dest0, dest1, noreg, src);
7532   }
7533 
7534   // Multiply and multiply-accumulate unsigned 64-bit registers.
7535   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7536     __ mul(prod_lo, n, m);
7537     __ umulh(prod_hi, n, m);
7538   }
7539   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7540     wide_mul(rscratch1, rscratch2, n, m);
7541     __ adds(sum_lo, sum_lo, rscratch1);
7542     __ adc(sum_hi, sum_hi, rscratch2);
7543   }
7544 
7545   // Poly1305, RFC 7539
7546 
7547   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7548   // description of the tricks used to simplify and accelerate this
7549   // computation.
7550 
7551   address generate_poly1305_processBlocks() {
7552     __ align(CodeEntryAlignment);
7553     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7554     address start = __ pc();
7555     Label here;
7556     __ enter();
7557     RegSet callee_saved = RegSet::range(r19, r28);
7558     __ push(callee_saved, sp);
7559 
7560     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7561 
7562     // Arguments
7563     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7564 
7565     // R_n is the 128-bit randomly-generated key, packed into two
7566     // registers.  The caller passes this key to us as long[5], with
7567     // BITS_PER_LIMB = 26.
7568     const Register R_0 = *++regs, R_1 = *++regs;
7569     pack_26(R_0, R_1, r_start);
7570 
7571     // RR_n is (R_n >> 2) * 5
7572     const Register RR_0 = *++regs, RR_1 = *++regs;
7573     __ lsr(RR_0, R_0, 2);
7574     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7575     __ lsr(RR_1, R_1, 2);
7576     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7577 
7578     // U_n is the current checksum
7579     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7580     pack_26(U_0, U_1, U_2, acc_start);
7581 
7582     static constexpr int BLOCK_LENGTH = 16;
7583     Label DONE, LOOP;
7584 
7585     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7586     __ br(Assembler::LT, DONE); {
7587       __ bind(LOOP);
7588 
7589       // S_n is to be the sum of U_n and the next block of data
7590       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7591       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7592       __ adds(S_0, U_0, S_0);
7593       __ adcs(S_1, U_1, S_1);
7594       __ adc(S_2, U_2, zr);
7595       __ add(S_2, S_2, 1);
7596 
7597       const Register U_0HI = *++regs, U_1HI = *++regs;
7598 
7599       // NB: this logic depends on some of the special properties of
7600       // Poly1305 keys. In particular, because we know that the top
7601       // four bits of R_0 and R_1 are zero, we can add together
7602       // partial products without any risk of needing to propagate a
7603       // carry out.
7604       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7605       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7606       __ andr(U_2, R_0, 3);
7607       __ mul(U_2, S_2, U_2);
7608 
7609       // Recycle registers S_0, S_1, S_2
7610       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7611 
7612       // Partial reduction mod 2**130 - 5
7613       __ adds(U_1, U_0HI, U_1);
7614       __ adc(U_2, U_1HI, U_2);
7615       // Sum now in U_2:U_1:U_0.
7616       // Dead: U_0HI, U_1HI.
7617       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7618 
7619       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7620 
7621       // First, U_2:U_1:U_0 += (U_2 >> 2)
7622       __ lsr(rscratch1, U_2, 2);
7623       __ andr(U_2, U_2, (u8)3);
7624       __ adds(U_0, U_0, rscratch1);
7625       __ adcs(U_1, U_1, zr);
7626       __ adc(U_2, U_2, zr);
7627       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7628       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7629       __ adcs(U_1, U_1, zr);
7630       __ adc(U_2, U_2, zr);
7631 
7632       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7633       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7634       __ br(~ Assembler::LT, LOOP);
7635     }
7636 
7637     // Further reduce modulo 2^130 - 5
7638     __ lsr(rscratch1, U_2, 2);
7639     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7640     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7641     __ adcs(U_1, U_1, zr);
7642     __ andr(U_2, U_2, (u1)3);
7643     __ adc(U_2, U_2, zr);
7644 
7645     // Unpack the sum into five 26-bit limbs and write to memory.
7646     __ ubfiz(rscratch1, U_0, 0, 26);
7647     __ ubfx(rscratch2, U_0, 26, 26);
7648     __ stp(rscratch1, rscratch2, Address(acc_start));
7649     __ ubfx(rscratch1, U_0, 52, 12);
7650     __ bfi(rscratch1, U_1, 12, 14);
7651     __ ubfx(rscratch2, U_1, 14, 26);
7652     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7653     __ ubfx(rscratch1, U_1, 40, 24);
7654     __ bfi(rscratch1, U_2, 24, 3);
7655     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7656 
7657     __ bind(DONE);
7658     __ pop(callee_saved, sp);
7659     __ leave();
7660     __ ret(lr);
7661 
7662     return start;
7663   }
7664 
7665   // exception handler for upcall stubs
7666   address generate_upcall_stub_exception_handler() {
7667     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7668     address start = __ pc();
7669 
7670     // Native caller has no idea how to handle exceptions,
7671     // so we just crash here. Up to callee to catch exceptions.
7672     __ verify_oop(r0);
7673     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7674     __ blr(rscratch1);
7675     __ should_not_reach_here();
7676 
7677     return start;
7678   }
7679 
7680   // load Method* target of MethodHandle
7681   // j_rarg0 = jobject receiver
7682   // rmethod = result
7683   address generate_upcall_stub_load_target() {
7684     StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target");
7685     address start = __ pc();
7686 
7687     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
7688       // Load target method from receiver
7689     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
7690     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
7691     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
7692     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
7693                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7694                       noreg, noreg);
7695     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7696 
7697     __ ret(lr);
7698 
7699     return start;
7700   }
7701 
7702 #undef __
7703 #define __ masm->
7704 
7705   class MontgomeryMultiplyGenerator : public MacroAssembler {
7706 
7707     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7708       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7709 
7710     RegSet _toSave;
7711     bool _squaring;
7712 
7713   public:
7714     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7715       : MacroAssembler(as->code()), _squaring(squaring) {
7716 
7717       // Register allocation
7718 
7719       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7720       Pa_base = *regs;       // Argument registers
7721       if (squaring)
7722         Pb_base = Pa_base;
7723       else
7724         Pb_base = *++regs;
7725       Pn_base = *++regs;
7726       Rlen= *++regs;
7727       inv = *++regs;
7728       Pm_base = *++regs;
7729 
7730                           // Working registers:
7731       Ra =  *++regs;        // The current digit of a, b, n, and m.
7732       Rb =  *++regs;
7733       Rm =  *++regs;
7734       Rn =  *++regs;
7735 
7736       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7737       Pb =  *++regs;
7738       Pm =  *++regs;
7739       Pn =  *++regs;
7740 
7741       t0 =  *++regs;        // Three registers which form a
7742       t1 =  *++regs;        // triple-precision accumuator.
7743       t2 =  *++regs;
7744 
7745       Ri =  *++regs;        // Inner and outer loop indexes.
7746       Rj =  *++regs;
7747 
7748       Rhi_ab = *++regs;     // Product registers: low and high parts
7749       Rlo_ab = *++regs;     // of a*b and m*n.
7750       Rhi_mn = *++regs;
7751       Rlo_mn = *++regs;
7752 
7753       // r19 and up are callee-saved.
7754       _toSave = RegSet::range(r19, *regs) + Pm_base;
7755     }
7756 
7757   private:
7758     void save_regs() {
7759       push(_toSave, sp);
7760     }
7761 
7762     void restore_regs() {
7763       pop(_toSave, sp);
7764     }
7765 
7766     template <typename T>
7767     void unroll_2(Register count, T block) {
7768       Label loop, end, odd;
7769       tbnz(count, 0, odd);
7770       cbz(count, end);
7771       align(16);
7772       bind(loop);
7773       (this->*block)();
7774       bind(odd);
7775       (this->*block)();
7776       subs(count, count, 2);
7777       br(Assembler::GT, loop);
7778       bind(end);
7779     }
7780 
7781     template <typename T>
7782     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7783       Label loop, end, odd;
7784       tbnz(count, 0, odd);
7785       cbz(count, end);
7786       align(16);
7787       bind(loop);
7788       (this->*block)(d, s, tmp);
7789       bind(odd);
7790       (this->*block)(d, s, tmp);
7791       subs(count, count, 2);
7792       br(Assembler::GT, loop);
7793       bind(end);
7794     }
7795 
7796     void pre1(RegisterOrConstant i) {
7797       block_comment("pre1");
7798       // Pa = Pa_base;
7799       // Pb = Pb_base + i;
7800       // Pm = Pm_base;
7801       // Pn = Pn_base + i;
7802       // Ra = *Pa;
7803       // Rb = *Pb;
7804       // Rm = *Pm;
7805       // Rn = *Pn;
7806       ldr(Ra, Address(Pa_base));
7807       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7808       ldr(Rm, Address(Pm_base));
7809       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7810       lea(Pa, Address(Pa_base));
7811       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7812       lea(Pm, Address(Pm_base));
7813       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7814 
7815       // Zero the m*n result.
7816       mov(Rhi_mn, zr);
7817       mov(Rlo_mn, zr);
7818     }
7819 
7820     // The core multiply-accumulate step of a Montgomery
7821     // multiplication.  The idea is to schedule operations as a
7822     // pipeline so that instructions with long latencies (loads and
7823     // multiplies) have time to complete before their results are
7824     // used.  This most benefits in-order implementations of the
7825     // architecture but out-of-order ones also benefit.
7826     void step() {
7827       block_comment("step");
7828       // MACC(Ra, Rb, t0, t1, t2);
7829       // Ra = *++Pa;
7830       // Rb = *--Pb;
7831       umulh(Rhi_ab, Ra, Rb);
7832       mul(Rlo_ab, Ra, Rb);
7833       ldr(Ra, pre(Pa, wordSize));
7834       ldr(Rb, pre(Pb, -wordSize));
7835       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7836                                        // previous iteration.
7837       // MACC(Rm, Rn, t0, t1, t2);
7838       // Rm = *++Pm;
7839       // Rn = *--Pn;
7840       umulh(Rhi_mn, Rm, Rn);
7841       mul(Rlo_mn, Rm, Rn);
7842       ldr(Rm, pre(Pm, wordSize));
7843       ldr(Rn, pre(Pn, -wordSize));
7844       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7845     }
7846 
7847     void post1() {
7848       block_comment("post1");
7849 
7850       // MACC(Ra, Rb, t0, t1, t2);
7851       // Ra = *++Pa;
7852       // Rb = *--Pb;
7853       umulh(Rhi_ab, Ra, Rb);
7854       mul(Rlo_ab, Ra, Rb);
7855       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7856       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7857 
7858       // *Pm = Rm = t0 * inv;
7859       mul(Rm, t0, inv);
7860       str(Rm, Address(Pm));
7861 
7862       // MACC(Rm, Rn, t0, t1, t2);
7863       // t0 = t1; t1 = t2; t2 = 0;
7864       umulh(Rhi_mn, Rm, Rn);
7865 
7866 #ifndef PRODUCT
7867       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7868       {
7869         mul(Rlo_mn, Rm, Rn);
7870         add(Rlo_mn, t0, Rlo_mn);
7871         Label ok;
7872         cbz(Rlo_mn, ok); {
7873           stop("broken Montgomery multiply");
7874         } bind(ok);
7875       }
7876 #endif
7877       // We have very carefully set things up so that
7878       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7879       // the lower half of Rm * Rn because we know the result already:
7880       // it must be -t0.  t0 + (-t0) must generate a carry iff
7881       // t0 != 0.  So, rather than do a mul and an adds we just set
7882       // the carry flag iff t0 is nonzero.
7883       //
7884       // mul(Rlo_mn, Rm, Rn);
7885       // adds(zr, t0, Rlo_mn);
7886       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7887       adcs(t0, t1, Rhi_mn);
7888       adc(t1, t2, zr);
7889       mov(t2, zr);
7890     }
7891 
7892     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7893       block_comment("pre2");
7894       // Pa = Pa_base + i-len;
7895       // Pb = Pb_base + len;
7896       // Pm = Pm_base + i-len;
7897       // Pn = Pn_base + len;
7898 
7899       if (i.is_register()) {
7900         sub(Rj, i.as_register(), len);
7901       } else {
7902         mov(Rj, i.as_constant());
7903         sub(Rj, Rj, len);
7904       }
7905       // Rj == i-len
7906 
7907       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7908       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7909       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7910       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7911 
7912       // Ra = *++Pa;
7913       // Rb = *--Pb;
7914       // Rm = *++Pm;
7915       // Rn = *--Pn;
7916       ldr(Ra, pre(Pa, wordSize));
7917       ldr(Rb, pre(Pb, -wordSize));
7918       ldr(Rm, pre(Pm, wordSize));
7919       ldr(Rn, pre(Pn, -wordSize));
7920 
7921       mov(Rhi_mn, zr);
7922       mov(Rlo_mn, zr);
7923     }
7924 
7925     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7926       block_comment("post2");
7927       if (i.is_constant()) {
7928         mov(Rj, i.as_constant()-len.as_constant());
7929       } else {
7930         sub(Rj, i.as_register(), len);
7931       }
7932 
7933       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7934 
7935       // As soon as we know the least significant digit of our result,
7936       // store it.
7937       // Pm_base[i-len] = t0;
7938       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7939 
7940       // t0 = t1; t1 = t2; t2 = 0;
7941       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7942       adc(t1, t2, zr);
7943       mov(t2, zr);
7944     }
7945 
7946     // A carry in t0 after Montgomery multiplication means that we
7947     // should subtract multiples of n from our result in m.  We'll
7948     // keep doing that until there is no carry.
7949     void normalize(RegisterOrConstant len) {
7950       block_comment("normalize");
7951       // while (t0)
7952       //   t0 = sub(Pm_base, Pn_base, t0, len);
7953       Label loop, post, again;
7954       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7955       cbz(t0, post); {
7956         bind(again); {
7957           mov(i, zr);
7958           mov(cnt, len);
7959           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7960           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7961           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7962           align(16);
7963           bind(loop); {
7964             sbcs(Rm, Rm, Rn);
7965             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7966             add(i, i, 1);
7967             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7968             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7969             sub(cnt, cnt, 1);
7970           } cbnz(cnt, loop);
7971           sbc(t0, t0, zr);
7972         } cbnz(t0, again);
7973       } bind(post);
7974     }
7975 
7976     // Move memory at s to d, reversing words.
7977     //    Increments d to end of copied memory
7978     //    Destroys tmp1, tmp2
7979     //    Preserves len
7980     //    Leaves s pointing to the address which was in d at start
7981     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7982       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7983       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7984 
7985       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7986       mov(tmp1, len);
7987       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7988       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7989     }
7990     // where
7991     void reverse1(Register d, Register s, Register tmp) {
7992       ldr(tmp, pre(s, -wordSize));
7993       ror(tmp, tmp, 32);
7994       str(tmp, post(d, wordSize));
7995     }
7996 
7997     void step_squaring() {
7998       // An extra ACC
7999       step();
8000       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8001     }
8002 
8003     void last_squaring(RegisterOrConstant i) {
8004       Label dont;
8005       // if ((i & 1) == 0) {
8006       tbnz(i.as_register(), 0, dont); {
8007         // MACC(Ra, Rb, t0, t1, t2);
8008         // Ra = *++Pa;
8009         // Rb = *--Pb;
8010         umulh(Rhi_ab, Ra, Rb);
8011         mul(Rlo_ab, Ra, Rb);
8012         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8013       } bind(dont);
8014     }
8015 
8016     void extra_step_squaring() {
8017       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8018 
8019       // MACC(Rm, Rn, t0, t1, t2);
8020       // Rm = *++Pm;
8021       // Rn = *--Pn;
8022       umulh(Rhi_mn, Rm, Rn);
8023       mul(Rlo_mn, Rm, Rn);
8024       ldr(Rm, pre(Pm, wordSize));
8025       ldr(Rn, pre(Pn, -wordSize));
8026     }
8027 
8028     void post1_squaring() {
8029       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8030 
8031       // *Pm = Rm = t0 * inv;
8032       mul(Rm, t0, inv);
8033       str(Rm, Address(Pm));
8034 
8035       // MACC(Rm, Rn, t0, t1, t2);
8036       // t0 = t1; t1 = t2; t2 = 0;
8037       umulh(Rhi_mn, Rm, Rn);
8038 
8039 #ifndef PRODUCT
8040       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
8041       {
8042         mul(Rlo_mn, Rm, Rn);
8043         add(Rlo_mn, t0, Rlo_mn);
8044         Label ok;
8045         cbz(Rlo_mn, ok); {
8046           stop("broken Montgomery multiply");
8047         } bind(ok);
8048       }
8049 #endif
8050       // We have very carefully set things up so that
8051       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
8052       // the lower half of Rm * Rn because we know the result already:
8053       // it must be -t0.  t0 + (-t0) must generate a carry iff
8054       // t0 != 0.  So, rather than do a mul and an adds we just set
8055       // the carry flag iff t0 is nonzero.
8056       //
8057       // mul(Rlo_mn, Rm, Rn);
8058       // adds(zr, t0, Rlo_mn);
8059       subs(zr, t0, 1); // Set carry iff t0 is nonzero
8060       adcs(t0, t1, Rhi_mn);
8061       adc(t1, t2, zr);
8062       mov(t2, zr);
8063     }
8064 
8065     void acc(Register Rhi, Register Rlo,
8066              Register t0, Register t1, Register t2) {
8067       adds(t0, t0, Rlo);
8068       adcs(t1, t1, Rhi);
8069       adc(t2, t2, zr);
8070     }
8071 
8072   public:
8073     /**
8074      * Fast Montgomery multiplication.  The derivation of the
8075      * algorithm is in A Cryptographic Library for the Motorola
8076      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
8077      *
8078      * Arguments:
8079      *
8080      * Inputs for multiplication:
8081      *   c_rarg0   - int array elements a
8082      *   c_rarg1   - int array elements b
8083      *   c_rarg2   - int array elements n (the modulus)
8084      *   c_rarg3   - int length
8085      *   c_rarg4   - int inv
8086      *   c_rarg5   - int array elements m (the result)
8087      *
8088      * Inputs for squaring:
8089      *   c_rarg0   - int array elements a
8090      *   c_rarg1   - int array elements n (the modulus)
8091      *   c_rarg2   - int length
8092      *   c_rarg3   - int inv
8093      *   c_rarg4   - int array elements m (the result)
8094      *
8095      */
8096     address generate_multiply() {
8097       Label argh, nothing;
8098       bind(argh);
8099       stop("MontgomeryMultiply total_allocation must be <= 8192");
8100 
8101       align(CodeEntryAlignment);
8102       address entry = pc();
8103 
8104       cbzw(Rlen, nothing);
8105 
8106       enter();
8107 
8108       // Make room.
8109       cmpw(Rlen, 512);
8110       br(Assembler::HI, argh);
8111       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8112       andr(sp, Ra, -2 * wordSize);
8113 
8114       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8115 
8116       {
8117         // Copy input args, reversing as we go.  We use Ra as a
8118         // temporary variable.
8119         reverse(Ra, Pa_base, Rlen, t0, t1);
8120         if (!_squaring)
8121           reverse(Ra, Pb_base, Rlen, t0, t1);
8122         reverse(Ra, Pn_base, Rlen, t0, t1);
8123       }
8124 
8125       // Push all call-saved registers and also Pm_base which we'll need
8126       // at the end.
8127       save_regs();
8128 
8129 #ifndef PRODUCT
8130       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
8131       {
8132         ldr(Rn, Address(Pn_base, 0));
8133         mul(Rlo_mn, Rn, inv);
8134         subs(zr, Rlo_mn, -1);
8135         Label ok;
8136         br(EQ, ok); {
8137           stop("broken inverse in Montgomery multiply");
8138         } bind(ok);
8139       }
8140 #endif
8141 
8142       mov(Pm_base, Ra);
8143 
8144       mov(t0, zr);
8145       mov(t1, zr);
8146       mov(t2, zr);
8147 
8148       block_comment("for (int i = 0; i < len; i++) {");
8149       mov(Ri, zr); {
8150         Label loop, end;
8151         cmpw(Ri, Rlen);
8152         br(Assembler::GE, end);
8153 
8154         bind(loop);
8155         pre1(Ri);
8156 
8157         block_comment("  for (j = i; j; j--) {"); {
8158           movw(Rj, Ri);
8159           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8160         } block_comment("  } // j");
8161 
8162         post1();
8163         addw(Ri, Ri, 1);
8164         cmpw(Ri, Rlen);
8165         br(Assembler::LT, loop);
8166         bind(end);
8167         block_comment("} // i");
8168       }
8169 
8170       block_comment("for (int i = len; i < 2*len; i++) {");
8171       mov(Ri, Rlen); {
8172         Label loop, end;
8173         cmpw(Ri, Rlen, Assembler::LSL, 1);
8174         br(Assembler::GE, end);
8175 
8176         bind(loop);
8177         pre2(Ri, Rlen);
8178 
8179         block_comment("  for (j = len*2-i-1; j; j--) {"); {
8180           lslw(Rj, Rlen, 1);
8181           subw(Rj, Rj, Ri);
8182           subw(Rj, Rj, 1);
8183           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8184         } block_comment("  } // j");
8185 
8186         post2(Ri, Rlen);
8187         addw(Ri, Ri, 1);
8188         cmpw(Ri, Rlen, Assembler::LSL, 1);
8189         br(Assembler::LT, loop);
8190         bind(end);
8191       }
8192       block_comment("} // i");
8193 
8194       normalize(Rlen);
8195 
8196       mov(Ra, Pm_base);  // Save Pm_base in Ra
8197       restore_regs();  // Restore caller's Pm_base
8198 
8199       // Copy our result into caller's Pm_base
8200       reverse(Pm_base, Ra, Rlen, t0, t1);
8201 
8202       leave();
8203       bind(nothing);
8204       ret(lr);
8205 
8206       return entry;
8207     }
8208     // In C, approximately:
8209 
8210     // void
8211     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8212     //                     julong Pn_base[], julong Pm_base[],
8213     //                     julong inv, int len) {
8214     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8215     //   julong *Pa, *Pb, *Pn, *Pm;
8216     //   julong Ra, Rb, Rn, Rm;
8217 
8218     //   int i;
8219 
8220     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8221 
8222     //   for (i = 0; i < len; i++) {
8223     //     int j;
8224 
8225     //     Pa = Pa_base;
8226     //     Pb = Pb_base + i;
8227     //     Pm = Pm_base;
8228     //     Pn = Pn_base + i;
8229 
8230     //     Ra = *Pa;
8231     //     Rb = *Pb;
8232     //     Rm = *Pm;
8233     //     Rn = *Pn;
8234 
8235     //     int iters = i;
8236     //     for (j = 0; iters--; j++) {
8237     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8238     //       MACC(Ra, Rb, t0, t1, t2);
8239     //       Ra = *++Pa;
8240     //       Rb = *--Pb;
8241     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8242     //       MACC(Rm, Rn, t0, t1, t2);
8243     //       Rm = *++Pm;
8244     //       Rn = *--Pn;
8245     //     }
8246 
8247     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8248     //     MACC(Ra, Rb, t0, t1, t2);
8249     //     *Pm = Rm = t0 * inv;
8250     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8251     //     MACC(Rm, Rn, t0, t1, t2);
8252 
8253     //     assert(t0 == 0, "broken Montgomery multiply");
8254 
8255     //     t0 = t1; t1 = t2; t2 = 0;
8256     //   }
8257 
8258     //   for (i = len; i < 2*len; i++) {
8259     //     int j;
8260 
8261     //     Pa = Pa_base + i-len;
8262     //     Pb = Pb_base + len;
8263     //     Pm = Pm_base + i-len;
8264     //     Pn = Pn_base + len;
8265 
8266     //     Ra = *++Pa;
8267     //     Rb = *--Pb;
8268     //     Rm = *++Pm;
8269     //     Rn = *--Pn;
8270 
8271     //     int iters = len*2-i-1;
8272     //     for (j = i-len+1; iters--; j++) {
8273     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8274     //       MACC(Ra, Rb, t0, t1, t2);
8275     //       Ra = *++Pa;
8276     //       Rb = *--Pb;
8277     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8278     //       MACC(Rm, Rn, t0, t1, t2);
8279     //       Rm = *++Pm;
8280     //       Rn = *--Pn;
8281     //     }
8282 
8283     //     Pm_base[i-len] = t0;
8284     //     t0 = t1; t1 = t2; t2 = 0;
8285     //   }
8286 
8287     //   while (t0)
8288     //     t0 = sub(Pm_base, Pn_base, t0, len);
8289     // }
8290 
8291     /**
8292      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8293      * multiplies than Montgomery multiplication so it should be up to
8294      * 25% faster.  However, its loop control is more complex and it
8295      * may actually run slower on some machines.
8296      *
8297      * Arguments:
8298      *
8299      * Inputs:
8300      *   c_rarg0   - int array elements a
8301      *   c_rarg1   - int array elements n (the modulus)
8302      *   c_rarg2   - int length
8303      *   c_rarg3   - int inv
8304      *   c_rarg4   - int array elements m (the result)
8305      *
8306      */
8307     address generate_square() {
8308       Label argh;
8309       bind(argh);
8310       stop("MontgomeryMultiply total_allocation must be <= 8192");
8311 
8312       align(CodeEntryAlignment);
8313       address entry = pc();
8314 
8315       enter();
8316 
8317       // Make room.
8318       cmpw(Rlen, 512);
8319       br(Assembler::HI, argh);
8320       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8321       andr(sp, Ra, -2 * wordSize);
8322 
8323       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8324 
8325       {
8326         // Copy input args, reversing as we go.  We use Ra as a
8327         // temporary variable.
8328         reverse(Ra, Pa_base, Rlen, t0, t1);
8329         reverse(Ra, Pn_base, Rlen, t0, t1);
8330       }
8331 
8332       // Push all call-saved registers and also Pm_base which we'll need
8333       // at the end.
8334       save_regs();
8335 
8336       mov(Pm_base, Ra);
8337 
8338       mov(t0, zr);
8339       mov(t1, zr);
8340       mov(t2, zr);
8341 
8342       block_comment("for (int i = 0; i < len; i++) {");
8343       mov(Ri, zr); {
8344         Label loop, end;
8345         bind(loop);
8346         cmp(Ri, Rlen);
8347         br(Assembler::GE, end);
8348 
8349         pre1(Ri);
8350 
8351         block_comment("for (j = (i+1)/2; j; j--) {"); {
8352           add(Rj, Ri, 1);
8353           lsr(Rj, Rj, 1);
8354           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8355         } block_comment("  } // j");
8356 
8357         last_squaring(Ri);
8358 
8359         block_comment("  for (j = i/2; j; j--) {"); {
8360           lsr(Rj, Ri, 1);
8361           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8362         } block_comment("  } // j");
8363 
8364         post1_squaring();
8365         add(Ri, Ri, 1);
8366         cmp(Ri, Rlen);
8367         br(Assembler::LT, loop);
8368 
8369         bind(end);
8370         block_comment("} // i");
8371       }
8372 
8373       block_comment("for (int i = len; i < 2*len; i++) {");
8374       mov(Ri, Rlen); {
8375         Label loop, end;
8376         bind(loop);
8377         cmp(Ri, Rlen, Assembler::LSL, 1);
8378         br(Assembler::GE, end);
8379 
8380         pre2(Ri, Rlen);
8381 
8382         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8383           lsl(Rj, Rlen, 1);
8384           sub(Rj, Rj, Ri);
8385           sub(Rj, Rj, 1);
8386           lsr(Rj, Rj, 1);
8387           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8388         } block_comment("  } // j");
8389 
8390         last_squaring(Ri);
8391 
8392         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8393           lsl(Rj, Rlen, 1);
8394           sub(Rj, Rj, Ri);
8395           lsr(Rj, Rj, 1);
8396           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8397         } block_comment("  } // j");
8398 
8399         post2(Ri, Rlen);
8400         add(Ri, Ri, 1);
8401         cmp(Ri, Rlen, Assembler::LSL, 1);
8402 
8403         br(Assembler::LT, loop);
8404         bind(end);
8405         block_comment("} // i");
8406       }
8407 
8408       normalize(Rlen);
8409 
8410       mov(Ra, Pm_base);  // Save Pm_base in Ra
8411       restore_regs();  // Restore caller's Pm_base
8412 
8413       // Copy our result into caller's Pm_base
8414       reverse(Pm_base, Ra, Rlen, t0, t1);
8415 
8416       leave();
8417       ret(lr);
8418 
8419       return entry;
8420     }
8421     // In C, approximately:
8422 
8423     // void
8424     // montgomery_square(julong Pa_base[], julong Pn_base[],
8425     //                   julong Pm_base[], julong inv, int len) {
8426     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8427     //   julong *Pa, *Pb, *Pn, *Pm;
8428     //   julong Ra, Rb, Rn, Rm;
8429 
8430     //   int i;
8431 
8432     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8433 
8434     //   for (i = 0; i < len; i++) {
8435     //     int j;
8436 
8437     //     Pa = Pa_base;
8438     //     Pb = Pa_base + i;
8439     //     Pm = Pm_base;
8440     //     Pn = Pn_base + i;
8441 
8442     //     Ra = *Pa;
8443     //     Rb = *Pb;
8444     //     Rm = *Pm;
8445     //     Rn = *Pn;
8446 
8447     //     int iters = (i+1)/2;
8448     //     for (j = 0; iters--; j++) {
8449     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8450     //       MACC2(Ra, Rb, t0, t1, t2);
8451     //       Ra = *++Pa;
8452     //       Rb = *--Pb;
8453     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8454     //       MACC(Rm, Rn, t0, t1, t2);
8455     //       Rm = *++Pm;
8456     //       Rn = *--Pn;
8457     //     }
8458     //     if ((i & 1) == 0) {
8459     //       assert(Ra == Pa_base[j], "must be");
8460     //       MACC(Ra, Ra, t0, t1, t2);
8461     //     }
8462     //     iters = i/2;
8463     //     assert(iters == i-j, "must be");
8464     //     for (; iters--; j++) {
8465     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8466     //       MACC(Rm, Rn, t0, t1, t2);
8467     //       Rm = *++Pm;
8468     //       Rn = *--Pn;
8469     //     }
8470 
8471     //     *Pm = Rm = t0 * inv;
8472     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8473     //     MACC(Rm, Rn, t0, t1, t2);
8474 
8475     //     assert(t0 == 0, "broken Montgomery multiply");
8476 
8477     //     t0 = t1; t1 = t2; t2 = 0;
8478     //   }
8479 
8480     //   for (i = len; i < 2*len; i++) {
8481     //     int start = i-len+1;
8482     //     int end = start + (len - start)/2;
8483     //     int j;
8484 
8485     //     Pa = Pa_base + i-len;
8486     //     Pb = Pa_base + len;
8487     //     Pm = Pm_base + i-len;
8488     //     Pn = Pn_base + len;
8489 
8490     //     Ra = *++Pa;
8491     //     Rb = *--Pb;
8492     //     Rm = *++Pm;
8493     //     Rn = *--Pn;
8494 
8495     //     int iters = (2*len-i-1)/2;
8496     //     assert(iters == end-start, "must be");
8497     //     for (j = start; iters--; j++) {
8498     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8499     //       MACC2(Ra, Rb, t0, t1, t2);
8500     //       Ra = *++Pa;
8501     //       Rb = *--Pb;
8502     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8503     //       MACC(Rm, Rn, t0, t1, t2);
8504     //       Rm = *++Pm;
8505     //       Rn = *--Pn;
8506     //     }
8507     //     if ((i & 1) == 0) {
8508     //       assert(Ra == Pa_base[j], "must be");
8509     //       MACC(Ra, Ra, t0, t1, t2);
8510     //     }
8511     //     iters =  (2*len-i)/2;
8512     //     assert(iters == len-j, "must be");
8513     //     for (; iters--; j++) {
8514     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8515     //       MACC(Rm, Rn, t0, t1, t2);
8516     //       Rm = *++Pm;
8517     //       Rn = *--Pn;
8518     //     }
8519     //     Pm_base[i-len] = t0;
8520     //     t0 = t1; t1 = t2; t2 = 0;
8521     //   }
8522 
8523     //   while (t0)
8524     //     t0 = sub(Pm_base, Pn_base, t0, len);
8525     // }
8526   };
8527 
8528   void generate_vector_math_stubs() {
8529     // Get native vector math stub routine addresses
8530     void* libsleef = nullptr;
8531     char ebuf[1024];
8532     char dll_name[JVM_MAXPATHLEN];
8533     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
8534       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
8535     }
8536     if (libsleef == nullptr) {
8537       log_info(library)("Failed to load native vector math library, %s!", ebuf);
8538       return;
8539     }
8540     // Method naming convention
8541     //   All the methods are named as <OP><T><N>_<U><suffix>
8542     //   Where:
8543     //     <OP>     is the operation name, e.g. sin
8544     //     <T>      is optional to indicate float/double
8545     //              "f/d" for vector float/double operation
8546     //     <N>      is the number of elements in the vector
8547     //              "2/4" for neon, and "x" for sve
8548     //     <U>      is the precision level
8549     //              "u10/u05" represents 1.0/0.5 ULP error bounds
8550     //               We use "u10" for all operations by default
8551     //               But for those functions do not have u10 support, we use "u05" instead
8552     //     <suffix> indicates neon/sve
8553     //              "sve/advsimd" for sve/neon implementations
8554     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
8555     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
8556     //
8557     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
8558 
8559     // Math vector stubs implemented with SVE for scalable vector size.
8560     if (UseSVE > 0) {
8561       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8562         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8563         // Skip "tanh" because there is performance regression
8564         if (vop == VectorSupport::VECTOR_OP_TANH) {
8565           continue;
8566         }
8567 
8568         // The native library does not support u10 level of "hypot".
8569         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8570 
8571         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
8572         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8573 
8574         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
8575         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8576       }
8577     }
8578 
8579     // Math vector stubs implemented with NEON for 64/128 bits vector size.
8580     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8581       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8582       // Skip "tanh" because there is performance regression
8583       if (vop == VectorSupport::VECTOR_OP_TANH) {
8584         continue;
8585       }
8586 
8587       // The native library does not support u10 level of "hypot".
8588       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8589 
8590       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8591       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
8592 
8593       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8594       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8595 
8596       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
8597       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8598     }
8599   }
8600 
8601   // Initialization
8602   void generate_initial_stubs() {
8603     // Generate initial stubs and initializes the entry points
8604 
8605     // entry points that exist in all platforms Note: This is code
8606     // that could be shared among different platforms - however the
8607     // benefit seems to be smaller than the disadvantage of having a
8608     // much more complicated generator structure. See also comment in
8609     // stubRoutines.hpp.
8610 
8611     StubRoutines::_forward_exception_entry = generate_forward_exception();
8612 
8613     StubRoutines::_call_stub_entry =
8614       generate_call_stub(StubRoutines::_call_stub_return_address);
8615 
8616     // is referenced by megamorphic call
8617     StubRoutines::_catch_exception_entry = generate_catch_exception();
8618 
8619     // Initialize table for copy memory (arraycopy) check.
8620     if (UnsafeMemoryAccess::_table == nullptr) {
8621       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
8622     }
8623 
8624     if (UseCRC32Intrinsics) {
8625       // set table address before stub generation which use it
8626       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8627       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8628     }
8629 
8630     if (UseCRC32CIntrinsics) {
8631       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8632     }
8633 
8634     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8635       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8636     }
8637 
8638     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8639       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8640     }
8641 
8642     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8643         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8644       StubRoutines::_hf2f = generate_float16ToFloat();
8645       StubRoutines::_f2hf = generate_floatToFloat16();
8646     }
8647   }
8648 
8649   void generate_continuation_stubs() {
8650     // Continuation stubs:
8651     StubRoutines::_cont_thaw          = generate_cont_thaw();
8652     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8653     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8654     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
8655   }
8656 
8657   void generate_final_stubs() {
8658     // support for verify_oop (must happen after universe_init)
8659     if (VerifyOops) {
8660       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8661     }
8662 
8663     // arraycopy stubs used by compilers
8664     generate_arraycopy_stubs();
8665 
8666     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8667     if (bs_nm != nullptr) {
8668       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8669     }
8670 
8671     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8672 
8673     if (UsePoly1305Intrinsics) {
8674       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8675     }
8676 
8677 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8678 
8679     generate_atomic_entry_points();
8680 
8681 #endif // LINUX
8682 
8683 #ifdef COMPILER2
8684     if (UseSecondarySupersTable) {
8685       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
8686       if (! InlineSecondarySupersTest) {
8687         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
8688           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
8689             = generate_lookup_secondary_supers_table_stub(slot);
8690         }
8691       }
8692     }
8693 #endif
8694 
8695     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8696     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
8697 
8698     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8699   }
8700 
8701   void generate_compiler_stubs() {
8702 #if COMPILER2_OR_JVMCI
8703 
8704     if (UseSVE == 0) {
8705       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8706     }
8707 
8708     // array equals stub for large arrays.
8709     if (!UseSimpleArrayEquals) {
8710       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8711     }
8712 
8713     // arrays_hascode stub for large arrays.
8714     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
8715     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
8716     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
8717     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
8718     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
8719 
8720     // byte_array_inflate stub for large arrays.
8721     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8722 
8723     // countPositives stub for large arrays.
8724     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8725 
8726     generate_compare_long_strings();
8727 
8728     generate_string_indexof_stubs();
8729 
8730 #ifdef COMPILER2
8731     if (UseMultiplyToLenIntrinsic) {
8732       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8733     }
8734 
8735     if (UseSquareToLenIntrinsic) {
8736       StubRoutines::_squareToLen = generate_squareToLen();
8737     }
8738 
8739     if (UseMulAddIntrinsic) {
8740       StubRoutines::_mulAdd = generate_mulAdd();
8741     }
8742 
8743     if (UseSIMDForBigIntegerShiftIntrinsics) {
8744       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8745       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8746     }
8747 
8748     if (UseMontgomeryMultiplyIntrinsic) {
8749       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8750       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8751       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8752     }
8753 
8754     if (UseMontgomerySquareIntrinsic) {
8755       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8756       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8757       // We use generate_multiply() rather than generate_square()
8758       // because it's faster for the sizes of modulus we care about.
8759       StubRoutines::_montgomerySquare = g.generate_multiply();
8760     }
8761 
8762     generate_vector_math_stubs();
8763 
8764 #endif // COMPILER2
8765 
8766     if (UseChaCha20Intrinsics) {
8767       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8768     }
8769 
8770     if (UseBASE64Intrinsics) {
8771         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8772         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8773     }
8774 
8775     // data cache line writeback
8776     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8777     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8778 
8779     if (UseAESIntrinsics) {
8780       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8781       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8782       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8783       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8784       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8785     }
8786     if (UseGHASHIntrinsics) {
8787       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8788       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8789     }
8790     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8791       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8792     }
8793 
8794     if (UseMD5Intrinsics) {
8795       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8796       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8797     }
8798     if (UseSHA1Intrinsics) {
8799       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8800       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8801     }
8802     if (UseSHA256Intrinsics) {
8803       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8804       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8805     }
8806     if (UseSHA512Intrinsics) {
8807       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8808       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8809     }
8810     if (UseSHA3Intrinsics) {
8811       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8812       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8813     }
8814 
8815     // generate Adler32 intrinsics code
8816     if (UseAdler32Intrinsics) {
8817       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8818     }
8819 
8820 #endif // COMPILER2_OR_JVMCI
8821   }
8822 
8823  public:
8824   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8825     switch(kind) {
8826     case Initial_stubs:
8827       generate_initial_stubs();
8828       break;
8829      case Continuation_stubs:
8830       generate_continuation_stubs();
8831       break;
8832     case Compiler_stubs:
8833       generate_compiler_stubs();
8834       break;
8835     case Final_stubs:
8836       generate_final_stubs();
8837       break;
8838     default:
8839       fatal("unexpected stubs kind: %d", kind);
8840       break;
8841     };
8842   }
8843 }; // end class declaration
8844 
8845 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8846   StubGenerator g(code, kind);
8847 }
8848 
8849 
8850 #if defined (LINUX)
8851 
8852 // Define pointers to atomic stubs and initialize them to point to the
8853 // code in atomic_aarch64.S.
8854 
8855 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8856   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8857     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8858   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8859     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8860 
8861 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8862 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8863 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8864 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8865 DEFAULT_ATOMIC_OP(xchg, 4, )
8866 DEFAULT_ATOMIC_OP(xchg, 8, )
8867 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8868 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8869 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8870 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8871 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8872 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8873 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8874 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8875 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8876 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8877 
8878 #undef DEFAULT_ATOMIC_OP
8879 
8880 #endif // LINUX