1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "code/SCCache.hpp"
  32 #include "compiler/oopMap.hpp"
  33 #include "gc/shared/barrierSet.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/gc_globals.hpp"
  36 #include "gc/shared/tlab_globals.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "memory/universe.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/instanceOop.hpp"
  41 #include "oops/method.hpp"
  42 #include "oops/objArrayKlass.hpp"
  43 #include "oops/oop.inline.hpp"
  44 #include "prims/methodHandles.hpp"
  45 #include "prims/upcallLinker.hpp"
  46 #include "runtime/atomic.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/frame.inline.hpp"
  50 #include "runtime/handles.inline.hpp"
  51 #include "runtime/javaThread.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/stubCodeGenerator.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "utilities/align.hpp"
  56 #include "utilities/checkedCast.hpp"
  57 #include "utilities/globalDefinitions.hpp"
  58 #include "utilities/powerOfTwo.hpp"
  59 #ifdef COMPILER2
  60 #include "opto/runtime.hpp"
  61 #endif
  62 #if INCLUDE_ZGC
  63 #include "gc/z/zThreadLocalData.hpp"
  64 #endif
  65 
  66 // Declaration and definition of StubGenerator (no .hpp file).
  67 // For a more detailed description of the stub routine structure
  68 // see the comment in stubRoutines.hpp
  69 
  70 #undef __
  71 #define __ _masm->
  72 
  73 #ifdef PRODUCT
  74 #define BLOCK_COMMENT(str) /* nothing */
  75 #else
  76 #define BLOCK_COMMENT(str) __ block_comment(str)
  77 #endif
  78 
  79 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  80 
  81 // Stub Code definitions
  82 
  83 class StubGenerator: public StubCodeGenerator {
  84  private:
  85 
  86 #ifdef PRODUCT
  87 #define inc_counter_np(counter) ((void)0)
  88 #else
  89   void inc_counter_np_(uint& counter) {
  90     __ incrementw(ExternalAddress((address)&counter));
  91   }
  92 #define inc_counter_np(counter) \
  93   BLOCK_COMMENT("inc_counter " #counter); \
  94   inc_counter_np_(counter);
  95 #endif
  96 
  97   // Call stubs are used to call Java from C
  98   //
  99   // Arguments:
 100   //    c_rarg0:   call wrapper address                   address
 101   //    c_rarg1:   result                                 address
 102   //    c_rarg2:   result type                            BasicType
 103   //    c_rarg3:   method                                 Method*
 104   //    c_rarg4:   (interpreter) entry point              address
 105   //    c_rarg5:   parameters                             intptr_t*
 106   //    c_rarg6:   parameter size (in words)              int
 107   //    c_rarg7:   thread                                 Thread*
 108   //
 109   // There is no return from the stub itself as any Java result
 110   // is written to result
 111   //
 112   // we save r30 (lr) as the return PC at the base of the frame and
 113   // link r29 (fp) below it as the frame pointer installing sp (r31)
 114   // into fp.
 115   //
 116   // we save r0-r7, which accounts for all the c arguments.
 117   //
 118   // TODO: strictly do we need to save them all? they are treated as
 119   // volatile by C so could we omit saving the ones we are going to
 120   // place in global registers (thread? method?) or those we only use
 121   // during setup of the Java call?
 122   //
 123   // we don't need to save r8 which C uses as an indirect result location
 124   // return register.
 125   //
 126   // we don't need to save r9-r15 which both C and Java treat as
 127   // volatile
 128   //
 129   // we don't need to save r16-18 because Java does not use them
 130   //
 131   // we save r19-r28 which Java uses as scratch registers and C
 132   // expects to be callee-save
 133   //
 134   // we save the bottom 64 bits of each value stored in v8-v15; it is
 135   // the responsibility of the caller to preserve larger values.
 136   //
 137   // so the stub frame looks like this when we enter Java code
 138   //
 139   //     [ return_from_Java     ] <--- sp
 140   //     [ argument word n      ]
 141   //      ...
 142   // -29 [ argument word 1      ]
 143   // -28 [ saved Floating-point Control Register ]
 144   // -26 [ saved v15            ] <--- sp_after_call
 145   // -25 [ saved v14            ]
 146   // -24 [ saved v13            ]
 147   // -23 [ saved v12            ]
 148   // -22 [ saved v11            ]
 149   // -21 [ saved v10            ]
 150   // -20 [ saved v9             ]
 151   // -19 [ saved v8             ]
 152   // -18 [ saved r28            ]
 153   // -17 [ saved r27            ]
 154   // -16 [ saved r26            ]
 155   // -15 [ saved r25            ]
 156   // -14 [ saved r24            ]
 157   // -13 [ saved r23            ]
 158   // -12 [ saved r22            ]
 159   // -11 [ saved r21            ]
 160   // -10 [ saved r20            ]
 161   //  -9 [ saved r19            ]
 162   //  -8 [ call wrapper    (r0) ]
 163   //  -7 [ result          (r1) ]
 164   //  -6 [ result type     (r2) ]
 165   //  -5 [ method          (r3) ]
 166   //  -4 [ entry point     (r4) ]
 167   //  -3 [ parameters      (r5) ]
 168   //  -2 [ parameter size  (r6) ]
 169   //  -1 [ thread (r7)          ]
 170   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 171   //   1 [ saved lr       (r30) ]
 172 
 173   // Call stub stack layout word offsets from fp
 174   enum call_stub_layout {
 175     sp_after_call_off  = -28,
 176 
 177     fpcr_off           = sp_after_call_off,
 178     d15_off            = -26,
 179     d13_off            = -24,
 180     d11_off            = -22,
 181     d9_off             = -20,
 182 
 183     r28_off            = -18,
 184     r26_off            = -16,
 185     r24_off            = -14,
 186     r22_off            = -12,
 187     r20_off            = -10,
 188     call_wrapper_off   =  -8,
 189     result_off         =  -7,
 190     result_type_off    =  -6,
 191     method_off         =  -5,
 192     entry_point_off    =  -4,
 193     parameter_size_off =  -2,
 194     thread_off         =  -1,
 195     fp_f               =   0,
 196     retaddr_off        =   1,
 197   };
 198 
 199   address generate_call_stub(address& return_address) {
 200     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 201            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 202            "adjust this code");
 203 
 204     StubCodeMark mark(this, "StubRoutines", "call_stub");
 205     address start = __ pc();
 206 
 207     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 208 
 209     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 210     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 211     const Address result        (rfp, result_off         * wordSize);
 212     const Address result_type   (rfp, result_type_off    * wordSize);
 213     const Address method        (rfp, method_off         * wordSize);
 214     const Address entry_point   (rfp, entry_point_off    * wordSize);
 215     const Address parameter_size(rfp, parameter_size_off * wordSize);
 216 
 217     const Address thread        (rfp, thread_off         * wordSize);
 218 
 219     const Address d15_save      (rfp, d15_off * wordSize);
 220     const Address d13_save      (rfp, d13_off * wordSize);
 221     const Address d11_save      (rfp, d11_off * wordSize);
 222     const Address d9_save       (rfp, d9_off * wordSize);
 223 
 224     const Address r28_save      (rfp, r28_off * wordSize);
 225     const Address r26_save      (rfp, r26_off * wordSize);
 226     const Address r24_save      (rfp, r24_off * wordSize);
 227     const Address r22_save      (rfp, r22_off * wordSize);
 228     const Address r20_save      (rfp, r20_off * wordSize);
 229 
 230     // stub code
 231 
 232     address aarch64_entry = __ pc();
 233 
 234     // set up frame and move sp to end of save area
 235     __ enter();
 236     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 237 
 238     // save register parameters and Java scratch/global registers
 239     // n.b. we save thread even though it gets installed in
 240     // rthread because we want to sanity check rthread later
 241     __ str(c_rarg7,  thread);
 242     __ strw(c_rarg6, parameter_size);
 243     __ stp(c_rarg4, c_rarg5,  entry_point);
 244     __ stp(c_rarg2, c_rarg3,  result_type);
 245     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 246 
 247     __ stp(r20, r19,   r20_save);
 248     __ stp(r22, r21,   r22_save);
 249     __ stp(r24, r23,   r24_save);
 250     __ stp(r26, r25,   r26_save);
 251     __ stp(r28, r27,   r28_save);
 252 
 253     __ stpd(v9,  v8,   d9_save);
 254     __ stpd(v11, v10,  d11_save);
 255     __ stpd(v13, v12,  d13_save);
 256     __ stpd(v15, v14,  d15_save);
 257 
 258     __ get_fpcr(rscratch1);
 259     __ str(rscratch1, fpcr_save);
 260     // Set FPCR to the state we need. We do want Round to Nearest. We
 261     // don't want non-IEEE rounding modes or floating-point traps.
 262     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 263     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 264     __ set_fpcr(rscratch1);
 265 
 266     // install Java thread in global register now we have saved
 267     // whatever value it held
 268     __ mov(rthread, c_rarg7);
 269     // And method
 270     __ mov(rmethod, c_rarg3);
 271 
 272     // set up the heapbase register
 273     __ reinit_heapbase();
 274 
 275 #ifdef ASSERT
 276     // make sure we have no pending exceptions
 277     {
 278       Label L;
 279       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 280       __ cmp(rscratch1, (u1)NULL_WORD);
 281       __ br(Assembler::EQ, L);
 282       __ stop("StubRoutines::call_stub: entered with pending exception");
 283       __ BIND(L);
 284     }
 285 #endif
 286     // pass parameters if any
 287     __ mov(esp, sp);
 288     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 289     __ andr(sp, rscratch1, -2 * wordSize);
 290 
 291     BLOCK_COMMENT("pass parameters if any");
 292     Label parameters_done;
 293     // parameter count is still in c_rarg6
 294     // and parameter pointer identifying param 1 is in c_rarg5
 295     __ cbzw(c_rarg6, parameters_done);
 296 
 297     address loop = __ pc();
 298     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 299     __ subsw(c_rarg6, c_rarg6, 1);
 300     __ push(rscratch1);
 301     __ br(Assembler::GT, loop);
 302 
 303     __ BIND(parameters_done);
 304 
 305     // call Java entry -- passing methdoOop, and current sp
 306     //      rmethod: Method*
 307     //      r19_sender_sp: sender sp
 308     BLOCK_COMMENT("call Java function");
 309     __ mov(r19_sender_sp, sp);
 310     __ blr(c_rarg4);
 311 
 312     // we do this here because the notify will already have been done
 313     // if we get to the next instruction via an exception
 314     //
 315     // n.b. adding this instruction here affects the calculation of
 316     // whether or not a routine returns to the call stub (used when
 317     // doing stack walks) since the normal test is to check the return
 318     // pc against the address saved below. so we may need to allow for
 319     // this extra instruction in the check.
 320 
 321     // save current address for use by exception handling code
 322 
 323     return_address = __ pc();
 324 
 325     // store result depending on type (everything that is not
 326     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 327     // n.b. this assumes Java returns an integral result in r0
 328     // and a floating result in j_farg0
 329     __ ldr(j_rarg2, result);
 330     Label is_long, is_float, is_double, exit;
 331     __ ldr(j_rarg1, result_type);
 332     __ cmp(j_rarg1, (u1)T_OBJECT);
 333     __ br(Assembler::EQ, is_long);
 334     __ cmp(j_rarg1, (u1)T_LONG);
 335     __ br(Assembler::EQ, is_long);
 336     __ cmp(j_rarg1, (u1)T_FLOAT);
 337     __ br(Assembler::EQ, is_float);
 338     __ cmp(j_rarg1, (u1)T_DOUBLE);
 339     __ br(Assembler::EQ, is_double);
 340 
 341     // handle T_INT case
 342     __ strw(r0, Address(j_rarg2));
 343 
 344     __ BIND(exit);
 345 
 346     // pop parameters
 347     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 348 
 349 #ifdef ASSERT
 350     // verify that threads correspond
 351     {
 352       Label L, S;
 353       __ ldr(rscratch1, thread);
 354       __ cmp(rthread, rscratch1);
 355       __ br(Assembler::NE, S);
 356       __ get_thread(rscratch1);
 357       __ cmp(rthread, rscratch1);
 358       __ br(Assembler::EQ, L);
 359       __ BIND(S);
 360       __ stop("StubRoutines::call_stub: threads must correspond");
 361       __ BIND(L);
 362     }
 363 #endif
 364 
 365     __ pop_cont_fastpath(rthread);
 366 
 367     // restore callee-save registers
 368     __ ldpd(v15, v14,  d15_save);
 369     __ ldpd(v13, v12,  d13_save);
 370     __ ldpd(v11, v10,  d11_save);
 371     __ ldpd(v9,  v8,   d9_save);
 372 
 373     __ ldp(r28, r27,   r28_save);
 374     __ ldp(r26, r25,   r26_save);
 375     __ ldp(r24, r23,   r24_save);
 376     __ ldp(r22, r21,   r22_save);
 377     __ ldp(r20, r19,   r20_save);
 378 
 379     // restore fpcr
 380     __ ldr(rscratch1,  fpcr_save);
 381     __ set_fpcr(rscratch1);
 382 
 383     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 384     __ ldrw(c_rarg2, result_type);
 385     __ ldr(c_rarg3,  method);
 386     __ ldp(c_rarg4, c_rarg5,  entry_point);
 387     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 388 
 389     // leave frame and return to caller
 390     __ leave();
 391     __ ret(lr);
 392 
 393     // handle return types different from T_INT
 394 
 395     __ BIND(is_long);
 396     __ str(r0, Address(j_rarg2, 0));
 397     __ br(Assembler::AL, exit);
 398 
 399     __ BIND(is_float);
 400     __ strs(j_farg0, Address(j_rarg2, 0));
 401     __ br(Assembler::AL, exit);
 402 
 403     __ BIND(is_double);
 404     __ strd(j_farg0, Address(j_rarg2, 0));
 405     __ br(Assembler::AL, exit);
 406 
 407     return start;
 408   }
 409 
 410   // Return point for a Java call if there's an exception thrown in
 411   // Java code.  The exception is caught and transformed into a
 412   // pending exception stored in JavaThread that can be tested from
 413   // within the VM.
 414   //
 415   // Note: Usually the parameters are removed by the callee. In case
 416   // of an exception crossing an activation frame boundary, that is
 417   // not the case if the callee is compiled code => need to setup the
 418   // rsp.
 419   //
 420   // r0: exception oop
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != nullptr,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code with no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // Reinitialize the ptrue predicate register, in case the external runtime
 517     // call clobbers ptrue reg, as we may return to SVE compiled code.
 518     __ reinitialize_ptrue();
 519 
 520     // we should not really care that lr is no longer the callee
 521     // address. we saved the value the handler needs in r19 so we can
 522     // just copy it to r3. however, the C2 handler will push its own
 523     // frame and then calls into the VM and the VM code asserts that
 524     // the PC for the frame above the handler belongs to a compiled
 525     // Java method. So, we restore lr here to satisfy that assert.
 526     __ mov(lr, r19);
 527     // setup r0 & r3 & clear pending exception
 528     __ mov(r3, r19);
 529     __ mov(r19, r0);
 530     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 531     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 532 
 533 #ifdef ASSERT
 534     // make sure exception is set
 535     {
 536       Label L;
 537       __ cbnz(r0, L);
 538       __ stop("StubRoutines::forward exception: no pending exception (2)");
 539       __ bind(L);
 540     }
 541 #endif
 542 
 543     // continue at exception handler
 544     // r0: exception
 545     // r3: throwing pc
 546     // r19: exception handler
 547     __ verify_oop(r0);
 548     __ br(r19);
 549 
 550     return start;
 551   }
 552 
 553   // Non-destructive plausibility checks for oops
 554   //
 555   // Arguments:
 556   //    r0: oop to verify
 557   //    rscratch1: error message
 558   //
 559   // Stack after saving c_rarg3:
 560   //    [tos + 0]: saved c_rarg3
 561   //    [tos + 1]: saved c_rarg2
 562   //    [tos + 2]: saved lr
 563   //    [tos + 3]: saved rscratch2
 564   //    [tos + 4]: saved r0
 565   //    [tos + 5]: saved rscratch1
 566   address generate_verify_oop() {
 567 
 568     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 569     address start = __ pc();
 570 
 571     Label exit, error;
 572 
 573     // save c_rarg2 and c_rarg3
 574     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 575 
 576     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 577     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 578     __ ldr(c_rarg3, Address(c_rarg2));
 579     __ add(c_rarg3, c_rarg3, 1);
 580     __ str(c_rarg3, Address(c_rarg2));
 581 
 582     // object is in r0
 583     // make sure object is 'reasonable'
 584     __ cbz(r0, exit); // if obj is null it is OK
 585 
 586     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 587     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 588 
 589     // return if everything seems ok
 590     __ bind(exit);
 591 
 592     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 593     __ ret(lr);
 594 
 595     // handle errors
 596     __ bind(error);
 597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 598 
 599     __ push(RegSet::range(r0, r29), sp);
 600     // debug(char* msg, int64_t pc, int64_t regs[])
 601     __ mov(c_rarg0, rscratch1);      // pass address of error message
 602     __ mov(c_rarg1, lr);             // pass return address
 603     __ mov(c_rarg2, sp);             // pass address of regs on stack
 604 #ifndef PRODUCT
 605     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 606 #endif
 607     BLOCK_COMMENT("call MacroAssembler::debug");
 608     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 609     __ blr(rscratch1);
 610     __ hlt(0);
 611 
 612     return start;
 613   }
 614 
 615   // Generate indices for iota vector.
 616   address generate_iota_indices(const char *stub_name) {
 617     __ align(CodeEntryAlignment);
 618     StubCodeMark mark(this, "StubRoutines", stub_name);
 619     address start = __ pc();
 620     // B
 621     __ emit_data64(0x0706050403020100, relocInfo::none);
 622     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 623     // H
 624     __ emit_data64(0x0003000200010000, relocInfo::none);
 625     __ emit_data64(0x0007000600050004, relocInfo::none);
 626     // S
 627     __ emit_data64(0x0000000100000000, relocInfo::none);
 628     __ emit_data64(0x0000000300000002, relocInfo::none);
 629     // D
 630     __ emit_data64(0x0000000000000000, relocInfo::none);
 631     __ emit_data64(0x0000000000000001, relocInfo::none);
 632     // S - FP
 633     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 634     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 635     // D - FP
 636     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 637     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 638     return start;
 639   }
 640 
 641   // The inner part of zero_words().  This is the bulk operation,
 642   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 643   // caller is responsible for zeroing the last few words.
 644   //
 645   // Inputs:
 646   // r10: the HeapWord-aligned base address of an array to zero.
 647   // r11: the count in HeapWords, r11 > 0.
 648   //
 649   // Returns r10 and r11, adjusted for the caller to clear.
 650   // r10: the base address of the tail of words left to clear.
 651   // r11: the number of words in the tail.
 652   //      r11 < MacroAssembler::zero_words_block_size.
 653 
 654   address generate_zero_blocks() {
 655     Label done;
 656     Label base_aligned;
 657 
 658     Register base = r10, cnt = r11;
 659 
 660     __ align(CodeEntryAlignment);
 661     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 662     address start = __ pc();
 663 
 664     if (UseBlockZeroing) {
 665       int zva_length = VM_Version::zva_length();
 666 
 667       // Ensure ZVA length can be divided by 16. This is required by
 668       // the subsequent operations.
 669       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 670 
 671       __ tbz(base, 3, base_aligned);
 672       __ str(zr, Address(__ post(base, 8)));
 673       __ sub(cnt, cnt, 1);
 674       __ bind(base_aligned);
 675 
 676       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 677       // alignment.
 678       Label small;
 679       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 680       __ subs(rscratch1, cnt, low_limit >> 3);
 681       __ br(Assembler::LT, small);
 682       __ zero_dcache_blocks(base, cnt);
 683       __ bind(small);
 684     }
 685 
 686     {
 687       // Number of stp instructions we'll unroll
 688       const int unroll =
 689         MacroAssembler::zero_words_block_size / 2;
 690       // Clear the remaining blocks.
 691       Label loop;
 692       __ subs(cnt, cnt, unroll * 2);
 693       __ br(Assembler::LT, done);
 694       __ bind(loop);
 695       for (int i = 0; i < unroll; i++)
 696         __ stp(zr, zr, __ post(base, 16));
 697       __ subs(cnt, cnt, unroll * 2);
 698       __ br(Assembler::GE, loop);
 699       __ bind(done);
 700       __ add(cnt, cnt, unroll * 2);
 701     }
 702 
 703     __ ret(lr);
 704 
 705     return start;
 706   }
 707 
 708 
 709   typedef enum {
 710     copy_forwards = 1,
 711     copy_backwards = -1
 712   } copy_direction;
 713 
 714   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 715   // for arraycopy stubs.
 716   class ArrayCopyBarrierSetHelper : StackObj {
 717     BarrierSetAssembler* _bs_asm;
 718     MacroAssembler* _masm;
 719     DecoratorSet _decorators;
 720     BasicType _type;
 721     Register _gct1;
 722     Register _gct2;
 723     Register _gct3;
 724     FloatRegister _gcvt1;
 725     FloatRegister _gcvt2;
 726     FloatRegister _gcvt3;
 727 
 728   public:
 729     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 730                               DecoratorSet decorators,
 731                               BasicType type,
 732                               Register gct1,
 733                               Register gct2,
 734                               Register gct3,
 735                               FloatRegister gcvt1,
 736                               FloatRegister gcvt2,
 737                               FloatRegister gcvt3)
 738       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 739         _masm(masm),
 740         _decorators(decorators),
 741         _type(type),
 742         _gct1(gct1),
 743         _gct2(gct2),
 744         _gct3(gct3),
 745         _gcvt1(gcvt1),
 746         _gcvt2(gcvt2),
 747         _gcvt3(gcvt3) {
 748     }
 749 
 750     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 751       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 752                             dst1, dst2, src,
 753                             _gct1, _gct2, _gcvt1);
 754     }
 755 
 756     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 757       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 758                              dst, src1, src2,
 759                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 760     }
 761 
 762     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 763       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 764                             dst1, dst2, src,
 765                             _gct1);
 766     }
 767 
 768     void copy_store_at_16(Address dst, Register src1, Register src2) {
 769       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 770                              dst, src1, src2,
 771                              _gct1, _gct2, _gct3);
 772     }
 773 
 774     void copy_load_at_8(Register dst, Address src) {
 775       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 776                             dst, noreg, src,
 777                             _gct1);
 778     }
 779 
 780     void copy_store_at_8(Address dst, Register src) {
 781       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 782                              dst, src, noreg,
 783                              _gct1, _gct2, _gct3);
 784     }
 785   };
 786 
 787   // Bulk copy of blocks of 8 words.
 788   //
 789   // count is a count of words.
 790   //
 791   // Precondition: count >= 8
 792   //
 793   // Postconditions:
 794   //
 795   // The least significant bit of count contains the remaining count
 796   // of words to copy.  The rest of count is trash.
 797   //
 798   // s and d are adjusted to point to the remaining words to copy
 799   //
 800   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 801                            copy_direction direction) {
 802     int unit = wordSize * direction;
 803     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 804 
 805     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 806       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 807     const Register stride = r14;
 808     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 809     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 810     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 811 
 812     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 813     assert_different_registers(s, d, count, rscratch1, rscratch2);
 814 
 815     Label again, drain;
 816     const char *stub_name;
 817     if (direction == copy_forwards)
 818       stub_name = "forward_copy_longs";
 819     else
 820       stub_name = "backward_copy_longs";
 821 
 822     __ align(CodeEntryAlignment);
 823 
 824     StubCodeMark mark(this, "StubRoutines", stub_name);
 825 
 826     __ bind(start);
 827 
 828     Label unaligned_copy_long;
 829     if (AvoidUnalignedAccesses) {
 830       __ tbnz(d, 3, unaligned_copy_long);
 831     }
 832 
 833     if (direction == copy_forwards) {
 834       __ sub(s, s, bias);
 835       __ sub(d, d, bias);
 836     }
 837 
 838 #ifdef ASSERT
 839     // Make sure we are never given < 8 words
 840     {
 841       Label L;
 842       __ cmp(count, (u1)8);
 843       __ br(Assembler::GE, L);
 844       __ stop("genrate_copy_longs called with < 8 words");
 845       __ bind(L);
 846     }
 847 #endif
 848 
 849     // Fill 8 registers
 850     if (UseSIMDForMemoryOps) {
 851       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 852       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 853     } else {
 854       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 855       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 856       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 857       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 858     }
 859 
 860     __ subs(count, count, 16);
 861     __ br(Assembler::LO, drain);
 862 
 863     int prefetch = PrefetchCopyIntervalInBytes;
 864     bool use_stride = false;
 865     if (direction == copy_backwards) {
 866        use_stride = prefetch > 256;
 867        prefetch = -prefetch;
 868        if (use_stride) __ mov(stride, prefetch);
 869     }
 870 
 871     __ bind(again);
 872 
 873     if (PrefetchCopyIntervalInBytes > 0)
 874       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 875 
 876     if (UseSIMDForMemoryOps) {
 877       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 878       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 879       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 880       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 881     } else {
 882       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 883       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 884       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 885       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 886       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 887       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 888       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 890     }
 891 
 892     __ subs(count, count, 8);
 893     __ br(Assembler::HS, again);
 894 
 895     // Drain
 896     __ bind(drain);
 897     if (UseSIMDForMemoryOps) {
 898       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 899       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 900     } else {
 901       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 902       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 903       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 904       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 905     }
 906 
 907     {
 908       Label L1, L2;
 909       __ tbz(count, exact_log2(4), L1);
 910       if (UseSIMDForMemoryOps) {
 911         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 912         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 913       } else {
 914         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 915         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 916         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 917         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 918       }
 919       __ bind(L1);
 920 
 921       if (direction == copy_forwards) {
 922         __ add(s, s, bias);
 923         __ add(d, d, bias);
 924       }
 925 
 926       __ tbz(count, 1, L2);
 927       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 928       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 929       __ bind(L2);
 930     }
 931 
 932     __ ret(lr);
 933 
 934     if (AvoidUnalignedAccesses) {
 935       Label drain, again;
 936       // Register order for storing. Order is different for backward copy.
 937 
 938       __ bind(unaligned_copy_long);
 939 
 940       // source address is even aligned, target odd aligned
 941       //
 942       // when forward copying word pairs we read long pairs at offsets
 943       // {0, 2, 4, 6} (in long words). when backwards copying we read
 944       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 945       // address by -2 in the forwards case so we can compute the
 946       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 947       // or -1.
 948       //
 949       // when forward copying we need to store 1 word, 3 pairs and
 950       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 951       // zero offset We adjust the destination by -1 which means we
 952       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 953       //
 954       // When backwards copyng we need to store 1 word, 3 pairs and
 955       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 956       // offsets {1, 3, 5, 7, 8} * unit.
 957 
 958       if (direction == copy_forwards) {
 959         __ sub(s, s, 16);
 960         __ sub(d, d, 8);
 961       }
 962 
 963       // Fill 8 registers
 964       //
 965       // for forwards copy s was offset by -16 from the original input
 966       // value of s so the register contents are at these offsets
 967       // relative to the 64 bit block addressed by that original input
 968       // and so on for each successive 64 byte block when s is updated
 969       //
 970       // t0 at offset 0,  t1 at offset 8
 971       // t2 at offset 16, t3 at offset 24
 972       // t4 at offset 32, t5 at offset 40
 973       // t6 at offset 48, t7 at offset 56
 974 
 975       // for backwards copy s was not offset so the register contents
 976       // are at these offsets into the preceding 64 byte block
 977       // relative to that original input and so on for each successive
 978       // preceding 64 byte block when s is updated. this explains the
 979       // slightly counter-intuitive looking pattern of register usage
 980       // in the stp instructions for backwards copy.
 981       //
 982       // t0 at offset -16, t1 at offset -8
 983       // t2 at offset -32, t3 at offset -24
 984       // t4 at offset -48, t5 at offset -40
 985       // t6 at offset -64, t7 at offset -56
 986 
 987       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 988       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 989       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 990       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 991 
 992       __ subs(count, count, 16);
 993       __ br(Assembler::LO, drain);
 994 
 995       int prefetch = PrefetchCopyIntervalInBytes;
 996       bool use_stride = false;
 997       if (direction == copy_backwards) {
 998          use_stride = prefetch > 256;
 999          prefetch = -prefetch;
1000          if (use_stride) __ mov(stride, prefetch);
1001       }
1002 
1003       __ bind(again);
1004 
1005       if (PrefetchCopyIntervalInBytes > 0)
1006         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1007 
1008       if (direction == copy_forwards) {
1009        // allowing for the offset of -8 the store instructions place
1010        // registers into the target 64 bit block at the following
1011        // offsets
1012        //
1013        // t0 at offset 0
1014        // t1 at offset 8,  t2 at offset 16
1015        // t3 at offset 24, t4 at offset 32
1016        // t5 at offset 40, t6 at offset 48
1017        // t7 at offset 56
1018 
1019         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1020         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1021         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1022         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1023         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1024         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1025         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1026         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1027         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1028       } else {
1029        // d was not offset when we started so the registers are
1030        // written into the 64 bit block preceding d with the following
1031        // offsets
1032        //
1033        // t1 at offset -8
1034        // t3 at offset -24, t0 at offset -16
1035        // t5 at offset -48, t2 at offset -32
1036        // t7 at offset -56, t4 at offset -48
1037        //                   t6 at offset -64
1038        //
1039        // note that this matches the offsets previously noted for the
1040        // loads
1041 
1042         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1043         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1044         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1045         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1046         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1047         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1048         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1049         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1050         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1051       }
1052 
1053       __ subs(count, count, 8);
1054       __ br(Assembler::HS, again);
1055 
1056       // Drain
1057       //
1058       // this uses the same pattern of offsets and register arguments
1059       // as above
1060       __ bind(drain);
1061       if (direction == copy_forwards) {
1062         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1063         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1064         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1065         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1066         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1067       } else {
1068         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1069         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1070         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1071         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1072         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1073       }
1074       // now we need to copy any remaining part block which may
1075       // include a 4 word block subblock and/or a 2 word subblock.
1076       // bits 2 and 1 in the count are the tell-tale for whether we
1077       // have each such subblock
1078       {
1079         Label L1, L2;
1080         __ tbz(count, exact_log2(4), L1);
1081        // this is the same as above but copying only 4 longs hence
1082        // with only one intervening stp between the str instructions
1083        // but note that the offsets and registers still follow the
1084        // same pattern
1085         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1086         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1087         if (direction == copy_forwards) {
1088           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1089           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1090           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1091         } else {
1092           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1093           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1094           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1095         }
1096         __ bind(L1);
1097 
1098         __ tbz(count, 1, L2);
1099        // this is the same as above but copying only 2 longs hence
1100        // there is no intervening stp between the str instructions
1101        // but note that the offset and register patterns are still
1102        // the same
1103         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1104         if (direction == copy_forwards) {
1105           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1106           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1107         } else {
1108           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1109           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1110         }
1111         __ bind(L2);
1112 
1113        // for forwards copy we need to re-adjust the offsets we
1114        // applied so that s and d are follow the last words written
1115 
1116        if (direction == copy_forwards) {
1117          __ add(s, s, 16);
1118          __ add(d, d, 8);
1119        }
1120 
1121       }
1122 
1123       __ ret(lr);
1124       }
1125   }
1126 
1127   // Small copy: less than 16 bytes.
1128   //
1129   // NB: Ignores all of the bits of count which represent more than 15
1130   // bytes, so a caller doesn't have to mask them.
1131 
1132   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1133     bool is_backwards = step < 0;
1134     size_t granularity = uabs(step);
1135     int direction = is_backwards ? -1 : 1;
1136 
1137     Label Lword, Lint, Lshort, Lbyte;
1138 
1139     assert(granularity
1140            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1141 
1142     const Register t0 = r3;
1143     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1144     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1145 
1146     // ??? I don't know if this bit-test-and-branch is the right thing
1147     // to do.  It does a lot of jumping, resulting in several
1148     // mispredicted branches.  It might make more sense to do this
1149     // with something like Duff's device with a single computed branch.
1150 
1151     __ tbz(count, 3 - exact_log2(granularity), Lword);
1152     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1153     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1154     __ bind(Lword);
1155 
1156     if (granularity <= sizeof (jint)) {
1157       __ tbz(count, 2 - exact_log2(granularity), Lint);
1158       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1159       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1160       __ bind(Lint);
1161     }
1162 
1163     if (granularity <= sizeof (jshort)) {
1164       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1165       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1166       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1167       __ bind(Lshort);
1168     }
1169 
1170     if (granularity <= sizeof (jbyte)) {
1171       __ tbz(count, 0, Lbyte);
1172       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1173       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1174       __ bind(Lbyte);
1175     }
1176   }
1177 
1178   Label copy_f, copy_b;
1179   Label copy_obj_f, copy_obj_b;
1180   Label copy_obj_uninit_f, copy_obj_uninit_b;
1181 
1182   // All-singing all-dancing memory copy.
1183   //
1184   // Copy count units of memory from s to d.  The size of a unit is
1185   // step, which can be positive or negative depending on the direction
1186   // of copy.  If is_aligned is false, we align the source address.
1187   //
1188 
1189   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1190                    Register s, Register d, Register count, int step) {
1191     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1192     bool is_backwards = step < 0;
1193     unsigned int granularity = uabs(step);
1194     const Register t0 = r3, t1 = r4;
1195 
1196     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1197     // load all the data before writing anything
1198     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1199     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1200     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1201     const Register send = r17, dend = r16;
1202     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1203     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1204     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1205 
1206     if (PrefetchCopyIntervalInBytes > 0)
1207       __ prfm(Address(s, 0), PLDL1KEEP);
1208     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1209     __ br(Assembler::HI, copy_big);
1210 
1211     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1212     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1213 
1214     __ cmp(count, u1(16/granularity));
1215     __ br(Assembler::LS, copy16);
1216 
1217     __ cmp(count, u1(64/granularity));
1218     __ br(Assembler::HI, copy80);
1219 
1220     __ cmp(count, u1(32/granularity));
1221     __ br(Assembler::LS, copy32);
1222 
1223     // 33..64 bytes
1224     if (UseSIMDForMemoryOps) {
1225       bs.copy_load_at_32(v0, v1, Address(s, 0));
1226       bs.copy_load_at_32(v2, v3, Address(send, -32));
1227       bs.copy_store_at_32(Address(d, 0), v0, v1);
1228       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1229     } else {
1230       bs.copy_load_at_16(t0, t1, Address(s, 0));
1231       bs.copy_load_at_16(t2, t3, Address(s, 16));
1232       bs.copy_load_at_16(t4, t5, Address(send, -32));
1233       bs.copy_load_at_16(t6, t7, Address(send, -16));
1234 
1235       bs.copy_store_at_16(Address(d, 0), t0, t1);
1236       bs.copy_store_at_16(Address(d, 16), t2, t3);
1237       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1238       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1239     }
1240     __ b(finish);
1241 
1242     // 17..32 bytes
1243     __ bind(copy32);
1244     bs.copy_load_at_16(t0, t1, Address(s, 0));
1245     bs.copy_load_at_16(t6, t7, Address(send, -16));
1246 
1247     bs.copy_store_at_16(Address(d, 0), t0, t1);
1248     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1249     __ b(finish);
1250 
1251     // 65..80/96 bytes
1252     // (96 bytes if SIMD because we do 32 byes per instruction)
1253     __ bind(copy80);
1254     if (UseSIMDForMemoryOps) {
1255       bs.copy_load_at_32(v0, v1, Address(s, 0));
1256       bs.copy_load_at_32(v2, v3, Address(s, 32));
1257       // Unaligned pointers can be an issue for copying.
1258       // The issue has more chances to happen when granularity of data is
1259       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1260       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1261       // The most performance drop has been seen for the range 65-80 bytes.
1262       // For such cases using the pair of ldp/stp instead of the third pair of
1263       // ldpq/stpq fixes the performance issue.
1264       if (granularity < sizeof (jint)) {
1265         Label copy96;
1266         __ cmp(count, u1(80/granularity));
1267         __ br(Assembler::HI, copy96);
1268         bs.copy_load_at_16(t0, t1, Address(send, -16));
1269 
1270         bs.copy_store_at_32(Address(d, 0), v0, v1);
1271         bs.copy_store_at_32(Address(d, 32), v2, v3);
1272 
1273         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1274         __ b(finish);
1275 
1276         __ bind(copy96);
1277       }
1278       bs.copy_load_at_32(v4, v5, Address(send, -32));
1279 
1280       bs.copy_store_at_32(Address(d, 0), v0, v1);
1281       bs.copy_store_at_32(Address(d, 32), v2, v3);
1282 
1283       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1284     } else {
1285       bs.copy_load_at_16(t0, t1, Address(s, 0));
1286       bs.copy_load_at_16(t2, t3, Address(s, 16));
1287       bs.copy_load_at_16(t4, t5, Address(s, 32));
1288       bs.copy_load_at_16(t6, t7, Address(s, 48));
1289       bs.copy_load_at_16(t8, t9, Address(send, -16));
1290 
1291       bs.copy_store_at_16(Address(d, 0), t0, t1);
1292       bs.copy_store_at_16(Address(d, 16), t2, t3);
1293       bs.copy_store_at_16(Address(d, 32), t4, t5);
1294       bs.copy_store_at_16(Address(d, 48), t6, t7);
1295       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1296     }
1297     __ b(finish);
1298 
1299     // 0..16 bytes
1300     __ bind(copy16);
1301     __ cmp(count, u1(8/granularity));
1302     __ br(Assembler::LO, copy8);
1303 
1304     // 8..16 bytes
1305     bs.copy_load_at_8(t0, Address(s, 0));
1306     bs.copy_load_at_8(t1, Address(send, -8));
1307     bs.copy_store_at_8(Address(d, 0), t0);
1308     bs.copy_store_at_8(Address(dend, -8), t1);
1309     __ b(finish);
1310 
1311     if (granularity < 8) {
1312       // 4..7 bytes
1313       __ bind(copy8);
1314       __ tbz(count, 2 - exact_log2(granularity), copy4);
1315       __ ldrw(t0, Address(s, 0));
1316       __ ldrw(t1, Address(send, -4));
1317       __ strw(t0, Address(d, 0));
1318       __ strw(t1, Address(dend, -4));
1319       __ b(finish);
1320       if (granularity < 4) {
1321         // 0..3 bytes
1322         __ bind(copy4);
1323         __ cbz(count, finish); // get rid of 0 case
1324         if (granularity == 2) {
1325           __ ldrh(t0, Address(s, 0));
1326           __ strh(t0, Address(d, 0));
1327         } else { // granularity == 1
1328           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1329           // the first and last byte.
1330           // Handle the 3 byte case by loading and storing base + count/2
1331           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1332           // This does means in the 1 byte case we load/store the same
1333           // byte 3 times.
1334           __ lsr(count, count, 1);
1335           __ ldrb(t0, Address(s, 0));
1336           __ ldrb(t1, Address(send, -1));
1337           __ ldrb(t2, Address(s, count));
1338           __ strb(t0, Address(d, 0));
1339           __ strb(t1, Address(dend, -1));
1340           __ strb(t2, Address(d, count));
1341         }
1342         __ b(finish);
1343       }
1344     }
1345 
1346     __ bind(copy_big);
1347     if (is_backwards) {
1348       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1349       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1350     }
1351 
1352     // Now we've got the small case out of the way we can align the
1353     // source address on a 2-word boundary.
1354 
1355     // Here we will materialize a count in r15, which is used by copy_memory_small
1356     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1357     // Up until here, we have used t9, which aliases r15, but from here on, that register
1358     // can not be used as a temp register, as it contains the count.
1359 
1360     Label aligned;
1361 
1362     if (is_aligned) {
1363       // We may have to adjust by 1 word to get s 2-word-aligned.
1364       __ tbz(s, exact_log2(wordSize), aligned);
1365       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1366       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1367       __ sub(count, count, wordSize/granularity);
1368     } else {
1369       if (is_backwards) {
1370         __ andr(r15, s, 2 * wordSize - 1);
1371       } else {
1372         __ neg(r15, s);
1373         __ andr(r15, r15, 2 * wordSize - 1);
1374       }
1375       // r15 is the byte adjustment needed to align s.
1376       __ cbz(r15, aligned);
1377       int shift = exact_log2(granularity);
1378       if (shift)  __ lsr(r15, r15, shift);
1379       __ sub(count, count, r15);
1380 
1381 #if 0
1382       // ?? This code is only correct for a disjoint copy.  It may or
1383       // may not make sense to use it in that case.
1384 
1385       // Copy the first pair; s and d may not be aligned.
1386       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1387       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1388 
1389       // Align s and d, adjust count
1390       if (is_backwards) {
1391         __ sub(s, s, r15);
1392         __ sub(d, d, r15);
1393       } else {
1394         __ add(s, s, r15);
1395         __ add(d, d, r15);
1396       }
1397 #else
1398       copy_memory_small(decorators, type, s, d, r15, step);
1399 #endif
1400     }
1401 
1402     __ bind(aligned);
1403 
1404     // s is now 2-word-aligned.
1405 
1406     // We have a count of units and some trailing bytes.  Adjust the
1407     // count and do a bulk copy of words.
1408     __ lsr(r15, count, exact_log2(wordSize/granularity));
1409     if (direction == copy_forwards) {
1410       if (type != T_OBJECT) {
1411         __ bl(copy_f);
1412       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1413         __ bl(copy_obj_uninit_f);
1414       } else {
1415         __ bl(copy_obj_f);
1416       }
1417     } else {
1418       if (type != T_OBJECT) {
1419         __ bl(copy_b);
1420       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1421         __ bl(copy_obj_uninit_b);
1422       } else {
1423         __ bl(copy_obj_b);
1424       }
1425     }
1426 
1427     // And the tail.
1428     copy_memory_small(decorators, type, s, d, count, step);
1429 
1430     if (granularity >= 8) __ bind(copy8);
1431     if (granularity >= 4) __ bind(copy4);
1432     __ bind(finish);
1433   }
1434 
1435 
1436   void clobber_registers() {
1437 #ifdef ASSERT
1438     RegSet clobbered
1439       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1440     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1441     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1442     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1443       __ mov(*it, rscratch1);
1444     }
1445 #endif
1446 
1447   }
1448 
1449   // Scan over array at a for count oops, verifying each one.
1450   // Preserves a and count, clobbers rscratch1 and rscratch2.
1451   void verify_oop_array (int size, Register a, Register count, Register temp) {
1452     Label loop, end;
1453     __ mov(rscratch1, a);
1454     __ mov(rscratch2, zr);
1455     __ bind(loop);
1456     __ cmp(rscratch2, count);
1457     __ br(Assembler::HS, end);
1458     if (size == wordSize) {
1459       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1460       __ verify_oop(temp);
1461     } else {
1462       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1463       __ decode_heap_oop(temp); // calls verify_oop
1464     }
1465     __ add(rscratch2, rscratch2, 1);
1466     __ b(loop);
1467     __ bind(end);
1468   }
1469 
1470   // Arguments:
1471   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1472   //             ignored
1473   //   is_oop  - true => oop array, so generate store check code
1474   //   name    - stub name string
1475   //
1476   // Inputs:
1477   //   c_rarg0   - source array address
1478   //   c_rarg1   - destination array address
1479   //   c_rarg2   - element count, treated as ssize_t, can be zero
1480   //
1481   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1482   // the hardware handle it.  The two dwords within qwords that span
1483   // cache line boundaries will still be loaded and stored atomically.
1484   //
1485   // Side Effects:
1486   //   disjoint_int_copy_entry is set to the no-overlap entry point
1487   //   used by generate_conjoint_int_oop_copy().
1488   //
1489   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1490                                   const char *name, bool dest_uninitialized = false) {
1491     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1492     RegSet saved_reg = RegSet::of(s, d, count);
1493     __ align(CodeEntryAlignment);
1494     StubCodeMark mark(this, "StubRoutines", name);
1495     address start = __ pc();
1496     __ enter();
1497 
1498     if (entry != nullptr) {
1499       *entry = __ pc();
1500       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1501       BLOCK_COMMENT("Entry:");
1502     }
1503 
1504     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1505     if (dest_uninitialized) {
1506       decorators |= IS_DEST_UNINITIALIZED;
1507     }
1508     if (aligned) {
1509       decorators |= ARRAYCOPY_ALIGNED;
1510     }
1511 
1512     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1513     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1514 
1515     if (is_oop) {
1516       // save regs before copy_memory
1517       __ push(RegSet::of(d, count), sp);
1518     }
1519     {
1520       // UnsafeMemoryAccess page error: continue after unsafe access
1521       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1522       UnsafeMemoryAccessMark umam(this, add_entry, true);
1523       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1524     }
1525 
1526     if (is_oop) {
1527       __ pop(RegSet::of(d, count), sp);
1528       if (VerifyOops)
1529         verify_oop_array(size, d, count, r16);
1530     }
1531 
1532     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1533 
1534     __ leave();
1535     __ mov(r0, zr); // return 0
1536     __ ret(lr);
1537     return start;
1538   }
1539 
1540   // Arguments:
1541   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1542   //             ignored
1543   //   is_oop  - true => oop array, so generate store check code
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1552   // the hardware handle it.  The two dwords within qwords that span
1553   // cache line boundaries will still be loaded and stored atomically.
1554   //
1555   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1556                                  address *entry, const char *name,
1557                                  bool dest_uninitialized = false) {
1558     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1559     RegSet saved_regs = RegSet::of(s, d, count);
1560     StubCodeMark mark(this, "StubRoutines", name);
1561     address start = __ pc();
1562     __ enter();
1563 
1564     if (entry != nullptr) {
1565       *entry = __ pc();
1566       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1567       BLOCK_COMMENT("Entry:");
1568     }
1569 
1570     // use fwd copy when (d-s) above_equal (count*size)
1571     __ sub(rscratch1, d, s);
1572     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1573     __ br(Assembler::HS, nooverlap_target);
1574 
1575     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1576     if (dest_uninitialized) {
1577       decorators |= IS_DEST_UNINITIALIZED;
1578     }
1579     if (aligned) {
1580       decorators |= ARRAYCOPY_ALIGNED;
1581     }
1582 
1583     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1584     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1585 
1586     if (is_oop) {
1587       // save regs before copy_memory
1588       __ push(RegSet::of(d, count), sp);
1589     }
1590     {
1591       // UnsafeMemoryAccess page error: continue after unsafe access
1592       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1593       UnsafeMemoryAccessMark umam(this, add_entry, true);
1594       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1595     }
1596     if (is_oop) {
1597       __ pop(RegSet::of(d, count), sp);
1598       if (VerifyOops)
1599         verify_oop_array(size, d, count, r16);
1600     }
1601     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1602     __ leave();
1603     __ mov(r0, zr); // return 0
1604     __ ret(lr);
1605     return start;
1606 }
1607 
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as ssize_t, can be zero
1617   //
1618   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1619   // we let the hardware handle it.  The one to eight bytes within words,
1620   // dwords or qwords that span cache line boundaries will still be loaded
1621   // and stored atomically.
1622   //
1623   // Side Effects:
1624   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1625   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1626   // we let the hardware handle it.  The one to eight bytes within words,
1627   // dwords or qwords that span cache line boundaries will still be loaded
1628   // and stored atomically.
1629   //
1630   // Side Effects:
1631   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1632   //   used by generate_conjoint_byte_copy().
1633   //
1634   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1635     const bool not_oop = false;
1636     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1637   }
1638 
1639   // Arguments:
1640   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1641   //             ignored
1642   //   name    - stub name string
1643   //
1644   // Inputs:
1645   //   c_rarg0   - source array address
1646   //   c_rarg1   - destination array address
1647   //   c_rarg2   - element count, treated as ssize_t, can be zero
1648   //
1649   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1650   // we let the hardware handle it.  The one to eight bytes within words,
1651   // dwords or qwords that span cache line boundaries will still be loaded
1652   // and stored atomically.
1653   //
1654   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1655                                       address* entry, const char *name) {
1656     const bool not_oop = false;
1657     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1658   }
1659 
1660   // Arguments:
1661   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1662   //             ignored
1663   //   name    - stub name string
1664   //
1665   // Inputs:
1666   //   c_rarg0   - source array address
1667   //   c_rarg1   - destination array address
1668   //   c_rarg2   - element count, treated as ssize_t, can be zero
1669   //
1670   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1671   // let the hardware handle it.  The two or four words within dwords
1672   // or qwords that span cache line boundaries will still be loaded
1673   // and stored atomically.
1674   //
1675   // Side Effects:
1676   //   disjoint_short_copy_entry is set to the no-overlap entry point
1677   //   used by generate_conjoint_short_copy().
1678   //
1679   address generate_disjoint_short_copy(bool aligned,
1680                                        address* entry, const char *name) {
1681     const bool not_oop = false;
1682     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1683   }
1684 
1685   // Arguments:
1686   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1687   //             ignored
1688   //   name    - stub name string
1689   //
1690   // Inputs:
1691   //   c_rarg0   - source array address
1692   //   c_rarg1   - destination array address
1693   //   c_rarg2   - element count, treated as ssize_t, can be zero
1694   //
1695   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1696   // let the hardware handle it.  The two or four words within dwords
1697   // or qwords that span cache line boundaries will still be loaded
1698   // and stored atomically.
1699   //
1700   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1701                                        address *entry, const char *name) {
1702     const bool not_oop = false;
1703     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1704 
1705   }
1706   // Arguments:
1707   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1708   //             ignored
1709   //   name    - stub name string
1710   //
1711   // Inputs:
1712   //   c_rarg0   - source array address
1713   //   c_rarg1   - destination array address
1714   //   c_rarg2   - element count, treated as ssize_t, can be zero
1715   //
1716   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1717   // the hardware handle it.  The two dwords within qwords that span
1718   // cache line boundaries will still be loaded and stored atomically.
1719   //
1720   // Side Effects:
1721   //   disjoint_int_copy_entry is set to the no-overlap entry point
1722   //   used by generate_conjoint_int_oop_copy().
1723   //
1724   address generate_disjoint_int_copy(bool aligned, address *entry,
1725                                          const char *name, bool dest_uninitialized = false) {
1726     const bool not_oop = false;
1727     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1728   }
1729 
1730   // Arguments:
1731   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1732   //             ignored
1733   //   name    - stub name string
1734   //
1735   // Inputs:
1736   //   c_rarg0   - source array address
1737   //   c_rarg1   - destination array address
1738   //   c_rarg2   - element count, treated as ssize_t, can be zero
1739   //
1740   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1741   // the hardware handle it.  The two dwords within qwords that span
1742   // cache line boundaries will still be loaded and stored atomically.
1743   //
1744   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1745                                      address *entry, const char *name,
1746                                      bool dest_uninitialized = false) {
1747     const bool not_oop = false;
1748     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1749   }
1750 
1751 
1752   // Arguments:
1753   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1754   //             ignored
1755   //   name    - stub name string
1756   //
1757   // Inputs:
1758   //   c_rarg0   - source array address
1759   //   c_rarg1   - destination array address
1760   //   c_rarg2   - element count, treated as size_t, can be zero
1761   //
1762   // Side Effects:
1763   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1764   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1765   //
1766   address generate_disjoint_long_copy(bool aligned, address *entry,
1767                                           const char *name, bool dest_uninitialized = false) {
1768     const bool not_oop = false;
1769     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1770   }
1771 
1772   // Arguments:
1773   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1774   //             ignored
1775   //   name    - stub name string
1776   //
1777   // Inputs:
1778   //   c_rarg0   - source array address
1779   //   c_rarg1   - destination array address
1780   //   c_rarg2   - element count, treated as size_t, can be zero
1781   //
1782   address generate_conjoint_long_copy(bool aligned,
1783                                       address nooverlap_target, address *entry,
1784                                       const char *name, bool dest_uninitialized = false) {
1785     const bool not_oop = false;
1786     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1787   }
1788 
1789   // Arguments:
1790   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1791   //             ignored
1792   //   name    - stub name string
1793   //
1794   // Inputs:
1795   //   c_rarg0   - source array address
1796   //   c_rarg1   - destination array address
1797   //   c_rarg2   - element count, treated as size_t, can be zero
1798   //
1799   // Side Effects:
1800   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1801   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1802   //
1803   address generate_disjoint_oop_copy(bool aligned, address *entry,
1804                                      const char *name, bool dest_uninitialized) {
1805     const bool is_oop = true;
1806     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1807     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1808   }
1809 
1810   // Arguments:
1811   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1812   //             ignored
1813   //   name    - stub name string
1814   //
1815   // Inputs:
1816   //   c_rarg0   - source array address
1817   //   c_rarg1   - destination array address
1818   //   c_rarg2   - element count, treated as size_t, can be zero
1819   //
1820   address generate_conjoint_oop_copy(bool aligned,
1821                                      address nooverlap_target, address *entry,
1822                                      const char *name, bool dest_uninitialized) {
1823     const bool is_oop = true;
1824     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1825     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1826                                   name, dest_uninitialized);
1827   }
1828 
1829 
1830   // Helper for generating a dynamic type check.
1831   // Smashes rscratch1, rscratch2.
1832   void generate_type_check(Register sub_klass,
1833                            Register super_check_offset,
1834                            Register super_klass,
1835                            Label& L_success) {
1836     assert_different_registers(sub_klass, super_check_offset, super_klass);
1837 
1838     BLOCK_COMMENT("type_check:");
1839 
1840     Label L_miss;
1841 
1842     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1843                                      super_check_offset);
1844     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1845 
1846     // Fall through on failure!
1847     __ BIND(L_miss);
1848   }
1849 
1850   //
1851   //  Generate checkcasting array copy stub
1852   //
1853   //  Input:
1854   //    c_rarg0   - source array address
1855   //    c_rarg1   - destination array address
1856   //    c_rarg2   - element count, treated as ssize_t, can be zero
1857   //    c_rarg3   - size_t ckoff (super_check_offset)
1858   //    c_rarg4   - oop ckval (super_klass)
1859   //
1860   //  Output:
1861   //    r0 ==  0  -  success
1862   //    r0 == -1^K - failure, where K is partial transfer count
1863   //
1864   address generate_checkcast_copy(const char *name, address *entry,
1865                                   bool dest_uninitialized = false) {
1866 
1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1868 
1869     // Input registers (after setup_arg_regs)
1870     const Register from        = c_rarg0;   // source array address
1871     const Register to          = c_rarg1;   // destination array address
1872     const Register count       = c_rarg2;   // elementscount
1873     const Register ckoff       = c_rarg3;   // super_check_offset
1874     const Register ckval       = c_rarg4;   // super_klass
1875 
1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1877     RegSet wb_post_saved_regs = RegSet::of(count);
1878 
1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1880     const Register copied_oop  = r22;       // actual oop copied
1881     const Register count_save  = r21;       // orig elementscount
1882     const Register start_to    = r20;       // destination array start address
1883     const Register r19_klass   = r19;       // oop._klass
1884 
1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1887 
1888     //---------------------------------------------------------------
1889     // Assembler stub will be used for this call to arraycopy
1890     // if the two arrays are subtypes of Object[] but the
1891     // destination array type is not equal to or a supertype
1892     // of the source type.  Each element must be separately
1893     // checked.
1894 
1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1896                                copied_oop, r19_klass, count_save);
1897 
1898     __ align(CodeEntryAlignment);
1899     StubCodeMark mark(this, "StubRoutines", name);
1900     address start = __ pc();
1901 
1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
1903 
1904 #ifdef ASSERT
1905     // caller guarantees that the arrays really are different
1906     // otherwise, we would have to make conjoint checks
1907     { Label L;
1908       __ b(L);                  // conjoint check not yet implemented
1909       __ stop("checkcast_copy within a single array");
1910       __ bind(L);
1911     }
1912 #endif //ASSERT
1913 
1914     // Caller of this entry point must set up the argument registers.
1915     if (entry != nullptr) {
1916       *entry = __ pc();
1917       BLOCK_COMMENT("Entry:");
1918     }
1919 
1920      // Empty array:  Nothing to do.
1921     __ cbz(count, L_done);
1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
1923 
1924 #ifdef ASSERT
1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
1926     // The ckoff and ckval must be mutually consistent,
1927     // even though caller generates both.
1928     { Label L;
1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1930       __ ldrw(start_to, Address(ckval, sco_offset));
1931       __ cmpw(ckoff, start_to);
1932       __ br(Assembler::EQ, L);
1933       __ stop("super_check_offset inconsistent");
1934       __ bind(L);
1935     }
1936 #endif //ASSERT
1937 
1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1939     bool is_oop = true;
1940     int element_size = UseCompressedOops ? 4 : 8;
1941     if (dest_uninitialized) {
1942       decorators |= IS_DEST_UNINITIALIZED;
1943     }
1944 
1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1947 
1948     // save the original count
1949     __ mov(count_save, count);
1950 
1951     // Copy from low to high addresses
1952     __ mov(start_to, to);              // Save destination array start address
1953     __ b(L_load_element);
1954 
1955     // ======== begin loop ========
1956     // (Loop is rotated; its entry is L_load_element.)
1957     // Loop control:
1958     //   for (; count != 0; count--) {
1959     //     copied_oop = load_heap_oop(from++);
1960     //     ... generate_type_check ...;
1961     //     store_heap_oop(to++, copied_oop);
1962     //   }
1963     __ align(OptoLoopAlignment);
1964 
1965     __ BIND(L_store_element);
1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1967                       __ post(to, element_size), copied_oop, noreg,
1968                       gct1, gct2, gct3);
1969     __ sub(count, count, 1);
1970     __ cbz(count, L_do_card_marks);
1971 
1972     // ======== loop entry is here ========
1973     __ BIND(L_load_element);
1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1975                      copied_oop, noreg, __ post(from, element_size),
1976                      gct1);
1977     __ cbz(copied_oop, L_store_element);
1978 
1979     __ load_klass(r19_klass, copied_oop);// query the object klass
1980     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1981     // ======== end loop ========
1982 
1983     // It was a real error; we must depend on the caller to finish the job.
1984     // Register count = remaining oops, count_orig = total oops.
1985     // Emit GC store barriers for the oops we have copied and report
1986     // their number to the caller.
1987 
1988     __ subs(count, count_save, count);     // K = partially copied oop count
1989     __ eon(count, count, zr);                   // report (-1^K) to caller
1990     __ br(Assembler::EQ, L_done_pop);
1991 
1992     __ BIND(L_do_card_marks);
1993     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1994 
1995     __ bind(L_done_pop);
1996     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1997     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1998 
1999     __ bind(L_done);
2000     __ mov(r0, count);
2001     __ leave();
2002     __ ret(lr);
2003 
2004     return start;
2005   }
2006 
2007   // Perform range checks on the proposed arraycopy.
2008   // Kills temp, but nothing else.
2009   // Also, clean the sign bits of src_pos and dst_pos.
2010   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2011                               Register src_pos, // source position (c_rarg1)
2012                               Register dst,     // destination array oo (c_rarg2)
2013                               Register dst_pos, // destination position (c_rarg3)
2014                               Register length,
2015                               Register temp,
2016                               Label& L_failed) {
2017     BLOCK_COMMENT("arraycopy_range_checks:");
2018 
2019     assert_different_registers(rscratch1, temp);
2020 
2021     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2022     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2023     __ addw(temp, length, src_pos);
2024     __ cmpw(temp, rscratch1);
2025     __ br(Assembler::HI, L_failed);
2026 
2027     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2028     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2029     __ addw(temp, length, dst_pos);
2030     __ cmpw(temp, rscratch1);
2031     __ br(Assembler::HI, L_failed);
2032 
2033     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2034     __ movw(src_pos, src_pos);
2035     __ movw(dst_pos, dst_pos);
2036 
2037     BLOCK_COMMENT("arraycopy_range_checks done");
2038   }
2039 
2040   // These stubs get called from some dumb test routine.
2041   // I'll write them properly when they're called from
2042   // something that's actually doing something.
2043   static void fake_arraycopy_stub(address src, address dst, int count) {
2044     assert(count == 0, "huh?");
2045   }
2046 
2047 
2048   //
2049   //  Generate 'unsafe' array copy stub
2050   //  Though just as safe as the other stubs, it takes an unscaled
2051   //  size_t argument instead of an element count.
2052   //
2053   //  Input:
2054   //    c_rarg0   - source array address
2055   //    c_rarg1   - destination array address
2056   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2057   //
2058   // Examines the alignment of the operands and dispatches
2059   // to a long, int, short, or byte copy loop.
2060   //
2061   address generate_unsafe_copy(const char *name,
2062                                address byte_copy_entry,
2063                                address short_copy_entry,
2064                                address int_copy_entry,
2065                                address long_copy_entry) {
2066     Label L_long_aligned, L_int_aligned, L_short_aligned;
2067     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2068 
2069     __ align(CodeEntryAlignment);
2070     StubCodeMark mark(this, "StubRoutines", name);
2071     address start = __ pc();
2072     __ enter(); // required for proper stackwalking of RuntimeStub frame
2073 
2074     // bump this on entry, not on exit:
2075     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2076 
2077     __ orr(rscratch1, s, d);
2078     __ orr(rscratch1, rscratch1, count);
2079 
2080     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2081     __ cbz(rscratch1, L_long_aligned);
2082     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2083     __ cbz(rscratch1, L_int_aligned);
2084     __ tbz(rscratch1, 0, L_short_aligned);
2085     __ b(RuntimeAddress(byte_copy_entry));
2086 
2087     __ BIND(L_short_aligned);
2088     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2089     __ b(RuntimeAddress(short_copy_entry));
2090     __ BIND(L_int_aligned);
2091     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2092     __ b(RuntimeAddress(int_copy_entry));
2093     __ BIND(L_long_aligned);
2094     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2095     __ b(RuntimeAddress(long_copy_entry));
2096 
2097     return start;
2098   }
2099 
2100   //
2101   //  Generate generic array copy stubs
2102   //
2103   //  Input:
2104   //    c_rarg0    -  src oop
2105   //    c_rarg1    -  src_pos (32-bits)
2106   //    c_rarg2    -  dst oop
2107   //    c_rarg3    -  dst_pos (32-bits)
2108   //    c_rarg4    -  element count (32-bits)
2109   //
2110   //  Output:
2111   //    r0 ==  0  -  success
2112   //    r0 == -1^K - failure, where K is partial transfer count
2113   //
2114   address generate_generic_copy(const char *name,
2115                                 address byte_copy_entry, address short_copy_entry,
2116                                 address int_copy_entry, address oop_copy_entry,
2117                                 address long_copy_entry, address checkcast_copy_entry) {
2118 
2119     Label L_failed, L_objArray;
2120     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2121 
2122     // Input registers
2123     const Register src        = c_rarg0;  // source array oop
2124     const Register src_pos    = c_rarg1;  // source position
2125     const Register dst        = c_rarg2;  // destination array oop
2126     const Register dst_pos    = c_rarg3;  // destination position
2127     const Register length     = c_rarg4;
2128 
2129 
2130     // Registers used as temps
2131     const Register dst_klass  = c_rarg5;
2132 
2133     __ align(CodeEntryAlignment);
2134 
2135     StubCodeMark mark(this, "StubRoutines", name);
2136 
2137     address start = __ pc();
2138 
2139     __ enter(); // required for proper stackwalking of RuntimeStub frame
2140 
2141     // bump this on entry, not on exit:
2142     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2143 
2144     //-----------------------------------------------------------------------
2145     // Assembler stub will be used for this call to arraycopy
2146     // if the following conditions are met:
2147     //
2148     // (1) src and dst must not be null.
2149     // (2) src_pos must not be negative.
2150     // (3) dst_pos must not be negative.
2151     // (4) length  must not be negative.
2152     // (5) src klass and dst klass should be the same and not null.
2153     // (6) src and dst should be arrays.
2154     // (7) src_pos + length must not exceed length of src.
2155     // (8) dst_pos + length must not exceed length of dst.
2156     //
2157 
2158     //  if (src == nullptr) return -1;
2159     __ cbz(src, L_failed);
2160 
2161     //  if (src_pos < 0) return -1;
2162     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2163 
2164     //  if (dst == nullptr) return -1;
2165     __ cbz(dst, L_failed);
2166 
2167     //  if (dst_pos < 0) return -1;
2168     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2169 
2170     // registers used as temp
2171     const Register scratch_length    = r16; // elements count to copy
2172     const Register scratch_src_klass = r17; // array klass
2173     const Register lh                = r15; // layout helper
2174 
2175     //  if (length < 0) return -1;
2176     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2177     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2178 
2179     __ load_klass(scratch_src_klass, src);
2180 #ifdef ASSERT
2181     //  assert(src->klass() != nullptr);
2182     {
2183       BLOCK_COMMENT("assert klasses not null {");
2184       Label L1, L2;
2185       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2186       __ bind(L1);
2187       __ stop("broken null klass");
2188       __ bind(L2);
2189       __ load_klass(rscratch1, dst);
2190       __ cbz(rscratch1, L1);     // this would be broken also
2191       BLOCK_COMMENT("} assert klasses not null done");
2192     }
2193 #endif
2194 
2195     // Load layout helper (32-bits)
2196     //
2197     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2198     // 32        30    24            16              8     2                 0
2199     //
2200     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2201     //
2202 
2203     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2204 
2205     // Handle objArrays completely differently...
2206     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2207     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2208     __ movw(rscratch1, objArray_lh);
2209     __ eorw(rscratch2, lh, rscratch1);
2210     __ cbzw(rscratch2, L_objArray);
2211 
2212     //  if (src->klass() != dst->klass()) return -1;
2213     __ load_klass(rscratch2, dst);
2214     __ eor(rscratch2, rscratch2, scratch_src_klass);
2215     __ cbnz(rscratch2, L_failed);
2216 
2217     //  if (!src->is_Array()) return -1;
2218     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2219 
2220     // At this point, it is known to be a typeArray (array_tag 0x3).
2221 #ifdef ASSERT
2222     {
2223       BLOCK_COMMENT("assert primitive array {");
2224       Label L;
2225       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2226       __ cmpw(lh, rscratch2);
2227       __ br(Assembler::GE, L);
2228       __ stop("must be a primitive array");
2229       __ bind(L);
2230       BLOCK_COMMENT("} assert primitive array done");
2231     }
2232 #endif
2233 
2234     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2235                            rscratch2, L_failed);
2236 
2237     // TypeArrayKlass
2238     //
2239     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2240     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2241     //
2242 
2243     const Register rscratch1_offset = rscratch1;    // array offset
2244     const Register r15_elsize = lh; // element size
2245 
2246     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2247            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2248     __ add(src, src, rscratch1_offset);           // src array offset
2249     __ add(dst, dst, rscratch1_offset);           // dst array offset
2250     BLOCK_COMMENT("choose copy loop based on element size");
2251 
2252     // next registers should be set before the jump to corresponding stub
2253     const Register from     = c_rarg0;  // source array address
2254     const Register to       = c_rarg1;  // destination array address
2255     const Register count    = c_rarg2;  // elements count
2256 
2257     // 'from', 'to', 'count' registers should be set in such order
2258     // since they are the same as 'src', 'src_pos', 'dst'.
2259 
2260     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2261 
2262     // The possible values of elsize are 0-3, i.e. exact_log2(element
2263     // size in bytes).  We do a simple bitwise binary search.
2264   __ BIND(L_copy_bytes);
2265     __ tbnz(r15_elsize, 1, L_copy_ints);
2266     __ tbnz(r15_elsize, 0, L_copy_shorts);
2267     __ lea(from, Address(src, src_pos));// src_addr
2268     __ lea(to,   Address(dst, dst_pos));// dst_addr
2269     __ movw(count, scratch_length); // length
2270     __ b(RuntimeAddress(byte_copy_entry));
2271 
2272   __ BIND(L_copy_shorts);
2273     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2274     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2275     __ movw(count, scratch_length); // length
2276     __ b(RuntimeAddress(short_copy_entry));
2277 
2278   __ BIND(L_copy_ints);
2279     __ tbnz(r15_elsize, 0, L_copy_longs);
2280     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2281     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2282     __ movw(count, scratch_length); // length
2283     __ b(RuntimeAddress(int_copy_entry));
2284 
2285   __ BIND(L_copy_longs);
2286 #ifdef ASSERT
2287     {
2288       BLOCK_COMMENT("assert long copy {");
2289       Label L;
2290       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2291       __ cmpw(r15_elsize, LogBytesPerLong);
2292       __ br(Assembler::EQ, L);
2293       __ stop("must be long copy, but elsize is wrong");
2294       __ bind(L);
2295       BLOCK_COMMENT("} assert long copy done");
2296     }
2297 #endif
2298     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2299     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2300     __ movw(count, scratch_length); // length
2301     __ b(RuntimeAddress(long_copy_entry));
2302 
2303     // ObjArrayKlass
2304   __ BIND(L_objArray);
2305     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2306 
2307     Label L_plain_copy, L_checkcast_copy;
2308     //  test array classes for subtyping
2309     __ load_klass(r15, dst);
2310     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2311     __ br(Assembler::NE, L_checkcast_copy);
2312 
2313     // Identically typed arrays can be copied without element-wise checks.
2314     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2315                            rscratch2, L_failed);
2316 
2317     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2318     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2319     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2320     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2321     __ movw(count, scratch_length); // length
2322   __ BIND(L_plain_copy);
2323     __ b(RuntimeAddress(oop_copy_entry));
2324 
2325   __ BIND(L_checkcast_copy);
2326     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2327     {
2328       // Before looking at dst.length, make sure dst is also an objArray.
2329       __ ldrw(rscratch1, Address(r15, lh_offset));
2330       __ movw(rscratch2, objArray_lh);
2331       __ eorw(rscratch1, rscratch1, rscratch2);
2332       __ cbnzw(rscratch1, L_failed);
2333 
2334       // It is safe to examine both src.length and dst.length.
2335       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2336                              r15, L_failed);
2337 
2338       __ load_klass(dst_klass, dst); // reload
2339 
2340       // Marshal the base address arguments now, freeing registers.
2341       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2342       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2343       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2344       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2345       __ movw(count, length);           // length (reloaded)
2346       Register sco_temp = c_rarg3;      // this register is free now
2347       assert_different_registers(from, to, count, sco_temp,
2348                                  dst_klass, scratch_src_klass);
2349       // assert_clean_int(count, sco_temp);
2350 
2351       // Generate the type check.
2352       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2353       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2354 
2355       // Smashes rscratch1, rscratch2
2356       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2357 
2358       // Fetch destination element klass from the ObjArrayKlass header.
2359       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2360       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2361       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2362 
2363       // the checkcast_copy loop needs two extra arguments:
2364       assert(c_rarg3 == sco_temp, "#3 already in place");
2365       // Set up arguments for checkcast_copy_entry.
2366       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2367       __ b(RuntimeAddress(checkcast_copy_entry));
2368     }
2369 
2370   __ BIND(L_failed);
2371     __ mov(r0, -1);
2372     __ leave();   // required for proper stackwalking of RuntimeStub frame
2373     __ ret(lr);
2374 
2375     return start;
2376   }
2377 
2378   //
2379   // Generate stub for array fill. If "aligned" is true, the
2380   // "to" address is assumed to be heapword aligned.
2381   //
2382   // Arguments for generated stub:
2383   //   to:    c_rarg0
2384   //   value: c_rarg1
2385   //   count: c_rarg2 treated as signed
2386   //
2387   address generate_fill(BasicType t, bool aligned, const char *name) {
2388     __ align(CodeEntryAlignment);
2389     StubCodeMark mark(this, "StubRoutines", name);
2390     address start = __ pc();
2391 
2392     BLOCK_COMMENT("Entry:");
2393 
2394     const Register to        = c_rarg0;  // source array address
2395     const Register value     = c_rarg1;  // value
2396     const Register count     = c_rarg2;  // elements count
2397 
2398     const Register bz_base = r10;        // base for block_zero routine
2399     const Register cnt_words = r11;      // temp register
2400 
2401     __ enter();
2402 
2403     Label L_fill_elements, L_exit1;
2404 
2405     int shift = -1;
2406     switch (t) {
2407       case T_BYTE:
2408         shift = 0;
2409         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2410         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2411         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2412         __ br(Assembler::LO, L_fill_elements);
2413         break;
2414       case T_SHORT:
2415         shift = 1;
2416         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2417         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2418         __ br(Assembler::LO, L_fill_elements);
2419         break;
2420       case T_INT:
2421         shift = 2;
2422         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2423         __ br(Assembler::LO, L_fill_elements);
2424         break;
2425       default: ShouldNotReachHere();
2426     }
2427 
2428     // Align source address at 8 bytes address boundary.
2429     Label L_skip_align1, L_skip_align2, L_skip_align4;
2430     if (!aligned) {
2431       switch (t) {
2432         case T_BYTE:
2433           // One byte misalignment happens only for byte arrays.
2434           __ tbz(to, 0, L_skip_align1);
2435           __ strb(value, Address(__ post(to, 1)));
2436           __ subw(count, count, 1);
2437           __ bind(L_skip_align1);
2438           // Fallthrough
2439         case T_SHORT:
2440           // Two bytes misalignment happens only for byte and short (char) arrays.
2441           __ tbz(to, 1, L_skip_align2);
2442           __ strh(value, Address(__ post(to, 2)));
2443           __ subw(count, count, 2 >> shift);
2444           __ bind(L_skip_align2);
2445           // Fallthrough
2446         case T_INT:
2447           // Align to 8 bytes, we know we are 4 byte aligned to start.
2448           __ tbz(to, 2, L_skip_align4);
2449           __ strw(value, Address(__ post(to, 4)));
2450           __ subw(count, count, 4 >> shift);
2451           __ bind(L_skip_align4);
2452           break;
2453         default: ShouldNotReachHere();
2454       }
2455     }
2456 
2457     //
2458     //  Fill large chunks
2459     //
2460     __ lsrw(cnt_words, count, 3 - shift); // number of words
2461     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2462     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2463     if (UseBlockZeroing) {
2464       Label non_block_zeroing, rest;
2465       // If the fill value is zero we can use the fast zero_words().
2466       __ cbnz(value, non_block_zeroing);
2467       __ mov(bz_base, to);
2468       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2469       address tpc = __ zero_words(bz_base, cnt_words);
2470       if (tpc == nullptr) {
2471         fatal("CodeCache is full at generate_fill");
2472       }
2473       __ b(rest);
2474       __ bind(non_block_zeroing);
2475       __ fill_words(to, cnt_words, value);
2476       __ bind(rest);
2477     } else {
2478       __ fill_words(to, cnt_words, value);
2479     }
2480 
2481     // Remaining count is less than 8 bytes. Fill it by a single store.
2482     // Note that the total length is no less than 8 bytes.
2483     if (t == T_BYTE || t == T_SHORT) {
2484       Label L_exit1;
2485       __ cbzw(count, L_exit1);
2486       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2487       __ str(value, Address(to, -8));    // overwrite some elements
2488       __ bind(L_exit1);
2489       __ leave();
2490       __ ret(lr);
2491     }
2492 
2493     // Handle copies less than 8 bytes.
2494     Label L_fill_2, L_fill_4, L_exit2;
2495     __ bind(L_fill_elements);
2496     switch (t) {
2497       case T_BYTE:
2498         __ tbz(count, 0, L_fill_2);
2499         __ strb(value, Address(__ post(to, 1)));
2500         __ bind(L_fill_2);
2501         __ tbz(count, 1, L_fill_4);
2502         __ strh(value, Address(__ post(to, 2)));
2503         __ bind(L_fill_4);
2504         __ tbz(count, 2, L_exit2);
2505         __ strw(value, Address(to));
2506         break;
2507       case T_SHORT:
2508         __ tbz(count, 0, L_fill_4);
2509         __ strh(value, Address(__ post(to, 2)));
2510         __ bind(L_fill_4);
2511         __ tbz(count, 1, L_exit2);
2512         __ strw(value, Address(to));
2513         break;
2514       case T_INT:
2515         __ cbzw(count, L_exit2);
2516         __ strw(value, Address(to));
2517         break;
2518       default: ShouldNotReachHere();
2519     }
2520     __ bind(L_exit2);
2521     __ leave();
2522     __ ret(lr);
2523     return start;
2524   }
2525 
2526   address generate_data_cache_writeback() {
2527     const Register line        = c_rarg0;  // address of line to write back
2528 
2529     __ align(CodeEntryAlignment);
2530 
2531     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2532 
2533     address start = __ pc();
2534     __ enter();
2535     __ cache_wb(Address(line, 0));
2536     __ leave();
2537     __ ret(lr);
2538 
2539     return start;
2540   }
2541 
2542   address generate_data_cache_writeback_sync() {
2543     const Register is_pre     = c_rarg0;  // pre or post sync
2544 
2545     __ align(CodeEntryAlignment);
2546 
2547     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2548 
2549     // pre wbsync is a no-op
2550     // post wbsync translates to an sfence
2551 
2552     Label skip;
2553     address start = __ pc();
2554     __ enter();
2555     __ cbnz(is_pre, skip);
2556     __ cache_wbsync(false);
2557     __ bind(skip);
2558     __ leave();
2559     __ ret(lr);
2560 
2561     return start;
2562   }
2563 
2564   void generate_arraycopy_stubs() {
2565     address entry;
2566     address entry_jbyte_arraycopy;
2567     address entry_jshort_arraycopy;
2568     address entry_jint_arraycopy;
2569     address entry_oop_arraycopy;
2570     address entry_jlong_arraycopy;
2571     address entry_checkcast_arraycopy;
2572 
2573     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2574     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2575 
2576     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2577     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2578 
2579     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2580     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2581 
2582     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2583 
2584     //*** jbyte
2585     // Always need aligned and unaligned versions
2586     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2587                                                                                   "jbyte_disjoint_arraycopy");
2588     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2589                                                                                   &entry_jbyte_arraycopy,
2590                                                                                   "jbyte_arraycopy");
2591     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2592                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2593     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2594                                                                                   "arrayof_jbyte_arraycopy");
2595 
2596     //*** jshort
2597     // Always need aligned and unaligned versions
2598     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2599                                                                                     "jshort_disjoint_arraycopy");
2600     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2601                                                                                     &entry_jshort_arraycopy,
2602                                                                                     "jshort_arraycopy");
2603     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2604                                                                                     "arrayof_jshort_disjoint_arraycopy");
2605     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2606                                                                                     "arrayof_jshort_arraycopy");
2607 
2608     //*** jint
2609     // Aligned versions
2610     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2611                                                                                 "arrayof_jint_disjoint_arraycopy");
2612     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2613                                                                                 "arrayof_jint_arraycopy");
2614     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2615     // entry_jint_arraycopy always points to the unaligned version
2616     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2617                                                                                 "jint_disjoint_arraycopy");
2618     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2619                                                                                 &entry_jint_arraycopy,
2620                                                                                 "jint_arraycopy");
2621 
2622     //*** jlong
2623     // It is always aligned
2624     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2625                                                                                   "arrayof_jlong_disjoint_arraycopy");
2626     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2627                                                                                   "arrayof_jlong_arraycopy");
2628     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2629     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2630 
2631     //*** oops
2632     {
2633       // With compressed oops we need unaligned versions; notice that
2634       // we overwrite entry_oop_arraycopy.
2635       bool aligned = !UseCompressedOops;
2636 
2637       StubRoutines::_arrayof_oop_disjoint_arraycopy
2638         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2639                                      /*dest_uninitialized*/false);
2640       StubRoutines::_arrayof_oop_arraycopy
2641         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2642                                      /*dest_uninitialized*/false);
2643       // Aligned versions without pre-barriers
2644       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2645         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2646                                      /*dest_uninitialized*/true);
2647       StubRoutines::_arrayof_oop_arraycopy_uninit
2648         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2649                                      /*dest_uninitialized*/true);
2650     }
2651 
2652     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2653     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2654     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2655     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2656 
2657     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2658     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2659                                                                         /*dest_uninitialized*/true);
2660 
2661     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2662                                                               entry_jbyte_arraycopy,
2663                                                               entry_jshort_arraycopy,
2664                                                               entry_jint_arraycopy,
2665                                                               entry_jlong_arraycopy);
2666 
2667     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2668                                                                entry_jbyte_arraycopy,
2669                                                                entry_jshort_arraycopy,
2670                                                                entry_jint_arraycopy,
2671                                                                entry_oop_arraycopy,
2672                                                                entry_jlong_arraycopy,
2673                                                                entry_checkcast_arraycopy);
2674 
2675     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2676     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2677     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2678     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2679     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2680     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2681   }
2682 
2683   void generate_math_stubs() { Unimplemented(); }
2684 
2685   // Arguments:
2686   //
2687   // Inputs:
2688   //   c_rarg0   - source byte array address
2689   //   c_rarg1   - destination byte array address
2690   //   c_rarg2   - K (key) in little endian int array
2691   //
2692   address generate_aescrypt_encryptBlock() {
2693     __ align(CodeEntryAlignment);
2694     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2695 
2696     const Register from        = c_rarg0;  // source array address
2697     const Register to          = c_rarg1;  // destination array address
2698     const Register key         = c_rarg2;  // key array address
2699     const Register keylen      = rscratch1;
2700 
2701     address start = __ pc();
2702     __ enter();
2703 
2704     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2705 
2706     __ aesenc_loadkeys(key, keylen);
2707     __ aesecb_encrypt(from, to, keylen);
2708 
2709     __ mov(r0, 0);
2710 
2711     __ leave();
2712     __ ret(lr);
2713 
2714     return start;
2715   }
2716 
2717   // Arguments:
2718   //
2719   // Inputs:
2720   //   c_rarg0   - source byte array address
2721   //   c_rarg1   - destination byte array address
2722   //   c_rarg2   - K (key) in little endian int array
2723   //
2724   address generate_aescrypt_decryptBlock() {
2725     assert(UseAES, "need AES cryptographic extension support");
2726     __ align(CodeEntryAlignment);
2727     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2728     Label L_doLast;
2729 
2730     const Register from        = c_rarg0;  // source array address
2731     const Register to          = c_rarg1;  // destination array address
2732     const Register key         = c_rarg2;  // key array address
2733     const Register keylen      = rscratch1;
2734 
2735     address start = __ pc();
2736     __ enter(); // required for proper stackwalking of RuntimeStub frame
2737 
2738     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2739 
2740     __ aesecb_decrypt(from, to, key, keylen);
2741 
2742     __ mov(r0, 0);
2743 
2744     __ leave();
2745     __ ret(lr);
2746 
2747     return start;
2748   }
2749 
2750   // Arguments:
2751   //
2752   // Inputs:
2753   //   c_rarg0   - source byte array address
2754   //   c_rarg1   - destination byte array address
2755   //   c_rarg2   - K (key) in little endian int array
2756   //   c_rarg3   - r vector byte array address
2757   //   c_rarg4   - input length
2758   //
2759   // Output:
2760   //   x0        - input length
2761   //
2762   address generate_cipherBlockChaining_encryptAESCrypt() {
2763     assert(UseAES, "need AES cryptographic extension support");
2764     __ align(CodeEntryAlignment);
2765     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2766 
2767     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2768 
2769     const Register from        = c_rarg0;  // source array address
2770     const Register to          = c_rarg1;  // destination array address
2771     const Register key         = c_rarg2;  // key array address
2772     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2773                                            // and left with the results of the last encryption block
2774     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2775     const Register keylen      = rscratch1;
2776 
2777     address start = __ pc();
2778 
2779       __ enter();
2780 
2781       __ movw(rscratch2, len_reg);
2782 
2783       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2784 
2785       __ ld1(v0, __ T16B, rvec);
2786 
2787       __ cmpw(keylen, 52);
2788       __ br(Assembler::CC, L_loadkeys_44);
2789       __ br(Assembler::EQ, L_loadkeys_52);
2790 
2791       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2792       __ rev32(v17, __ T16B, v17);
2793       __ rev32(v18, __ T16B, v18);
2794     __ BIND(L_loadkeys_52);
2795       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2796       __ rev32(v19, __ T16B, v19);
2797       __ rev32(v20, __ T16B, v20);
2798     __ BIND(L_loadkeys_44);
2799       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2800       __ rev32(v21, __ T16B, v21);
2801       __ rev32(v22, __ T16B, v22);
2802       __ rev32(v23, __ T16B, v23);
2803       __ rev32(v24, __ T16B, v24);
2804       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2805       __ rev32(v25, __ T16B, v25);
2806       __ rev32(v26, __ T16B, v26);
2807       __ rev32(v27, __ T16B, v27);
2808       __ rev32(v28, __ T16B, v28);
2809       __ ld1(v29, v30, v31, __ T16B, key);
2810       __ rev32(v29, __ T16B, v29);
2811       __ rev32(v30, __ T16B, v30);
2812       __ rev32(v31, __ T16B, v31);
2813 
2814     __ BIND(L_aes_loop);
2815       __ ld1(v1, __ T16B, __ post(from, 16));
2816       __ eor(v0, __ T16B, v0, v1);
2817 
2818       __ br(Assembler::CC, L_rounds_44);
2819       __ br(Assembler::EQ, L_rounds_52);
2820 
2821       __ aese(v0, v17); __ aesmc(v0, v0);
2822       __ aese(v0, v18); __ aesmc(v0, v0);
2823     __ BIND(L_rounds_52);
2824       __ aese(v0, v19); __ aesmc(v0, v0);
2825       __ aese(v0, v20); __ aesmc(v0, v0);
2826     __ BIND(L_rounds_44);
2827       __ aese(v0, v21); __ aesmc(v0, v0);
2828       __ aese(v0, v22); __ aesmc(v0, v0);
2829       __ aese(v0, v23); __ aesmc(v0, v0);
2830       __ aese(v0, v24); __ aesmc(v0, v0);
2831       __ aese(v0, v25); __ aesmc(v0, v0);
2832       __ aese(v0, v26); __ aesmc(v0, v0);
2833       __ aese(v0, v27); __ aesmc(v0, v0);
2834       __ aese(v0, v28); __ aesmc(v0, v0);
2835       __ aese(v0, v29); __ aesmc(v0, v0);
2836       __ aese(v0, v30);
2837       __ eor(v0, __ T16B, v0, v31);
2838 
2839       __ st1(v0, __ T16B, __ post(to, 16));
2840 
2841       __ subw(len_reg, len_reg, 16);
2842       __ cbnzw(len_reg, L_aes_loop);
2843 
2844       __ st1(v0, __ T16B, rvec);
2845 
2846       __ mov(r0, rscratch2);
2847 
2848       __ leave();
2849       __ ret(lr);
2850 
2851       return start;
2852   }
2853 
2854   // Arguments:
2855   //
2856   // Inputs:
2857   //   c_rarg0   - source byte array address
2858   //   c_rarg1   - destination byte array address
2859   //   c_rarg2   - K (key) in little endian int array
2860   //   c_rarg3   - r vector byte array address
2861   //   c_rarg4   - input length
2862   //
2863   // Output:
2864   //   r0        - input length
2865   //
2866   address generate_cipherBlockChaining_decryptAESCrypt() {
2867     assert(UseAES, "need AES cryptographic extension support");
2868     __ align(CodeEntryAlignment);
2869     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2870 
2871     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2872 
2873     const Register from        = c_rarg0;  // source array address
2874     const Register to          = c_rarg1;  // destination array address
2875     const Register key         = c_rarg2;  // key array address
2876     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2877                                            // and left with the results of the last encryption block
2878     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2879     const Register keylen      = rscratch1;
2880 
2881     address start = __ pc();
2882 
2883       __ enter();
2884 
2885       __ movw(rscratch2, len_reg);
2886 
2887       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2888 
2889       __ ld1(v2, __ T16B, rvec);
2890 
2891       __ ld1(v31, __ T16B, __ post(key, 16));
2892       __ rev32(v31, __ T16B, v31);
2893 
2894       __ cmpw(keylen, 52);
2895       __ br(Assembler::CC, L_loadkeys_44);
2896       __ br(Assembler::EQ, L_loadkeys_52);
2897 
2898       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2899       __ rev32(v17, __ T16B, v17);
2900       __ rev32(v18, __ T16B, v18);
2901     __ BIND(L_loadkeys_52);
2902       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2903       __ rev32(v19, __ T16B, v19);
2904       __ rev32(v20, __ T16B, v20);
2905     __ BIND(L_loadkeys_44);
2906       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2907       __ rev32(v21, __ T16B, v21);
2908       __ rev32(v22, __ T16B, v22);
2909       __ rev32(v23, __ T16B, v23);
2910       __ rev32(v24, __ T16B, v24);
2911       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2912       __ rev32(v25, __ T16B, v25);
2913       __ rev32(v26, __ T16B, v26);
2914       __ rev32(v27, __ T16B, v27);
2915       __ rev32(v28, __ T16B, v28);
2916       __ ld1(v29, v30, __ T16B, key);
2917       __ rev32(v29, __ T16B, v29);
2918       __ rev32(v30, __ T16B, v30);
2919 
2920     __ BIND(L_aes_loop);
2921       __ ld1(v0, __ T16B, __ post(from, 16));
2922       __ orr(v1, __ T16B, v0, v0);
2923 
2924       __ br(Assembler::CC, L_rounds_44);
2925       __ br(Assembler::EQ, L_rounds_52);
2926 
2927       __ aesd(v0, v17); __ aesimc(v0, v0);
2928       __ aesd(v0, v18); __ aesimc(v0, v0);
2929     __ BIND(L_rounds_52);
2930       __ aesd(v0, v19); __ aesimc(v0, v0);
2931       __ aesd(v0, v20); __ aesimc(v0, v0);
2932     __ BIND(L_rounds_44);
2933       __ aesd(v0, v21); __ aesimc(v0, v0);
2934       __ aesd(v0, v22); __ aesimc(v0, v0);
2935       __ aesd(v0, v23); __ aesimc(v0, v0);
2936       __ aesd(v0, v24); __ aesimc(v0, v0);
2937       __ aesd(v0, v25); __ aesimc(v0, v0);
2938       __ aesd(v0, v26); __ aesimc(v0, v0);
2939       __ aesd(v0, v27); __ aesimc(v0, v0);
2940       __ aesd(v0, v28); __ aesimc(v0, v0);
2941       __ aesd(v0, v29); __ aesimc(v0, v0);
2942       __ aesd(v0, v30);
2943       __ eor(v0, __ T16B, v0, v31);
2944       __ eor(v0, __ T16B, v0, v2);
2945 
2946       __ st1(v0, __ T16B, __ post(to, 16));
2947       __ orr(v2, __ T16B, v1, v1);
2948 
2949       __ subw(len_reg, len_reg, 16);
2950       __ cbnzw(len_reg, L_aes_loop);
2951 
2952       __ st1(v2, __ T16B, rvec);
2953 
2954       __ mov(r0, rscratch2);
2955 
2956       __ leave();
2957       __ ret(lr);
2958 
2959     return start;
2960   }
2961 
2962   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2963   // Inputs: 128-bits. in is preserved.
2964   // The least-significant 64-bit word is in the upper dword of each vector.
2965   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2966   // Output: result
2967   void be_add_128_64(FloatRegister result, FloatRegister in,
2968                      FloatRegister inc, FloatRegister tmp) {
2969     assert_different_registers(result, tmp, inc);
2970 
2971     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2972                                            // input
2973     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2974     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
2975                                            // MSD == 0 (must be!) to LSD
2976     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
2977   }
2978 
2979   // CTR AES crypt.
2980   // Arguments:
2981   //
2982   // Inputs:
2983   //   c_rarg0   - source byte array address
2984   //   c_rarg1   - destination byte array address
2985   //   c_rarg2   - K (key) in little endian int array
2986   //   c_rarg3   - counter vector byte array address
2987   //   c_rarg4   - input length
2988   //   c_rarg5   - saved encryptedCounter start
2989   //   c_rarg6   - saved used length
2990   //
2991   // Output:
2992   //   r0       - input length
2993   //
2994   address generate_counterMode_AESCrypt() {
2995     const Register in = c_rarg0;
2996     const Register out = c_rarg1;
2997     const Register key = c_rarg2;
2998     const Register counter = c_rarg3;
2999     const Register saved_len = c_rarg4, len = r10;
3000     const Register saved_encrypted_ctr = c_rarg5;
3001     const Register used_ptr = c_rarg6, used = r12;
3002 
3003     const Register offset = r7;
3004     const Register keylen = r11;
3005 
3006     const unsigned char block_size = 16;
3007     const int bulk_width = 4;
3008     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3009     // performance with larger data sizes, but it also means that the
3010     // fast path isn't used until you have at least 8 blocks, and up
3011     // to 127 bytes of data will be executed on the slow path. For
3012     // that reason, and also so as not to blow away too much icache, 4
3013     // blocks seems like a sensible compromise.
3014 
3015     // Algorithm:
3016     //
3017     //    if (len == 0) {
3018     //        goto DONE;
3019     //    }
3020     //    int result = len;
3021     //    do {
3022     //        if (used >= blockSize) {
3023     //            if (len >= bulk_width * blockSize) {
3024     //                CTR_large_block();
3025     //                if (len == 0)
3026     //                    goto DONE;
3027     //            }
3028     //            for (;;) {
3029     //                16ByteVector v0 = counter;
3030     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3031     //                used = 0;
3032     //                if (len < blockSize)
3033     //                    break;    /* goto NEXT */
3034     //                16ByteVector v1 = load16Bytes(in, offset);
3035     //                v1 = v1 ^ encryptedCounter;
3036     //                store16Bytes(out, offset);
3037     //                used = blockSize;
3038     //                offset += blockSize;
3039     //                len -= blockSize;
3040     //                if (len == 0)
3041     //                    goto DONE;
3042     //            }
3043     //        }
3044     //      NEXT:
3045     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3046     //        len--;
3047     //    } while (len != 0);
3048     //  DONE:
3049     //    return result;
3050     //
3051     // CTR_large_block()
3052     //    Wide bulk encryption of whole blocks.
3053 
3054     __ align(CodeEntryAlignment);
3055     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3056     const address start = __ pc();
3057     __ enter();
3058 
3059     Label DONE, CTR_large_block, large_block_return;
3060     __ ldrw(used, Address(used_ptr));
3061     __ cbzw(saved_len, DONE);
3062 
3063     __ mov(len, saved_len);
3064     __ mov(offset, 0);
3065 
3066     // Compute #rounds for AES based on the length of the key array
3067     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3068 
3069     __ aesenc_loadkeys(key, keylen);
3070 
3071     {
3072       Label L_CTR_loop, NEXT;
3073 
3074       __ bind(L_CTR_loop);
3075 
3076       __ cmp(used, block_size);
3077       __ br(__ LO, NEXT);
3078 
3079       // Maybe we have a lot of data
3080       __ subsw(rscratch1, len, bulk_width * block_size);
3081       __ br(__ HS, CTR_large_block);
3082       __ BIND(large_block_return);
3083       __ cbzw(len, DONE);
3084 
3085       // Setup the counter
3086       __ movi(v4, __ T4S, 0);
3087       __ movi(v5, __ T4S, 1);
3088       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3089 
3090       // 128-bit big-endian increment
3091       __ ld1(v0, __ T16B, counter);
3092       __ rev64(v16, __ T16B, v0);
3093       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3094       __ rev64(v16, __ T16B, v16);
3095       __ st1(v16, __ T16B, counter);
3096       // Previous counter value is in v0
3097       // v4 contains { 0, 1 }
3098 
3099       {
3100         // We have fewer than bulk_width blocks of data left. Encrypt
3101         // them one by one until there is less than a full block
3102         // remaining, being careful to save both the encrypted counter
3103         // and the counter.
3104 
3105         Label inner_loop;
3106         __ bind(inner_loop);
3107         // Counter to encrypt is in v0
3108         __ aesecb_encrypt(noreg, noreg, keylen);
3109         __ st1(v0, __ T16B, saved_encrypted_ctr);
3110 
3111         // Do we have a remaining full block?
3112 
3113         __ mov(used, 0);
3114         __ cmp(len, block_size);
3115         __ br(__ LO, NEXT);
3116 
3117         // Yes, we have a full block
3118         __ ldrq(v1, Address(in, offset));
3119         __ eor(v1, __ T16B, v1, v0);
3120         __ strq(v1, Address(out, offset));
3121         __ mov(used, block_size);
3122         __ add(offset, offset, block_size);
3123 
3124         __ subw(len, len, block_size);
3125         __ cbzw(len, DONE);
3126 
3127         // Increment the counter, store it back
3128         __ orr(v0, __ T16B, v16, v16);
3129         __ rev64(v16, __ T16B, v16);
3130         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3131         __ rev64(v16, __ T16B, v16);
3132         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3133 
3134         __ b(inner_loop);
3135       }
3136 
3137       __ BIND(NEXT);
3138 
3139       // Encrypt a single byte, and loop.
3140       // We expect this to be a rare event.
3141       __ ldrb(rscratch1, Address(in, offset));
3142       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3143       __ eor(rscratch1, rscratch1, rscratch2);
3144       __ strb(rscratch1, Address(out, offset));
3145       __ add(offset, offset, 1);
3146       __ add(used, used, 1);
3147       __ subw(len, len,1);
3148       __ cbnzw(len, L_CTR_loop);
3149     }
3150 
3151     __ bind(DONE);
3152     __ strw(used, Address(used_ptr));
3153     __ mov(r0, saved_len);
3154 
3155     __ leave(); // required for proper stackwalking of RuntimeStub frame
3156     __ ret(lr);
3157 
3158     // Bulk encryption
3159 
3160     __ BIND (CTR_large_block);
3161     assert(bulk_width == 4 || bulk_width == 8, "must be");
3162 
3163     if (bulk_width == 8) {
3164       __ sub(sp, sp, 4 * 16);
3165       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3166     }
3167     __ sub(sp, sp, 4 * 16);
3168     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3169     RegSet saved_regs = (RegSet::of(in, out, offset)
3170                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3171     __ push(saved_regs, sp);
3172     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3173     __ add(in, in, offset);
3174     __ add(out, out, offset);
3175 
3176     // Keys should already be loaded into the correct registers
3177 
3178     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3179     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3180 
3181     // AES/CTR loop
3182     {
3183       Label L_CTR_loop;
3184       __ BIND(L_CTR_loop);
3185 
3186       // Setup the counters
3187       __ movi(v8, __ T4S, 0);
3188       __ movi(v9, __ T4S, 1);
3189       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3190 
3191       for (int i = 0; i < bulk_width; i++) {
3192         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3193         __ rev64(v0_ofs, __ T16B, v16);
3194         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3195       }
3196 
3197       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3198 
3199       // Encrypt the counters
3200       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3201 
3202       if (bulk_width == 8) {
3203         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3204       }
3205 
3206       // XOR the encrypted counters with the inputs
3207       for (int i = 0; i < bulk_width; i++) {
3208         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3209         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3210         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3211       }
3212 
3213       // Write the encrypted data
3214       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3215       if (bulk_width == 8) {
3216         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3217       }
3218 
3219       __ subw(len, len, 16 * bulk_width);
3220       __ cbnzw(len, L_CTR_loop);
3221     }
3222 
3223     // Save the counter back where it goes
3224     __ rev64(v16, __ T16B, v16);
3225     __ st1(v16, __ T16B, counter);
3226 
3227     __ pop(saved_regs, sp);
3228 
3229     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3230     if (bulk_width == 8) {
3231       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3232     }
3233 
3234     __ andr(rscratch1, len, -16 * bulk_width);
3235     __ sub(len, len, rscratch1);
3236     __ add(offset, offset, rscratch1);
3237     __ mov(used, 16);
3238     __ strw(used, Address(used_ptr));
3239     __ b(large_block_return);
3240 
3241     return start;
3242   }
3243 
3244   // Vector AES Galois Counter Mode implementation. Parameters:
3245   //
3246   // in = c_rarg0
3247   // len = c_rarg1
3248   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3249   // out = c_rarg3
3250   // key = c_rarg4
3251   // state = c_rarg5 - GHASH.state
3252   // subkeyHtbl = c_rarg6 - powers of H
3253   // counter = c_rarg7 - 16 bytes of CTR
3254   // return - number of processed bytes
3255   address generate_galoisCounterMode_AESCrypt() {
3256     address ghash_polynomial = __ pc();
3257     __ emit_int64(0x87);  // The low-order bits of the field
3258                           // polynomial (i.e. p = z^7+z^2+z+1)
3259                           // repeated in the low and high parts of a
3260                           // 128-bit vector
3261     __ emit_int64(0x87);
3262 
3263     __ align(CodeEntryAlignment);
3264      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3265     address start = __ pc();
3266     __ enter();
3267 
3268     const Register in = c_rarg0;
3269     const Register len = c_rarg1;
3270     const Register ct = c_rarg2;
3271     const Register out = c_rarg3;
3272     // and updated with the incremented counter in the end
3273 
3274     const Register key = c_rarg4;
3275     const Register state = c_rarg5;
3276 
3277     const Register subkeyHtbl = c_rarg6;
3278 
3279     const Register counter = c_rarg7;
3280 
3281     const Register keylen = r10;
3282     // Save state before entering routine
3283     __ sub(sp, sp, 4 * 16);
3284     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3285     __ sub(sp, sp, 4 * 16);
3286     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3287 
3288     // __ andr(len, len, -512);
3289     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3290     __ str(len, __ pre(sp, -2 * wordSize));
3291 
3292     Label DONE;
3293     __ cbz(len, DONE);
3294 
3295     // Compute #rounds for AES based on the length of the key array
3296     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3297 
3298     __ aesenc_loadkeys(key, keylen);
3299     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3300     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3301 
3302     // AES/CTR loop
3303     {
3304       Label L_CTR_loop;
3305       __ BIND(L_CTR_loop);
3306 
3307       // Setup the counters
3308       __ movi(v8, __ T4S, 0);
3309       __ movi(v9, __ T4S, 1);
3310       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3311 
3312       assert(v0->encoding() < v8->encoding(), "");
3313       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3314         FloatRegister f = as_FloatRegister(i);
3315         __ rev32(f, __ T16B, v16);
3316         __ addv(v16, __ T4S, v16, v8);
3317       }
3318 
3319       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3320 
3321       // Encrypt the counters
3322       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3323 
3324       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3325 
3326       // XOR the encrypted counters with the inputs
3327       for (int i = 0; i < 8; i++) {
3328         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3329         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3330         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3331       }
3332       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3333       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3334 
3335       __ subw(len, len, 16 * 8);
3336       __ cbnzw(len, L_CTR_loop);
3337     }
3338 
3339     __ rev32(v16, __ T16B, v16);
3340     __ st1(v16, __ T16B, counter);
3341 
3342     __ ldr(len, Address(sp));
3343     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3344 
3345     // GHASH/CTR loop
3346     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3347                                 len, /*unrolls*/4);
3348 
3349 #ifdef ASSERT
3350     { Label L;
3351       __ cmp(len, (unsigned char)0);
3352       __ br(Assembler::EQ, L);
3353       __ stop("stubGenerator: abort");
3354       __ bind(L);
3355   }
3356 #endif
3357 
3358   __ bind(DONE);
3359     // Return the number of bytes processed
3360     __ ldr(r0, __ post(sp, 2 * wordSize));
3361 
3362     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3363     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3364 
3365     __ leave(); // required for proper stackwalking of RuntimeStub frame
3366     __ ret(lr);
3367      return start;
3368   }
3369 
3370   class Cached64Bytes {
3371   private:
3372     MacroAssembler *_masm;
3373     Register _regs[8];
3374 
3375   public:
3376     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3377       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3378       auto it = rs.begin();
3379       for (auto &r: _regs) {
3380         r = *it;
3381         ++it;
3382       }
3383     }
3384 
3385     void gen_loads(Register base) {
3386       for (int i = 0; i < 8; i += 2) {
3387         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3388       }
3389     }
3390 
3391     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3392     void extract_u32(Register dest, int i) {
3393       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3394     }
3395   };
3396 
3397   // Utility routines for md5.
3398   // Clobbers r10 and r11.
3399   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3400               int k, int s, int t) {
3401     Register rscratch3 = r10;
3402     Register rscratch4 = r11;
3403 
3404     __ eorw(rscratch3, r3, r4);
3405     __ movw(rscratch2, t);
3406     __ andw(rscratch3, rscratch3, r2);
3407     __ addw(rscratch4, r1, rscratch2);
3408     reg_cache.extract_u32(rscratch1, k);
3409     __ eorw(rscratch3, rscratch3, r4);
3410     __ addw(rscratch4, rscratch4, rscratch1);
3411     __ addw(rscratch3, rscratch3, rscratch4);
3412     __ rorw(rscratch2, rscratch3, 32 - s);
3413     __ addw(r1, rscratch2, r2);
3414   }
3415 
3416   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3417               int k, int s, int t) {
3418     Register rscratch3 = r10;
3419     Register rscratch4 = r11;
3420 
3421     __ andw(rscratch3, r2, r4);
3422     __ bicw(rscratch4, r3, r4);
3423     reg_cache.extract_u32(rscratch1, k);
3424     __ movw(rscratch2, t);
3425     __ orrw(rscratch3, rscratch3, rscratch4);
3426     __ addw(rscratch4, r1, rscratch2);
3427     __ addw(rscratch4, rscratch4, rscratch1);
3428     __ addw(rscratch3, rscratch3, rscratch4);
3429     __ rorw(rscratch2, rscratch3, 32 - s);
3430     __ addw(r1, rscratch2, r2);
3431   }
3432 
3433   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3434               int k, int s, int t) {
3435     Register rscratch3 = r10;
3436     Register rscratch4 = r11;
3437 
3438     __ eorw(rscratch3, r3, r4);
3439     __ movw(rscratch2, t);
3440     __ addw(rscratch4, r1, rscratch2);
3441     reg_cache.extract_u32(rscratch1, k);
3442     __ eorw(rscratch3, rscratch3, r2);
3443     __ addw(rscratch4, rscratch4, rscratch1);
3444     __ addw(rscratch3, rscratch3, rscratch4);
3445     __ rorw(rscratch2, rscratch3, 32 - s);
3446     __ addw(r1, rscratch2, r2);
3447   }
3448 
3449   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3450               int k, int s, int t) {
3451     Register rscratch3 = r10;
3452     Register rscratch4 = r11;
3453 
3454     __ movw(rscratch3, t);
3455     __ ornw(rscratch2, r2, r4);
3456     __ addw(rscratch4, r1, rscratch3);
3457     reg_cache.extract_u32(rscratch1, k);
3458     __ eorw(rscratch3, rscratch2, r3);
3459     __ addw(rscratch4, rscratch4, rscratch1);
3460     __ addw(rscratch3, rscratch3, rscratch4);
3461     __ rorw(rscratch2, rscratch3, 32 - s);
3462     __ addw(r1, rscratch2, r2);
3463   }
3464 
3465   // Arguments:
3466   //
3467   // Inputs:
3468   //   c_rarg0   - byte[]  source+offset
3469   //   c_rarg1   - int[]   SHA.state
3470   //   c_rarg2   - int     offset
3471   //   c_rarg3   - int     limit
3472   //
3473   address generate_md5_implCompress(bool multi_block, const char *name) {
3474     __ align(CodeEntryAlignment);
3475     StubCodeMark mark(this, "StubRoutines", name);
3476     address start = __ pc();
3477 
3478     Register buf       = c_rarg0;
3479     Register state     = c_rarg1;
3480     Register ofs       = c_rarg2;
3481     Register limit     = c_rarg3;
3482     Register a         = r4;
3483     Register b         = r5;
3484     Register c         = r6;
3485     Register d         = r7;
3486     Register rscratch3 = r10;
3487     Register rscratch4 = r11;
3488 
3489     Register state_regs[2] = { r12, r13 };
3490     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3491     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3492 
3493     __ push(saved_regs, sp);
3494 
3495     __ ldp(state_regs[0], state_regs[1], Address(state));
3496     __ ubfx(a, state_regs[0],  0, 32);
3497     __ ubfx(b, state_regs[0], 32, 32);
3498     __ ubfx(c, state_regs[1],  0, 32);
3499     __ ubfx(d, state_regs[1], 32, 32);
3500 
3501     Label md5_loop;
3502     __ BIND(md5_loop);
3503 
3504     reg_cache.gen_loads(buf);
3505 
3506     // Round 1
3507     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3508     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3509     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3510     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3511     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3512     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3513     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3514     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3515     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3516     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3517     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3518     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3519     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3520     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3521     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3522     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3523 
3524     // Round 2
3525     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3526     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3527     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3528     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3529     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3530     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3531     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3532     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3533     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3534     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3535     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3536     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3537     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3538     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3539     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3540     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3541 
3542     // Round 3
3543     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3544     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3545     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3546     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3547     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3548     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3549     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3550     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3551     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3552     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3553     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3554     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3555     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3556     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3557     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3558     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3559 
3560     // Round 4
3561     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3562     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3563     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3564     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3565     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3566     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3567     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3568     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3569     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3570     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3571     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3572     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3573     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3574     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3575     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3576     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3577 
3578     __ addw(a, state_regs[0], a);
3579     __ ubfx(rscratch2, state_regs[0], 32, 32);
3580     __ addw(b, rscratch2, b);
3581     __ addw(c, state_regs[1], c);
3582     __ ubfx(rscratch4, state_regs[1], 32, 32);
3583     __ addw(d, rscratch4, d);
3584 
3585     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3586     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3587 
3588     if (multi_block) {
3589       __ add(buf, buf, 64);
3590       __ add(ofs, ofs, 64);
3591       __ cmp(ofs, limit);
3592       __ br(Assembler::LE, md5_loop);
3593       __ mov(c_rarg0, ofs); // return ofs
3594     }
3595 
3596     // write hash values back in the correct order
3597     __ stp(state_regs[0], state_regs[1], Address(state));
3598 
3599     __ pop(saved_regs, sp);
3600 
3601     __ ret(lr);
3602 
3603     return start;
3604   }
3605 
3606   // Arguments:
3607   //
3608   // Inputs:
3609   //   c_rarg0   - byte[]  source+offset
3610   //   c_rarg1   - int[]   SHA.state
3611   //   c_rarg2   - int     offset
3612   //   c_rarg3   - int     limit
3613   //
3614   address generate_sha1_implCompress(bool multi_block, const char *name) {
3615     __ align(CodeEntryAlignment);
3616     StubCodeMark mark(this, "StubRoutines", name);
3617     address start = __ pc();
3618 
3619     Register buf   = c_rarg0;
3620     Register state = c_rarg1;
3621     Register ofs   = c_rarg2;
3622     Register limit = c_rarg3;
3623 
3624     Label keys;
3625     Label sha1_loop;
3626 
3627     // load the keys into v0..v3
3628     __ adr(rscratch1, keys);
3629     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3630     // load 5 words state into v6, v7
3631     __ ldrq(v6, Address(state, 0));
3632     __ ldrs(v7, Address(state, 16));
3633 
3634 
3635     __ BIND(sha1_loop);
3636     // load 64 bytes of data into v16..v19
3637     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3638     __ rev32(v16, __ T16B, v16);
3639     __ rev32(v17, __ T16B, v17);
3640     __ rev32(v18, __ T16B, v18);
3641     __ rev32(v19, __ T16B, v19);
3642 
3643     // do the sha1
3644     __ addv(v4, __ T4S, v16, v0);
3645     __ orr(v20, __ T16B, v6, v6);
3646 
3647     FloatRegister d0 = v16;
3648     FloatRegister d1 = v17;
3649     FloatRegister d2 = v18;
3650     FloatRegister d3 = v19;
3651 
3652     for (int round = 0; round < 20; round++) {
3653       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3654       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3655       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3656       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3657       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3658 
3659       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3660       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3661       __ sha1h(tmp2, __ T4S, v20);
3662       if (round < 5)
3663         __ sha1c(v20, __ T4S, tmp3, tmp4);
3664       else if (round < 10 || round >= 15)
3665         __ sha1p(v20, __ T4S, tmp3, tmp4);
3666       else
3667         __ sha1m(v20, __ T4S, tmp3, tmp4);
3668       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3669 
3670       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3671     }
3672 
3673     __ addv(v7, __ T2S, v7, v21);
3674     __ addv(v6, __ T4S, v6, v20);
3675 
3676     if (multi_block) {
3677       __ add(ofs, ofs, 64);
3678       __ cmp(ofs, limit);
3679       __ br(Assembler::LE, sha1_loop);
3680       __ mov(c_rarg0, ofs); // return ofs
3681     }
3682 
3683     __ strq(v6, Address(state, 0));
3684     __ strs(v7, Address(state, 16));
3685 
3686     __ ret(lr);
3687 
3688     __ bind(keys);
3689     __ emit_int32(0x5a827999);
3690     __ emit_int32(0x6ed9eba1);
3691     __ emit_int32(0x8f1bbcdc);
3692     __ emit_int32(0xca62c1d6);
3693 
3694     return start;
3695   }
3696 
3697 
3698   // Arguments:
3699   //
3700   // Inputs:
3701   //   c_rarg0   - byte[]  source+offset
3702   //   c_rarg1   - int[]   SHA.state
3703   //   c_rarg2   - int     offset
3704   //   c_rarg3   - int     limit
3705   //
3706   address generate_sha256_implCompress(bool multi_block, const char *name) {
3707     static const uint32_t round_consts[64] = {
3708       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3709       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3710       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3711       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3712       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3713       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3714       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3715       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3716       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3717       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3718       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3719       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3720       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3721       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3722       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3723       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3724     };
3725     __ align(CodeEntryAlignment);
3726     StubCodeMark mark(this, "StubRoutines", name);
3727     address start = __ pc();
3728 
3729     Register buf   = c_rarg0;
3730     Register state = c_rarg1;
3731     Register ofs   = c_rarg2;
3732     Register limit = c_rarg3;
3733 
3734     Label sha1_loop;
3735 
3736     __ stpd(v8, v9, __ pre(sp, -32));
3737     __ stpd(v10, v11, Address(sp, 16));
3738 
3739 // dga == v0
3740 // dgb == v1
3741 // dg0 == v2
3742 // dg1 == v3
3743 // dg2 == v4
3744 // t0 == v6
3745 // t1 == v7
3746 
3747     // load 16 keys to v16..v31
3748     __ lea(rscratch1, ExternalAddress((address)round_consts));
3749     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3750     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3751     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3752     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3753 
3754     // load 8 words (256 bits) state
3755     __ ldpq(v0, v1, state);
3756 
3757     __ BIND(sha1_loop);
3758     // load 64 bytes of data into v8..v11
3759     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3760     __ rev32(v8, __ T16B, v8);
3761     __ rev32(v9, __ T16B, v9);
3762     __ rev32(v10, __ T16B, v10);
3763     __ rev32(v11, __ T16B, v11);
3764 
3765     __ addv(v6, __ T4S, v8, v16);
3766     __ orr(v2, __ T16B, v0, v0);
3767     __ orr(v3, __ T16B, v1, v1);
3768 
3769     FloatRegister d0 = v8;
3770     FloatRegister d1 = v9;
3771     FloatRegister d2 = v10;
3772     FloatRegister d3 = v11;
3773 
3774 
3775     for (int round = 0; round < 16; round++) {
3776       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3777       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3778       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3779       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3780 
3781       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3782        __ orr(v4, __ T16B, v2, v2);
3783       if (round < 15)
3784         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3785       __ sha256h(v2, __ T4S, v3, tmp2);
3786       __ sha256h2(v3, __ T4S, v4, tmp2);
3787       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3788 
3789       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3790     }
3791 
3792     __ addv(v0, __ T4S, v0, v2);
3793     __ addv(v1, __ T4S, v1, v3);
3794 
3795     if (multi_block) {
3796       __ add(ofs, ofs, 64);
3797       __ cmp(ofs, limit);
3798       __ br(Assembler::LE, sha1_loop);
3799       __ mov(c_rarg0, ofs); // return ofs
3800     }
3801 
3802     __ ldpd(v10, v11, Address(sp, 16));
3803     __ ldpd(v8, v9, __ post(sp, 32));
3804 
3805     __ stpq(v0, v1, state);
3806 
3807     __ ret(lr);
3808 
3809     return start;
3810   }
3811 
3812   // Double rounds for sha512.
3813   void sha512_dround(int dr,
3814                      FloatRegister vi0, FloatRegister vi1,
3815                      FloatRegister vi2, FloatRegister vi3,
3816                      FloatRegister vi4, FloatRegister vrc0,
3817                      FloatRegister vrc1, FloatRegister vin0,
3818                      FloatRegister vin1, FloatRegister vin2,
3819                      FloatRegister vin3, FloatRegister vin4) {
3820       if (dr < 36) {
3821         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3822       }
3823       __ addv(v5, __ T2D, vrc0, vin0);
3824       __ ext(v6, __ T16B, vi2, vi3, 8);
3825       __ ext(v5, __ T16B, v5, v5, 8);
3826       __ ext(v7, __ T16B, vi1, vi2, 8);
3827       __ addv(vi3, __ T2D, vi3, v5);
3828       if (dr < 32) {
3829         __ ext(v5, __ T16B, vin3, vin4, 8);
3830         __ sha512su0(vin0, __ T2D, vin1);
3831       }
3832       __ sha512h(vi3, __ T2D, v6, v7);
3833       if (dr < 32) {
3834         __ sha512su1(vin0, __ T2D, vin2, v5);
3835       }
3836       __ addv(vi4, __ T2D, vi1, vi3);
3837       __ sha512h2(vi3, __ T2D, vi1, vi0);
3838   }
3839 
3840   // Arguments:
3841   //
3842   // Inputs:
3843   //   c_rarg0   - byte[]  source+offset
3844   //   c_rarg1   - int[]   SHA.state
3845   //   c_rarg2   - int     offset
3846   //   c_rarg3   - int     limit
3847   //
3848   address generate_sha512_implCompress(bool multi_block, const char *name) {
3849     static const uint64_t round_consts[80] = {
3850       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3851       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3852       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3853       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3854       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3855       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3856       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3857       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3858       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3859       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3860       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3861       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3862       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3863       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3864       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3865       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3866       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3867       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3868       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3869       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3870       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3871       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3872       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3873       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3874       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3875       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3876       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3877     };
3878 
3879     __ align(CodeEntryAlignment);
3880     StubCodeMark mark(this, "StubRoutines", name);
3881     address start = __ pc();
3882 
3883     Register buf   = c_rarg0;
3884     Register state = c_rarg1;
3885     Register ofs   = c_rarg2;
3886     Register limit = c_rarg3;
3887 
3888     __ stpd(v8, v9, __ pre(sp, -64));
3889     __ stpd(v10, v11, Address(sp, 16));
3890     __ stpd(v12, v13, Address(sp, 32));
3891     __ stpd(v14, v15, Address(sp, 48));
3892 
3893     Label sha512_loop;
3894 
3895     // load state
3896     __ ld1(v8, v9, v10, v11, __ T2D, state);
3897 
3898     // load first 4 round constants
3899     __ lea(rscratch1, ExternalAddress((address)round_consts));
3900     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3901 
3902     __ BIND(sha512_loop);
3903     // load 128B of data into v12..v19
3904     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3905     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3906     __ rev64(v12, __ T16B, v12);
3907     __ rev64(v13, __ T16B, v13);
3908     __ rev64(v14, __ T16B, v14);
3909     __ rev64(v15, __ T16B, v15);
3910     __ rev64(v16, __ T16B, v16);
3911     __ rev64(v17, __ T16B, v17);
3912     __ rev64(v18, __ T16B, v18);
3913     __ rev64(v19, __ T16B, v19);
3914 
3915     __ mov(rscratch2, rscratch1);
3916 
3917     __ mov(v0, __ T16B, v8);
3918     __ mov(v1, __ T16B, v9);
3919     __ mov(v2, __ T16B, v10);
3920     __ mov(v3, __ T16B, v11);
3921 
3922     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3923     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3924     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3925     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3926     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3927     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3928     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3929     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3930     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3931     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3932     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3933     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3934     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3935     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3936     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3937     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3938     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3939     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3940     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3941     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3942     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3943     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3944     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3945     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3946     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3947     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3948     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3949     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3950     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3951     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3952     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3953     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3954     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3955     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3956     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3957     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3958     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3959     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3960     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3961     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3962 
3963     __ addv(v8, __ T2D, v8, v0);
3964     __ addv(v9, __ T2D, v9, v1);
3965     __ addv(v10, __ T2D, v10, v2);
3966     __ addv(v11, __ T2D, v11, v3);
3967 
3968     if (multi_block) {
3969       __ add(ofs, ofs, 128);
3970       __ cmp(ofs, limit);
3971       __ br(Assembler::LE, sha512_loop);
3972       __ mov(c_rarg0, ofs); // return ofs
3973     }
3974 
3975     __ st1(v8, v9, v10, v11, __ T2D, state);
3976 
3977     __ ldpd(v14, v15, Address(sp, 48));
3978     __ ldpd(v12, v13, Address(sp, 32));
3979     __ ldpd(v10, v11, Address(sp, 16));
3980     __ ldpd(v8, v9, __ post(sp, 64));
3981 
3982     __ ret(lr);
3983 
3984     return start;
3985   }
3986 
3987   // Arguments:
3988   //
3989   // Inputs:
3990   //   c_rarg0   - byte[]  source+offset
3991   //   c_rarg1   - byte[]  SHA.state
3992   //   c_rarg2   - int     block_size
3993   //   c_rarg3   - int     offset
3994   //   c_rarg4   - int     limit
3995   //
3996   address generate_sha3_implCompress(bool multi_block, const char *name) {
3997     static const uint64_t round_consts[24] = {
3998       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3999       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4000       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4001       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4002       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4003       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4004       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4005       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4006     };
4007 
4008     __ align(CodeEntryAlignment);
4009     StubCodeMark mark(this, "StubRoutines", name);
4010     address start = __ pc();
4011 
4012     Register buf           = c_rarg0;
4013     Register state         = c_rarg1;
4014     Register block_size    = c_rarg2;
4015     Register ofs           = c_rarg3;
4016     Register limit         = c_rarg4;
4017 
4018     Label sha3_loop, rounds24_loop;
4019     Label sha3_512_or_sha3_384, shake128;
4020 
4021     __ stpd(v8, v9, __ pre(sp, -64));
4022     __ stpd(v10, v11, Address(sp, 16));
4023     __ stpd(v12, v13, Address(sp, 32));
4024     __ stpd(v14, v15, Address(sp, 48));
4025 
4026     // load state
4027     __ add(rscratch1, state, 32);
4028     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4029     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4030     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4031     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4032     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4033     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4034     __ ld1(v24, __ T1D, rscratch1);
4035 
4036     __ BIND(sha3_loop);
4037 
4038     // 24 keccak rounds
4039     __ movw(rscratch2, 24);
4040 
4041     // load round_constants base
4042     __ lea(rscratch1, ExternalAddress((address) round_consts));
4043 
4044     // load input
4045     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4046     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4047     __ eor(v0, __ T8B, v0, v25);
4048     __ eor(v1, __ T8B, v1, v26);
4049     __ eor(v2, __ T8B, v2, v27);
4050     __ eor(v3, __ T8B, v3, v28);
4051     __ eor(v4, __ T8B, v4, v29);
4052     __ eor(v5, __ T8B, v5, v30);
4053     __ eor(v6, __ T8B, v6, v31);
4054 
4055     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4056     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4057 
4058     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4059     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4060     __ eor(v7, __ T8B, v7, v25);
4061     __ eor(v8, __ T8B, v8, v26);
4062     __ eor(v9, __ T8B, v9, v27);
4063     __ eor(v10, __ T8B, v10, v28);
4064     __ eor(v11, __ T8B, v11, v29);
4065     __ eor(v12, __ T8B, v12, v30);
4066     __ eor(v13, __ T8B, v13, v31);
4067 
4068     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4069     __ eor(v14, __ T8B, v14, v25);
4070     __ eor(v15, __ T8B, v15, v26);
4071     __ eor(v16, __ T8B, v16, v27);
4072 
4073     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4074     __ andw(c_rarg5, block_size, 48);
4075     __ cbzw(c_rarg5, rounds24_loop);
4076 
4077     __ tbnz(block_size, 5, shake128);
4078     // block_size == 144, bit5 == 0, SHA3-244
4079     __ ldrd(v28, __ post(buf, 8));
4080     __ eor(v17, __ T8B, v17, v28);
4081     __ b(rounds24_loop);
4082 
4083     __ BIND(shake128);
4084     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4085     __ eor(v17, __ T8B, v17, v28);
4086     __ eor(v18, __ T8B, v18, v29);
4087     __ eor(v19, __ T8B, v19, v30);
4088     __ eor(v20, __ T8B, v20, v31);
4089     __ b(rounds24_loop); // block_size == 168, SHAKE128
4090 
4091     __ BIND(sha3_512_or_sha3_384);
4092     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4093     __ eor(v7, __ T8B, v7, v25);
4094     __ eor(v8, __ T8B, v8, v26);
4095     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4096 
4097     // SHA3-384
4098     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4099     __ eor(v9,  __ T8B, v9,  v27);
4100     __ eor(v10, __ T8B, v10, v28);
4101     __ eor(v11, __ T8B, v11, v29);
4102     __ eor(v12, __ T8B, v12, v30);
4103 
4104     __ BIND(rounds24_loop);
4105     __ subw(rscratch2, rscratch2, 1);
4106 
4107     __ eor3(v29, __ T16B, v4, v9, v14);
4108     __ eor3(v26, __ T16B, v1, v6, v11);
4109     __ eor3(v28, __ T16B, v3, v8, v13);
4110     __ eor3(v25, __ T16B, v0, v5, v10);
4111     __ eor3(v27, __ T16B, v2, v7, v12);
4112     __ eor3(v29, __ T16B, v29, v19, v24);
4113     __ eor3(v26, __ T16B, v26, v16, v21);
4114     __ eor3(v28, __ T16B, v28, v18, v23);
4115     __ eor3(v25, __ T16B, v25, v15, v20);
4116     __ eor3(v27, __ T16B, v27, v17, v22);
4117 
4118     __ rax1(v30, __ T2D, v29, v26);
4119     __ rax1(v26, __ T2D, v26, v28);
4120     __ rax1(v28, __ T2D, v28, v25);
4121     __ rax1(v25, __ T2D, v25, v27);
4122     __ rax1(v27, __ T2D, v27, v29);
4123 
4124     __ eor(v0, __ T16B, v0, v30);
4125     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4126     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4127     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4128     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4129     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4130     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4131     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4132     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4133     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4134     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4135     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4136     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4137     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4138     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4139     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4140     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4141     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4142     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4143     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4144     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4145     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4146     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4147     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4148     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4149 
4150     __ bcax(v20, __ T16B, v31, v22, v8);
4151     __ bcax(v21, __ T16B, v8,  v23, v22);
4152     __ bcax(v22, __ T16B, v22, v24, v23);
4153     __ bcax(v23, __ T16B, v23, v31, v24);
4154     __ bcax(v24, __ T16B, v24, v8,  v31);
4155 
4156     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4157 
4158     __ bcax(v17, __ T16B, v25, v19, v3);
4159     __ bcax(v18, __ T16B, v3,  v15, v19);
4160     __ bcax(v19, __ T16B, v19, v16, v15);
4161     __ bcax(v15, __ T16B, v15, v25, v16);
4162     __ bcax(v16, __ T16B, v16, v3,  v25);
4163 
4164     __ bcax(v10, __ T16B, v29, v12, v26);
4165     __ bcax(v11, __ T16B, v26, v13, v12);
4166     __ bcax(v12, __ T16B, v12, v14, v13);
4167     __ bcax(v13, __ T16B, v13, v29, v14);
4168     __ bcax(v14, __ T16B, v14, v26, v29);
4169 
4170     __ bcax(v7, __ T16B, v30, v9,  v4);
4171     __ bcax(v8, __ T16B, v4,  v5,  v9);
4172     __ bcax(v9, __ T16B, v9,  v6,  v5);
4173     __ bcax(v5, __ T16B, v5,  v30, v6);
4174     __ bcax(v6, __ T16B, v6,  v4,  v30);
4175 
4176     __ bcax(v3, __ T16B, v27, v0,  v28);
4177     __ bcax(v4, __ T16B, v28, v1,  v0);
4178     __ bcax(v0, __ T16B, v0,  v2,  v1);
4179     __ bcax(v1, __ T16B, v1,  v27, v2);
4180     __ bcax(v2, __ T16B, v2,  v28, v27);
4181 
4182     __ eor(v0, __ T16B, v0, v31);
4183 
4184     __ cbnzw(rscratch2, rounds24_loop);
4185 
4186     if (multi_block) {
4187       __ add(ofs, ofs, block_size);
4188       __ cmp(ofs, limit);
4189       __ br(Assembler::LE, sha3_loop);
4190       __ mov(c_rarg0, ofs); // return ofs
4191     }
4192 
4193     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4194     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4195     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4196     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4197     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4198     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4199     __ st1(v24, __ T1D, state);
4200 
4201     __ ldpd(v14, v15, Address(sp, 48));
4202     __ ldpd(v12, v13, Address(sp, 32));
4203     __ ldpd(v10, v11, Address(sp, 16));
4204     __ ldpd(v8, v9, __ post(sp, 64));
4205 
4206     __ ret(lr);
4207 
4208     return start;
4209   }
4210 
4211   /**
4212    *  Arguments:
4213    *
4214    * Inputs:
4215    *   c_rarg0   - int crc
4216    *   c_rarg1   - byte* buf
4217    *   c_rarg2   - int length
4218    *
4219    * Output:
4220    *       rax   - int crc result
4221    */
4222   address generate_updateBytesCRC32() {
4223     assert(UseCRC32Intrinsics, "what are we doing here?");
4224 
4225     __ align(CodeEntryAlignment);
4226     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4227 
4228     address start = __ pc();
4229 
4230     const Register crc   = c_rarg0;  // crc
4231     const Register buf   = c_rarg1;  // source java byte array address
4232     const Register len   = c_rarg2;  // length
4233     const Register table0 = c_rarg3; // crc_table address
4234     const Register table1 = c_rarg4;
4235     const Register table2 = c_rarg5;
4236     const Register table3 = c_rarg6;
4237     const Register tmp3 = c_rarg7;
4238 
4239     BLOCK_COMMENT("Entry:");
4240     __ enter(); // required for proper stackwalking of RuntimeStub frame
4241 
4242     __ kernel_crc32(crc, buf, len,
4243               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4244 
4245     __ leave(); // required for proper stackwalking of RuntimeStub frame
4246     __ ret(lr);
4247 
4248     return start;
4249   }
4250 
4251   // ChaCha20 block function.  This version parallelizes by loading
4252   // individual 32-bit state elements into vectors for four blocks
4253   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4254   //
4255   // state (int[16]) = c_rarg0
4256   // keystream (byte[1024]) = c_rarg1
4257   // return - number of bytes of keystream (always 256)
4258   address generate_chacha20Block_blockpar() {
4259     Label L_twoRounds, L_cc20_const;
4260     // The constant data is broken into two 128-bit segments to be loaded
4261     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4262     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4263     // The second 128-bits is a table constant used for 8-bit left rotations.
4264     __ BIND(L_cc20_const);
4265     __ emit_int64(0x0000000100000000UL);
4266     __ emit_int64(0x0000000300000002UL);
4267     __ emit_int64(0x0605040702010003UL);
4268     __ emit_int64(0x0E0D0C0F0A09080BUL);
4269 
4270     __ align(CodeEntryAlignment);
4271     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4272     address start = __ pc();
4273     __ enter();
4274 
4275     int i, j;
4276     const Register state = c_rarg0;
4277     const Register keystream = c_rarg1;
4278     const Register loopCtr = r10;
4279     const Register tmpAddr = r11;
4280 
4281     const FloatRegister stateFirst = v0;
4282     const FloatRegister stateSecond = v1;
4283     const FloatRegister stateThird = v2;
4284     const FloatRegister stateFourth = v3;
4285     const FloatRegister origCtrState = v28;
4286     const FloatRegister scratch = v29;
4287     const FloatRegister lrot8Tbl = v30;
4288 
4289     // Organize SIMD registers in an array that facilitates
4290     // putting repetitive opcodes into loop structures.  It is
4291     // important that each grouping of 4 registers is monotonically
4292     // increasing to support the requirements of multi-register
4293     // instructions (e.g. ld4r, st4, etc.)
4294     const FloatRegister workSt[16] = {
4295          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4296         v20, v21, v22, v23, v24, v25, v26, v27
4297     };
4298 
4299     // Load from memory and interlace across 16 SIMD registers,
4300     // With each word from memory being broadcast to all lanes of
4301     // each successive SIMD register.
4302     //      Addr(0) -> All lanes in workSt[i]
4303     //      Addr(4) -> All lanes workSt[i + 1], etc.
4304     __ mov(tmpAddr, state);
4305     for (i = 0; i < 16; i += 4) {
4306       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4307           __ post(tmpAddr, 16));
4308     }
4309 
4310     // Pull in constant data.  The first 16 bytes are the add overlay
4311     // which is applied to the vector holding the counter (state[12]).
4312     // The second 16 bytes is the index register for the 8-bit left
4313     // rotation tbl instruction.
4314     __ adr(tmpAddr, L_cc20_const);
4315     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4316     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4317 
4318     // Set up the 10 iteration loop and perform all 8 quarter round ops
4319     __ mov(loopCtr, 10);
4320     __ BIND(L_twoRounds);
4321 
4322     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4323         scratch, lrot8Tbl);
4324     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4325         scratch, lrot8Tbl);
4326     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4327         scratch, lrot8Tbl);
4328     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4329         scratch, lrot8Tbl);
4330 
4331     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4332         scratch, lrot8Tbl);
4333     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4334         scratch, lrot8Tbl);
4335     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4336         scratch, lrot8Tbl);
4337     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4338         scratch, lrot8Tbl);
4339 
4340     // Decrement and iterate
4341     __ sub(loopCtr, loopCtr, 1);
4342     __ cbnz(loopCtr, L_twoRounds);
4343 
4344     __ mov(tmpAddr, state);
4345 
4346     // Add the starting state back to the post-loop keystream
4347     // state.  We read/interlace the state array from memory into
4348     // 4 registers similar to what we did in the beginning.  Then
4349     // add the counter overlay onto workSt[12] at the end.
4350     for (i = 0; i < 16; i += 4) {
4351       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4352           __ post(tmpAddr, 16));
4353       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4354       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4355       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4356       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4357     }
4358     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4359 
4360     // Write to key stream, storing the same element out of workSt[0..15]
4361     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4362     // for the next element position.
4363     for (i = 0; i < 4; i++) {
4364       for (j = 0; j < 16; j += 4) {
4365         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4366             __ post(keystream, 16));
4367       }
4368     }
4369 
4370     __ mov(r0, 256);             // Return length of output keystream
4371     __ leave();
4372     __ ret(lr);
4373 
4374     return start;
4375   }
4376 
4377   /**
4378    *  Arguments:
4379    *
4380    * Inputs:
4381    *   c_rarg0   - int crc
4382    *   c_rarg1   - byte* buf
4383    *   c_rarg2   - int length
4384    *   c_rarg3   - int* table
4385    *
4386    * Output:
4387    *       r0   - int crc result
4388    */
4389   address generate_updateBytesCRC32C() {
4390     assert(UseCRC32CIntrinsics, "what are we doing here?");
4391 
4392     __ align(CodeEntryAlignment);
4393     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4394 
4395     address start = __ pc();
4396 
4397     const Register crc   = c_rarg0;  // crc
4398     const Register buf   = c_rarg1;  // source java byte array address
4399     const Register len   = c_rarg2;  // length
4400     const Register table0 = c_rarg3; // crc_table address
4401     const Register table1 = c_rarg4;
4402     const Register table2 = c_rarg5;
4403     const Register table3 = c_rarg6;
4404     const Register tmp3 = c_rarg7;
4405 
4406     BLOCK_COMMENT("Entry:");
4407     __ enter(); // required for proper stackwalking of RuntimeStub frame
4408 
4409     __ kernel_crc32c(crc, buf, len,
4410               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4411 
4412     __ leave(); // required for proper stackwalking of RuntimeStub frame
4413     __ ret(lr);
4414 
4415     return start;
4416   }
4417 
4418   /***
4419    *  Arguments:
4420    *
4421    *  Inputs:
4422    *   c_rarg0   - int   adler
4423    *   c_rarg1   - byte* buff
4424    *   c_rarg2   - int   len
4425    *
4426    * Output:
4427    *   c_rarg0   - int adler result
4428    */
4429   address generate_updateBytesAdler32() {
4430     __ align(CodeEntryAlignment);
4431     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4432     address start = __ pc();
4433 
4434     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4435 
4436     // Aliases
4437     Register adler  = c_rarg0;
4438     Register s1     = c_rarg0;
4439     Register s2     = c_rarg3;
4440     Register buff   = c_rarg1;
4441     Register len    = c_rarg2;
4442     Register nmax  = r4;
4443     Register base  = r5;
4444     Register count = r6;
4445     Register temp0 = rscratch1;
4446     Register temp1 = rscratch2;
4447     FloatRegister vbytes = v0;
4448     FloatRegister vs1acc = v1;
4449     FloatRegister vs2acc = v2;
4450     FloatRegister vtable = v3;
4451 
4452     // Max number of bytes we can process before having to take the mod
4453     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4454     uint64_t BASE = 0xfff1;
4455     uint64_t NMAX = 0x15B0;
4456 
4457     __ mov(base, BASE);
4458     __ mov(nmax, NMAX);
4459 
4460     // Load accumulation coefficients for the upper 16 bits
4461     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4462     __ ld1(vtable, __ T16B, Address(temp0));
4463 
4464     // s1 is initialized to the lower 16 bits of adler
4465     // s2 is initialized to the upper 16 bits of adler
4466     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4467     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4468 
4469     // The pipelined loop needs at least 16 elements for 1 iteration
4470     // It does check this, but it is more effective to skip to the cleanup loop
4471     __ cmp(len, (u1)16);
4472     __ br(Assembler::HS, L_nmax);
4473     __ cbz(len, L_combine);
4474 
4475     __ bind(L_simple_by1_loop);
4476     __ ldrb(temp0, Address(__ post(buff, 1)));
4477     __ add(s1, s1, temp0);
4478     __ add(s2, s2, s1);
4479     __ subs(len, len, 1);
4480     __ br(Assembler::HI, L_simple_by1_loop);
4481 
4482     // s1 = s1 % BASE
4483     __ subs(temp0, s1, base);
4484     __ csel(s1, temp0, s1, Assembler::HS);
4485 
4486     // s2 = s2 % BASE
4487     __ lsr(temp0, s2, 16);
4488     __ lsl(temp1, temp0, 4);
4489     __ sub(temp1, temp1, temp0);
4490     __ add(s2, temp1, s2, ext::uxth);
4491 
4492     __ subs(temp0, s2, base);
4493     __ csel(s2, temp0, s2, Assembler::HS);
4494 
4495     __ b(L_combine);
4496 
4497     __ bind(L_nmax);
4498     __ subs(len, len, nmax);
4499     __ sub(count, nmax, 16);
4500     __ br(Assembler::LO, L_by16);
4501 
4502     __ bind(L_nmax_loop);
4503 
4504     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4505                                       vbytes, vs1acc, vs2acc, vtable);
4506 
4507     __ subs(count, count, 16);
4508     __ br(Assembler::HS, L_nmax_loop);
4509 
4510     // s1 = s1 % BASE
4511     __ lsr(temp0, s1, 16);
4512     __ lsl(temp1, temp0, 4);
4513     __ sub(temp1, temp1, temp0);
4514     __ add(temp1, temp1, s1, ext::uxth);
4515 
4516     __ lsr(temp0, temp1, 16);
4517     __ lsl(s1, temp0, 4);
4518     __ sub(s1, s1, temp0);
4519     __ add(s1, s1, temp1, ext:: uxth);
4520 
4521     __ subs(temp0, s1, base);
4522     __ csel(s1, temp0, s1, Assembler::HS);
4523 
4524     // s2 = s2 % BASE
4525     __ lsr(temp0, s2, 16);
4526     __ lsl(temp1, temp0, 4);
4527     __ sub(temp1, temp1, temp0);
4528     __ add(temp1, temp1, s2, ext::uxth);
4529 
4530     __ lsr(temp0, temp1, 16);
4531     __ lsl(s2, temp0, 4);
4532     __ sub(s2, s2, temp0);
4533     __ add(s2, s2, temp1, ext:: uxth);
4534 
4535     __ subs(temp0, s2, base);
4536     __ csel(s2, temp0, s2, Assembler::HS);
4537 
4538     __ subs(len, len, nmax);
4539     __ sub(count, nmax, 16);
4540     __ br(Assembler::HS, L_nmax_loop);
4541 
4542     __ bind(L_by16);
4543     __ adds(len, len, count);
4544     __ br(Assembler::LO, L_by1);
4545 
4546     __ bind(L_by16_loop);
4547 
4548     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4549                                       vbytes, vs1acc, vs2acc, vtable);
4550 
4551     __ subs(len, len, 16);
4552     __ br(Assembler::HS, L_by16_loop);
4553 
4554     __ bind(L_by1);
4555     __ adds(len, len, 15);
4556     __ br(Assembler::LO, L_do_mod);
4557 
4558     __ bind(L_by1_loop);
4559     __ ldrb(temp0, Address(__ post(buff, 1)));
4560     __ add(s1, temp0, s1);
4561     __ add(s2, s2, s1);
4562     __ subs(len, len, 1);
4563     __ br(Assembler::HS, L_by1_loop);
4564 
4565     __ bind(L_do_mod);
4566     // s1 = s1 % BASE
4567     __ lsr(temp0, s1, 16);
4568     __ lsl(temp1, temp0, 4);
4569     __ sub(temp1, temp1, temp0);
4570     __ add(temp1, temp1, s1, ext::uxth);
4571 
4572     __ lsr(temp0, temp1, 16);
4573     __ lsl(s1, temp0, 4);
4574     __ sub(s1, s1, temp0);
4575     __ add(s1, s1, temp1, ext:: uxth);
4576 
4577     __ subs(temp0, s1, base);
4578     __ csel(s1, temp0, s1, Assembler::HS);
4579 
4580     // s2 = s2 % BASE
4581     __ lsr(temp0, s2, 16);
4582     __ lsl(temp1, temp0, 4);
4583     __ sub(temp1, temp1, temp0);
4584     __ add(temp1, temp1, s2, ext::uxth);
4585 
4586     __ lsr(temp0, temp1, 16);
4587     __ lsl(s2, temp0, 4);
4588     __ sub(s2, s2, temp0);
4589     __ add(s2, s2, temp1, ext:: uxth);
4590 
4591     __ subs(temp0, s2, base);
4592     __ csel(s2, temp0, s2, Assembler::HS);
4593 
4594     // Combine lower bits and higher bits
4595     __ bind(L_combine);
4596     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4597 
4598     __ ret(lr);
4599 
4600     return start;
4601   }
4602 
4603   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4604           Register temp0, Register temp1, FloatRegister vbytes,
4605           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4606     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4607     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4608     // In non-vectorized code, we update s1 and s2 as:
4609     //   s1 <- s1 + b1
4610     //   s2 <- s2 + s1
4611     //   s1 <- s1 + b2
4612     //   s2 <- s2 + b1
4613     //   ...
4614     //   s1 <- s1 + b16
4615     //   s2 <- s2 + s1
4616     // Putting above assignments together, we have:
4617     //   s1_new = s1 + b1 + b2 + ... + b16
4618     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4619     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4620     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4621     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4622 
4623     // s2 = s2 + s1 * 16
4624     __ add(s2, s2, s1, Assembler::LSL, 4);
4625 
4626     // vs1acc = b1 + b2 + b3 + ... + b16
4627     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4628     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4629     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4630     __ uaddlv(vs1acc, __ T16B, vbytes);
4631     __ uaddlv(vs2acc, __ T8H, vs2acc);
4632 
4633     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4634     __ fmovd(temp0, vs1acc);
4635     __ fmovd(temp1, vs2acc);
4636     __ add(s1, s1, temp0);
4637     __ add(s2, s2, temp1);
4638   }
4639 
4640   /**
4641    *  Arguments:
4642    *
4643    *  Input:
4644    *    c_rarg0   - x address
4645    *    c_rarg1   - x length
4646    *    c_rarg2   - y address
4647    *    c_rarg3   - y length
4648    *    c_rarg4   - z address
4649    */
4650   address generate_multiplyToLen() {
4651     __ align(CodeEntryAlignment);
4652     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4653 
4654     address start = __ pc();
4655  
4656     if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
4657       return start;
4658     }
4659     const Register x     = r0;
4660     const Register xlen  = r1;
4661     const Register y     = r2;
4662     const Register ylen  = r3;
4663     const Register z     = r4;
4664 
4665     const Register tmp0  = r5;
4666     const Register tmp1  = r10;
4667     const Register tmp2  = r11;
4668     const Register tmp3  = r12;
4669     const Register tmp4  = r13;
4670     const Register tmp5  = r14;
4671     const Register tmp6  = r15;
4672     const Register tmp7  = r16;
4673 
4674     BLOCK_COMMENT("Entry:");
4675     __ enter(); // required for proper stackwalking of RuntimeStub frame
4676     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4677     __ leave(); // required for proper stackwalking of RuntimeStub frame
4678     __ ret(lr);
4679 
4680     SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
4681     return start;
4682   }
4683 
4684   address generate_squareToLen() {
4685     // squareToLen algorithm for sizes 1..127 described in java code works
4686     // faster than multiply_to_len on some CPUs and slower on others, but
4687     // multiply_to_len shows a bit better overall results
4688     __ align(CodeEntryAlignment);
4689     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4690     address start = __ pc();
4691 
4692     if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
4693       return start;
4694     }
4695     const Register x     = r0;
4696     const Register xlen  = r1;
4697     const Register z     = r2;
4698     const Register y     = r4; // == x
4699     const Register ylen  = r5; // == xlen
4700 
4701     const Register tmp0  = r3;
4702     const Register tmp1  = r10;
4703     const Register tmp2  = r11;
4704     const Register tmp3  = r12;
4705     const Register tmp4  = r13;
4706     const Register tmp5  = r14;
4707     const Register tmp6  = r15;
4708     const Register tmp7  = r16;
4709 
4710     RegSet spilled_regs = RegSet::of(y, ylen);
4711     BLOCK_COMMENT("Entry:");
4712     __ enter();
4713     __ push(spilled_regs, sp);
4714     __ mov(y, x);
4715     __ mov(ylen, xlen);
4716     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4717     __ pop(spilled_regs, sp);
4718     __ leave();
4719     __ ret(lr);
4720 
4721     SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
4722     return start;
4723   }
4724 
4725   address generate_mulAdd() {
4726     __ align(CodeEntryAlignment);
4727     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4728 
4729     address start = __ pc();
4730 
4731     if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
4732       return start;
4733     }
4734     const Register out     = r0;
4735     const Register in      = r1;
4736     const Register offset  = r2;
4737     const Register len     = r3;
4738     const Register k       = r4;
4739 
4740     BLOCK_COMMENT("Entry:");
4741     __ enter();
4742     __ mul_add(out, in, offset, len, k);
4743     __ leave();
4744     __ ret(lr);
4745 
4746     SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
4747     return start;
4748   }
4749 
4750   // Arguments:
4751   //
4752   // Input:
4753   //   c_rarg0   - newArr address
4754   //   c_rarg1   - oldArr address
4755   //   c_rarg2   - newIdx
4756   //   c_rarg3   - shiftCount
4757   //   c_rarg4   - numIter
4758   //
4759   address generate_bigIntegerRightShift() {
4760     __ align(CodeEntryAlignment);
4761     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4762     address start = __ pc();
4763 
4764     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4765 
4766     Register newArr        = c_rarg0;
4767     Register oldArr        = c_rarg1;
4768     Register newIdx        = c_rarg2;
4769     Register shiftCount    = c_rarg3;
4770     Register numIter       = c_rarg4;
4771     Register idx           = numIter;
4772 
4773     Register newArrCur     = rscratch1;
4774     Register shiftRevCount = rscratch2;
4775     Register oldArrCur     = r13;
4776     Register oldArrNext    = r14;
4777 
4778     FloatRegister oldElem0        = v0;
4779     FloatRegister oldElem1        = v1;
4780     FloatRegister newElem         = v2;
4781     FloatRegister shiftVCount     = v3;
4782     FloatRegister shiftVRevCount  = v4;
4783 
4784     __ cbz(idx, Exit);
4785 
4786     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4787 
4788     // left shift count
4789     __ movw(shiftRevCount, 32);
4790     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4791 
4792     // numIter too small to allow a 4-words SIMD loop, rolling back
4793     __ cmp(numIter, (u1)4);
4794     __ br(Assembler::LT, ShiftThree);
4795 
4796     __ dup(shiftVCount,    __ T4S, shiftCount);
4797     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4798     __ negr(shiftVCount,   __ T4S, shiftVCount);
4799 
4800     __ BIND(ShiftSIMDLoop);
4801 
4802     // Calculate the load addresses
4803     __ sub(idx, idx, 4);
4804     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4805     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4806     __ add(oldArrCur,  oldArrNext, 4);
4807 
4808     // Load 4 words and process
4809     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4810     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4811     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4812     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4813     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4814     __ st1(newElem,   __ T4S,  Address(newArrCur));
4815 
4816     __ cmp(idx, (u1)4);
4817     __ br(Assembler::LT, ShiftTwoLoop);
4818     __ b(ShiftSIMDLoop);
4819 
4820     __ BIND(ShiftTwoLoop);
4821     __ cbz(idx, Exit);
4822     __ cmp(idx, (u1)1);
4823     __ br(Assembler::EQ, ShiftOne);
4824 
4825     // Calculate the load addresses
4826     __ sub(idx, idx, 2);
4827     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4828     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4829     __ add(oldArrCur,  oldArrNext, 4);
4830 
4831     // Load 2 words and process
4832     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4833     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4834     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4835     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4836     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4837     __ st1(newElem,   __ T2S, Address(newArrCur));
4838     __ b(ShiftTwoLoop);
4839 
4840     __ BIND(ShiftThree);
4841     __ tbz(idx, 1, ShiftOne);
4842     __ tbz(idx, 0, ShiftTwo);
4843     __ ldrw(r10,  Address(oldArr, 12));
4844     __ ldrw(r11,  Address(oldArr, 8));
4845     __ lsrvw(r10, r10, shiftCount);
4846     __ lslvw(r11, r11, shiftRevCount);
4847     __ orrw(r12,  r10, r11);
4848     __ strw(r12,  Address(newArr, 8));
4849 
4850     __ BIND(ShiftTwo);
4851     __ ldrw(r10,  Address(oldArr, 8));
4852     __ ldrw(r11,  Address(oldArr, 4));
4853     __ lsrvw(r10, r10, shiftCount);
4854     __ lslvw(r11, r11, shiftRevCount);
4855     __ orrw(r12,  r10, r11);
4856     __ strw(r12,  Address(newArr, 4));
4857 
4858     __ BIND(ShiftOne);
4859     __ ldrw(r10,  Address(oldArr, 4));
4860     __ ldrw(r11,  Address(oldArr));
4861     __ lsrvw(r10, r10, shiftCount);
4862     __ lslvw(r11, r11, shiftRevCount);
4863     __ orrw(r12,  r10, r11);
4864     __ strw(r12,  Address(newArr));
4865 
4866     __ BIND(Exit);
4867     __ ret(lr);
4868 
4869     return start;
4870   }
4871 
4872   // Arguments:
4873   //
4874   // Input:
4875   //   c_rarg0   - newArr address
4876   //   c_rarg1   - oldArr address
4877   //   c_rarg2   - newIdx
4878   //   c_rarg3   - shiftCount
4879   //   c_rarg4   - numIter
4880   //
4881   address generate_bigIntegerLeftShift() {
4882     __ align(CodeEntryAlignment);
4883     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4884     address start = __ pc();
4885 
4886     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4887 
4888     Register newArr        = c_rarg0;
4889     Register oldArr        = c_rarg1;
4890     Register newIdx        = c_rarg2;
4891     Register shiftCount    = c_rarg3;
4892     Register numIter       = c_rarg4;
4893 
4894     Register shiftRevCount = rscratch1;
4895     Register oldArrNext    = rscratch2;
4896 
4897     FloatRegister oldElem0        = v0;
4898     FloatRegister oldElem1        = v1;
4899     FloatRegister newElem         = v2;
4900     FloatRegister shiftVCount     = v3;
4901     FloatRegister shiftVRevCount  = v4;
4902 
4903     __ cbz(numIter, Exit);
4904 
4905     __ add(oldArrNext, oldArr, 4);
4906     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4907 
4908     // right shift count
4909     __ movw(shiftRevCount, 32);
4910     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4911 
4912     // numIter too small to allow a 4-words SIMD loop, rolling back
4913     __ cmp(numIter, (u1)4);
4914     __ br(Assembler::LT, ShiftThree);
4915 
4916     __ dup(shiftVCount,     __ T4S, shiftCount);
4917     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4918     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4919 
4920     __ BIND(ShiftSIMDLoop);
4921 
4922     // load 4 words and process
4923     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4924     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4925     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4926     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4927     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4928     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4929     __ sub(numIter,   numIter, 4);
4930 
4931     __ cmp(numIter, (u1)4);
4932     __ br(Assembler::LT, ShiftTwoLoop);
4933     __ b(ShiftSIMDLoop);
4934 
4935     __ BIND(ShiftTwoLoop);
4936     __ cbz(numIter, Exit);
4937     __ cmp(numIter, (u1)1);
4938     __ br(Assembler::EQ, ShiftOne);
4939 
4940     // load 2 words and process
4941     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4942     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4943     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4944     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4945     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4946     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4947     __ sub(numIter,   numIter, 2);
4948     __ b(ShiftTwoLoop);
4949 
4950     __ BIND(ShiftThree);
4951     __ ldrw(r10,  __ post(oldArr, 4));
4952     __ ldrw(r11,  __ post(oldArrNext, 4));
4953     __ lslvw(r10, r10, shiftCount);
4954     __ lsrvw(r11, r11, shiftRevCount);
4955     __ orrw(r12,  r10, r11);
4956     __ strw(r12,  __ post(newArr, 4));
4957     __ tbz(numIter, 1, Exit);
4958     __ tbz(numIter, 0, ShiftOne);
4959 
4960     __ BIND(ShiftTwo);
4961     __ ldrw(r10,  __ post(oldArr, 4));
4962     __ ldrw(r11,  __ post(oldArrNext, 4));
4963     __ lslvw(r10, r10, shiftCount);
4964     __ lsrvw(r11, r11, shiftRevCount);
4965     __ orrw(r12,  r10, r11);
4966     __ strw(r12,  __ post(newArr, 4));
4967 
4968     __ BIND(ShiftOne);
4969     __ ldrw(r10,  Address(oldArr));
4970     __ ldrw(r11,  Address(oldArrNext));
4971     __ lslvw(r10, r10, shiftCount);
4972     __ lsrvw(r11, r11, shiftRevCount);
4973     __ orrw(r12,  r10, r11);
4974     __ strw(r12,  Address(newArr));
4975 
4976     __ BIND(Exit);
4977     __ ret(lr);
4978 
4979     return start;
4980   }
4981 
4982   address generate_count_positives(address &count_positives_long) {
4983     const u1 large_loop_size = 64;
4984     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4985     int dcache_line = VM_Version::dcache_line_size();
4986 
4987     Register ary1 = r1, len = r2, result = r0;
4988 
4989     __ align(CodeEntryAlignment);
4990 
4991     StubCodeMark mark(this, "StubRoutines", "count_positives");
4992 
4993     address entry = __ pc();
4994 
4995     __ enter();
4996     // precondition: a copy of len is already in result
4997     // __ mov(result, len);
4998 
4999   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5000         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5001 
5002   __ cmp(len, (u1)15);
5003   __ br(Assembler::GT, LEN_OVER_15);
5004   // The only case when execution falls into this code is when pointer is near
5005   // the end of memory page and we have to avoid reading next page
5006   __ add(ary1, ary1, len);
5007   __ subs(len, len, 8);
5008   __ br(Assembler::GT, LEN_OVER_8);
5009   __ ldr(rscratch2, Address(ary1, -8));
5010   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5011   __ lsrv(rscratch2, rscratch2, rscratch1);
5012   __ tst(rscratch2, UPPER_BIT_MASK);
5013   __ csel(result, zr, result, Assembler::NE);
5014   __ leave();
5015   __ ret(lr);
5016   __ bind(LEN_OVER_8);
5017   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5018   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5019   __ tst(rscratch2, UPPER_BIT_MASK);
5020   __ br(Assembler::NE, RET_NO_POP);
5021   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5022   __ lsrv(rscratch1, rscratch1, rscratch2);
5023   __ tst(rscratch1, UPPER_BIT_MASK);
5024   __ bind(RET_NO_POP);
5025   __ csel(result, zr, result, Assembler::NE);
5026   __ leave();
5027   __ ret(lr);
5028 
5029   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5030   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5031 
5032   count_positives_long = __ pc(); // 2nd entry point
5033 
5034   __ enter();
5035 
5036   __ bind(LEN_OVER_15);
5037     __ push(spilled_regs, sp);
5038     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5039     __ cbz(rscratch2, ALIGNED);
5040     __ ldp(tmp6, tmp1, Address(ary1));
5041     __ mov(tmp5, 16);
5042     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5043     __ add(ary1, ary1, rscratch1);
5044     __ orr(tmp6, tmp6, tmp1);
5045     __ tst(tmp6, UPPER_BIT_MASK);
5046     __ br(Assembler::NE, RET_ADJUST);
5047     __ sub(len, len, rscratch1);
5048 
5049   __ bind(ALIGNED);
5050     __ cmp(len, large_loop_size);
5051     __ br(Assembler::LT, CHECK_16);
5052     // Perform 16-byte load as early return in pre-loop to handle situation
5053     // when initially aligned large array has negative values at starting bytes,
5054     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5055     // slower. Cases with negative bytes further ahead won't be affected that
5056     // much. In fact, it'll be faster due to early loads, less instructions and
5057     // less branches in LARGE_LOOP.
5058     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5059     __ sub(len, len, 16);
5060     __ orr(tmp6, tmp6, tmp1);
5061     __ tst(tmp6, UPPER_BIT_MASK);
5062     __ br(Assembler::NE, RET_ADJUST_16);
5063     __ cmp(len, large_loop_size);
5064     __ br(Assembler::LT, CHECK_16);
5065 
5066     if (SoftwarePrefetchHintDistance >= 0
5067         && SoftwarePrefetchHintDistance >= dcache_line) {
5068       // initial prefetch
5069       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5070     }
5071   __ bind(LARGE_LOOP);
5072     if (SoftwarePrefetchHintDistance >= 0) {
5073       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5074     }
5075     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5076     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5077     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5078     // instructions per cycle and have less branches, but this approach disables
5079     // early return, thus, all 64 bytes are loaded and checked every time.
5080     __ ldp(tmp2, tmp3, Address(ary1));
5081     __ ldp(tmp4, tmp5, Address(ary1, 16));
5082     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5083     __ ldp(tmp6, tmp1, Address(ary1, 48));
5084     __ add(ary1, ary1, large_loop_size);
5085     __ sub(len, len, large_loop_size);
5086     __ orr(tmp2, tmp2, tmp3);
5087     __ orr(tmp4, tmp4, tmp5);
5088     __ orr(rscratch1, rscratch1, rscratch2);
5089     __ orr(tmp6, tmp6, tmp1);
5090     __ orr(tmp2, tmp2, tmp4);
5091     __ orr(rscratch1, rscratch1, tmp6);
5092     __ orr(tmp2, tmp2, rscratch1);
5093     __ tst(tmp2, UPPER_BIT_MASK);
5094     __ br(Assembler::NE, RET_ADJUST_LONG);
5095     __ cmp(len, large_loop_size);
5096     __ br(Assembler::GE, LARGE_LOOP);
5097 
5098   __ bind(CHECK_16); // small 16-byte load pre-loop
5099     __ cmp(len, (u1)16);
5100     __ br(Assembler::LT, POST_LOOP16);
5101 
5102   __ bind(LOOP16); // small 16-byte load loop
5103     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5104     __ sub(len, len, 16);
5105     __ orr(tmp2, tmp2, tmp3);
5106     __ tst(tmp2, UPPER_BIT_MASK);
5107     __ br(Assembler::NE, RET_ADJUST_16);
5108     __ cmp(len, (u1)16);
5109     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5110 
5111   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5112     __ cmp(len, (u1)8);
5113     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5114     __ ldr(tmp3, Address(__ post(ary1, 8)));
5115     __ tst(tmp3, UPPER_BIT_MASK);
5116     __ br(Assembler::NE, RET_ADJUST);
5117     __ sub(len, len, 8);
5118 
5119   __ bind(POST_LOOP16_LOAD_TAIL);
5120     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5121     __ ldr(tmp1, Address(ary1));
5122     __ mov(tmp2, 64);
5123     __ sub(tmp4, tmp2, len, __ LSL, 3);
5124     __ lslv(tmp1, tmp1, tmp4);
5125     __ tst(tmp1, UPPER_BIT_MASK);
5126     __ br(Assembler::NE, RET_ADJUST);
5127     // Fallthrough
5128 
5129   __ bind(RET_LEN);
5130     __ pop(spilled_regs, sp);
5131     __ leave();
5132     __ ret(lr);
5133 
5134     // difference result - len is the count of guaranteed to be
5135     // positive bytes
5136 
5137   __ bind(RET_ADJUST_LONG);
5138     __ add(len, len, (u1)(large_loop_size - 16));
5139   __ bind(RET_ADJUST_16);
5140     __ add(len, len, 16);
5141   __ bind(RET_ADJUST);
5142     __ pop(spilled_regs, sp);
5143     __ leave();
5144     __ sub(result, result, len);
5145     __ ret(lr);
5146 
5147     return entry;
5148   }
5149 
5150   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5151         bool usePrefetch, Label &NOT_EQUAL) {
5152     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5153         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5154         tmp7 = r12, tmp8 = r13;
5155     Label LOOP;
5156 
5157     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5158     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5159     __ bind(LOOP);
5160     if (usePrefetch) {
5161       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5162       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5163     }
5164     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5165     __ eor(tmp1, tmp1, tmp2);
5166     __ eor(tmp3, tmp3, tmp4);
5167     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5168     __ orr(tmp1, tmp1, tmp3);
5169     __ cbnz(tmp1, NOT_EQUAL);
5170     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5171     __ eor(tmp5, tmp5, tmp6);
5172     __ eor(tmp7, tmp7, tmp8);
5173     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5174     __ orr(tmp5, tmp5, tmp7);
5175     __ cbnz(tmp5, NOT_EQUAL);
5176     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5177     __ eor(tmp1, tmp1, tmp2);
5178     __ eor(tmp3, tmp3, tmp4);
5179     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5180     __ orr(tmp1, tmp1, tmp3);
5181     __ cbnz(tmp1, NOT_EQUAL);
5182     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5183     __ eor(tmp5, tmp5, tmp6);
5184     __ sub(cnt1, cnt1, 8 * wordSize);
5185     __ eor(tmp7, tmp7, tmp8);
5186     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5187     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5188     // cmp) because subs allows an unlimited range of immediate operand.
5189     __ subs(tmp6, cnt1, loopThreshold);
5190     __ orr(tmp5, tmp5, tmp7);
5191     __ cbnz(tmp5, NOT_EQUAL);
5192     __ br(__ GE, LOOP);
5193     // post-loop
5194     __ eor(tmp1, tmp1, tmp2);
5195     __ eor(tmp3, tmp3, tmp4);
5196     __ orr(tmp1, tmp1, tmp3);
5197     __ sub(cnt1, cnt1, 2 * wordSize);
5198     __ cbnz(tmp1, NOT_EQUAL);
5199   }
5200 
5201   void generate_large_array_equals_loop_simd(int loopThreshold,
5202         bool usePrefetch, Label &NOT_EQUAL) {
5203     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5204         tmp2 = rscratch2;
5205     Label LOOP;
5206 
5207     __ bind(LOOP);
5208     if (usePrefetch) {
5209       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5210       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5211     }
5212     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5213     __ sub(cnt1, cnt1, 8 * wordSize);
5214     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5215     __ subs(tmp1, cnt1, loopThreshold);
5216     __ eor(v0, __ T16B, v0, v4);
5217     __ eor(v1, __ T16B, v1, v5);
5218     __ eor(v2, __ T16B, v2, v6);
5219     __ eor(v3, __ T16B, v3, v7);
5220     __ orr(v0, __ T16B, v0, v1);
5221     __ orr(v1, __ T16B, v2, v3);
5222     __ orr(v0, __ T16B, v0, v1);
5223     __ umov(tmp1, v0, __ D, 0);
5224     __ umov(tmp2, v0, __ D, 1);
5225     __ orr(tmp1, tmp1, tmp2);
5226     __ cbnz(tmp1, NOT_EQUAL);
5227     __ br(__ GE, LOOP);
5228   }
5229 
5230   // a1 = r1 - array1 address
5231   // a2 = r2 - array2 address
5232   // result = r0 - return value. Already contains "false"
5233   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5234   // r3-r5 are reserved temporary registers
5235   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5236   address generate_large_array_equals() {
5237     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5238         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5239         tmp7 = r12, tmp8 = r13;
5240     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5241         SMALL_LOOP, POST_LOOP;
5242     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5243     // calculate if at least 32 prefetched bytes are used
5244     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5245     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5246     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5247     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5248         tmp5, tmp6, tmp7, tmp8);
5249 
5250     __ align(CodeEntryAlignment);
5251 
5252     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5253 
5254     address entry = __ pc();
5255     __ enter();
5256     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5257     // also advance pointers to use post-increment instead of pre-increment
5258     __ add(a1, a1, wordSize);
5259     __ add(a2, a2, wordSize);
5260     if (AvoidUnalignedAccesses) {
5261       // both implementations (SIMD/nonSIMD) are using relatively large load
5262       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5263       // on some CPUs in case of address is not at least 16-byte aligned.
5264       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5265       // load if needed at least for 1st address and make if 16-byte aligned.
5266       Label ALIGNED16;
5267       __ tbz(a1, 3, ALIGNED16);
5268       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5269       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5270       __ sub(cnt1, cnt1, wordSize);
5271       __ eor(tmp1, tmp1, tmp2);
5272       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5273       __ bind(ALIGNED16);
5274     }
5275     if (UseSIMDForArrayEquals) {
5276       if (SoftwarePrefetchHintDistance >= 0) {
5277         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5278         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5279         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5280             /* prfm = */ true, NOT_EQUAL);
5281         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5282         __ br(__ LT, TAIL);
5283       }
5284       __ bind(NO_PREFETCH_LARGE_LOOP);
5285       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5286           /* prfm = */ false, NOT_EQUAL);
5287     } else {
5288       __ push(spilled_regs, sp);
5289       if (SoftwarePrefetchHintDistance >= 0) {
5290         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5291         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5292         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5293             /* prfm = */ true, NOT_EQUAL);
5294         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5295         __ br(__ LT, TAIL);
5296       }
5297       __ bind(NO_PREFETCH_LARGE_LOOP);
5298       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5299           /* prfm = */ false, NOT_EQUAL);
5300     }
5301     __ bind(TAIL);
5302       __ cbz(cnt1, EQUAL);
5303       __ subs(cnt1, cnt1, wordSize);
5304       __ br(__ LE, POST_LOOP);
5305     __ bind(SMALL_LOOP);
5306       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5307       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5308       __ subs(cnt1, cnt1, wordSize);
5309       __ eor(tmp1, tmp1, tmp2);
5310       __ cbnz(tmp1, NOT_EQUAL);
5311       __ br(__ GT, SMALL_LOOP);
5312     __ bind(POST_LOOP);
5313       __ ldr(tmp1, Address(a1, cnt1));
5314       __ ldr(tmp2, Address(a2, cnt1));
5315       __ eor(tmp1, tmp1, tmp2);
5316       __ cbnz(tmp1, NOT_EQUAL);
5317     __ bind(EQUAL);
5318       __ mov(result, true);
5319     __ bind(NOT_EQUAL);
5320       if (!UseSIMDForArrayEquals) {
5321         __ pop(spilled_regs, sp);
5322       }
5323     __ bind(NOT_EQUAL_NO_POP);
5324     __ leave();
5325     __ ret(lr);
5326     return entry;
5327   }
5328 
5329   address generate_dsin_dcos(bool isCos) {
5330     __ align(CodeEntryAlignment);
5331     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5332     address start = __ pc();
5333     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5334         (address)StubRoutines::aarch64::_two_over_pi,
5335         (address)StubRoutines::aarch64::_pio2,
5336         (address)StubRoutines::aarch64::_dsin_coef,
5337         (address)StubRoutines::aarch64::_dcos_coef);
5338     return start;
5339   }
5340 
5341   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5342   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5343       Label &DIFF2) {
5344     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5345     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5346 
5347     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5348     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5349     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5350     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5351 
5352     __ fmovd(tmpL, vtmp3);
5353     __ eor(rscratch2, tmp3, tmpL);
5354     __ cbnz(rscratch2, DIFF2);
5355 
5356     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5357     __ umov(tmpL, vtmp3, __ D, 1);
5358     __ eor(rscratch2, tmpU, tmpL);
5359     __ cbnz(rscratch2, DIFF1);
5360 
5361     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5362     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5363     __ fmovd(tmpL, vtmp);
5364     __ eor(rscratch2, tmp3, tmpL);
5365     __ cbnz(rscratch2, DIFF2);
5366 
5367     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5368     __ umov(tmpL, vtmp, __ D, 1);
5369     __ eor(rscratch2, tmpU, tmpL);
5370     __ cbnz(rscratch2, DIFF1);
5371   }
5372 
5373   // r0  = result
5374   // r1  = str1
5375   // r2  = cnt1
5376   // r3  = str2
5377   // r4  = cnt2
5378   // r10 = tmp1
5379   // r11 = tmp2
5380   address generate_compare_long_string_different_encoding(bool isLU) {
5381     __ align(CodeEntryAlignment);
5382     StubCodeMark mark(this, "StubRoutines", isLU
5383         ? "compare_long_string_different_encoding LU"
5384         : "compare_long_string_different_encoding UL");
5385     address entry = __ pc();
5386     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5387         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5388         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5389     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5390         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5391     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5392     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5393 
5394     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5395 
5396     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5397     // cnt2 == amount of characters left to compare
5398     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5399     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5400     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5401     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5402     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5403     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5404     __ eor(rscratch2, tmp1, tmp2);
5405     __ mov(rscratch1, tmp2);
5406     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5407     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5408              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5409     __ push(spilled_regs, sp);
5410     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5411     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5412 
5413     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5414 
5415     if (SoftwarePrefetchHintDistance >= 0) {
5416       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5417       __ br(__ LT, NO_PREFETCH);
5418       __ bind(LARGE_LOOP_PREFETCH);
5419         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5420         __ mov(tmp4, 2);
5421         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5422         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5423           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5424           __ subs(tmp4, tmp4, 1);
5425           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5426           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5427           __ mov(tmp4, 2);
5428         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5429           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5430           __ subs(tmp4, tmp4, 1);
5431           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5432           __ sub(cnt2, cnt2, 64);
5433           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5434           __ br(__ GE, LARGE_LOOP_PREFETCH);
5435     }
5436     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5437     __ bind(NO_PREFETCH);
5438     __ subs(cnt2, cnt2, 16);
5439     __ br(__ LT, TAIL);
5440     __ align(OptoLoopAlignment);
5441     __ bind(SMALL_LOOP); // smaller loop
5442       __ subs(cnt2, cnt2, 16);
5443       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5444       __ br(__ GE, SMALL_LOOP);
5445       __ cmn(cnt2, (u1)16);
5446       __ br(__ EQ, LOAD_LAST);
5447     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5448       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5449       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5450       __ ldr(tmp3, Address(cnt1, -8));
5451       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5452       __ b(LOAD_LAST);
5453     __ bind(DIFF2);
5454       __ mov(tmpU, tmp3);
5455     __ bind(DIFF1);
5456       __ pop(spilled_regs, sp);
5457       __ b(CALCULATE_DIFFERENCE);
5458     __ bind(LOAD_LAST);
5459       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5460       // No need to load it again
5461       __ mov(tmpU, tmp3);
5462       __ pop(spilled_regs, sp);
5463 
5464       // tmp2 points to the address of the last 4 Latin1 characters right now
5465       __ ldrs(vtmp, Address(tmp2));
5466       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5467       __ fmovd(tmpL, vtmp);
5468 
5469       __ eor(rscratch2, tmpU, tmpL);
5470       __ cbz(rscratch2, DONE);
5471 
5472     // Find the first different characters in the longwords and
5473     // compute their difference.
5474     __ bind(CALCULATE_DIFFERENCE);
5475       __ rev(rscratch2, rscratch2);
5476       __ clz(rscratch2, rscratch2);
5477       __ andr(rscratch2, rscratch2, -16);
5478       __ lsrv(tmp1, tmp1, rscratch2);
5479       __ uxthw(tmp1, tmp1);
5480       __ lsrv(rscratch1, rscratch1, rscratch2);
5481       __ uxthw(rscratch1, rscratch1);
5482       __ subw(result, tmp1, rscratch1);
5483     __ bind(DONE);
5484       __ ret(lr);
5485     return entry;
5486   }
5487 
5488   // r0 = input (float16)
5489   // v0 = result (float)
5490   // v1 = temporary float register
5491   address generate_float16ToFloat() {
5492     __ align(CodeEntryAlignment);
5493     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
5494     address entry = __ pc();
5495     BLOCK_COMMENT("Entry:");
5496     __ flt16_to_flt(v0, r0, v1);
5497     __ ret(lr);
5498     return entry;
5499   }
5500 
5501   // v0 = input (float)
5502   // r0 = result (float16)
5503   // v1 = temporary float register
5504   address generate_floatToFloat16() {
5505     __ align(CodeEntryAlignment);
5506     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
5507     address entry = __ pc();
5508     BLOCK_COMMENT("Entry:");
5509     __ flt_to_flt16(r0, v0, v1);
5510     __ ret(lr);
5511     return entry;
5512   }
5513 
5514   address generate_method_entry_barrier() {
5515     __ align(CodeEntryAlignment);
5516     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5517 
5518     Label deoptimize_label;
5519 
5520     address start = __ pc();
5521 
5522     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5523 
5524     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5525       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5526       // We can get here despite the nmethod being good, if we have not
5527       // yet applied our cross modification fence (or data fence).
5528       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5529       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5530       __ ldrw(rscratch2, rscratch2);
5531       __ strw(rscratch2, thread_epoch_addr);
5532       __ isb();
5533       __ membar(__ LoadLoad);
5534     }
5535 
5536     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5537 
5538     __ enter();
5539     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5540 
5541     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5542 
5543     __ push_call_clobbered_registers();
5544 
5545     __ mov(c_rarg0, rscratch2);
5546     __ call_VM_leaf
5547          (CAST_FROM_FN_PTR
5548           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5549 
5550     __ reset_last_Java_frame(true);
5551 
5552     __ mov(rscratch1, r0);
5553 
5554     __ pop_call_clobbered_registers();
5555 
5556     __ cbnz(rscratch1, deoptimize_label);
5557 
5558     __ leave();
5559     __ ret(lr);
5560 
5561     __ BIND(deoptimize_label);
5562 
5563     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5564     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5565 
5566     __ mov(sp, rscratch1);
5567     __ br(rscratch2);
5568 
5569     return start;
5570   }
5571 
5572   // r0  = result
5573   // r1  = str1
5574   // r2  = cnt1
5575   // r3  = str2
5576   // r4  = cnt2
5577   // r10 = tmp1
5578   // r11 = tmp2
5579   address generate_compare_long_string_same_encoding(bool isLL) {
5580     __ align(CodeEntryAlignment);
5581     StubCodeMark mark(this, "StubRoutines", isLL
5582         ? "compare_long_string_same_encoding LL"
5583         : "compare_long_string_same_encoding UU");
5584     address entry = __ pc();
5585     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5586         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5587 
5588     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5589 
5590     // exit from large loop when less than 64 bytes left to read or we're about
5591     // to prefetch memory behind array border
5592     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5593 
5594     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5595     __ eor(rscratch2, tmp1, tmp2);
5596     __ cbnz(rscratch2, CAL_DIFFERENCE);
5597 
5598     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5599     // update pointers, because of previous read
5600     __ add(str1, str1, wordSize);
5601     __ add(str2, str2, wordSize);
5602     if (SoftwarePrefetchHintDistance >= 0) {
5603       __ align(OptoLoopAlignment);
5604       __ bind(LARGE_LOOP_PREFETCH);
5605         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5606         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5607 
5608         for (int i = 0; i < 4; i++) {
5609           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5610           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5611           __ cmp(tmp1, tmp2);
5612           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5613           __ br(Assembler::NE, DIFF);
5614         }
5615         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5616         __ add(str1, str1, 64);
5617         __ add(str2, str2, 64);
5618         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5619         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5620         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5621     }
5622 
5623     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5624     __ br(Assembler::LE, LESS16);
5625     __ align(OptoLoopAlignment);
5626     __ bind(LOOP_COMPARE16);
5627       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5628       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5629       __ cmp(tmp1, tmp2);
5630       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5631       __ br(Assembler::NE, DIFF);
5632       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5633       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5634       __ br(Assembler::LT, LESS16);
5635 
5636       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5637       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5638       __ cmp(tmp1, tmp2);
5639       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5640       __ br(Assembler::NE, DIFF);
5641       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5642       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5643       __ br(Assembler::GE, LOOP_COMPARE16);
5644       __ cbz(cnt2, LENGTH_DIFF);
5645 
5646     __ bind(LESS16);
5647       // each 8 compare
5648       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5649       __ br(Assembler::LE, LESS8);
5650       __ ldr(tmp1, Address(__ post(str1, 8)));
5651       __ ldr(tmp2, Address(__ post(str2, 8)));
5652       __ eor(rscratch2, tmp1, tmp2);
5653       __ cbnz(rscratch2, CAL_DIFFERENCE);
5654       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5655 
5656     __ bind(LESS8); // directly load last 8 bytes
5657       if (!isLL) {
5658         __ add(cnt2, cnt2, cnt2);
5659       }
5660       __ ldr(tmp1, Address(str1, cnt2));
5661       __ ldr(tmp2, Address(str2, cnt2));
5662       __ eor(rscratch2, tmp1, tmp2);
5663       __ cbz(rscratch2, LENGTH_DIFF);
5664       __ b(CAL_DIFFERENCE);
5665 
5666     __ bind(DIFF);
5667       __ cmp(tmp1, tmp2);
5668       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5669       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5670       // reuse rscratch2 register for the result of eor instruction
5671       __ eor(rscratch2, tmp1, tmp2);
5672 
5673     __ bind(CAL_DIFFERENCE);
5674       __ rev(rscratch2, rscratch2);
5675       __ clz(rscratch2, rscratch2);
5676       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5677       __ lsrv(tmp1, tmp1, rscratch2);
5678       __ lsrv(tmp2, tmp2, rscratch2);
5679       if (isLL) {
5680         __ uxtbw(tmp1, tmp1);
5681         __ uxtbw(tmp2, tmp2);
5682       } else {
5683         __ uxthw(tmp1, tmp1);
5684         __ uxthw(tmp2, tmp2);
5685       }
5686       __ subw(result, tmp1, tmp2);
5687 
5688     __ bind(LENGTH_DIFF);
5689       __ ret(lr);
5690     return entry;
5691   }
5692 
5693   enum string_compare_mode {
5694     LL,
5695     LU,
5696     UL,
5697     UU,
5698   };
5699 
5700   // The following registers are declared in aarch64.ad
5701   // r0  = result
5702   // r1  = str1
5703   // r2  = cnt1
5704   // r3  = str2
5705   // r4  = cnt2
5706   // r10 = tmp1
5707   // r11 = tmp2
5708   // z0  = ztmp1
5709   // z1  = ztmp2
5710   // p0  = pgtmp1
5711   // p1  = pgtmp2
5712   address generate_compare_long_string_sve(string_compare_mode mode) {
5713     __ align(CodeEntryAlignment);
5714     address entry = __ pc();
5715     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5716              tmp1 = r10, tmp2 = r11;
5717 
5718     Label LOOP, DONE, MISMATCH;
5719     Register vec_len = tmp1;
5720     Register idx = tmp2;
5721     // The minimum of the string lengths has been stored in cnt2.
5722     Register cnt = cnt2;
5723     FloatRegister ztmp1 = z0, ztmp2 = z1;
5724     PRegister pgtmp1 = p0, pgtmp2 = p1;
5725 
5726 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5727     switch (mode) {                                                            \
5728       case LL:                                                                 \
5729         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5730         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5731         break;                                                                 \
5732       case LU:                                                                 \
5733         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5734         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5735         break;                                                                 \
5736       case UL:                                                                 \
5737         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5738         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5739         break;                                                                 \
5740       case UU:                                                                 \
5741         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5742         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5743         break;                                                                 \
5744       default:                                                                 \
5745         ShouldNotReachHere();                                                  \
5746     }
5747 
5748     const char* stubname;
5749     switch (mode) {
5750       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5751       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5752       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5753       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5754       default: ShouldNotReachHere();
5755     }
5756 
5757     StubCodeMark mark(this, "StubRoutines", stubname);
5758 
5759     __ mov(idx, 0);
5760     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5761 
5762     if (mode == LL) {
5763       __ sve_cntb(vec_len);
5764     } else {
5765       __ sve_cnth(vec_len);
5766     }
5767 
5768     __ sub(rscratch1, cnt, vec_len);
5769 
5770     __ bind(LOOP);
5771 
5772       // main loop
5773       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5774       __ add(idx, idx, vec_len);
5775       // Compare strings.
5776       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5777       __ br(__ NE, MISMATCH);
5778       __ cmp(idx, rscratch1);
5779       __ br(__ LT, LOOP);
5780 
5781     // post loop, last iteration
5782     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5783 
5784     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5785     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5786     __ br(__ EQ, DONE);
5787 
5788     __ bind(MISMATCH);
5789 
5790     // Crop the vector to find its location.
5791     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5792     // Extract the first different characters of each string.
5793     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5794     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5795 
5796     // Compute the difference of the first different characters.
5797     __ sub(result, rscratch1, rscratch2);
5798 
5799     __ bind(DONE);
5800     __ ret(lr);
5801 #undef LOAD_PAIR
5802     return entry;
5803   }
5804 
5805   void generate_compare_long_strings() {
5806     if (UseSVE == 0) {
5807       StubRoutines::aarch64::_compare_long_string_LL
5808           = generate_compare_long_string_same_encoding(true);
5809       StubRoutines::aarch64::_compare_long_string_UU
5810           = generate_compare_long_string_same_encoding(false);
5811       StubRoutines::aarch64::_compare_long_string_LU
5812           = generate_compare_long_string_different_encoding(true);
5813       StubRoutines::aarch64::_compare_long_string_UL
5814           = generate_compare_long_string_different_encoding(false);
5815     } else {
5816       StubRoutines::aarch64::_compare_long_string_LL
5817           = generate_compare_long_string_sve(LL);
5818       StubRoutines::aarch64::_compare_long_string_UU
5819           = generate_compare_long_string_sve(UU);
5820       StubRoutines::aarch64::_compare_long_string_LU
5821           = generate_compare_long_string_sve(LU);
5822       StubRoutines::aarch64::_compare_long_string_UL
5823           = generate_compare_long_string_sve(UL);
5824     }
5825   }
5826 
5827   // R0 = result
5828   // R1 = str2
5829   // R2 = cnt1
5830   // R3 = str1
5831   // R4 = cnt2
5832   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5833   //
5834   // This generic linear code use few additional ideas, which makes it faster:
5835   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5836   // in order to skip initial loading(help in systems with 1 ld pipeline)
5837   // 2) we can use "fast" algorithm of finding single character to search for
5838   // first symbol with less branches(1 branch per each loaded register instead
5839   // of branch for each symbol), so, this is where constants like
5840   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5841   // 3) after loading and analyzing 1st register of source string, it can be
5842   // used to search for every 1st character entry, saving few loads in
5843   // comparison with "simplier-but-slower" implementation
5844   // 4) in order to avoid lots of push/pop operations, code below is heavily
5845   // re-using/re-initializing/compressing register values, which makes code
5846   // larger and a bit less readable, however, most of extra operations are
5847   // issued during loads or branches, so, penalty is minimal
5848   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5849     const char* stubName = str1_isL
5850         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5851         : "indexof_linear_uu";
5852     __ align(CodeEntryAlignment);
5853     StubCodeMark mark(this, "StubRoutines", stubName);
5854     address entry = __ pc();
5855 
5856     int str1_chr_size = str1_isL ? 1 : 2;
5857     int str2_chr_size = str2_isL ? 1 : 2;
5858     int str1_chr_shift = str1_isL ? 0 : 1;
5859     int str2_chr_shift = str2_isL ? 0 : 1;
5860     bool isL = str1_isL && str2_isL;
5861    // parameters
5862     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5863     // temporary registers
5864     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5865     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5866     // redefinitions
5867     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5868 
5869     __ push(spilled_regs, sp);
5870     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5871         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5872         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5873         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5874         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5875         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5876     // Read whole register from str1. It is safe, because length >=8 here
5877     __ ldr(ch1, Address(str1));
5878     // Read whole register from str2. It is safe, because length >=8 here
5879     __ ldr(ch2, Address(str2));
5880     __ sub(cnt2, cnt2, cnt1);
5881     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5882     if (str1_isL != str2_isL) {
5883       __ eor(v0, __ T16B, v0, v0);
5884     }
5885     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5886     __ mul(first, first, tmp1);
5887     // check if we have less than 1 register to check
5888     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5889     if (str1_isL != str2_isL) {
5890       __ fmovd(v1, ch1);
5891     }
5892     __ br(__ LE, L_SMALL);
5893     __ eor(ch2, first, ch2);
5894     if (str1_isL != str2_isL) {
5895       __ zip1(v1, __ T16B, v1, v0);
5896     }
5897     __ sub(tmp2, ch2, tmp1);
5898     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5899     __ bics(tmp2, tmp2, ch2);
5900     if (str1_isL != str2_isL) {
5901       __ fmovd(ch1, v1);
5902     }
5903     __ br(__ NE, L_HAS_ZERO);
5904     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5905     __ add(result, result, wordSize/str2_chr_size);
5906     __ add(str2, str2, wordSize);
5907     __ br(__ LT, L_POST_LOOP);
5908     __ BIND(L_LOOP);
5909       __ ldr(ch2, Address(str2));
5910       __ eor(ch2, first, ch2);
5911       __ sub(tmp2, ch2, tmp1);
5912       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5913       __ bics(tmp2, tmp2, ch2);
5914       __ br(__ NE, L_HAS_ZERO);
5915     __ BIND(L_LOOP_PROCEED);
5916       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5917       __ add(str2, str2, wordSize);
5918       __ add(result, result, wordSize/str2_chr_size);
5919       __ br(__ GE, L_LOOP);
5920     __ BIND(L_POST_LOOP);
5921       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5922       __ br(__ LE, NOMATCH);
5923       __ ldr(ch2, Address(str2));
5924       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5925       __ eor(ch2, first, ch2);
5926       __ sub(tmp2, ch2, tmp1);
5927       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5928       __ mov(tmp4, -1); // all bits set
5929       __ b(L_SMALL_PROCEED);
5930     __ align(OptoLoopAlignment);
5931     __ BIND(L_SMALL);
5932       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5933       __ eor(ch2, first, ch2);
5934       if (str1_isL != str2_isL) {
5935         __ zip1(v1, __ T16B, v1, v0);
5936       }
5937       __ sub(tmp2, ch2, tmp1);
5938       __ mov(tmp4, -1); // all bits set
5939       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5940       if (str1_isL != str2_isL) {
5941         __ fmovd(ch1, v1); // move converted 4 symbols
5942       }
5943     __ BIND(L_SMALL_PROCEED);
5944       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5945       __ bic(tmp2, tmp2, ch2);
5946       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5947       __ rbit(tmp2, tmp2);
5948       __ br(__ EQ, NOMATCH);
5949     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5950       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5951       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5952       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5953       if (str2_isL) { // LL
5954         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5955         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5956         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5957         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5958         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5959       } else {
5960         __ mov(ch2, 0xE); // all bits in byte set except last one
5961         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5962         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5963         __ lslv(tmp2, tmp2, tmp4);
5964         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5965         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5966         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5967         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5968       }
5969       __ cmp(ch1, ch2);
5970       __ mov(tmp4, wordSize/str2_chr_size);
5971       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5972     __ BIND(L_SMALL_CMP_LOOP);
5973       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5974                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5975       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5976                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5977       __ add(tmp4, tmp4, 1);
5978       __ cmp(tmp4, cnt1);
5979       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5980       __ cmp(first, ch2);
5981       __ br(__ EQ, L_SMALL_CMP_LOOP);
5982     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5983       __ cbz(tmp2, NOMATCH); // no more matches. exit
5984       __ clz(tmp4, tmp2);
5985       __ add(result, result, 1); // advance index
5986       __ add(str2, str2, str2_chr_size); // advance pointer
5987       __ b(L_SMALL_HAS_ZERO_LOOP);
5988     __ align(OptoLoopAlignment);
5989     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5990       __ cmp(first, ch2);
5991       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5992       __ b(DONE);
5993     __ align(OptoLoopAlignment);
5994     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5995       if (str2_isL) { // LL
5996         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5997         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5998         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5999         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6000         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6001       } else {
6002         __ mov(ch2, 0xE); // all bits in byte set except last one
6003         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6004         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6005         __ lslv(tmp2, tmp2, tmp4);
6006         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6007         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6008         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6009         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6010       }
6011       __ cmp(ch1, ch2);
6012       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6013       __ b(DONE);
6014     __ align(OptoLoopAlignment);
6015     __ BIND(L_HAS_ZERO);
6016       __ rbit(tmp2, tmp2);
6017       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6018       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6019       // It's fine because both counters are 32bit and are not changed in this
6020       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6021       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6022       __ sub(result, result, 1);
6023     __ BIND(L_HAS_ZERO_LOOP);
6024       __ mov(cnt1, wordSize/str2_chr_size);
6025       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6026       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6027       if (str2_isL) {
6028         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6029         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6030         __ lslv(tmp2, tmp2, tmp4);
6031         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6032         __ add(tmp4, tmp4, 1);
6033         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6034         __ lsl(tmp2, tmp2, 1);
6035         __ mov(tmp4, wordSize/str2_chr_size);
6036       } else {
6037         __ mov(ch2, 0xE);
6038         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6039         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6040         __ lslv(tmp2, tmp2, tmp4);
6041         __ add(tmp4, tmp4, 1);
6042         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6043         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6044         __ lsl(tmp2, tmp2, 1);
6045         __ mov(tmp4, wordSize/str2_chr_size);
6046         __ sub(str2, str2, str2_chr_size);
6047       }
6048       __ cmp(ch1, ch2);
6049       __ mov(tmp4, wordSize/str2_chr_size);
6050       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6051     __ BIND(L_CMP_LOOP);
6052       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6053                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6054       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6055                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6056       __ add(tmp4, tmp4, 1);
6057       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6058       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6059       __ cmp(cnt1, ch2);
6060       __ br(__ EQ, L_CMP_LOOP);
6061     __ BIND(L_CMP_LOOP_NOMATCH);
6062       // here we're not matched
6063       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6064       __ clz(tmp4, tmp2);
6065       __ add(str2, str2, str2_chr_size); // advance pointer
6066       __ b(L_HAS_ZERO_LOOP);
6067     __ align(OptoLoopAlignment);
6068     __ BIND(L_CMP_LOOP_LAST_CMP);
6069       __ cmp(cnt1, ch2);
6070       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6071       __ b(DONE);
6072     __ align(OptoLoopAlignment);
6073     __ BIND(L_CMP_LOOP_LAST_CMP2);
6074       if (str2_isL) {
6075         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6076         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6077         __ lslv(tmp2, tmp2, tmp4);
6078         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6079         __ add(tmp4, tmp4, 1);
6080         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6081         __ lsl(tmp2, tmp2, 1);
6082       } else {
6083         __ mov(ch2, 0xE);
6084         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6085         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6086         __ lslv(tmp2, tmp2, tmp4);
6087         __ add(tmp4, tmp4, 1);
6088         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6089         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6090         __ lsl(tmp2, tmp2, 1);
6091         __ sub(str2, str2, str2_chr_size);
6092       }
6093       __ cmp(ch1, ch2);
6094       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6095       __ b(DONE);
6096     __ align(OptoLoopAlignment);
6097     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6098       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6099       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6100       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6101       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6102       // result by analyzed characters value, so, we can just reset lower bits
6103       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6104       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6105       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6106       // index of last analyzed substring inside current octet. So, str2 in at
6107       // respective start address. We need to advance it to next octet
6108       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6109       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6110       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6111       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6112       __ movw(cnt2, cnt2);
6113       __ b(L_LOOP_PROCEED);
6114     __ align(OptoLoopAlignment);
6115     __ BIND(NOMATCH);
6116       __ mov(result, -1);
6117     __ BIND(DONE);
6118       __ pop(spilled_regs, sp);
6119       __ ret(lr);
6120     return entry;
6121   }
6122 
6123   void generate_string_indexof_stubs() {
6124     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6125     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6126     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6127   }
6128 
6129   void inflate_and_store_2_fp_registers(bool generatePrfm,
6130       FloatRegister src1, FloatRegister src2) {
6131     Register dst = r1;
6132     __ zip1(v1, __ T16B, src1, v0);
6133     __ zip2(v2, __ T16B, src1, v0);
6134     if (generatePrfm) {
6135       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6136     }
6137     __ zip1(v3, __ T16B, src2, v0);
6138     __ zip2(v4, __ T16B, src2, v0);
6139     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6140   }
6141 
6142   // R0 = src
6143   // R1 = dst
6144   // R2 = len
6145   // R3 = len >> 3
6146   // V0 = 0
6147   // v1 = loaded 8 bytes
6148   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6149   address generate_large_byte_array_inflate() {
6150     __ align(CodeEntryAlignment);
6151     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6152     address entry = __ pc();
6153     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6154     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6155     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6156 
6157     // do one more 8-byte read to have address 16-byte aligned in most cases
6158     // also use single store instruction
6159     __ ldrd(v2, __ post(src, 8));
6160     __ sub(octetCounter, octetCounter, 2);
6161     __ zip1(v1, __ T16B, v1, v0);
6162     __ zip1(v2, __ T16B, v2, v0);
6163     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6164     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6165     __ subs(rscratch1, octetCounter, large_loop_threshold);
6166     __ br(__ LE, LOOP_START);
6167     __ b(LOOP_PRFM_START);
6168     __ bind(LOOP_PRFM);
6169       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6170     __ bind(LOOP_PRFM_START);
6171       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6172       __ sub(octetCounter, octetCounter, 8);
6173       __ subs(rscratch1, octetCounter, large_loop_threshold);
6174       inflate_and_store_2_fp_registers(true, v3, v4);
6175       inflate_and_store_2_fp_registers(true, v5, v6);
6176       __ br(__ GT, LOOP_PRFM);
6177       __ cmp(octetCounter, (u1)8);
6178       __ br(__ LT, DONE);
6179     __ bind(LOOP);
6180       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6181       __ bind(LOOP_START);
6182       __ sub(octetCounter, octetCounter, 8);
6183       __ cmp(octetCounter, (u1)8);
6184       inflate_and_store_2_fp_registers(false, v3, v4);
6185       inflate_and_store_2_fp_registers(false, v5, v6);
6186       __ br(__ GE, LOOP);
6187     __ bind(DONE);
6188       __ ret(lr);
6189     return entry;
6190   }
6191 
6192   /**
6193    *  Arguments:
6194    *
6195    *  Input:
6196    *  c_rarg0   - current state address
6197    *  c_rarg1   - H key address
6198    *  c_rarg2   - data address
6199    *  c_rarg3   - number of blocks
6200    *
6201    *  Output:
6202    *  Updated state at c_rarg0
6203    */
6204   address generate_ghash_processBlocks() {
6205     // Bafflingly, GCM uses little-endian for the byte order, but
6206     // big-endian for the bit order.  For example, the polynomial 1 is
6207     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6208     //
6209     // So, we must either reverse the bytes in each word and do
6210     // everything big-endian or reverse the bits in each byte and do
6211     // it little-endian.  On AArch64 it's more idiomatic to reverse
6212     // the bits in each byte (we have an instruction, RBIT, to do
6213     // that) and keep the data in little-endian bit order through the
6214     // calculation, bit-reversing the inputs and outputs.
6215 
6216     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6217     __ align(wordSize * 2);
6218     address p = __ pc();
6219     __ emit_int64(0x87);  // The low-order bits of the field
6220                           // polynomial (i.e. p = z^7+z^2+z+1)
6221                           // repeated in the low and high parts of a
6222                           // 128-bit vector
6223     __ emit_int64(0x87);
6224 
6225     __ align(CodeEntryAlignment);
6226     address start = __ pc();
6227 
6228     Register state   = c_rarg0;
6229     Register subkeyH = c_rarg1;
6230     Register data    = c_rarg2;
6231     Register blocks  = c_rarg3;
6232 
6233     FloatRegister vzr = v30;
6234     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6235 
6236     __ ldrq(v24, p);    // The field polynomial
6237 
6238     __ ldrq(v0, Address(state));
6239     __ ldrq(v1, Address(subkeyH));
6240 
6241     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6242     __ rbit(v0, __ T16B, v0);
6243     __ rev64(v1, __ T16B, v1);
6244     __ rbit(v1, __ T16B, v1);
6245 
6246     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6247     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6248 
6249     {
6250       Label L_ghash_loop;
6251       __ bind(L_ghash_loop);
6252 
6253       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6254                                                  // reversing each byte
6255       __ rbit(v2, __ T16B, v2);
6256       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6257 
6258       // Multiply state in v2 by subkey in v1
6259       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6260                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6261                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6262       // Reduce v7:v5 by the field polynomial
6263       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6264 
6265       __ sub(blocks, blocks, 1);
6266       __ cbnz(blocks, L_ghash_loop);
6267     }
6268 
6269     // The bit-reversed result is at this point in v0
6270     __ rev64(v0, __ T16B, v0);
6271     __ rbit(v0, __ T16B, v0);
6272 
6273     __ st1(v0, __ T16B, state);
6274     __ ret(lr);
6275 
6276     return start;
6277   }
6278 
6279   address generate_ghash_processBlocks_wide() {
6280     address small = generate_ghash_processBlocks();
6281 
6282     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6283     __ align(wordSize * 2);
6284     address p = __ pc();
6285     __ emit_int64(0x87);  // The low-order bits of the field
6286                           // polynomial (i.e. p = z^7+z^2+z+1)
6287                           // repeated in the low and high parts of a
6288                           // 128-bit vector
6289     __ emit_int64(0x87);
6290 
6291     __ align(CodeEntryAlignment);
6292     address start = __ pc();
6293 
6294     Register state   = c_rarg0;
6295     Register subkeyH = c_rarg1;
6296     Register data    = c_rarg2;
6297     Register blocks  = c_rarg3;
6298 
6299     const int unroll = 4;
6300 
6301     __ cmp(blocks, (unsigned char)(unroll * 2));
6302     __ br(__ LT, small);
6303 
6304     if (unroll > 1) {
6305     // Save state before entering routine
6306       __ sub(sp, sp, 4 * 16);
6307       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6308       __ sub(sp, sp, 4 * 16);
6309       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6310     }
6311 
6312     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6313 
6314     if (unroll > 1) {
6315       // And restore state
6316       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6317       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6318     }
6319 
6320     __ cmp(blocks, (unsigned char)0);
6321     __ br(__ GT, small);
6322 
6323     __ ret(lr);
6324 
6325     return start;
6326   }
6327 
6328   void generate_base64_encode_simdround(Register src, Register dst,
6329         FloatRegister codec, u8 size) {
6330 
6331     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6332     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6333     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6334 
6335     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6336 
6337     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6338 
6339     __ ushr(ind0, arrangement, in0,  2);
6340 
6341     __ ushr(ind1, arrangement, in1,  2);
6342     __ shl(in0,   arrangement, in0,  6);
6343     __ orr(ind1,  arrangement, ind1, in0);
6344     __ ushr(ind1, arrangement, ind1, 2);
6345 
6346     __ ushr(ind2, arrangement, in2,  4);
6347     __ shl(in1,   arrangement, in1,  4);
6348     __ orr(ind2,  arrangement, in1,  ind2);
6349     __ ushr(ind2, arrangement, ind2, 2);
6350 
6351     __ shl(ind3,  arrangement, in2,  2);
6352     __ ushr(ind3, arrangement, ind3, 2);
6353 
6354     __ tbl(out0,  arrangement, codec,  4, ind0);
6355     __ tbl(out1,  arrangement, codec,  4, ind1);
6356     __ tbl(out2,  arrangement, codec,  4, ind2);
6357     __ tbl(out3,  arrangement, codec,  4, ind3);
6358 
6359     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6360   }
6361 
6362    /**
6363    *  Arguments:
6364    *
6365    *  Input:
6366    *  c_rarg0   - src_start
6367    *  c_rarg1   - src_offset
6368    *  c_rarg2   - src_length
6369    *  c_rarg3   - dest_start
6370    *  c_rarg4   - dest_offset
6371    *  c_rarg5   - isURL
6372    *
6373    */
6374   address generate_base64_encodeBlock() {
6375 
6376     static const char toBase64[64] = {
6377       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6378       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6379       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6380       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6381       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6382     };
6383 
6384     static const char toBase64URL[64] = {
6385       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6386       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6387       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6388       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6389       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6390     };
6391 
6392     __ align(CodeEntryAlignment);
6393     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6394     address start = __ pc();
6395 
6396     Register src   = c_rarg0;  // source array
6397     Register soff  = c_rarg1;  // source start offset
6398     Register send  = c_rarg2;  // source end offset
6399     Register dst   = c_rarg3;  // dest array
6400     Register doff  = c_rarg4;  // position for writing to dest array
6401     Register isURL = c_rarg5;  // Base64 or URL character set
6402 
6403     // c_rarg6 and c_rarg7 are free to use as temps
6404     Register codec  = c_rarg6;
6405     Register length = c_rarg7;
6406 
6407     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6408 
6409     __ add(src, src, soff);
6410     __ add(dst, dst, doff);
6411     __ sub(length, send, soff);
6412 
6413     // load the codec base address
6414     __ lea(codec, ExternalAddress((address) toBase64));
6415     __ cbz(isURL, ProcessData);
6416     __ lea(codec, ExternalAddress((address) toBase64URL));
6417 
6418     __ BIND(ProcessData);
6419 
6420     // too short to formup a SIMD loop, roll back
6421     __ cmp(length, (u1)24);
6422     __ br(Assembler::LT, Process3B);
6423 
6424     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6425 
6426     __ BIND(Process48B);
6427     __ cmp(length, (u1)48);
6428     __ br(Assembler::LT, Process24B);
6429     generate_base64_encode_simdround(src, dst, v0, 16);
6430     __ sub(length, length, 48);
6431     __ b(Process48B);
6432 
6433     __ BIND(Process24B);
6434     __ cmp(length, (u1)24);
6435     __ br(Assembler::LT, SIMDExit);
6436     generate_base64_encode_simdround(src, dst, v0, 8);
6437     __ sub(length, length, 24);
6438 
6439     __ BIND(SIMDExit);
6440     __ cbz(length, Exit);
6441 
6442     __ BIND(Process3B);
6443     //  3 src bytes, 24 bits
6444     __ ldrb(r10, __ post(src, 1));
6445     __ ldrb(r11, __ post(src, 1));
6446     __ ldrb(r12, __ post(src, 1));
6447     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6448     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6449     // codec index
6450     __ ubfmw(r15, r12, 18, 23);
6451     __ ubfmw(r14, r12, 12, 17);
6452     __ ubfmw(r13, r12, 6,  11);
6453     __ andw(r12,  r12, 63);
6454     // get the code based on the codec
6455     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6456     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6457     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6458     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6459     __ strb(r15, __ post(dst, 1));
6460     __ strb(r14, __ post(dst, 1));
6461     __ strb(r13, __ post(dst, 1));
6462     __ strb(r12, __ post(dst, 1));
6463     __ sub(length, length, 3);
6464     __ cbnz(length, Process3B);
6465 
6466     __ BIND(Exit);
6467     __ ret(lr);
6468 
6469     return start;
6470   }
6471 
6472   void generate_base64_decode_simdround(Register src, Register dst,
6473         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6474 
6475     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6476     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6477 
6478     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6479     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6480 
6481     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6482 
6483     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6484 
6485     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6486 
6487     // we need unsigned saturating subtract, to make sure all input values
6488     // in range [0, 63] will have 0U value in the higher half lookup
6489     __ uqsubv(decH0, __ T16B, in0, v27);
6490     __ uqsubv(decH1, __ T16B, in1, v27);
6491     __ uqsubv(decH2, __ T16B, in2, v27);
6492     __ uqsubv(decH3, __ T16B, in3, v27);
6493 
6494     // lower half lookup
6495     __ tbl(decL0, arrangement, codecL, 4, in0);
6496     __ tbl(decL1, arrangement, codecL, 4, in1);
6497     __ tbl(decL2, arrangement, codecL, 4, in2);
6498     __ tbl(decL3, arrangement, codecL, 4, in3);
6499 
6500     // higher half lookup
6501     __ tbx(decH0, arrangement, codecH, 4, decH0);
6502     __ tbx(decH1, arrangement, codecH, 4, decH1);
6503     __ tbx(decH2, arrangement, codecH, 4, decH2);
6504     __ tbx(decH3, arrangement, codecH, 4, decH3);
6505 
6506     // combine lower and higher
6507     __ orr(decL0, arrangement, decL0, decH0);
6508     __ orr(decL1, arrangement, decL1, decH1);
6509     __ orr(decL2, arrangement, decL2, decH2);
6510     __ orr(decL3, arrangement, decL3, decH3);
6511 
6512     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6513     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6514     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6515     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6516     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6517     __ orr(in0, arrangement, decH0, decH1);
6518     __ orr(in1, arrangement, decH2, decH3);
6519     __ orr(in2, arrangement, in0,   in1);
6520     __ umaxv(in3, arrangement, in2);
6521     __ umov(rscratch2, in3, __ B, 0);
6522 
6523     // get the data to output
6524     __ shl(out0,  arrangement, decL0, 2);
6525     __ ushr(out1, arrangement, decL1, 4);
6526     __ orr(out0,  arrangement, out0,  out1);
6527     __ shl(out1,  arrangement, decL1, 4);
6528     __ ushr(out2, arrangement, decL2, 2);
6529     __ orr(out1,  arrangement, out1,  out2);
6530     __ shl(out2,  arrangement, decL2, 6);
6531     __ orr(out2,  arrangement, out2,  decL3);
6532 
6533     __ cbz(rscratch2, NoIllegalData);
6534 
6535     // handle illegal input
6536     __ umov(r10, in2, __ D, 0);
6537     if (size == 16) {
6538       __ cbnz(r10, ErrorInLowerHalf);
6539 
6540       // illegal input is in higher half, store the lower half now.
6541       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6542 
6543       __ umov(r10, in2,  __ D, 1);
6544       __ umov(r11, out0, __ D, 1);
6545       __ umov(r12, out1, __ D, 1);
6546       __ umov(r13, out2, __ D, 1);
6547       __ b(StoreLegalData);
6548 
6549       __ BIND(ErrorInLowerHalf);
6550     }
6551     __ umov(r11, out0, __ D, 0);
6552     __ umov(r12, out1, __ D, 0);
6553     __ umov(r13, out2, __ D, 0);
6554 
6555     __ BIND(StoreLegalData);
6556     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6557     __ strb(r11, __ post(dst, 1));
6558     __ strb(r12, __ post(dst, 1));
6559     __ strb(r13, __ post(dst, 1));
6560     __ lsr(r10, r10, 8);
6561     __ lsr(r11, r11, 8);
6562     __ lsr(r12, r12, 8);
6563     __ lsr(r13, r13, 8);
6564     __ b(StoreLegalData);
6565 
6566     __ BIND(NoIllegalData);
6567     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6568   }
6569 
6570 
6571    /**
6572    *  Arguments:
6573    *
6574    *  Input:
6575    *  c_rarg0   - src_start
6576    *  c_rarg1   - src_offset
6577    *  c_rarg2   - src_length
6578    *  c_rarg3   - dest_start
6579    *  c_rarg4   - dest_offset
6580    *  c_rarg5   - isURL
6581    *  c_rarg6   - isMIME
6582    *
6583    */
6584   address generate_base64_decodeBlock() {
6585 
6586     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6587     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6588     // titled "Base64 decoding".
6589 
6590     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6591     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6592     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6593     static const uint8_t fromBase64ForNoSIMD[256] = {
6594       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6595       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6596       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6597        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6598       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6599        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6600       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6601        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6602       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6603       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6604       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6605       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6606       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6607       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6608       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6609       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6610     };
6611 
6612     static const uint8_t fromBase64URLForNoSIMD[256] = {
6613       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6614       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6616        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6617       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6618        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6619       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6620        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6621       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6622       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6623       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6624       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6625       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6626       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6627       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6628       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6629     };
6630 
6631     // A legal value of base64 code is in range [0, 127].  We need two lookups
6632     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6633     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6634     // table vector lookup use tbx, out of range indices are unchanged in
6635     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6636     // The value of index 64 is set to 0, so that we know that we already get the
6637     // decoded data with the 1st lookup.
6638     static const uint8_t fromBase64ForSIMD[128] = {
6639       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6640       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6641       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6642        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6643         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6644        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6645       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6646        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6647     };
6648 
6649     static const uint8_t fromBase64URLForSIMD[128] = {
6650       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6651       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6652       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6653        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6654         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6655        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6656        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6657        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6658     };
6659 
6660     __ align(CodeEntryAlignment);
6661     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6662     address start = __ pc();
6663 
6664     Register src    = c_rarg0;  // source array
6665     Register soff   = c_rarg1;  // source start offset
6666     Register send   = c_rarg2;  // source end offset
6667     Register dst    = c_rarg3;  // dest array
6668     Register doff   = c_rarg4;  // position for writing to dest array
6669     Register isURL  = c_rarg5;  // Base64 or URL character set
6670     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6671 
6672     Register length = send;    // reuse send as length of source data to process
6673 
6674     Register simd_codec   = c_rarg6;
6675     Register nosimd_codec = c_rarg7;
6676 
6677     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6678 
6679     __ enter();
6680 
6681     __ add(src, src, soff);
6682     __ add(dst, dst, doff);
6683 
6684     __ mov(doff, dst);
6685 
6686     __ sub(length, send, soff);
6687     __ bfm(length, zr, 0, 1);
6688 
6689     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6690     __ cbz(isURL, ProcessData);
6691     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6692 
6693     __ BIND(ProcessData);
6694     __ mov(rscratch1, length);
6695     __ cmp(length, (u1)144); // 144 = 80 + 64
6696     __ br(Assembler::LT, Process4B);
6697 
6698     // In the MIME case, the line length cannot be more than 76
6699     // bytes (see RFC 2045). This is too short a block for SIMD
6700     // to be worthwhile, so we use non-SIMD here.
6701     __ movw(rscratch1, 79);
6702 
6703     __ BIND(Process4B);
6704     __ ldrw(r14, __ post(src, 4));
6705     __ ubfxw(r10, r14, 0,  8);
6706     __ ubfxw(r11, r14, 8,  8);
6707     __ ubfxw(r12, r14, 16, 8);
6708     __ ubfxw(r13, r14, 24, 8);
6709     // get the de-code
6710     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6711     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6712     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6713     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6714     // error detection, 255u indicates an illegal input
6715     __ orrw(r14, r10, r11);
6716     __ orrw(r15, r12, r13);
6717     __ orrw(r14, r14, r15);
6718     __ tbnz(r14, 7, Exit);
6719     // recover the data
6720     __ lslw(r14, r10, 10);
6721     __ bfiw(r14, r11, 4, 6);
6722     __ bfmw(r14, r12, 2, 5);
6723     __ rev16w(r14, r14);
6724     __ bfiw(r13, r12, 6, 2);
6725     __ strh(r14, __ post(dst, 2));
6726     __ strb(r13, __ post(dst, 1));
6727     // non-simd loop
6728     __ subsw(rscratch1, rscratch1, 4);
6729     __ br(Assembler::GT, Process4B);
6730 
6731     // if exiting from PreProcess80B, rscratch1 == -1;
6732     // otherwise, rscratch1 == 0.
6733     __ cbzw(rscratch1, Exit);
6734     __ sub(length, length, 80);
6735 
6736     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6737     __ cbz(isURL, SIMDEnter);
6738     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6739 
6740     __ BIND(SIMDEnter);
6741     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6742     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6743     __ mov(rscratch1, 63);
6744     __ dup(v27, __ T16B, rscratch1);
6745 
6746     __ BIND(Process64B);
6747     __ cmp(length, (u1)64);
6748     __ br(Assembler::LT, Process32B);
6749     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6750     __ sub(length, length, 64);
6751     __ b(Process64B);
6752 
6753     __ BIND(Process32B);
6754     __ cmp(length, (u1)32);
6755     __ br(Assembler::LT, SIMDExit);
6756     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6757     __ sub(length, length, 32);
6758     __ b(Process32B);
6759 
6760     __ BIND(SIMDExit);
6761     __ cbz(length, Exit);
6762     __ movw(rscratch1, length);
6763     __ b(Process4B);
6764 
6765     __ BIND(Exit);
6766     __ sub(c_rarg0, dst, doff);
6767 
6768     __ leave();
6769     __ ret(lr);
6770 
6771     return start;
6772   }
6773 
6774   // Support for spin waits.
6775   address generate_spin_wait() {
6776     __ align(CodeEntryAlignment);
6777     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6778     address start = __ pc();
6779 
6780     __ spin_wait();
6781     __ ret(lr);
6782 
6783     return start;
6784   }
6785 
6786   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
6787     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
6788 
6789     address start = __ pc();
6790     const Register
6791       r_super_klass  = r0,
6792       r_array_base   = r1,
6793       r_array_length = r2,
6794       r_array_index  = r3,
6795       r_sub_klass    = r4,
6796       r_bitmap       = rscratch2,
6797       result         = r5;
6798     const FloatRegister
6799       vtemp          = v0;
6800 
6801     Label L_success;
6802     __ enter();
6803     __ lookup_secondary_supers_table(r_sub_klass, r_super_klass,
6804                                      r_array_base, r_array_length, r_array_index,
6805                                      vtemp, result, super_klass_index,
6806                                      /*stub_is_near*/true);
6807     __ leave();
6808     __ ret(lr);
6809 
6810     return start;
6811   }
6812 
6813   // Slow path implementation for UseSecondarySupersTable.
6814   address generate_lookup_secondary_supers_table_slow_path_stub() {
6815     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
6816 
6817     address start = __ pc();
6818     const Register
6819       r_super_klass  = r0,        // argument
6820       r_array_base   = r1,        // argument
6821       temp1          = r2,        // temp
6822       r_array_index  = r3,        // argument
6823       r_bitmap       = rscratch2, // argument
6824       result         = r5;        // argument
6825 
6826     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
6827     __ ret(lr);
6828 
6829     return start;
6830   }
6831 
6832 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6833 
6834   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6835   //
6836   // If LSE is in use, generate LSE versions of all the stubs. The
6837   // non-LSE versions are in atomic_aarch64.S.
6838 
6839   // class AtomicStubMark records the entry point of a stub and the
6840   // stub pointer which will point to it. The stub pointer is set to
6841   // the entry point when ~AtomicStubMark() is called, which must be
6842   // after ICache::invalidate_range. This ensures safe publication of
6843   // the generated code.
6844   class AtomicStubMark {
6845     address _entry_point;
6846     aarch64_atomic_stub_t *_stub;
6847     MacroAssembler *_masm;
6848   public:
6849     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6850       _masm = masm;
6851       __ align(32);
6852       _entry_point = __ pc();
6853       _stub = stub;
6854     }
6855     ~AtomicStubMark() {
6856       *_stub = (aarch64_atomic_stub_t)_entry_point;
6857     }
6858   };
6859 
6860   // NB: For memory_order_conservative we need a trailing membar after
6861   // LSE atomic operations but not a leading membar.
6862   //
6863   // We don't need a leading membar because a clause in the Arm ARM
6864   // says:
6865   //
6866   //   Barrier-ordered-before
6867   //
6868   //   Barrier instructions order prior Memory effects before subsequent
6869   //   Memory effects generated by the same Observer. A read or a write
6870   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6871   //   Observer if and only if RW1 appears in program order before RW 2
6872   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6873   //   instruction with both Acquire and Release semantics.
6874   //
6875   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6876   // and Release semantics, therefore we don't need a leading
6877   // barrier. However, there is no corresponding Barrier-ordered-after
6878   // relationship, therefore we need a trailing membar to prevent a
6879   // later store or load from being reordered with the store in an
6880   // atomic instruction.
6881   //
6882   // This was checked by using the herd7 consistency model simulator
6883   // (http://diy.inria.fr/) with this test case:
6884   //
6885   // AArch64 LseCas
6886   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6887   // P0 | P1;
6888   // LDR W4, [X2] | MOV W3, #0;
6889   // DMB LD       | MOV W4, #1;
6890   // LDR W3, [X1] | CASAL W3, W4, [X1];
6891   //              | DMB ISH;
6892   //              | STR W4, [X2];
6893   // exists
6894   // (0:X3=0 /\ 0:X4=1)
6895   //
6896   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6897   // with the store to x in P1. Without the DMB in P1 this may happen.
6898   //
6899   // At the time of writing we don't know of any AArch64 hardware that
6900   // reorders stores in this way, but the Reference Manual permits it.
6901 
6902   void gen_cas_entry(Assembler::operand_size size,
6903                      atomic_memory_order order) {
6904     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6905       exchange_val = c_rarg2;
6906     bool acquire, release;
6907     switch (order) {
6908       case memory_order_relaxed:
6909         acquire = false;
6910         release = false;
6911         break;
6912       case memory_order_release:
6913         acquire = false;
6914         release = true;
6915         break;
6916       default:
6917         acquire = true;
6918         release = true;
6919         break;
6920     }
6921     __ mov(prev, compare_val);
6922     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6923     if (order == memory_order_conservative) {
6924       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6925     }
6926     if (size == Assembler::xword) {
6927       __ mov(r0, prev);
6928     } else {
6929       __ movw(r0, prev);
6930     }
6931     __ ret(lr);
6932   }
6933 
6934   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6935     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6936     // If not relaxed, then default to conservative.  Relaxed is the only
6937     // case we use enough to be worth specializing.
6938     if (order == memory_order_relaxed) {
6939       __ ldadd(size, incr, prev, addr);
6940     } else {
6941       __ ldaddal(size, incr, prev, addr);
6942       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6943     }
6944     if (size == Assembler::xword) {
6945       __ mov(r0, prev);
6946     } else {
6947       __ movw(r0, prev);
6948     }
6949     __ ret(lr);
6950   }
6951 
6952   void gen_swpal_entry(Assembler::operand_size size) {
6953     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6954     __ swpal(size, incr, prev, addr);
6955     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6956     if (size == Assembler::xword) {
6957       __ mov(r0, prev);
6958     } else {
6959       __ movw(r0, prev);
6960     }
6961     __ ret(lr);
6962   }
6963 
6964   void generate_atomic_entry_points() {
6965     if (! UseLSE) {
6966       return;
6967     }
6968 
6969     __ align(CodeEntryAlignment);
6970     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6971     address first_entry = __ pc();
6972 
6973     // ADD, memory_order_conservative
6974     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6975     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6976     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6977     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6978 
6979     // ADD, memory_order_relaxed
6980     AtomicStubMark mark_fetch_add_4_relaxed
6981       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6982     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6983     AtomicStubMark mark_fetch_add_8_relaxed
6984       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6985     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6986 
6987     // XCHG, memory_order_conservative
6988     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6989     gen_swpal_entry(Assembler::word);
6990     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6991     gen_swpal_entry(Assembler::xword);
6992 
6993     // CAS, memory_order_conservative
6994     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6995     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6996     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6997     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6998     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6999     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
7000 
7001     // CAS, memory_order_relaxed
7002     AtomicStubMark mark_cmpxchg_1_relaxed
7003       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
7004     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
7005     AtomicStubMark mark_cmpxchg_4_relaxed
7006       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
7007     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
7008     AtomicStubMark mark_cmpxchg_8_relaxed
7009       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
7010     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
7011 
7012     AtomicStubMark mark_cmpxchg_4_release
7013       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
7014     gen_cas_entry(MacroAssembler::word, memory_order_release);
7015     AtomicStubMark mark_cmpxchg_8_release
7016       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
7017     gen_cas_entry(MacroAssembler::xword, memory_order_release);
7018 
7019     AtomicStubMark mark_cmpxchg_4_seq_cst
7020       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
7021     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
7022     AtomicStubMark mark_cmpxchg_8_seq_cst
7023       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
7024     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
7025 
7026     ICache::invalidate_range(first_entry, __ pc() - first_entry);
7027   }
7028 #endif // LINUX
7029 
7030   address generate_cont_thaw(Continuation::thaw_kind kind) {
7031     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
7032     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
7033 
7034     address start = __ pc();
7035 
7036     if (return_barrier) {
7037       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7038       __ mov(sp, rscratch1);
7039     }
7040     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7041 
7042     if (return_barrier) {
7043       // preserve possible return value from a method returning to the return barrier
7044       __ fmovd(rscratch1, v0);
7045       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7046     }
7047 
7048     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7049     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7050     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7051 
7052     if (return_barrier) {
7053       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7054       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7055       __ fmovd(v0, rscratch1);
7056     }
7057     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7058 
7059 
7060     Label thaw_success;
7061     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7062     __ cbnz(rscratch2, thaw_success);
7063     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
7064     __ br(rscratch1);
7065     __ bind(thaw_success);
7066 
7067     // make room for the thawed frames
7068     __ sub(rscratch1, sp, rscratch2);
7069     __ andr(rscratch1, rscratch1, -16); // align
7070     __ mov(sp, rscratch1);
7071 
7072     if (return_barrier) {
7073       // save original return value -- again
7074       __ fmovd(rscratch1, v0);
7075       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7076     }
7077 
7078     // If we want, we can templatize thaw by kind, and have three different entries
7079     __ movw(c_rarg1, (uint32_t)kind);
7080 
7081     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7082     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7083 
7084     if (return_barrier) {
7085       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7086       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7087       __ fmovd(v0, rscratch1);
7088     } else {
7089       __ mov(r0, zr); // return 0 (success) from doYield
7090     }
7091 
7092     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7093     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7094     __ mov(rfp, sp);
7095 
7096     if (return_barrier_exception) {
7097       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7098       __ authenticate_return_address(c_rarg1);
7099       __ verify_oop(r0);
7100       // save return value containing the exception oop in callee-saved R19
7101       __ mov(r19, r0);
7102 
7103       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7104 
7105       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7106       // __ reinitialize_ptrue();
7107 
7108       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7109 
7110       __ mov(r1, r0); // the exception handler
7111       __ mov(r0, r19); // restore return value containing the exception oop
7112       __ verify_oop(r0);
7113 
7114       __ leave();
7115       __ mov(r3, lr);
7116       __ br(r1); // the exception handler
7117     } else {
7118       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7119       __ leave();
7120       __ ret(lr);
7121     }
7122 
7123     return start;
7124   }
7125 
7126   address generate_cont_thaw() {
7127     if (!Continuations::enabled()) return nullptr;
7128 
7129     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7130     address start = __ pc();
7131     generate_cont_thaw(Continuation::thaw_top);
7132     return start;
7133   }
7134 
7135   address generate_cont_returnBarrier() {
7136     if (!Continuations::enabled()) return nullptr;
7137 
7138     // TODO: will probably need multiple return barriers depending on return type
7139     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7140     address start = __ pc();
7141 
7142     generate_cont_thaw(Continuation::thaw_return_barrier);
7143 
7144     return start;
7145   }
7146 
7147   address generate_cont_returnBarrier_exception() {
7148     if (!Continuations::enabled()) return nullptr;
7149 
7150     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7151     address start = __ pc();
7152 
7153     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7154 
7155     return start;
7156   }
7157 
7158   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7159   // are represented as long[5], with BITS_PER_LIMB = 26.
7160   // Pack five 26-bit limbs into three 64-bit registers.
7161   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7162     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7163     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7164     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7165     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7166 
7167     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7168     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7169     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7170     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7171 
7172     if (dest2->is_valid()) {
7173       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7174     } else {
7175 #ifdef ASSERT
7176       Label OK;
7177       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7178       __ br(__ EQ, OK);
7179       __ stop("high bits of Poly1305 integer should be zero");
7180       __ should_not_reach_here();
7181       __ bind(OK);
7182 #endif
7183     }
7184   }
7185 
7186   // As above, but return only a 128-bit integer, packed into two
7187   // 64-bit registers.
7188   void pack_26(Register dest0, Register dest1, Register src) {
7189     pack_26(dest0, dest1, noreg, src);
7190   }
7191 
7192   // Multiply and multiply-accumulate unsigned 64-bit registers.
7193   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7194     __ mul(prod_lo, n, m);
7195     __ umulh(prod_hi, n, m);
7196   }
7197   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7198     wide_mul(rscratch1, rscratch2, n, m);
7199     __ adds(sum_lo, sum_lo, rscratch1);
7200     __ adc(sum_hi, sum_hi, rscratch2);
7201   }
7202 
7203   // Poly1305, RFC 7539
7204 
7205   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7206   // description of the tricks used to simplify and accelerate this
7207   // computation.
7208 
7209   address generate_poly1305_processBlocks() {
7210     __ align(CodeEntryAlignment);
7211     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7212     address start = __ pc();
7213     Label here;
7214     __ enter();
7215     RegSet callee_saved = RegSet::range(r19, r28);
7216     __ push(callee_saved, sp);
7217 
7218     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7219 
7220     // Arguments
7221     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7222 
7223     // R_n is the 128-bit randomly-generated key, packed into two
7224     // registers.  The caller passes this key to us as long[5], with
7225     // BITS_PER_LIMB = 26.
7226     const Register R_0 = *++regs, R_1 = *++regs;
7227     pack_26(R_0, R_1, r_start);
7228 
7229     // RR_n is (R_n >> 2) * 5
7230     const Register RR_0 = *++regs, RR_1 = *++regs;
7231     __ lsr(RR_0, R_0, 2);
7232     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7233     __ lsr(RR_1, R_1, 2);
7234     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7235 
7236     // U_n is the current checksum
7237     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7238     pack_26(U_0, U_1, U_2, acc_start);
7239 
7240     static constexpr int BLOCK_LENGTH = 16;
7241     Label DONE, LOOP;
7242 
7243     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7244     __ br(Assembler::LT, DONE); {
7245       __ bind(LOOP);
7246 
7247       // S_n is to be the sum of U_n and the next block of data
7248       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7249       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7250       __ adds(S_0, U_0, S_0);
7251       __ adcs(S_1, U_1, S_1);
7252       __ adc(S_2, U_2, zr);
7253       __ add(S_2, S_2, 1);
7254 
7255       const Register U_0HI = *++regs, U_1HI = *++regs;
7256 
7257       // NB: this logic depends on some of the special properties of
7258       // Poly1305 keys. In particular, because we know that the top
7259       // four bits of R_0 and R_1 are zero, we can add together
7260       // partial products without any risk of needing to propagate a
7261       // carry out.
7262       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7263       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7264       __ andr(U_2, R_0, 3);
7265       __ mul(U_2, S_2, U_2);
7266 
7267       // Recycle registers S_0, S_1, S_2
7268       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7269 
7270       // Partial reduction mod 2**130 - 5
7271       __ adds(U_1, U_0HI, U_1);
7272       __ adc(U_2, U_1HI, U_2);
7273       // Sum now in U_2:U_1:U_0.
7274       // Dead: U_0HI, U_1HI.
7275       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7276 
7277       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7278 
7279       // First, U_2:U_1:U_0 += (U_2 >> 2)
7280       __ lsr(rscratch1, U_2, 2);
7281       __ andr(U_2, U_2, (u8)3);
7282       __ adds(U_0, U_0, rscratch1);
7283       __ adcs(U_1, U_1, zr);
7284       __ adc(U_2, U_2, zr);
7285       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7286       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7287       __ adcs(U_1, U_1, zr);
7288       __ adc(U_2, U_2, zr);
7289 
7290       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7291       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7292       __ br(~ Assembler::LT, LOOP);
7293     }
7294 
7295     // Further reduce modulo 2^130 - 5
7296     __ lsr(rscratch1, U_2, 2);
7297     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7298     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7299     __ adcs(U_1, U_1, zr);
7300     __ andr(U_2, U_2, (u1)3);
7301     __ adc(U_2, U_2, zr);
7302 
7303     // Unpack the sum into five 26-bit limbs and write to memory.
7304     __ ubfiz(rscratch1, U_0, 0, 26);
7305     __ ubfx(rscratch2, U_0, 26, 26);
7306     __ stp(rscratch1, rscratch2, Address(acc_start));
7307     __ ubfx(rscratch1, U_0, 52, 12);
7308     __ bfi(rscratch1, U_1, 12, 14);
7309     __ ubfx(rscratch2, U_1, 14, 26);
7310     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7311     __ ubfx(rscratch1, U_1, 40, 24);
7312     __ bfi(rscratch1, U_2, 24, 3);
7313     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7314 
7315     __ bind(DONE);
7316     __ pop(callee_saved, sp);
7317     __ leave();
7318     __ ret(lr);
7319 
7320     return start;
7321   }
7322 
7323   // exception handler for upcall stubs
7324   address generate_upcall_stub_exception_handler() {
7325     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7326     address start = __ pc();
7327 
7328     // Native caller has no idea how to handle exceptions,
7329     // so we just crash here. Up to callee to catch exceptions.
7330     __ verify_oop(r0);
7331     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7332     __ blr(rscratch1);
7333     __ should_not_reach_here();
7334 
7335     return start;
7336   }
7337 
7338 #undef __
7339 #define __ masm->
7340 
7341   class MontgomeryMultiplyGenerator : public MacroAssembler {
7342 
7343     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7344       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7345 
7346     RegSet _toSave;
7347     bool _squaring;
7348 
7349   public:
7350     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7351       : MacroAssembler(as->code()), _squaring(squaring) {
7352 
7353       // Register allocation
7354 
7355       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7356       Pa_base = *regs;       // Argument registers
7357       if (squaring)
7358         Pb_base = Pa_base;
7359       else
7360         Pb_base = *++regs;
7361       Pn_base = *++regs;
7362       Rlen= *++regs;
7363       inv = *++regs;
7364       Pm_base = *++regs;
7365 
7366                           // Working registers:
7367       Ra =  *++regs;        // The current digit of a, b, n, and m.
7368       Rb =  *++regs;
7369       Rm =  *++regs;
7370       Rn =  *++regs;
7371 
7372       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7373       Pb =  *++regs;
7374       Pm =  *++regs;
7375       Pn =  *++regs;
7376 
7377       t0 =  *++regs;        // Three registers which form a
7378       t1 =  *++regs;        // triple-precision accumuator.
7379       t2 =  *++regs;
7380 
7381       Ri =  *++regs;        // Inner and outer loop indexes.
7382       Rj =  *++regs;
7383 
7384       Rhi_ab = *++regs;     // Product registers: low and high parts
7385       Rlo_ab = *++regs;     // of a*b and m*n.
7386       Rhi_mn = *++regs;
7387       Rlo_mn = *++regs;
7388 
7389       // r19 and up are callee-saved.
7390       _toSave = RegSet::range(r19, *regs) + Pm_base;
7391     }
7392 
7393   private:
7394     void save_regs() {
7395       push(_toSave, sp);
7396     }
7397 
7398     void restore_regs() {
7399       pop(_toSave, sp);
7400     }
7401 
7402     template <typename T>
7403     void unroll_2(Register count, T block) {
7404       Label loop, end, odd;
7405       tbnz(count, 0, odd);
7406       cbz(count, end);
7407       align(16);
7408       bind(loop);
7409       (this->*block)();
7410       bind(odd);
7411       (this->*block)();
7412       subs(count, count, 2);
7413       br(Assembler::GT, loop);
7414       bind(end);
7415     }
7416 
7417     template <typename T>
7418     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7419       Label loop, end, odd;
7420       tbnz(count, 0, odd);
7421       cbz(count, end);
7422       align(16);
7423       bind(loop);
7424       (this->*block)(d, s, tmp);
7425       bind(odd);
7426       (this->*block)(d, s, tmp);
7427       subs(count, count, 2);
7428       br(Assembler::GT, loop);
7429       bind(end);
7430     }
7431 
7432     void pre1(RegisterOrConstant i) {
7433       block_comment("pre1");
7434       // Pa = Pa_base;
7435       // Pb = Pb_base + i;
7436       // Pm = Pm_base;
7437       // Pn = Pn_base + i;
7438       // Ra = *Pa;
7439       // Rb = *Pb;
7440       // Rm = *Pm;
7441       // Rn = *Pn;
7442       ldr(Ra, Address(Pa_base));
7443       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7444       ldr(Rm, Address(Pm_base));
7445       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7446       lea(Pa, Address(Pa_base));
7447       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7448       lea(Pm, Address(Pm_base));
7449       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7450 
7451       // Zero the m*n result.
7452       mov(Rhi_mn, zr);
7453       mov(Rlo_mn, zr);
7454     }
7455 
7456     // The core multiply-accumulate step of a Montgomery
7457     // multiplication.  The idea is to schedule operations as a
7458     // pipeline so that instructions with long latencies (loads and
7459     // multiplies) have time to complete before their results are
7460     // used.  This most benefits in-order implementations of the
7461     // architecture but out-of-order ones also benefit.
7462     void step() {
7463       block_comment("step");
7464       // MACC(Ra, Rb, t0, t1, t2);
7465       // Ra = *++Pa;
7466       // Rb = *--Pb;
7467       umulh(Rhi_ab, Ra, Rb);
7468       mul(Rlo_ab, Ra, Rb);
7469       ldr(Ra, pre(Pa, wordSize));
7470       ldr(Rb, pre(Pb, -wordSize));
7471       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7472                                        // previous iteration.
7473       // MACC(Rm, Rn, t0, t1, t2);
7474       // Rm = *++Pm;
7475       // Rn = *--Pn;
7476       umulh(Rhi_mn, Rm, Rn);
7477       mul(Rlo_mn, Rm, Rn);
7478       ldr(Rm, pre(Pm, wordSize));
7479       ldr(Rn, pre(Pn, -wordSize));
7480       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7481     }
7482 
7483     void post1() {
7484       block_comment("post1");
7485 
7486       // MACC(Ra, Rb, t0, t1, t2);
7487       // Ra = *++Pa;
7488       // Rb = *--Pb;
7489       umulh(Rhi_ab, Ra, Rb);
7490       mul(Rlo_ab, Ra, Rb);
7491       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7492       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7493 
7494       // *Pm = Rm = t0 * inv;
7495       mul(Rm, t0, inv);
7496       str(Rm, Address(Pm));
7497 
7498       // MACC(Rm, Rn, t0, t1, t2);
7499       // t0 = t1; t1 = t2; t2 = 0;
7500       umulh(Rhi_mn, Rm, Rn);
7501 
7502 #ifndef PRODUCT
7503       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7504       {
7505         mul(Rlo_mn, Rm, Rn);
7506         add(Rlo_mn, t0, Rlo_mn);
7507         Label ok;
7508         cbz(Rlo_mn, ok); {
7509           stop("broken Montgomery multiply");
7510         } bind(ok);
7511       }
7512 #endif
7513       // We have very carefully set things up so that
7514       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7515       // the lower half of Rm * Rn because we know the result already:
7516       // it must be -t0.  t0 + (-t0) must generate a carry iff
7517       // t0 != 0.  So, rather than do a mul and an adds we just set
7518       // the carry flag iff t0 is nonzero.
7519       //
7520       // mul(Rlo_mn, Rm, Rn);
7521       // adds(zr, t0, Rlo_mn);
7522       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7523       adcs(t0, t1, Rhi_mn);
7524       adc(t1, t2, zr);
7525       mov(t2, zr);
7526     }
7527 
7528     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7529       block_comment("pre2");
7530       // Pa = Pa_base + i-len;
7531       // Pb = Pb_base + len;
7532       // Pm = Pm_base + i-len;
7533       // Pn = Pn_base + len;
7534 
7535       if (i.is_register()) {
7536         sub(Rj, i.as_register(), len);
7537       } else {
7538         mov(Rj, i.as_constant());
7539         sub(Rj, Rj, len);
7540       }
7541       // Rj == i-len
7542 
7543       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7544       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7545       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7546       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7547 
7548       // Ra = *++Pa;
7549       // Rb = *--Pb;
7550       // Rm = *++Pm;
7551       // Rn = *--Pn;
7552       ldr(Ra, pre(Pa, wordSize));
7553       ldr(Rb, pre(Pb, -wordSize));
7554       ldr(Rm, pre(Pm, wordSize));
7555       ldr(Rn, pre(Pn, -wordSize));
7556 
7557       mov(Rhi_mn, zr);
7558       mov(Rlo_mn, zr);
7559     }
7560 
7561     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7562       block_comment("post2");
7563       if (i.is_constant()) {
7564         mov(Rj, i.as_constant()-len.as_constant());
7565       } else {
7566         sub(Rj, i.as_register(), len);
7567       }
7568 
7569       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7570 
7571       // As soon as we know the least significant digit of our result,
7572       // store it.
7573       // Pm_base[i-len] = t0;
7574       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7575 
7576       // t0 = t1; t1 = t2; t2 = 0;
7577       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7578       adc(t1, t2, zr);
7579       mov(t2, zr);
7580     }
7581 
7582     // A carry in t0 after Montgomery multiplication means that we
7583     // should subtract multiples of n from our result in m.  We'll
7584     // keep doing that until there is no carry.
7585     void normalize(RegisterOrConstant len) {
7586       block_comment("normalize");
7587       // while (t0)
7588       //   t0 = sub(Pm_base, Pn_base, t0, len);
7589       Label loop, post, again;
7590       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7591       cbz(t0, post); {
7592         bind(again); {
7593           mov(i, zr);
7594           mov(cnt, len);
7595           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7596           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7597           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7598           align(16);
7599           bind(loop); {
7600             sbcs(Rm, Rm, Rn);
7601             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7602             add(i, i, 1);
7603             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7604             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7605             sub(cnt, cnt, 1);
7606           } cbnz(cnt, loop);
7607           sbc(t0, t0, zr);
7608         } cbnz(t0, again);
7609       } bind(post);
7610     }
7611 
7612     // Move memory at s to d, reversing words.
7613     //    Increments d to end of copied memory
7614     //    Destroys tmp1, tmp2
7615     //    Preserves len
7616     //    Leaves s pointing to the address which was in d at start
7617     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7618       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7619       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7620 
7621       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7622       mov(tmp1, len);
7623       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7624       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7625     }
7626     // where
7627     void reverse1(Register d, Register s, Register tmp) {
7628       ldr(tmp, pre(s, -wordSize));
7629       ror(tmp, tmp, 32);
7630       str(tmp, post(d, wordSize));
7631     }
7632 
7633     void step_squaring() {
7634       // An extra ACC
7635       step();
7636       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7637     }
7638 
7639     void last_squaring(RegisterOrConstant i) {
7640       Label dont;
7641       // if ((i & 1) == 0) {
7642       tbnz(i.as_register(), 0, dont); {
7643         // MACC(Ra, Rb, t0, t1, t2);
7644         // Ra = *++Pa;
7645         // Rb = *--Pb;
7646         umulh(Rhi_ab, Ra, Rb);
7647         mul(Rlo_ab, Ra, Rb);
7648         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7649       } bind(dont);
7650     }
7651 
7652     void extra_step_squaring() {
7653       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7654 
7655       // MACC(Rm, Rn, t0, t1, t2);
7656       // Rm = *++Pm;
7657       // Rn = *--Pn;
7658       umulh(Rhi_mn, Rm, Rn);
7659       mul(Rlo_mn, Rm, Rn);
7660       ldr(Rm, pre(Pm, wordSize));
7661       ldr(Rn, pre(Pn, -wordSize));
7662     }
7663 
7664     void post1_squaring() {
7665       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7666 
7667       // *Pm = Rm = t0 * inv;
7668       mul(Rm, t0, inv);
7669       str(Rm, Address(Pm));
7670 
7671       // MACC(Rm, Rn, t0, t1, t2);
7672       // t0 = t1; t1 = t2; t2 = 0;
7673       umulh(Rhi_mn, Rm, Rn);
7674 
7675 #ifndef PRODUCT
7676       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7677       {
7678         mul(Rlo_mn, Rm, Rn);
7679         add(Rlo_mn, t0, Rlo_mn);
7680         Label ok;
7681         cbz(Rlo_mn, ok); {
7682           stop("broken Montgomery multiply");
7683         } bind(ok);
7684       }
7685 #endif
7686       // We have very carefully set things up so that
7687       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7688       // the lower half of Rm * Rn because we know the result already:
7689       // it must be -t0.  t0 + (-t0) must generate a carry iff
7690       // t0 != 0.  So, rather than do a mul and an adds we just set
7691       // the carry flag iff t0 is nonzero.
7692       //
7693       // mul(Rlo_mn, Rm, Rn);
7694       // adds(zr, t0, Rlo_mn);
7695       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7696       adcs(t0, t1, Rhi_mn);
7697       adc(t1, t2, zr);
7698       mov(t2, zr);
7699     }
7700 
7701     void acc(Register Rhi, Register Rlo,
7702              Register t0, Register t1, Register t2) {
7703       adds(t0, t0, Rlo);
7704       adcs(t1, t1, Rhi);
7705       adc(t2, t2, zr);
7706     }
7707 
7708   public:
7709     /**
7710      * Fast Montgomery multiplication.  The derivation of the
7711      * algorithm is in A Cryptographic Library for the Motorola
7712      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7713      *
7714      * Arguments:
7715      *
7716      * Inputs for multiplication:
7717      *   c_rarg0   - int array elements a
7718      *   c_rarg1   - int array elements b
7719      *   c_rarg2   - int array elements n (the modulus)
7720      *   c_rarg3   - int length
7721      *   c_rarg4   - int inv
7722      *   c_rarg5   - int array elements m (the result)
7723      *
7724      * Inputs for squaring:
7725      *   c_rarg0   - int array elements a
7726      *   c_rarg1   - int array elements n (the modulus)
7727      *   c_rarg2   - int length
7728      *   c_rarg3   - int inv
7729      *   c_rarg4   - int array elements m (the result)
7730      *
7731      */
7732     address generate_multiply() {
7733       Label argh, nothing;
7734       bind(argh);
7735       stop("MontgomeryMultiply total_allocation must be <= 8192");
7736 
7737       align(CodeEntryAlignment);
7738       address entry = pc();
7739 
7740       cbzw(Rlen, nothing);
7741 
7742       enter();
7743 
7744       // Make room.
7745       cmpw(Rlen, 512);
7746       br(Assembler::HI, argh);
7747       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7748       andr(sp, Ra, -2 * wordSize);
7749 
7750       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7751 
7752       {
7753         // Copy input args, reversing as we go.  We use Ra as a
7754         // temporary variable.
7755         reverse(Ra, Pa_base, Rlen, t0, t1);
7756         if (!_squaring)
7757           reverse(Ra, Pb_base, Rlen, t0, t1);
7758         reverse(Ra, Pn_base, Rlen, t0, t1);
7759       }
7760 
7761       // Push all call-saved registers and also Pm_base which we'll need
7762       // at the end.
7763       save_regs();
7764 
7765 #ifndef PRODUCT
7766       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7767       {
7768         ldr(Rn, Address(Pn_base, 0));
7769         mul(Rlo_mn, Rn, inv);
7770         subs(zr, Rlo_mn, -1);
7771         Label ok;
7772         br(EQ, ok); {
7773           stop("broken inverse in Montgomery multiply");
7774         } bind(ok);
7775       }
7776 #endif
7777 
7778       mov(Pm_base, Ra);
7779 
7780       mov(t0, zr);
7781       mov(t1, zr);
7782       mov(t2, zr);
7783 
7784       block_comment("for (int i = 0; i < len; i++) {");
7785       mov(Ri, zr); {
7786         Label loop, end;
7787         cmpw(Ri, Rlen);
7788         br(Assembler::GE, end);
7789 
7790         bind(loop);
7791         pre1(Ri);
7792 
7793         block_comment("  for (j = i; j; j--) {"); {
7794           movw(Rj, Ri);
7795           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7796         } block_comment("  } // j");
7797 
7798         post1();
7799         addw(Ri, Ri, 1);
7800         cmpw(Ri, Rlen);
7801         br(Assembler::LT, loop);
7802         bind(end);
7803         block_comment("} // i");
7804       }
7805 
7806       block_comment("for (int i = len; i < 2*len; i++) {");
7807       mov(Ri, Rlen); {
7808         Label loop, end;
7809         cmpw(Ri, Rlen, Assembler::LSL, 1);
7810         br(Assembler::GE, end);
7811 
7812         bind(loop);
7813         pre2(Ri, Rlen);
7814 
7815         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7816           lslw(Rj, Rlen, 1);
7817           subw(Rj, Rj, Ri);
7818           subw(Rj, Rj, 1);
7819           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7820         } block_comment("  } // j");
7821 
7822         post2(Ri, Rlen);
7823         addw(Ri, Ri, 1);
7824         cmpw(Ri, Rlen, Assembler::LSL, 1);
7825         br(Assembler::LT, loop);
7826         bind(end);
7827       }
7828       block_comment("} // i");
7829 
7830       normalize(Rlen);
7831 
7832       mov(Ra, Pm_base);  // Save Pm_base in Ra
7833       restore_regs();  // Restore caller's Pm_base
7834 
7835       // Copy our result into caller's Pm_base
7836       reverse(Pm_base, Ra, Rlen, t0, t1);
7837 
7838       leave();
7839       bind(nothing);
7840       ret(lr);
7841 
7842       return entry;
7843     }
7844     // In C, approximately:
7845 
7846     // void
7847     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7848     //                     julong Pn_base[], julong Pm_base[],
7849     //                     julong inv, int len) {
7850     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7851     //   julong *Pa, *Pb, *Pn, *Pm;
7852     //   julong Ra, Rb, Rn, Rm;
7853 
7854     //   int i;
7855 
7856     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7857 
7858     //   for (i = 0; i < len; i++) {
7859     //     int j;
7860 
7861     //     Pa = Pa_base;
7862     //     Pb = Pb_base + i;
7863     //     Pm = Pm_base;
7864     //     Pn = Pn_base + i;
7865 
7866     //     Ra = *Pa;
7867     //     Rb = *Pb;
7868     //     Rm = *Pm;
7869     //     Rn = *Pn;
7870 
7871     //     int iters = i;
7872     //     for (j = 0; iters--; j++) {
7873     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7874     //       MACC(Ra, Rb, t0, t1, t2);
7875     //       Ra = *++Pa;
7876     //       Rb = *--Pb;
7877     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7878     //       MACC(Rm, Rn, t0, t1, t2);
7879     //       Rm = *++Pm;
7880     //       Rn = *--Pn;
7881     //     }
7882 
7883     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7884     //     MACC(Ra, Rb, t0, t1, t2);
7885     //     *Pm = Rm = t0 * inv;
7886     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7887     //     MACC(Rm, Rn, t0, t1, t2);
7888 
7889     //     assert(t0 == 0, "broken Montgomery multiply");
7890 
7891     //     t0 = t1; t1 = t2; t2 = 0;
7892     //   }
7893 
7894     //   for (i = len; i < 2*len; i++) {
7895     //     int j;
7896 
7897     //     Pa = Pa_base + i-len;
7898     //     Pb = Pb_base + len;
7899     //     Pm = Pm_base + i-len;
7900     //     Pn = Pn_base + len;
7901 
7902     //     Ra = *++Pa;
7903     //     Rb = *--Pb;
7904     //     Rm = *++Pm;
7905     //     Rn = *--Pn;
7906 
7907     //     int iters = len*2-i-1;
7908     //     for (j = i-len+1; iters--; j++) {
7909     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7910     //       MACC(Ra, Rb, t0, t1, t2);
7911     //       Ra = *++Pa;
7912     //       Rb = *--Pb;
7913     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7914     //       MACC(Rm, Rn, t0, t1, t2);
7915     //       Rm = *++Pm;
7916     //       Rn = *--Pn;
7917     //     }
7918 
7919     //     Pm_base[i-len] = t0;
7920     //     t0 = t1; t1 = t2; t2 = 0;
7921     //   }
7922 
7923     //   while (t0)
7924     //     t0 = sub(Pm_base, Pn_base, t0, len);
7925     // }
7926 
7927     /**
7928      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7929      * multiplies than Montgomery multiplication so it should be up to
7930      * 25% faster.  However, its loop control is more complex and it
7931      * may actually run slower on some machines.
7932      *
7933      * Arguments:
7934      *
7935      * Inputs:
7936      *   c_rarg0   - int array elements a
7937      *   c_rarg1   - int array elements n (the modulus)
7938      *   c_rarg2   - int length
7939      *   c_rarg3   - int inv
7940      *   c_rarg4   - int array elements m (the result)
7941      *
7942      */
7943     address generate_square() {
7944       Label argh;
7945       bind(argh);
7946       stop("MontgomeryMultiply total_allocation must be <= 8192");
7947 
7948       align(CodeEntryAlignment);
7949       address entry = pc();
7950 
7951       enter();
7952 
7953       // Make room.
7954       cmpw(Rlen, 512);
7955       br(Assembler::HI, argh);
7956       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7957       andr(sp, Ra, -2 * wordSize);
7958 
7959       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7960 
7961       {
7962         // Copy input args, reversing as we go.  We use Ra as a
7963         // temporary variable.
7964         reverse(Ra, Pa_base, Rlen, t0, t1);
7965         reverse(Ra, Pn_base, Rlen, t0, t1);
7966       }
7967 
7968       // Push all call-saved registers and also Pm_base which we'll need
7969       // at the end.
7970       save_regs();
7971 
7972       mov(Pm_base, Ra);
7973 
7974       mov(t0, zr);
7975       mov(t1, zr);
7976       mov(t2, zr);
7977 
7978       block_comment("for (int i = 0; i < len; i++) {");
7979       mov(Ri, zr); {
7980         Label loop, end;
7981         bind(loop);
7982         cmp(Ri, Rlen);
7983         br(Assembler::GE, end);
7984 
7985         pre1(Ri);
7986 
7987         block_comment("for (j = (i+1)/2; j; j--) {"); {
7988           add(Rj, Ri, 1);
7989           lsr(Rj, Rj, 1);
7990           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7991         } block_comment("  } // j");
7992 
7993         last_squaring(Ri);
7994 
7995         block_comment("  for (j = i/2; j; j--) {"); {
7996           lsr(Rj, Ri, 1);
7997           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7998         } block_comment("  } // j");
7999 
8000         post1_squaring();
8001         add(Ri, Ri, 1);
8002         cmp(Ri, Rlen);
8003         br(Assembler::LT, loop);
8004 
8005         bind(end);
8006         block_comment("} // i");
8007       }
8008 
8009       block_comment("for (int i = len; i < 2*len; i++) {");
8010       mov(Ri, Rlen); {
8011         Label loop, end;
8012         bind(loop);
8013         cmp(Ri, Rlen, Assembler::LSL, 1);
8014         br(Assembler::GE, end);
8015 
8016         pre2(Ri, Rlen);
8017 
8018         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8019           lsl(Rj, Rlen, 1);
8020           sub(Rj, Rj, Ri);
8021           sub(Rj, Rj, 1);
8022           lsr(Rj, Rj, 1);
8023           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8024         } block_comment("  } // j");
8025 
8026         last_squaring(Ri);
8027 
8028         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8029           lsl(Rj, Rlen, 1);
8030           sub(Rj, Rj, Ri);
8031           lsr(Rj, Rj, 1);
8032           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8033         } block_comment("  } // j");
8034 
8035         post2(Ri, Rlen);
8036         add(Ri, Ri, 1);
8037         cmp(Ri, Rlen, Assembler::LSL, 1);
8038 
8039         br(Assembler::LT, loop);
8040         bind(end);
8041         block_comment("} // i");
8042       }
8043 
8044       normalize(Rlen);
8045 
8046       mov(Ra, Pm_base);  // Save Pm_base in Ra
8047       restore_regs();  // Restore caller's Pm_base
8048 
8049       // Copy our result into caller's Pm_base
8050       reverse(Pm_base, Ra, Rlen, t0, t1);
8051 
8052       leave();
8053       ret(lr);
8054 
8055       return entry;
8056     }
8057     // In C, approximately:
8058 
8059     // void
8060     // montgomery_square(julong Pa_base[], julong Pn_base[],
8061     //                   julong Pm_base[], julong inv, int len) {
8062     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8063     //   julong *Pa, *Pb, *Pn, *Pm;
8064     //   julong Ra, Rb, Rn, Rm;
8065 
8066     //   int i;
8067 
8068     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8069 
8070     //   for (i = 0; i < len; i++) {
8071     //     int j;
8072 
8073     //     Pa = Pa_base;
8074     //     Pb = Pa_base + i;
8075     //     Pm = Pm_base;
8076     //     Pn = Pn_base + i;
8077 
8078     //     Ra = *Pa;
8079     //     Rb = *Pb;
8080     //     Rm = *Pm;
8081     //     Rn = *Pn;
8082 
8083     //     int iters = (i+1)/2;
8084     //     for (j = 0; iters--; j++) {
8085     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8086     //       MACC2(Ra, Rb, t0, t1, t2);
8087     //       Ra = *++Pa;
8088     //       Rb = *--Pb;
8089     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8090     //       MACC(Rm, Rn, t0, t1, t2);
8091     //       Rm = *++Pm;
8092     //       Rn = *--Pn;
8093     //     }
8094     //     if ((i & 1) == 0) {
8095     //       assert(Ra == Pa_base[j], "must be");
8096     //       MACC(Ra, Ra, t0, t1, t2);
8097     //     }
8098     //     iters = i/2;
8099     //     assert(iters == i-j, "must be");
8100     //     for (; iters--; j++) {
8101     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8102     //       MACC(Rm, Rn, t0, t1, t2);
8103     //       Rm = *++Pm;
8104     //       Rn = *--Pn;
8105     //     }
8106 
8107     //     *Pm = Rm = t0 * inv;
8108     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8109     //     MACC(Rm, Rn, t0, t1, t2);
8110 
8111     //     assert(t0 == 0, "broken Montgomery multiply");
8112 
8113     //     t0 = t1; t1 = t2; t2 = 0;
8114     //   }
8115 
8116     //   for (i = len; i < 2*len; i++) {
8117     //     int start = i-len+1;
8118     //     int end = start + (len - start)/2;
8119     //     int j;
8120 
8121     //     Pa = Pa_base + i-len;
8122     //     Pb = Pa_base + len;
8123     //     Pm = Pm_base + i-len;
8124     //     Pn = Pn_base + len;
8125 
8126     //     Ra = *++Pa;
8127     //     Rb = *--Pb;
8128     //     Rm = *++Pm;
8129     //     Rn = *--Pn;
8130 
8131     //     int iters = (2*len-i-1)/2;
8132     //     assert(iters == end-start, "must be");
8133     //     for (j = start; iters--; j++) {
8134     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8135     //       MACC2(Ra, Rb, t0, t1, t2);
8136     //       Ra = *++Pa;
8137     //       Rb = *--Pb;
8138     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8139     //       MACC(Rm, Rn, t0, t1, t2);
8140     //       Rm = *++Pm;
8141     //       Rn = *--Pn;
8142     //     }
8143     //     if ((i & 1) == 0) {
8144     //       assert(Ra == Pa_base[j], "must be");
8145     //       MACC(Ra, Ra, t0, t1, t2);
8146     //     }
8147     //     iters =  (2*len-i)/2;
8148     //     assert(iters == len-j, "must be");
8149     //     for (; iters--; j++) {
8150     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8151     //       MACC(Rm, Rn, t0, t1, t2);
8152     //       Rm = *++Pm;
8153     //       Rn = *--Pn;
8154     //     }
8155     //     Pm_base[i-len] = t0;
8156     //     t0 = t1; t1 = t2; t2 = 0;
8157     //   }
8158 
8159     //   while (t0)
8160     //     t0 = sub(Pm_base, Pn_base, t0, len);
8161     // }
8162   };
8163 
8164 
8165   // Initialization
8166   void generate_initial_stubs() {
8167     // Generate initial stubs and initializes the entry points
8168 
8169     // entry points that exist in all platforms Note: This is code
8170     // that could be shared among different platforms - however the
8171     // benefit seems to be smaller than the disadvantage of having a
8172     // much more complicated generator structure. See also comment in
8173     // stubRoutines.hpp.
8174 
8175     StubRoutines::_forward_exception_entry = generate_forward_exception();
8176 
8177     StubRoutines::_call_stub_entry =
8178       generate_call_stub(StubRoutines::_call_stub_return_address);
8179 
8180     // is referenced by megamorphic call
8181     StubRoutines::_catch_exception_entry = generate_catch_exception();
8182 
8183     // Initialize table for copy memory (arraycopy) check.
8184     if (UnsafeMemoryAccess::_table == nullptr) {
8185       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
8186     }
8187 
8188     if (UseCRC32Intrinsics) {
8189       // set table address before stub generation which use it
8190       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8191       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8192     }
8193 
8194     if (UseCRC32CIntrinsics) {
8195       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8196     }
8197 
8198     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8199       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8200     }
8201 
8202     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8203       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8204     }
8205 
8206     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8207         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8208       StubRoutines::_hf2f = generate_float16ToFloat();
8209       StubRoutines::_f2hf = generate_floatToFloat16();
8210     }
8211   }
8212 
8213   void generate_continuation_stubs() {
8214     // Continuation stubs:
8215     StubRoutines::_cont_thaw          = generate_cont_thaw();
8216     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8217     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8218   }
8219 
8220   void generate_final_stubs() {
8221     // support for verify_oop (must happen after universe_init)
8222     if (VerifyOops) {
8223       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8224     }
8225 
8226     // arraycopy stubs used by compilers
8227     generate_arraycopy_stubs();
8228 
8229     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8230     if (bs_nm != nullptr) {
8231       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8232     }
8233 
8234     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8235 
8236     if (UsePoly1305Intrinsics) {
8237       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8238     }
8239 
8240 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8241 
8242     generate_atomic_entry_points();
8243 
8244 #endif // LINUX
8245 
8246 #ifdef COMPILER2
8247     if (UseSecondarySupersTable) {
8248       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
8249       if (! InlineSecondarySupersTest) {
8250         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
8251           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
8252             = generate_lookup_secondary_supers_table_stub(slot);
8253         }
8254       }
8255     }
8256 #endif
8257 
8258     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8259 
8260     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8261   }
8262 
8263   void generate_compiler_stubs() {
8264 #if COMPILER2_OR_JVMCI
8265 
8266     if (UseSVE == 0) {
8267       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8268     }
8269 
8270     // array equals stub for large arrays.
8271     if (!UseSimpleArrayEquals) {
8272       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8273     }
8274 
8275     // byte_array_inflate stub for large arrays.
8276     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8277 
8278     // countPositives stub for large arrays.
8279     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8280 
8281     generate_compare_long_strings();
8282 
8283     generate_string_indexof_stubs();
8284 
8285 #ifdef COMPILER2
8286     if (UseMultiplyToLenIntrinsic) {
8287       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8288     }
8289 
8290     if (UseSquareToLenIntrinsic) {
8291       StubRoutines::_squareToLen = generate_squareToLen();
8292     }
8293 
8294     if (UseMulAddIntrinsic) {
8295       StubRoutines::_mulAdd = generate_mulAdd();
8296     }
8297 
8298     if (UseSIMDForBigIntegerShiftIntrinsics) {
8299       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8300       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8301     }
8302 
8303     if (UseMontgomeryMultiplyIntrinsic) {
8304       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8305       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8306       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8307     }
8308 
8309     if (UseMontgomerySquareIntrinsic) {
8310       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8311       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8312       // We use generate_multiply() rather than generate_square()
8313       // because it's faster for the sizes of modulus we care about.
8314       StubRoutines::_montgomerySquare = g.generate_multiply();
8315     }
8316 #endif // COMPILER2
8317 
8318     if (UseChaCha20Intrinsics) {
8319       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8320     }
8321 
8322     if (UseBASE64Intrinsics) {
8323         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8324         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8325     }
8326 
8327     // data cache line writeback
8328     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8329     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8330 
8331     if (UseAESIntrinsics) {
8332       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8333       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8334       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8335       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8336       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8337     }
8338     if (UseGHASHIntrinsics) {
8339       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8340       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8341     }
8342     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8343       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8344     }
8345 
8346     if (UseMD5Intrinsics) {
8347       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8348       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8349     }
8350     if (UseSHA1Intrinsics) {
8351       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8352       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8353     }
8354     if (UseSHA256Intrinsics) {
8355       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8356       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8357     }
8358     if (UseSHA512Intrinsics) {
8359       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8360       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8361     }
8362     if (UseSHA3Intrinsics) {
8363       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8364       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8365     }
8366 
8367     // generate Adler32 intrinsics code
8368     if (UseAdler32Intrinsics) {
8369       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8370     }
8371 #endif // COMPILER2_OR_JVMCI
8372   }
8373 
8374  public:
8375   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8376     switch(kind) {
8377     case Initial_stubs:
8378       generate_initial_stubs();
8379       break;
8380      case Continuation_stubs:
8381       generate_continuation_stubs();
8382       break;
8383     case Compiler_stubs:
8384       generate_compiler_stubs();
8385       break;
8386     case Final_stubs:
8387       generate_final_stubs();
8388       break;
8389     default:
8390       fatal("unexpected stubs kind: %d", kind);
8391       break;
8392     };
8393   }
8394 }; // end class declaration
8395 
8396 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8397   StubGenerator g(code, kind);
8398 }
8399 
8400 
8401 #if defined (LINUX)
8402 
8403 // Define pointers to atomic stubs and initialize them to point to the
8404 // code in atomic_aarch64.S.
8405 
8406 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8407   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8408     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8409   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8410     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8411 
8412 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8413 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8414 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8415 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8416 DEFAULT_ATOMIC_OP(xchg, 4, )
8417 DEFAULT_ATOMIC_OP(xchg, 8, )
8418 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8419 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8420 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8421 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8422 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8423 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8424 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8425 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8426 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8427 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8428 
8429 #undef DEFAULT_ATOMIC_OP
8430 
8431 #endif // LINUX