1 /*
   2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "asm/register.hpp"
  29 #include "atomic_aarch64.hpp"
  30 #include "code/SCCache.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/arguments.hpp"
  46 #include "runtime/atomic.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/frame.inline.hpp"
  50 #include "runtime/handles.inline.hpp"
  51 #include "runtime/javaThread.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/stubCodeGenerator.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "utilities/align.hpp"
  56 #include "utilities/checkedCast.hpp"
  57 #include "utilities/debug.hpp"
  58 #include "utilities/globalDefinitions.hpp"
  59 #include "utilities/intpow.hpp"
  60 #include "utilities/powerOfTwo.hpp"
  61 #ifdef COMPILER2
  62 #include "opto/runtime.hpp"
  63 #endif
  64 #if INCLUDE_ZGC
  65 #include "gc/z/zThreadLocalData.hpp"
  66 #endif
  67 
  68 // Declaration and definition of StubGenerator (no .hpp file).
  69 // For a more detailed description of the stub routine structure
  70 // see the comment in stubRoutines.hpp
  71 
  72 #undef __
  73 #define __ _masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif
  80 
  81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  82 
  83 // Stub Code definitions
  84 
  85 class StubGenerator: public StubCodeGenerator {
  86  private:
  87 
  88 #ifdef PRODUCT
  89 #define inc_counter_np(counter) ((void)0)
  90 #else
  91   void inc_counter_np_(uint& counter) {
  92     __ incrementw(ExternalAddress((address)&counter));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -29 [ argument word 1      ]
 145   // -28 [ saved Floating-point Control Register ]
 146   // -26 [ saved v15            ] <--- sp_after_call
 147   // -25 [ saved v14            ]
 148   // -24 [ saved v13            ]
 149   // -23 [ saved v12            ]
 150   // -22 [ saved v11            ]
 151   // -21 [ saved v10            ]
 152   // -20 [ saved v9             ]
 153   // -19 [ saved v8             ]
 154   // -18 [ saved r28            ]
 155   // -17 [ saved r27            ]
 156   // -16 [ saved r26            ]
 157   // -15 [ saved r25            ]
 158   // -14 [ saved r24            ]
 159   // -13 [ saved r23            ]
 160   // -12 [ saved r22            ]
 161   // -11 [ saved r21            ]
 162   // -10 [ saved r20            ]
 163   //  -9 [ saved r19            ]
 164   //  -8 [ call wrapper    (r0) ]
 165   //  -7 [ result          (r1) ]
 166   //  -6 [ result type     (r2) ]
 167   //  -5 [ method          (r3) ]
 168   //  -4 [ entry point     (r4) ]
 169   //  -3 [ parameters      (r5) ]
 170   //  -2 [ parameter size  (r6) ]
 171   //  -1 [ thread (r7)          ]
 172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 173   //   1 [ saved lr       (r30) ]
 174 
 175   // Call stub stack layout word offsets from fp
 176   enum call_stub_layout {
 177     sp_after_call_off  = -28,
 178 
 179     fpcr_off           = sp_after_call_off,
 180     d15_off            = -26,
 181     d13_off            = -24,
 182     d11_off            = -22,
 183     d9_off             = -20,
 184 
 185     r28_off            = -18,
 186     r26_off            = -16,
 187     r24_off            = -14,
 188     r22_off            = -12,
 189     r20_off            = -10,
 190     call_wrapper_off   =  -8,
 191     result_off         =  -7,
 192     result_type_off    =  -6,
 193     method_off         =  -5,
 194     entry_point_off    =  -4,
 195     parameter_size_off =  -2,
 196     thread_off         =  -1,
 197     fp_f               =   0,
 198     retaddr_off        =   1,
 199   };
 200 
 201   address generate_call_stub(address& return_address) {
 202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 204            "adjust this code");
 205 
 206     StubGenStubId stub_id = StubGenStubId::call_stub_id;
 207     StubCodeMark mark(this, stub_id);
 208     address start = __ pc();
 209 
 210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 211 
 212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 214     const Address result        (rfp, result_off         * wordSize);
 215     const Address result_type   (rfp, result_type_off    * wordSize);
 216     const Address method        (rfp, method_off         * wordSize);
 217     const Address entry_point   (rfp, entry_point_off    * wordSize);
 218     const Address parameter_size(rfp, parameter_size_off * wordSize);
 219 
 220     const Address thread        (rfp, thread_off         * wordSize);
 221 
 222     const Address d15_save      (rfp, d15_off * wordSize);
 223     const Address d13_save      (rfp, d13_off * wordSize);
 224     const Address d11_save      (rfp, d11_off * wordSize);
 225     const Address d9_save       (rfp, d9_off * wordSize);
 226 
 227     const Address r28_save      (rfp, r28_off * wordSize);
 228     const Address r26_save      (rfp, r26_off * wordSize);
 229     const Address r24_save      (rfp, r24_off * wordSize);
 230     const Address r22_save      (rfp, r22_off * wordSize);
 231     const Address r20_save      (rfp, r20_off * wordSize);
 232 
 233     // stub code
 234 
 235     address aarch64_entry = __ pc();
 236 
 237     // set up frame and move sp to end of save area
 238     __ enter();
 239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 240 
 241     // save register parameters and Java scratch/global registers
 242     // n.b. we save thread even though it gets installed in
 243     // rthread because we want to sanity check rthread later
 244     __ str(c_rarg7,  thread);
 245     __ strw(c_rarg6, parameter_size);
 246     __ stp(c_rarg4, c_rarg5,  entry_point);
 247     __ stp(c_rarg2, c_rarg3,  result_type);
 248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 249 
 250     __ stp(r20, r19,   r20_save);
 251     __ stp(r22, r21,   r22_save);
 252     __ stp(r24, r23,   r24_save);
 253     __ stp(r26, r25,   r26_save);
 254     __ stp(r28, r27,   r28_save);
 255 
 256     __ stpd(v9,  v8,   d9_save);
 257     __ stpd(v11, v10,  d11_save);
 258     __ stpd(v13, v12,  d13_save);
 259     __ stpd(v15, v14,  d15_save);
 260 
 261     __ get_fpcr(rscratch1);
 262     __ str(rscratch1, fpcr_save);
 263     // Set FPCR to the state we need. We do want Round to Nearest. We
 264     // don't want non-IEEE rounding modes or floating-point traps.
 265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 267     __ set_fpcr(rscratch1);
 268 
 269     // install Java thread in global register now we have saved
 270     // whatever value it held
 271     __ mov(rthread, c_rarg7);
 272     // And method
 273     __ mov(rmethod, c_rarg3);
 274 
 275     // set up the heapbase register
 276     __ reinit_heapbase();
 277 
 278 #ifdef ASSERT
 279     // make sure we have no pending exceptions
 280     {
 281       Label L;
 282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 283       __ cmp(rscratch1, (u1)NULL_WORD);
 284       __ br(Assembler::EQ, L);
 285       __ stop("StubRoutines::call_stub: entered with pending exception");
 286       __ BIND(L);
 287     }
 288 #endif
 289     // pass parameters if any
 290     __ mov(esp, sp);
 291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 292     __ andr(sp, rscratch1, -2 * wordSize);
 293 
 294     BLOCK_COMMENT("pass parameters if any");
 295     Label parameters_done;
 296     // parameter count is still in c_rarg6
 297     // and parameter pointer identifying param 1 is in c_rarg5
 298     __ cbzw(c_rarg6, parameters_done);
 299 
 300     address loop = __ pc();
 301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 302     __ subsw(c_rarg6, c_rarg6, 1);
 303     __ push(rscratch1);
 304     __ br(Assembler::GT, loop);
 305 
 306     __ BIND(parameters_done);
 307 
 308     // call Java entry -- passing methdoOop, and current sp
 309     //      rmethod: Method*
 310     //      r19_sender_sp: sender sp
 311     BLOCK_COMMENT("call Java function");
 312     __ mov(r19_sender_sp, sp);
 313     __ blr(c_rarg4);
 314 
 315     // we do this here because the notify will already have been done
 316     // if we get to the next instruction via an exception
 317     //
 318     // n.b. adding this instruction here affects the calculation of
 319     // whether or not a routine returns to the call stub (used when
 320     // doing stack walks) since the normal test is to check the return
 321     // pc against the address saved below. so we may need to allow for
 322     // this extra instruction in the check.
 323 
 324     // save current address for use by exception handling code
 325 
 326     return_address = __ pc();
 327 
 328     // store result depending on type (everything that is not
 329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 330     // n.b. this assumes Java returns an integral result in r0
 331     // and a floating result in j_farg0
 332     __ ldr(j_rarg2, result);
 333     Label is_long, is_float, is_double, exit;
 334     __ ldr(j_rarg1, result_type);
 335     __ cmp(j_rarg1, (u1)T_OBJECT);
 336     __ br(Assembler::EQ, is_long);
 337     __ cmp(j_rarg1, (u1)T_LONG);
 338     __ br(Assembler::EQ, is_long);
 339     __ cmp(j_rarg1, (u1)T_FLOAT);
 340     __ br(Assembler::EQ, is_float);
 341     __ cmp(j_rarg1, (u1)T_DOUBLE);
 342     __ br(Assembler::EQ, is_double);
 343 
 344     // handle T_INT case
 345     __ strw(r0, Address(j_rarg2));
 346 
 347     __ BIND(exit);
 348 
 349     // pop parameters
 350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 351 
 352 #ifdef ASSERT
 353     // verify that threads correspond
 354     {
 355       Label L, S;
 356       __ ldr(rscratch1, thread);
 357       __ cmp(rthread, rscratch1);
 358       __ br(Assembler::NE, S);
 359       __ get_thread(rscratch1);
 360       __ cmp(rthread, rscratch1);
 361       __ br(Assembler::EQ, L);
 362       __ BIND(S);
 363       __ stop("StubRoutines::call_stub: threads must correspond");
 364       __ BIND(L);
 365     }
 366 #endif
 367 
 368     __ pop_cont_fastpath(rthread);
 369 
 370     // restore callee-save registers
 371     __ ldpd(v15, v14,  d15_save);
 372     __ ldpd(v13, v12,  d13_save);
 373     __ ldpd(v11, v10,  d11_save);
 374     __ ldpd(v9,  v8,   d9_save);
 375 
 376     __ ldp(r28, r27,   r28_save);
 377     __ ldp(r26, r25,   r26_save);
 378     __ ldp(r24, r23,   r24_save);
 379     __ ldp(r22, r21,   r22_save);
 380     __ ldp(r20, r19,   r20_save);
 381 
 382     // restore fpcr
 383     __ ldr(rscratch1,  fpcr_save);
 384     __ set_fpcr(rscratch1);
 385 
 386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 387     __ ldrw(c_rarg2, result_type);
 388     __ ldr(c_rarg3,  method);
 389     __ ldp(c_rarg4, c_rarg5,  entry_point);
 390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 391 
 392     // leave frame and return to caller
 393     __ leave();
 394     __ ret(lr);
 395 
 396     // handle return types different from T_INT
 397 
 398     __ BIND(is_long);
 399     __ str(r0, Address(j_rarg2, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     __ BIND(is_float);
 403     __ strs(j_farg0, Address(j_rarg2, 0));
 404     __ br(Assembler::AL, exit);
 405 
 406     __ BIND(is_double);
 407     __ strd(j_farg0, Address(j_rarg2, 0));
 408     __ br(Assembler::AL, exit);
 409 
 410     return start;
 411   }
 412 
 413   // Return point for a Java call if there's an exception thrown in
 414   // Java code.  The exception is caught and transformed into a
 415   // pending exception stored in JavaThread that can be tested from
 416   // within the VM.
 417   //
 418   // Note: Usually the parameters are removed by the callee. In case
 419   // of an exception crossing an activation frame boundary, that is
 420   // not the case if the callee is compiled code => need to setup the
 421   // rsp.
 422   //
 423   // r0: exception oop
 424 
 425   address generate_catch_exception() {
 426     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
 427     StubCodeMark mark(this, stub_id);
 428     address start = __ pc();
 429 
 430     // same as in generate_call_stub():
 431     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 432     const Address thread        (rfp, thread_off         * wordSize);
 433 
 434 #ifdef ASSERT
 435     // verify that threads correspond
 436     {
 437       Label L, S;
 438       __ ldr(rscratch1, thread);
 439       __ cmp(rthread, rscratch1);
 440       __ br(Assembler::NE, S);
 441       __ get_thread(rscratch1);
 442       __ cmp(rthread, rscratch1);
 443       __ br(Assembler::EQ, L);
 444       __ bind(S);
 445       __ stop("StubRoutines::catch_exception: threads must correspond");
 446       __ bind(L);
 447     }
 448 #endif
 449 
 450     // set pending exception
 451     __ verify_oop(r0);
 452 
 453     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 454     __ mov(rscratch1, (address)__FILE__);
 455     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 456     __ movw(rscratch1, (int)__LINE__);
 457     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 458 
 459     // complete return to VM
 460     assert(StubRoutines::_call_stub_return_address != nullptr,
 461            "_call_stub_return_address must have been generated before");
 462     __ b(StubRoutines::_call_stub_return_address);
 463 
 464     return start;
 465   }
 466 
 467   // Continuation point for runtime calls returning with a pending
 468   // exception.  The pending exception check happened in the runtime
 469   // or native call stub.  The pending exception in Thread is
 470   // converted into a Java-level exception.
 471   //
 472   // Contract with Java-level exception handlers:
 473   // r0: exception
 474   // r3: throwing pc
 475   //
 476   // NOTE: At entry of this stub, exception-pc must be in LR !!
 477 
 478   // NOTE: this is always used as a jump target within generated code
 479   // so it just needs to be generated code with no x86 prolog
 480 
 481   address generate_forward_exception() {
 482     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
 483     StubCodeMark mark(this, stub_id);
 484     address start = __ pc();
 485 
 486     // Upon entry, LR points to the return address returning into
 487     // Java (interpreted or compiled) code; i.e., the return address
 488     // becomes the throwing pc.
 489     //
 490     // Arguments pushed before the runtime call are still on the stack
 491     // but the exception handler will reset the stack pointer ->
 492     // ignore them.  A potential result in registers can be ignored as
 493     // well.
 494 
 495 #ifdef ASSERT
 496     // make sure this code is only executed if there is a pending exception
 497     {
 498       Label L;
 499       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 500       __ cbnz(rscratch1, L);
 501       __ stop("StubRoutines::forward exception: no pending exception (1)");
 502       __ bind(L);
 503     }
 504 #endif
 505 
 506     // compute exception handler into r19
 507 
 508     // call the VM to find the handler address associated with the
 509     // caller address. pass thread in r0 and caller pc (ret address)
 510     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 511     // the stack.
 512     __ mov(c_rarg1, lr);
 513     // lr will be trashed by the VM call so we move it to R19
 514     // (callee-saved) because we also need to pass it to the handler
 515     // returned by this call.
 516     __ mov(r19, lr);
 517     BLOCK_COMMENT("call exception_handler_for_return_address");
 518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 519                          SharedRuntime::exception_handler_for_return_address),
 520                     rthread, c_rarg1);
 521     // Reinitialize the ptrue predicate register, in case the external runtime
 522     // call clobbers ptrue reg, as we may return to SVE compiled code.
 523     __ reinitialize_ptrue();
 524 
 525     // we should not really care that lr is no longer the callee
 526     // address. we saved the value the handler needs in r19 so we can
 527     // just copy it to r3. however, the C2 handler will push its own
 528     // frame and then calls into the VM and the VM code asserts that
 529     // the PC for the frame above the handler belongs to a compiled
 530     // Java method. So, we restore lr here to satisfy that assert.
 531     __ mov(lr, r19);
 532     // setup r0 & r3 & clear pending exception
 533     __ mov(r3, r19);
 534     __ mov(r19, r0);
 535     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 536     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 537 
 538 #ifdef ASSERT
 539     // make sure exception is set
 540     {
 541       Label L;
 542       __ cbnz(r0, L);
 543       __ stop("StubRoutines::forward exception: no pending exception (2)");
 544       __ bind(L);
 545     }
 546 #endif
 547 
 548     // continue at exception handler
 549     // r0: exception
 550     // r3: throwing pc
 551     // r19: exception handler
 552     __ verify_oop(r0);
 553     __ br(r19);
 554 
 555     return start;
 556   }
 557 
 558   // Non-destructive plausibility checks for oops
 559   //
 560   // Arguments:
 561   //    r0: oop to verify
 562   //    rscratch1: error message
 563   //
 564   // Stack after saving c_rarg3:
 565   //    [tos + 0]: saved c_rarg3
 566   //    [tos + 1]: saved c_rarg2
 567   //    [tos + 2]: saved lr
 568   //    [tos + 3]: saved rscratch2
 569   //    [tos + 4]: saved r0
 570   //    [tos + 5]: saved rscratch1
 571   address generate_verify_oop() {
 572     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
 573     StubCodeMark mark(this, stub_id);
 574     address start = __ pc();
 575 
 576     Label exit, error;
 577 
 578     // save c_rarg2 and c_rarg3
 579     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 580 
 581     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 582     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 583     __ ldr(c_rarg3, Address(c_rarg2));
 584     __ add(c_rarg3, c_rarg3, 1);
 585     __ str(c_rarg3, Address(c_rarg2));
 586 
 587     // object is in r0
 588     // make sure object is 'reasonable'
 589     __ cbz(r0, exit); // if obj is null it is OK
 590 
 591     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 592     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 593 
 594     // return if everything seems ok
 595     __ bind(exit);
 596 
 597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 598     __ ret(lr);
 599 
 600     // handle errors
 601     __ bind(error);
 602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 603 
 604     __ push(RegSet::range(r0, r29), sp);
 605     // debug(char* msg, int64_t pc, int64_t regs[])
 606     __ mov(c_rarg0, rscratch1);      // pass address of error message
 607     __ mov(c_rarg1, lr);             // pass return address
 608     __ mov(c_rarg2, sp);             // pass address of regs on stack
 609 #ifndef PRODUCT
 610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 611 #endif
 612     BLOCK_COMMENT("call MacroAssembler::debug");
 613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 614     __ blr(rscratch1);
 615     __ hlt(0);
 616 
 617     return start;
 618   }
 619 
 620   // Generate indices for iota vector.
 621   address generate_iota_indices(StubGenStubId stub_id) {
 622     __ align(CodeEntryAlignment);
 623     StubCodeMark mark(this, stub_id);
 624     address start = __ pc();
 625     // B
 626     __ emit_data64(0x0706050403020100, relocInfo::none);
 627     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 628     // H
 629     __ emit_data64(0x0003000200010000, relocInfo::none);
 630     __ emit_data64(0x0007000600050004, relocInfo::none);
 631     // S
 632     __ emit_data64(0x0000000100000000, relocInfo::none);
 633     __ emit_data64(0x0000000300000002, relocInfo::none);
 634     // D
 635     __ emit_data64(0x0000000000000000, relocInfo::none);
 636     __ emit_data64(0x0000000000000001, relocInfo::none);
 637     // S - FP
 638     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 639     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 640     // D - FP
 641     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 642     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 643     return start;
 644   }
 645 
 646   // The inner part of zero_words().  This is the bulk operation,
 647   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 648   // caller is responsible for zeroing the last few words.
 649   //
 650   // Inputs:
 651   // r10: the HeapWord-aligned base address of an array to zero.
 652   // r11: the count in HeapWords, r11 > 0.
 653   //
 654   // Returns r10 and r11, adjusted for the caller to clear.
 655   // r10: the base address of the tail of words left to clear.
 656   // r11: the number of words in the tail.
 657   //      r11 < MacroAssembler::zero_words_block_size.
 658 
 659   address generate_zero_blocks() {
 660     Label done;
 661     Label base_aligned;
 662 
 663     Register base = r10, cnt = r11;
 664 
 665     __ align(CodeEntryAlignment);
 666     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
 667     StubCodeMark mark(this, stub_id);
 668     address start = __ pc();
 669 
 670     if (UseBlockZeroing) {
 671       int zva_length = VM_Version::zva_length();
 672 
 673       // Ensure ZVA length can be divided by 16. This is required by
 674       // the subsequent operations.
 675       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 676 
 677       __ tbz(base, 3, base_aligned);
 678       __ str(zr, Address(__ post(base, 8)));
 679       __ sub(cnt, cnt, 1);
 680       __ bind(base_aligned);
 681 
 682       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 683       // alignment.
 684       Label small;
 685       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 686       __ subs(rscratch1, cnt, low_limit >> 3);
 687       __ br(Assembler::LT, small);
 688       __ zero_dcache_blocks(base, cnt);
 689       __ bind(small);
 690     }
 691 
 692     {
 693       // Number of stp instructions we'll unroll
 694       const int unroll =
 695         MacroAssembler::zero_words_block_size / 2;
 696       // Clear the remaining blocks.
 697       Label loop;
 698       __ subs(cnt, cnt, unroll * 2);
 699       __ br(Assembler::LT, done);
 700       __ bind(loop);
 701       for (int i = 0; i < unroll; i++)
 702         __ stp(zr, zr, __ post(base, 16));
 703       __ subs(cnt, cnt, unroll * 2);
 704       __ br(Assembler::GE, loop);
 705       __ bind(done);
 706       __ add(cnt, cnt, unroll * 2);
 707     }
 708 
 709     __ ret(lr);
 710 
 711     return start;
 712   }
 713 
 714 
 715   typedef enum {
 716     copy_forwards = 1,
 717     copy_backwards = -1
 718   } copy_direction;
 719 
 720   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 721   // for arraycopy stubs.
 722   class ArrayCopyBarrierSetHelper : StackObj {
 723     BarrierSetAssembler* _bs_asm;
 724     MacroAssembler* _masm;
 725     DecoratorSet _decorators;
 726     BasicType _type;
 727     Register _gct1;
 728     Register _gct2;
 729     Register _gct3;
 730     FloatRegister _gcvt1;
 731     FloatRegister _gcvt2;
 732     FloatRegister _gcvt3;
 733 
 734   public:
 735     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 736                               DecoratorSet decorators,
 737                               BasicType type,
 738                               Register gct1,
 739                               Register gct2,
 740                               Register gct3,
 741                               FloatRegister gcvt1,
 742                               FloatRegister gcvt2,
 743                               FloatRegister gcvt3)
 744       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 745         _masm(masm),
 746         _decorators(decorators),
 747         _type(type),
 748         _gct1(gct1),
 749         _gct2(gct2),
 750         _gct3(gct3),
 751         _gcvt1(gcvt1),
 752         _gcvt2(gcvt2),
 753         _gcvt3(gcvt3) {
 754     }
 755 
 756     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 757       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 758                             dst1, dst2, src,
 759                             _gct1, _gct2, _gcvt1);
 760     }
 761 
 762     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 763       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 764                              dst, src1, src2,
 765                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 766     }
 767 
 768     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 769       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 770                             dst1, dst2, src,
 771                             _gct1);
 772     }
 773 
 774     void copy_store_at_16(Address dst, Register src1, Register src2) {
 775       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 776                              dst, src1, src2,
 777                              _gct1, _gct2, _gct3);
 778     }
 779 
 780     void copy_load_at_8(Register dst, Address src) {
 781       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 782                             dst, noreg, src,
 783                             _gct1);
 784     }
 785 
 786     void copy_store_at_8(Address dst, Register src) {
 787       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 788                              dst, src, noreg,
 789                              _gct1, _gct2, _gct3);
 790     }
 791   };
 792 
 793   // Bulk copy of blocks of 8 words.
 794   //
 795   // count is a count of words.
 796   //
 797   // Precondition: count >= 8
 798   //
 799   // Postconditions:
 800   //
 801   // The least significant bit of count contains the remaining count
 802   // of words to copy.  The rest of count is trash.
 803   //
 804   // s and d are adjusted to point to the remaining words to copy
 805   //
 806   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
 807     BasicType type;
 808     copy_direction direction;
 809 
 810     switch (stub_id) {
 811     case copy_byte_f_id:
 812       direction = copy_forwards;
 813       type = T_BYTE;
 814       break;
 815     case copy_byte_b_id:
 816       direction = copy_backwards;
 817       type = T_BYTE;
 818       break;
 819     case copy_oop_f_id:
 820       direction = copy_forwards;
 821       type = T_OBJECT;
 822       break;
 823     case copy_oop_b_id:
 824       direction = copy_backwards;
 825       type = T_OBJECT;
 826       break;
 827     case copy_oop_uninit_f_id:
 828       direction = copy_forwards;
 829       type = T_OBJECT;
 830       break;
 831     case copy_oop_uninit_b_id:
 832       direction = copy_backwards;
 833       type = T_OBJECT;
 834       break;
 835     default:
 836       ShouldNotReachHere();
 837     }
 838 
 839     int unit = wordSize * direction;
 840     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 841 
 842     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 843       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 844     const Register stride = r14;
 845     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 846     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 847     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 848 
 849     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 850     assert_different_registers(s, d, count, rscratch1, rscratch2);
 851 
 852     Label again, drain;
 853 
 854     __ align(CodeEntryAlignment);
 855 
 856     StubCodeMark mark(this, stub_id);
 857 
 858     __ bind(start);
 859 
 860     Label unaligned_copy_long;
 861     if (AvoidUnalignedAccesses) {
 862       __ tbnz(d, 3, unaligned_copy_long);
 863     }
 864 
 865     if (direction == copy_forwards) {
 866       __ sub(s, s, bias);
 867       __ sub(d, d, bias);
 868     }
 869 
 870 #ifdef ASSERT
 871     // Make sure we are never given < 8 words
 872     {
 873       Label L;
 874       __ cmp(count, (u1)8);
 875       __ br(Assembler::GE, L);
 876       __ stop("genrate_copy_longs called with < 8 words");
 877       __ bind(L);
 878     }
 879 #endif
 880 
 881     // Fill 8 registers
 882     if (UseSIMDForMemoryOps) {
 883       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 884       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 885     } else {
 886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 887       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 888       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 889       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 890     }
 891 
 892     __ subs(count, count, 16);
 893     __ br(Assembler::LO, drain);
 894 
 895     int prefetch = PrefetchCopyIntervalInBytes;
 896     bool use_stride = false;
 897     if (direction == copy_backwards) {
 898        use_stride = prefetch > 256;
 899        prefetch = -prefetch;
 900        if (use_stride) __ mov(stride, prefetch);
 901     }
 902 
 903     __ bind(again);
 904 
 905     if (PrefetchCopyIntervalInBytes > 0)
 906       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 907 
 908     if (UseSIMDForMemoryOps) {
 909       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 910       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 911       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 912       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 913     } else {
 914       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 915       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 916       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 917       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 919       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 920       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 921       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 922     }
 923 
 924     __ subs(count, count, 8);
 925     __ br(Assembler::HS, again);
 926 
 927     // Drain
 928     __ bind(drain);
 929     if (UseSIMDForMemoryOps) {
 930       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 931       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 932     } else {
 933       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 934       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 935       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 936       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 937     }
 938 
 939     {
 940       Label L1, L2;
 941       __ tbz(count, exact_log2(4), L1);
 942       if (UseSIMDForMemoryOps) {
 943         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 944         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 945       } else {
 946         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 947         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 948         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 949         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 950       }
 951       __ bind(L1);
 952 
 953       if (direction == copy_forwards) {
 954         __ add(s, s, bias);
 955         __ add(d, d, bias);
 956       }
 957 
 958       __ tbz(count, 1, L2);
 959       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 960       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 961       __ bind(L2);
 962     }
 963 
 964     __ ret(lr);
 965 
 966     if (AvoidUnalignedAccesses) {
 967       Label drain, again;
 968       // Register order for storing. Order is different for backward copy.
 969 
 970       __ bind(unaligned_copy_long);
 971 
 972       // source address is even aligned, target odd aligned
 973       //
 974       // when forward copying word pairs we read long pairs at offsets
 975       // {0, 2, 4, 6} (in long words). when backwards copying we read
 976       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 977       // address by -2 in the forwards case so we can compute the
 978       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 979       // or -1.
 980       //
 981       // when forward copying we need to store 1 word, 3 pairs and
 982       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 983       // zero offset We adjust the destination by -1 which means we
 984       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 985       //
 986       // When backwards copyng we need to store 1 word, 3 pairs and
 987       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 988       // offsets {1, 3, 5, 7, 8} * unit.
 989 
 990       if (direction == copy_forwards) {
 991         __ sub(s, s, 16);
 992         __ sub(d, d, 8);
 993       }
 994 
 995       // Fill 8 registers
 996       //
 997       // for forwards copy s was offset by -16 from the original input
 998       // value of s so the register contents are at these offsets
 999       // relative to the 64 bit block addressed by that original input
1000       // and so on for each successive 64 byte block when s is updated
1001       //
1002       // t0 at offset 0,  t1 at offset 8
1003       // t2 at offset 16, t3 at offset 24
1004       // t4 at offset 32, t5 at offset 40
1005       // t6 at offset 48, t7 at offset 56
1006 
1007       // for backwards copy s was not offset so the register contents
1008       // are at these offsets into the preceding 64 byte block
1009       // relative to that original input and so on for each successive
1010       // preceding 64 byte block when s is updated. this explains the
1011       // slightly counter-intuitive looking pattern of register usage
1012       // in the stp instructions for backwards copy.
1013       //
1014       // t0 at offset -16, t1 at offset -8
1015       // t2 at offset -32, t3 at offset -24
1016       // t4 at offset -48, t5 at offset -40
1017       // t6 at offset -64, t7 at offset -56
1018 
1019       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1020       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1021       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1022       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1023 
1024       __ subs(count, count, 16);
1025       __ br(Assembler::LO, drain);
1026 
1027       int prefetch = PrefetchCopyIntervalInBytes;
1028       bool use_stride = false;
1029       if (direction == copy_backwards) {
1030          use_stride = prefetch > 256;
1031          prefetch = -prefetch;
1032          if (use_stride) __ mov(stride, prefetch);
1033       }
1034 
1035       __ bind(again);
1036 
1037       if (PrefetchCopyIntervalInBytes > 0)
1038         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1039 
1040       if (direction == copy_forwards) {
1041        // allowing for the offset of -8 the store instructions place
1042        // registers into the target 64 bit block at the following
1043        // offsets
1044        //
1045        // t0 at offset 0
1046        // t1 at offset 8,  t2 at offset 16
1047        // t3 at offset 24, t4 at offset 32
1048        // t5 at offset 40, t6 at offset 48
1049        // t7 at offset 56
1050 
1051         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1052         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1053         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1054         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1055         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1056         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1057         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1058         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1059         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1060       } else {
1061        // d was not offset when we started so the registers are
1062        // written into the 64 bit block preceding d with the following
1063        // offsets
1064        //
1065        // t1 at offset -8
1066        // t3 at offset -24, t0 at offset -16
1067        // t5 at offset -48, t2 at offset -32
1068        // t7 at offset -56, t4 at offset -48
1069        //                   t6 at offset -64
1070        //
1071        // note that this matches the offsets previously noted for the
1072        // loads
1073 
1074         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1075         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1076         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1077         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1078         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1079         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1080         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1082         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1083       }
1084 
1085       __ subs(count, count, 8);
1086       __ br(Assembler::HS, again);
1087 
1088       // Drain
1089       //
1090       // this uses the same pattern of offsets and register arguments
1091       // as above
1092       __ bind(drain);
1093       if (direction == copy_forwards) {
1094         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1095         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1096         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1097         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1098         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1099       } else {
1100         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1101         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1102         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1103         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1104         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1105       }
1106       // now we need to copy any remaining part block which may
1107       // include a 4 word block subblock and/or a 2 word subblock.
1108       // bits 2 and 1 in the count are the tell-tale for whether we
1109       // have each such subblock
1110       {
1111         Label L1, L2;
1112         __ tbz(count, exact_log2(4), L1);
1113        // this is the same as above but copying only 4 longs hence
1114        // with only one intervening stp between the str instructions
1115        // but note that the offsets and registers still follow the
1116        // same pattern
1117         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1118         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1119         if (direction == copy_forwards) {
1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1121           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1122           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1123         } else {
1124           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1125           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1126           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1127         }
1128         __ bind(L1);
1129 
1130         __ tbz(count, 1, L2);
1131        // this is the same as above but copying only 2 longs hence
1132        // there is no intervening stp between the str instructions
1133        // but note that the offset and register patterns are still
1134        // the same
1135         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1136         if (direction == copy_forwards) {
1137           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1138           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1139         } else {
1140           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1141           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1142         }
1143         __ bind(L2);
1144 
1145        // for forwards copy we need to re-adjust the offsets we
1146        // applied so that s and d are follow the last words written
1147 
1148        if (direction == copy_forwards) {
1149          __ add(s, s, 16);
1150          __ add(d, d, 8);
1151        }
1152 
1153       }
1154 
1155       __ ret(lr);
1156       }
1157   }
1158 
1159   // Small copy: less than 16 bytes.
1160   //
1161   // NB: Ignores all of the bits of count which represent more than 15
1162   // bytes, so a caller doesn't have to mask them.
1163 
1164   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1165     bool is_backwards = step < 0;
1166     size_t granularity = uabs(step);
1167     int direction = is_backwards ? -1 : 1;
1168 
1169     Label Lword, Lint, Lshort, Lbyte;
1170 
1171     assert(granularity
1172            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1173 
1174     const Register t0 = r3;
1175     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1176     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1177 
1178     // ??? I don't know if this bit-test-and-branch is the right thing
1179     // to do.  It does a lot of jumping, resulting in several
1180     // mispredicted branches.  It might make more sense to do this
1181     // with something like Duff's device with a single computed branch.
1182 
1183     __ tbz(count, 3 - exact_log2(granularity), Lword);
1184     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1185     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1186     __ bind(Lword);
1187 
1188     if (granularity <= sizeof (jint)) {
1189       __ tbz(count, 2 - exact_log2(granularity), Lint);
1190       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1191       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1192       __ bind(Lint);
1193     }
1194 
1195     if (granularity <= sizeof (jshort)) {
1196       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1197       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1198       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1199       __ bind(Lshort);
1200     }
1201 
1202     if (granularity <= sizeof (jbyte)) {
1203       __ tbz(count, 0, Lbyte);
1204       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1205       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1206       __ bind(Lbyte);
1207     }
1208   }
1209 
1210   Label copy_f, copy_b;
1211   Label copy_obj_f, copy_obj_b;
1212   Label copy_obj_uninit_f, copy_obj_uninit_b;
1213 
1214   // All-singing all-dancing memory copy.
1215   //
1216   // Copy count units of memory from s to d.  The size of a unit is
1217   // step, which can be positive or negative depending on the direction
1218   // of copy.  If is_aligned is false, we align the source address.
1219   //
1220 
1221   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1222                    Register s, Register d, Register count, int step) {
1223     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1224     bool is_backwards = step < 0;
1225     unsigned int granularity = uabs(step);
1226     const Register t0 = r3, t1 = r4;
1227 
1228     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1229     // load all the data before writing anything
1230     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1231     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1232     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1233     const Register send = r17, dend = r16;
1234     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1235     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1236     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1237 
1238     if (PrefetchCopyIntervalInBytes > 0)
1239       __ prfm(Address(s, 0), PLDL1KEEP);
1240     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1241     __ br(Assembler::HI, copy_big);
1242 
1243     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1244     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1245 
1246     __ cmp(count, u1(16/granularity));
1247     __ br(Assembler::LS, copy16);
1248 
1249     __ cmp(count, u1(64/granularity));
1250     __ br(Assembler::HI, copy80);
1251 
1252     __ cmp(count, u1(32/granularity));
1253     __ br(Assembler::LS, copy32);
1254 
1255     // 33..64 bytes
1256     if (UseSIMDForMemoryOps) {
1257       bs.copy_load_at_32(v0, v1, Address(s, 0));
1258       bs.copy_load_at_32(v2, v3, Address(send, -32));
1259       bs.copy_store_at_32(Address(d, 0), v0, v1);
1260       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1261     } else {
1262       bs.copy_load_at_16(t0, t1, Address(s, 0));
1263       bs.copy_load_at_16(t2, t3, Address(s, 16));
1264       bs.copy_load_at_16(t4, t5, Address(send, -32));
1265       bs.copy_load_at_16(t6, t7, Address(send, -16));
1266 
1267       bs.copy_store_at_16(Address(d, 0), t0, t1);
1268       bs.copy_store_at_16(Address(d, 16), t2, t3);
1269       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1270       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1271     }
1272     __ b(finish);
1273 
1274     // 17..32 bytes
1275     __ bind(copy32);
1276     bs.copy_load_at_16(t0, t1, Address(s, 0));
1277     bs.copy_load_at_16(t6, t7, Address(send, -16));
1278 
1279     bs.copy_store_at_16(Address(d, 0), t0, t1);
1280     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1281     __ b(finish);
1282 
1283     // 65..80/96 bytes
1284     // (96 bytes if SIMD because we do 32 byes per instruction)
1285     __ bind(copy80);
1286     if (UseSIMDForMemoryOps) {
1287       bs.copy_load_at_32(v0, v1, Address(s, 0));
1288       bs.copy_load_at_32(v2, v3, Address(s, 32));
1289       // Unaligned pointers can be an issue for copying.
1290       // The issue has more chances to happen when granularity of data is
1291       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1292       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1293       // The most performance drop has been seen for the range 65-80 bytes.
1294       // For such cases using the pair of ldp/stp instead of the third pair of
1295       // ldpq/stpq fixes the performance issue.
1296       if (granularity < sizeof (jint)) {
1297         Label copy96;
1298         __ cmp(count, u1(80/granularity));
1299         __ br(Assembler::HI, copy96);
1300         bs.copy_load_at_16(t0, t1, Address(send, -16));
1301 
1302         bs.copy_store_at_32(Address(d, 0), v0, v1);
1303         bs.copy_store_at_32(Address(d, 32), v2, v3);
1304 
1305         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1306         __ b(finish);
1307 
1308         __ bind(copy96);
1309       }
1310       bs.copy_load_at_32(v4, v5, Address(send, -32));
1311 
1312       bs.copy_store_at_32(Address(d, 0), v0, v1);
1313       bs.copy_store_at_32(Address(d, 32), v2, v3);
1314 
1315       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1316     } else {
1317       bs.copy_load_at_16(t0, t1, Address(s, 0));
1318       bs.copy_load_at_16(t2, t3, Address(s, 16));
1319       bs.copy_load_at_16(t4, t5, Address(s, 32));
1320       bs.copy_load_at_16(t6, t7, Address(s, 48));
1321       bs.copy_load_at_16(t8, t9, Address(send, -16));
1322 
1323       bs.copy_store_at_16(Address(d, 0), t0, t1);
1324       bs.copy_store_at_16(Address(d, 16), t2, t3);
1325       bs.copy_store_at_16(Address(d, 32), t4, t5);
1326       bs.copy_store_at_16(Address(d, 48), t6, t7);
1327       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1328     }
1329     __ b(finish);
1330 
1331     // 0..16 bytes
1332     __ bind(copy16);
1333     __ cmp(count, u1(8/granularity));
1334     __ br(Assembler::LO, copy8);
1335 
1336     // 8..16 bytes
1337     bs.copy_load_at_8(t0, Address(s, 0));
1338     bs.copy_load_at_8(t1, Address(send, -8));
1339     bs.copy_store_at_8(Address(d, 0), t0);
1340     bs.copy_store_at_8(Address(dend, -8), t1);
1341     __ b(finish);
1342 
1343     if (granularity < 8) {
1344       // 4..7 bytes
1345       __ bind(copy8);
1346       __ tbz(count, 2 - exact_log2(granularity), copy4);
1347       __ ldrw(t0, Address(s, 0));
1348       __ ldrw(t1, Address(send, -4));
1349       __ strw(t0, Address(d, 0));
1350       __ strw(t1, Address(dend, -4));
1351       __ b(finish);
1352       if (granularity < 4) {
1353         // 0..3 bytes
1354         __ bind(copy4);
1355         __ cbz(count, finish); // get rid of 0 case
1356         if (granularity == 2) {
1357           __ ldrh(t0, Address(s, 0));
1358           __ strh(t0, Address(d, 0));
1359         } else { // granularity == 1
1360           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1361           // the first and last byte.
1362           // Handle the 3 byte case by loading and storing base + count/2
1363           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1364           // This does means in the 1 byte case we load/store the same
1365           // byte 3 times.
1366           __ lsr(count, count, 1);
1367           __ ldrb(t0, Address(s, 0));
1368           __ ldrb(t1, Address(send, -1));
1369           __ ldrb(t2, Address(s, count));
1370           __ strb(t0, Address(d, 0));
1371           __ strb(t1, Address(dend, -1));
1372           __ strb(t2, Address(d, count));
1373         }
1374         __ b(finish);
1375       }
1376     }
1377 
1378     __ bind(copy_big);
1379     if (is_backwards) {
1380       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1381       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1382     }
1383 
1384     // Now we've got the small case out of the way we can align the
1385     // source address on a 2-word boundary.
1386 
1387     // Here we will materialize a count in r15, which is used by copy_memory_small
1388     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1389     // Up until here, we have used t9, which aliases r15, but from here on, that register
1390     // can not be used as a temp register, as it contains the count.
1391 
1392     Label aligned;
1393 
1394     if (is_aligned) {
1395       // We may have to adjust by 1 word to get s 2-word-aligned.
1396       __ tbz(s, exact_log2(wordSize), aligned);
1397       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1398       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1399       __ sub(count, count, wordSize/granularity);
1400     } else {
1401       if (is_backwards) {
1402         __ andr(r15, s, 2 * wordSize - 1);
1403       } else {
1404         __ neg(r15, s);
1405         __ andr(r15, r15, 2 * wordSize - 1);
1406       }
1407       // r15 is the byte adjustment needed to align s.
1408       __ cbz(r15, aligned);
1409       int shift = exact_log2(granularity);
1410       if (shift > 0) {
1411         __ lsr(r15, r15, shift);
1412       }
1413       __ sub(count, count, r15);
1414 
1415 #if 0
1416       // ?? This code is only correct for a disjoint copy.  It may or
1417       // may not make sense to use it in that case.
1418 
1419       // Copy the first pair; s and d may not be aligned.
1420       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1421       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1422 
1423       // Align s and d, adjust count
1424       if (is_backwards) {
1425         __ sub(s, s, r15);
1426         __ sub(d, d, r15);
1427       } else {
1428         __ add(s, s, r15);
1429         __ add(d, d, r15);
1430       }
1431 #else
1432       copy_memory_small(decorators, type, s, d, r15, step);
1433 #endif
1434     }
1435 
1436     __ bind(aligned);
1437 
1438     // s is now 2-word-aligned.
1439 
1440     // We have a count of units and some trailing bytes. Adjust the
1441     // count and do a bulk copy of words. If the shift is zero
1442     // perform a move instead to benefit from zero latency moves.
1443     int shift = exact_log2(wordSize/granularity);
1444     if (shift > 0) {
1445       __ lsr(r15, count, shift);
1446     } else {
1447       __ mov(r15, count);
1448     }
1449     if (direction == copy_forwards) {
1450       if (type != T_OBJECT) {
1451         __ bl(copy_f);
1452       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1453         __ bl(copy_obj_uninit_f);
1454       } else {
1455         __ bl(copy_obj_f);
1456       }
1457     } else {
1458       if (type != T_OBJECT) {
1459         __ bl(copy_b);
1460       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1461         __ bl(copy_obj_uninit_b);
1462       } else {
1463         __ bl(copy_obj_b);
1464       }
1465     }
1466 
1467     // And the tail.
1468     copy_memory_small(decorators, type, s, d, count, step);
1469 
1470     if (granularity >= 8) __ bind(copy8);
1471     if (granularity >= 4) __ bind(copy4);
1472     __ bind(finish);
1473   }
1474 
1475 
1476   void clobber_registers() {
1477 #ifdef ASSERT
1478     RegSet clobbered
1479       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1480     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1481     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1482     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1483       __ mov(*it, rscratch1);
1484     }
1485 #endif
1486 
1487   }
1488 
1489   // Scan over array at a for count oops, verifying each one.
1490   // Preserves a and count, clobbers rscratch1 and rscratch2.
1491   void verify_oop_array (int size, Register a, Register count, Register temp) {
1492     Label loop, end;
1493     __ mov(rscratch1, a);
1494     __ mov(rscratch2, zr);
1495     __ bind(loop);
1496     __ cmp(rscratch2, count);
1497     __ br(Assembler::HS, end);
1498     if (size == wordSize) {
1499       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1500       __ verify_oop(temp);
1501     } else {
1502       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1503       __ decode_heap_oop(temp); // calls verify_oop
1504     }
1505     __ add(rscratch2, rscratch2, 1);
1506     __ b(loop);
1507     __ bind(end);
1508   }
1509 
1510   // Arguments:
1511   //   stub_id - is used to name the stub and identify all details of
1512   //             how to perform the copy.
1513   //
1514   //   entry - is assigned to the stub's post push entry point unless
1515   //           it is null
1516   //
1517   // Inputs:
1518   //   c_rarg0   - source array address
1519   //   c_rarg1   - destination array address
1520   //   c_rarg2   - element count, treated as ssize_t, can be zero
1521   //
1522   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1523   // the hardware handle it.  The two dwords within qwords that span
1524   // cache line boundaries will still be loaded and stored atomically.
1525   //
1526   // Side Effects: entry is set to the (post push) entry point so it
1527   //               can be used by the corresponding conjoint copy
1528   //               method
1529   //
1530   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
1531     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1532     RegSet saved_reg = RegSet::of(s, d, count);
1533     int size;
1534     bool aligned;
1535     bool is_oop;
1536     bool dest_uninitialized;
1537     switch (stub_id) {
1538     case jbyte_disjoint_arraycopy_id:
1539       size = sizeof(jbyte);
1540       aligned = false;
1541       is_oop = false;
1542       dest_uninitialized = false;
1543       break;
1544     case arrayof_jbyte_disjoint_arraycopy_id:
1545       size = sizeof(jbyte);
1546       aligned = true;
1547       is_oop = false;
1548       dest_uninitialized = false;
1549       break;
1550     case jshort_disjoint_arraycopy_id:
1551       size = sizeof(jshort);
1552       aligned = false;
1553       is_oop = false;
1554       dest_uninitialized = false;
1555       break;
1556     case arrayof_jshort_disjoint_arraycopy_id:
1557       size = sizeof(jshort);
1558       aligned = true;
1559       is_oop = false;
1560       dest_uninitialized = false;
1561       break;
1562     case jint_disjoint_arraycopy_id:
1563       size = sizeof(jint);
1564       aligned = false;
1565       is_oop = false;
1566       dest_uninitialized = false;
1567       break;
1568     case arrayof_jint_disjoint_arraycopy_id:
1569       size = sizeof(jint);
1570       aligned = true;
1571       is_oop = false;
1572       dest_uninitialized = false;
1573       break;
1574     case jlong_disjoint_arraycopy_id:
1575       // since this is always aligned we can (should!) use the same
1576       // stub as for case arrayof_jlong_disjoint_arraycopy
1577       ShouldNotReachHere();
1578       break;
1579     case arrayof_jlong_disjoint_arraycopy_id:
1580       size = sizeof(jlong);
1581       aligned = true;
1582       is_oop = false;
1583       dest_uninitialized = false;
1584       break;
1585     case oop_disjoint_arraycopy_id:
1586       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1587       aligned = !UseCompressedOops;
1588       is_oop = true;
1589       dest_uninitialized = false;
1590       break;
1591     case arrayof_oop_disjoint_arraycopy_id:
1592       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1593       aligned = !UseCompressedOops;
1594       is_oop = true;
1595       dest_uninitialized = false;
1596       break;
1597     case oop_disjoint_arraycopy_uninit_id:
1598       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1599       aligned = !UseCompressedOops;
1600       is_oop = true;
1601       dest_uninitialized = true;
1602       break;
1603     case arrayof_oop_disjoint_arraycopy_uninit_id:
1604       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1605       aligned = !UseCompressedOops;
1606       is_oop = true;
1607       dest_uninitialized = true;
1608       break;
1609     default:
1610       ShouldNotReachHere();
1611       break;
1612     }
1613 
1614     __ align(CodeEntryAlignment);
1615     StubCodeMark mark(this, stub_id);
1616     address start = __ pc();
1617     __ enter();
1618 
1619     if (entry != nullptr) {
1620       *entry = __ pc();
1621       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1622       BLOCK_COMMENT("Entry:");
1623     }
1624 
1625     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1626     if (dest_uninitialized) {
1627       decorators |= IS_DEST_UNINITIALIZED;
1628     }
1629     if (aligned) {
1630       decorators |= ARRAYCOPY_ALIGNED;
1631     }
1632 
1633     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1634     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1635 
1636     if (is_oop) {
1637       // save regs before copy_memory
1638       __ push(RegSet::of(d, count), sp);
1639     }
1640     {
1641       // UnsafeMemoryAccess page error: continue after unsafe access
1642       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1643       UnsafeMemoryAccessMark umam(this, add_entry, true);
1644       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1645     }
1646 
1647     if (is_oop) {
1648       __ pop(RegSet::of(d, count), sp);
1649       if (VerifyOops)
1650         verify_oop_array(size, d, count, r16);
1651     }
1652 
1653     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1654 
1655     __ leave();
1656     __ mov(r0, zr); // return 0
1657     __ ret(lr);
1658     return start;
1659   }
1660 
1661   // Arguments:
1662   //   stub_id - is used to name the stub and identify all details of
1663   //             how to perform the copy.
1664   //
1665   //   nooverlap_target - identifes the (post push) entry for the
1666   //             corresponding disjoint copy routine which can be
1667   //             jumped to if the ranges do not actually overlap
1668   //
1669   //   entry - is assigned to the stub's post push entry point unless
1670   //           it is null
1671   //
1672   //
1673   // Inputs:
1674   //   c_rarg0   - source array address
1675   //   c_rarg1   - destination array address
1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
1677   //
1678   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1679   // the hardware handle it.  The two dwords within qwords that span
1680   // cache line boundaries will still be loaded and stored atomically.
1681   //
1682   // Side Effects:
1683   //   entry is set to the no-overlap entry point so it can be used by
1684   //   some other conjoint copy method
1685   //
1686   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
1687     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1688     RegSet saved_regs = RegSet::of(s, d, count);
1689     int size;
1690     bool aligned;
1691     bool is_oop;
1692     bool dest_uninitialized;
1693     switch (stub_id) {
1694     case jbyte_arraycopy_id:
1695       size = sizeof(jbyte);
1696       aligned = false;
1697       is_oop = false;
1698       dest_uninitialized = false;
1699       break;
1700     case arrayof_jbyte_arraycopy_id:
1701       size = sizeof(jbyte);
1702       aligned = true;
1703       is_oop = false;
1704       dest_uninitialized = false;
1705       break;
1706     case jshort_arraycopy_id:
1707       size = sizeof(jshort);
1708       aligned = false;
1709       is_oop = false;
1710       dest_uninitialized = false;
1711       break;
1712     case arrayof_jshort_arraycopy_id:
1713       size = sizeof(jshort);
1714       aligned = true;
1715       is_oop = false;
1716       dest_uninitialized = false;
1717       break;
1718     case jint_arraycopy_id:
1719       size = sizeof(jint);
1720       aligned = false;
1721       is_oop = false;
1722       dest_uninitialized = false;
1723       break;
1724     case arrayof_jint_arraycopy_id:
1725       size = sizeof(jint);
1726       aligned = true;
1727       is_oop = false;
1728       dest_uninitialized = false;
1729       break;
1730     case jlong_arraycopy_id:
1731       // since this is always aligned we can (should!) use the same
1732       // stub as for case arrayof_jlong_disjoint_arraycopy
1733       ShouldNotReachHere();
1734       break;
1735     case arrayof_jlong_arraycopy_id:
1736       size = sizeof(jlong);
1737       aligned = true;
1738       is_oop = false;
1739       dest_uninitialized = false;
1740       break;
1741     case oop_arraycopy_id:
1742       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1743       aligned = !UseCompressedOops;
1744       is_oop = true;
1745       dest_uninitialized = false;
1746       break;
1747     case arrayof_oop_arraycopy_id:
1748       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1749       aligned = !UseCompressedOops;
1750       is_oop = true;
1751       dest_uninitialized = false;
1752       break;
1753     case oop_arraycopy_uninit_id:
1754       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1755       aligned = !UseCompressedOops;
1756       is_oop = true;
1757       dest_uninitialized = true;
1758       break;
1759     case arrayof_oop_arraycopy_uninit_id:
1760       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1761       aligned = !UseCompressedOops;
1762       is_oop = true;
1763       dest_uninitialized = true;
1764       break;
1765     default:
1766       ShouldNotReachHere();
1767     }
1768 
1769     StubCodeMark mark(this, stub_id);
1770     address start = __ pc();
1771     __ enter();
1772 
1773     if (entry != nullptr) {
1774       *entry = __ pc();
1775       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1776       BLOCK_COMMENT("Entry:");
1777     }
1778 
1779     // use fwd copy when (d-s) above_equal (count*size)
1780     __ sub(rscratch1, d, s);
1781     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1782     __ br(Assembler::HS, nooverlap_target);
1783 
1784     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1785     if (dest_uninitialized) {
1786       decorators |= IS_DEST_UNINITIALIZED;
1787     }
1788     if (aligned) {
1789       decorators |= ARRAYCOPY_ALIGNED;
1790     }
1791 
1792     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1793     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1794 
1795     if (is_oop) {
1796       // save regs before copy_memory
1797       __ push(RegSet::of(d, count), sp);
1798     }
1799     {
1800       // UnsafeMemoryAccess page error: continue after unsafe access
1801       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1802       UnsafeMemoryAccessMark umam(this, add_entry, true);
1803       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1804     }
1805     if (is_oop) {
1806       __ pop(RegSet::of(d, count), sp);
1807       if (VerifyOops)
1808         verify_oop_array(size, d, count, r16);
1809     }
1810     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1811     __ leave();
1812     __ mov(r0, zr); // return 0
1813     __ ret(lr);
1814     return start;
1815   }
1816 
1817   // Helper for generating a dynamic type check.
1818   // Smashes rscratch1, rscratch2.
1819   void generate_type_check(Register sub_klass,
1820                            Register super_check_offset,
1821                            Register super_klass,
1822                            Register temp1,
1823                            Register temp2,
1824                            Register result,
1825                            Label& L_success) {
1826     assert_different_registers(sub_klass, super_check_offset, super_klass);
1827 
1828     BLOCK_COMMENT("type_check:");
1829 
1830     Label L_miss;
1831 
1832     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1833                                      super_check_offset);
1834     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1835 
1836     // Fall through on failure!
1837     __ BIND(L_miss);
1838   }
1839 
1840   //
1841   //  Generate checkcasting array copy stub
1842   //
1843   //  Input:
1844   //    c_rarg0   - source array address
1845   //    c_rarg1   - destination array address
1846   //    c_rarg2   - element count, treated as ssize_t, can be zero
1847   //    c_rarg3   - size_t ckoff (super_check_offset)
1848   //    c_rarg4   - oop ckval (super_klass)
1849   //
1850   //  Output:
1851   //    r0 ==  0  -  success
1852   //    r0 == -1^K - failure, where K is partial transfer count
1853   //
1854   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
1855     bool dest_uninitialized;
1856     switch (stub_id) {
1857     case checkcast_arraycopy_id:
1858       dest_uninitialized = false;
1859       break;
1860     case checkcast_arraycopy_uninit_id:
1861       dest_uninitialized = true;
1862       break;
1863     default:
1864       ShouldNotReachHere();
1865     }
1866 
1867     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1868 
1869     // Input registers (after setup_arg_regs)
1870     const Register from        = c_rarg0;   // source array address
1871     const Register to          = c_rarg1;   // destination array address
1872     const Register count       = c_rarg2;   // elementscount
1873     const Register ckoff       = c_rarg3;   // super_check_offset
1874     const Register ckval       = c_rarg4;   // super_klass
1875 
1876     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1877     RegSet wb_post_saved_regs = RegSet::of(count);
1878 
1879     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1880     const Register copied_oop  = r22;       // actual oop copied
1881     const Register count_save  = r21;       // orig elementscount
1882     const Register start_to    = r20;       // destination array start address
1883     const Register r19_klass   = r19;       // oop._klass
1884 
1885     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1886     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1887 
1888     //---------------------------------------------------------------
1889     // Assembler stub will be used for this call to arraycopy
1890     // if the two arrays are subtypes of Object[] but the
1891     // destination array type is not equal to or a supertype
1892     // of the source type.  Each element must be separately
1893     // checked.
1894 
1895     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1896                                copied_oop, r19_klass, count_save);
1897 
1898     __ align(CodeEntryAlignment);
1899     StubCodeMark mark(this, stub_id);
1900     address start = __ pc();
1901 
1902     __ enter(); // required for proper stackwalking of RuntimeStub frame
1903 
1904 #ifdef ASSERT
1905     // caller guarantees that the arrays really are different
1906     // otherwise, we would have to make conjoint checks
1907     { Label L;
1908       __ b(L);                  // conjoint check not yet implemented
1909       __ stop("checkcast_copy within a single array");
1910       __ bind(L);
1911     }
1912 #endif //ASSERT
1913 
1914     // Caller of this entry point must set up the argument registers.
1915     if (entry != nullptr) {
1916       *entry = __ pc();
1917       BLOCK_COMMENT("Entry:");
1918     }
1919 
1920      // Empty array:  Nothing to do.
1921     __ cbz(count, L_done);
1922     __ push(RegSet::of(r19, r20, r21, r22), sp);
1923 
1924 #ifdef ASSERT
1925     BLOCK_COMMENT("assert consistent ckoff/ckval");
1926     // The ckoff and ckval must be mutually consistent,
1927     // even though caller generates both.
1928     { Label L;
1929       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1930       __ ldrw(start_to, Address(ckval, sco_offset));
1931       __ cmpw(ckoff, start_to);
1932       __ br(Assembler::EQ, L);
1933       __ stop("super_check_offset inconsistent");
1934       __ bind(L);
1935     }
1936 #endif //ASSERT
1937 
1938     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1939     bool is_oop = true;
1940     int element_size = UseCompressedOops ? 4 : 8;
1941     if (dest_uninitialized) {
1942       decorators |= IS_DEST_UNINITIALIZED;
1943     }
1944 
1945     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1946     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1947 
1948     // save the original count
1949     __ mov(count_save, count);
1950 
1951     // Copy from low to high addresses
1952     __ mov(start_to, to);              // Save destination array start address
1953     __ b(L_load_element);
1954 
1955     // ======== begin loop ========
1956     // (Loop is rotated; its entry is L_load_element.)
1957     // Loop control:
1958     //   for (; count != 0; count--) {
1959     //     copied_oop = load_heap_oop(from++);
1960     //     ... generate_type_check ...;
1961     //     store_heap_oop(to++, copied_oop);
1962     //   }
1963     __ align(OptoLoopAlignment);
1964 
1965     __ BIND(L_store_element);
1966     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1967                       __ post(to, element_size), copied_oop, noreg,
1968                       gct1, gct2, gct3);
1969     __ sub(count, count, 1);
1970     __ cbz(count, L_do_card_marks);
1971 
1972     // ======== loop entry is here ========
1973     __ BIND(L_load_element);
1974     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1975                      copied_oop, noreg, __ post(from, element_size),
1976                      gct1);
1977     __ cbz(copied_oop, L_store_element);
1978 
1979     __ load_klass(r19_klass, copied_oop);// query the object klass
1980 
1981     BLOCK_COMMENT("type_check:");
1982     generate_type_check(/*sub_klass*/r19_klass,
1983                         /*super_check_offset*/ckoff,
1984                         /*super_klass*/ckval,
1985                         /*r_array_base*/gct1,
1986                         /*temp2*/gct2,
1987                         /*result*/r10, L_store_element);
1988 
1989     // Fall through on failure!
1990 
1991     // ======== end loop ========
1992 
1993     // It was a real error; we must depend on the caller to finish the job.
1994     // Register count = remaining oops, count_orig = total oops.
1995     // Emit GC store barriers for the oops we have copied and report
1996     // their number to the caller.
1997 
1998     __ subs(count, count_save, count);     // K = partially copied oop count
1999     __ eon(count, count, zr);              // report (-1^K) to caller
2000     __ br(Assembler::EQ, L_done_pop);
2001 
2002     __ BIND(L_do_card_marks);
2003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2004 
2005     __ bind(L_done_pop);
2006     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2007     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2008 
2009     __ bind(L_done);
2010     __ mov(r0, count);
2011     __ leave();
2012     __ ret(lr);
2013 
2014     return start;
2015   }
2016 
2017   // Perform range checks on the proposed arraycopy.
2018   // Kills temp, but nothing else.
2019   // Also, clean the sign bits of src_pos and dst_pos.
2020   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2021                               Register src_pos, // source position (c_rarg1)
2022                               Register dst,     // destination array oo (c_rarg2)
2023                               Register dst_pos, // destination position (c_rarg3)
2024                               Register length,
2025                               Register temp,
2026                               Label& L_failed) {
2027     BLOCK_COMMENT("arraycopy_range_checks:");
2028 
2029     assert_different_registers(rscratch1, temp);
2030 
2031     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2032     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2033     __ addw(temp, length, src_pos);
2034     __ cmpw(temp, rscratch1);
2035     __ br(Assembler::HI, L_failed);
2036 
2037     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2038     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2039     __ addw(temp, length, dst_pos);
2040     __ cmpw(temp, rscratch1);
2041     __ br(Assembler::HI, L_failed);
2042 
2043     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2044     __ movw(src_pos, src_pos);
2045     __ movw(dst_pos, dst_pos);
2046 
2047     BLOCK_COMMENT("arraycopy_range_checks done");
2048   }
2049 
2050   // These stubs get called from some dumb test routine.
2051   // I'll write them properly when they're called from
2052   // something that's actually doing something.
2053   static void fake_arraycopy_stub(address src, address dst, int count) {
2054     assert(count == 0, "huh?");
2055   }
2056 
2057 
2058   //
2059   //  Generate 'unsafe' array copy stub
2060   //  Though just as safe as the other stubs, it takes an unscaled
2061   //  size_t argument instead of an element count.
2062   //
2063   //  Input:
2064   //    c_rarg0   - source array address
2065   //    c_rarg1   - destination array address
2066   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2067   //
2068   // Examines the alignment of the operands and dispatches
2069   // to a long, int, short, or byte copy loop.
2070   //
2071   address generate_unsafe_copy(address byte_copy_entry,
2072                                address short_copy_entry,
2073                                address int_copy_entry,
2074                                address long_copy_entry) {
2075     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
2076 
2077     Label L_long_aligned, L_int_aligned, L_short_aligned;
2078     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2079 
2080     __ align(CodeEntryAlignment);
2081     StubCodeMark mark(this, stub_id);
2082     address start = __ pc();
2083     __ enter(); // required for proper stackwalking of RuntimeStub frame
2084 
2085     // bump this on entry, not on exit:
2086     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2087 
2088     __ orr(rscratch1, s, d);
2089     __ orr(rscratch1, rscratch1, count);
2090 
2091     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2092     __ cbz(rscratch1, L_long_aligned);
2093     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2094     __ cbz(rscratch1, L_int_aligned);
2095     __ tbz(rscratch1, 0, L_short_aligned);
2096     __ b(RuntimeAddress(byte_copy_entry));
2097 
2098     __ BIND(L_short_aligned);
2099     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2100     __ b(RuntimeAddress(short_copy_entry));
2101     __ BIND(L_int_aligned);
2102     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2103     __ b(RuntimeAddress(int_copy_entry));
2104     __ BIND(L_long_aligned);
2105     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2106     __ b(RuntimeAddress(long_copy_entry));
2107 
2108     return start;
2109   }
2110 
2111   //
2112   //  Generate generic array copy stubs
2113   //
2114   //  Input:
2115   //    c_rarg0    -  src oop
2116   //    c_rarg1    -  src_pos (32-bits)
2117   //    c_rarg2    -  dst oop
2118   //    c_rarg3    -  dst_pos (32-bits)
2119   //    c_rarg4    -  element count (32-bits)
2120   //
2121   //  Output:
2122   //    r0 ==  0  -  success
2123   //    r0 == -1^K - failure, where K is partial transfer count
2124   //
2125   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2126                                 address int_copy_entry, address oop_copy_entry,
2127                                 address long_copy_entry, address checkcast_copy_entry) {
2128     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
2129 
2130     Label L_failed, L_objArray;
2131     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2132 
2133     // Input registers
2134     const Register src        = c_rarg0;  // source array oop
2135     const Register src_pos    = c_rarg1;  // source position
2136     const Register dst        = c_rarg2;  // destination array oop
2137     const Register dst_pos    = c_rarg3;  // destination position
2138     const Register length     = c_rarg4;
2139 
2140 
2141     // Registers used as temps
2142     const Register dst_klass  = c_rarg5;
2143 
2144     __ align(CodeEntryAlignment);
2145 
2146     StubCodeMark mark(this, stub_id);
2147 
2148     address start = __ pc();
2149 
2150     __ enter(); // required for proper stackwalking of RuntimeStub frame
2151 
2152     // bump this on entry, not on exit:
2153     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2154 
2155     //-----------------------------------------------------------------------
2156     // Assembler stub will be used for this call to arraycopy
2157     // if the following conditions are met:
2158     //
2159     // (1) src and dst must not be null.
2160     // (2) src_pos must not be negative.
2161     // (3) dst_pos must not be negative.
2162     // (4) length  must not be negative.
2163     // (5) src klass and dst klass should be the same and not null.
2164     // (6) src and dst should be arrays.
2165     // (7) src_pos + length must not exceed length of src.
2166     // (8) dst_pos + length must not exceed length of dst.
2167     //
2168 
2169     //  if (src == nullptr) return -1;
2170     __ cbz(src, L_failed);
2171 
2172     //  if (src_pos < 0) return -1;
2173     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2174 
2175     //  if (dst == nullptr) return -1;
2176     __ cbz(dst, L_failed);
2177 
2178     //  if (dst_pos < 0) return -1;
2179     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2180 
2181     // registers used as temp
2182     const Register scratch_length    = r16; // elements count to copy
2183     const Register scratch_src_klass = r17; // array klass
2184     const Register lh                = r15; // layout helper
2185 
2186     //  if (length < 0) return -1;
2187     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2188     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2189 
2190     __ load_klass(scratch_src_klass, src);
2191 #ifdef ASSERT
2192     //  assert(src->klass() != nullptr);
2193     {
2194       BLOCK_COMMENT("assert klasses not null {");
2195       Label L1, L2;
2196       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2197       __ bind(L1);
2198       __ stop("broken null klass");
2199       __ bind(L2);
2200       __ load_klass(rscratch1, dst);
2201       __ cbz(rscratch1, L1);     // this would be broken also
2202       BLOCK_COMMENT("} assert klasses not null done");
2203     }
2204 #endif
2205 
2206     // Load layout helper (32-bits)
2207     //
2208     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2209     // 32        30    24            16              8     2                 0
2210     //
2211     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2212     //
2213 
2214     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2215 
2216     // Handle objArrays completely differently...
2217     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2218     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2219     __ movw(rscratch1, objArray_lh);
2220     __ eorw(rscratch2, lh, rscratch1);
2221     __ cbzw(rscratch2, L_objArray);
2222 
2223     //  if (src->klass() != dst->klass()) return -1;
2224     __ load_klass(rscratch2, dst);
2225     __ eor(rscratch2, rscratch2, scratch_src_klass);
2226     __ cbnz(rscratch2, L_failed);
2227 
2228     //  if (!src->is_Array()) return -1;
2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2230 
2231     // At this point, it is known to be a typeArray (array_tag 0x3).
2232 #ifdef ASSERT
2233     {
2234       BLOCK_COMMENT("assert primitive array {");
2235       Label L;
2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2237       __ cmpw(lh, rscratch2);
2238       __ br(Assembler::GE, L);
2239       __ stop("must be a primitive array");
2240       __ bind(L);
2241       BLOCK_COMMENT("} assert primitive array done");
2242     }
2243 #endif
2244 
2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2246                            rscratch2, L_failed);
2247 
2248     // TypeArrayKlass
2249     //
2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2252     //
2253 
2254     const Register rscratch1_offset = rscratch1;    // array offset
2255     const Register r15_elsize = lh; // element size
2256 
2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2259     __ add(src, src, rscratch1_offset);           // src array offset
2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
2261     BLOCK_COMMENT("choose copy loop based on element size");
2262 
2263     // next registers should be set before the jump to corresponding stub
2264     const Register from     = c_rarg0;  // source array address
2265     const Register to       = c_rarg1;  // destination array address
2266     const Register count    = c_rarg2;  // elements count
2267 
2268     // 'from', 'to', 'count' registers should be set in such order
2269     // since they are the same as 'src', 'src_pos', 'dst'.
2270 
2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2272 
2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
2274     // size in bytes).  We do a simple bitwise binary search.
2275   __ BIND(L_copy_bytes);
2276     __ tbnz(r15_elsize, 1, L_copy_ints);
2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
2278     __ lea(from, Address(src, src_pos));// src_addr
2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
2280     __ movw(count, scratch_length); // length
2281     __ b(RuntimeAddress(byte_copy_entry));
2282 
2283   __ BIND(L_copy_shorts);
2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2286     __ movw(count, scratch_length); // length
2287     __ b(RuntimeAddress(short_copy_entry));
2288 
2289   __ BIND(L_copy_ints);
2290     __ tbnz(r15_elsize, 0, L_copy_longs);
2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2293     __ movw(count, scratch_length); // length
2294     __ b(RuntimeAddress(int_copy_entry));
2295 
2296   __ BIND(L_copy_longs);
2297 #ifdef ASSERT
2298     {
2299       BLOCK_COMMENT("assert long copy {");
2300       Label L;
2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2302       __ cmpw(r15_elsize, LogBytesPerLong);
2303       __ br(Assembler::EQ, L);
2304       __ stop("must be long copy, but elsize is wrong");
2305       __ bind(L);
2306       BLOCK_COMMENT("} assert long copy done");
2307     }
2308 #endif
2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2311     __ movw(count, scratch_length); // length
2312     __ b(RuntimeAddress(long_copy_entry));
2313 
2314     // ObjArrayKlass
2315   __ BIND(L_objArray);
2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2317 
2318     Label L_plain_copy, L_checkcast_copy;
2319     //  test array classes for subtyping
2320     __ load_klass(r15, dst);
2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2322     __ br(Assembler::NE, L_checkcast_copy);
2323 
2324     // Identically typed arrays can be copied without element-wise checks.
2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2326                            rscratch2, L_failed);
2327 
2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2332     __ movw(count, scratch_length); // length
2333   __ BIND(L_plain_copy);
2334     __ b(RuntimeAddress(oop_copy_entry));
2335 
2336   __ BIND(L_checkcast_copy);
2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2338     {
2339       // Before looking at dst.length, make sure dst is also an objArray.
2340       __ ldrw(rscratch1, Address(r15, lh_offset));
2341       __ movw(rscratch2, objArray_lh);
2342       __ eorw(rscratch1, rscratch1, rscratch2);
2343       __ cbnzw(rscratch1, L_failed);
2344 
2345       // It is safe to examine both src.length and dst.length.
2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2347                              r15, L_failed);
2348 
2349       __ load_klass(dst_klass, dst); // reload
2350 
2351       // Marshal the base address arguments now, freeing registers.
2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2356       __ movw(count, length);           // length (reloaded)
2357       Register sco_temp = c_rarg3;      // this register is free now
2358       assert_different_registers(from, to, count, sco_temp,
2359                                  dst_klass, scratch_src_klass);
2360       // assert_clean_int(count, sco_temp);
2361 
2362       // Generate the type check.
2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2365 
2366       // Smashes rscratch1, rscratch2
2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2368                           L_plain_copy);
2369 
2370       // Fetch destination element klass from the ObjArrayKlass header.
2371       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2372       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2373       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2374 
2375       // the checkcast_copy loop needs two extra arguments:
2376       assert(c_rarg3 == sco_temp, "#3 already in place");
2377       // Set up arguments for checkcast_copy_entry.
2378       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2379       __ b(RuntimeAddress(checkcast_copy_entry));
2380     }
2381 
2382   __ BIND(L_failed);
2383     __ mov(r0, -1);
2384     __ leave();   // required for proper stackwalking of RuntimeStub frame
2385     __ ret(lr);
2386 
2387     return start;
2388   }
2389 
2390   //
2391   // Generate stub for array fill. If "aligned" is true, the
2392   // "to" address is assumed to be heapword aligned.
2393   //
2394   // Arguments for generated stub:
2395   //   to:    c_rarg0
2396   //   value: c_rarg1
2397   //   count: c_rarg2 treated as signed
2398   //
2399   address generate_fill(StubGenStubId stub_id) {
2400     BasicType t;
2401     bool aligned;
2402 
2403     switch (stub_id) {
2404     case jbyte_fill_id:
2405       t = T_BYTE;
2406       aligned = false;
2407       break;
2408     case jshort_fill_id:
2409       t = T_SHORT;
2410       aligned = false;
2411       break;
2412     case jint_fill_id:
2413       t = T_INT;
2414       aligned = false;
2415       break;
2416     case arrayof_jbyte_fill_id:
2417       t = T_BYTE;
2418       aligned = true;
2419       break;
2420     case arrayof_jshort_fill_id:
2421       t = T_SHORT;
2422       aligned = true;
2423       break;
2424     case arrayof_jint_fill_id:
2425       t = T_INT;
2426       aligned = true;
2427       break;
2428     default:
2429       ShouldNotReachHere();
2430     };
2431 
2432     __ align(CodeEntryAlignment);
2433     StubCodeMark mark(this, stub_id);
2434     address start = __ pc();
2435 
2436     BLOCK_COMMENT("Entry:");
2437 
2438     const Register to        = c_rarg0;  // source array address
2439     const Register value     = c_rarg1;  // value
2440     const Register count     = c_rarg2;  // elements count
2441 
2442     const Register bz_base = r10;        // base for block_zero routine
2443     const Register cnt_words = r11;      // temp register
2444 
2445     __ enter();
2446 
2447     Label L_fill_elements, L_exit1;
2448 
2449     int shift = -1;
2450     switch (t) {
2451       case T_BYTE:
2452         shift = 0;
2453         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2454         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2455         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2456         __ br(Assembler::LO, L_fill_elements);
2457         break;
2458       case T_SHORT:
2459         shift = 1;
2460         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2461         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2462         __ br(Assembler::LO, L_fill_elements);
2463         break;
2464       case T_INT:
2465         shift = 2;
2466         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2467         __ br(Assembler::LO, L_fill_elements);
2468         break;
2469       default: ShouldNotReachHere();
2470     }
2471 
2472     // Align source address at 8 bytes address boundary.
2473     Label L_skip_align1, L_skip_align2, L_skip_align4;
2474     if (!aligned) {
2475       switch (t) {
2476         case T_BYTE:
2477           // One byte misalignment happens only for byte arrays.
2478           __ tbz(to, 0, L_skip_align1);
2479           __ strb(value, Address(__ post(to, 1)));
2480           __ subw(count, count, 1);
2481           __ bind(L_skip_align1);
2482           // Fallthrough
2483         case T_SHORT:
2484           // Two bytes misalignment happens only for byte and short (char) arrays.
2485           __ tbz(to, 1, L_skip_align2);
2486           __ strh(value, Address(__ post(to, 2)));
2487           __ subw(count, count, 2 >> shift);
2488           __ bind(L_skip_align2);
2489           // Fallthrough
2490         case T_INT:
2491           // Align to 8 bytes, we know we are 4 byte aligned to start.
2492           __ tbz(to, 2, L_skip_align4);
2493           __ strw(value, Address(__ post(to, 4)));
2494           __ subw(count, count, 4 >> shift);
2495           __ bind(L_skip_align4);
2496           break;
2497         default: ShouldNotReachHere();
2498       }
2499     }
2500 
2501     //
2502     //  Fill large chunks
2503     //
2504     __ lsrw(cnt_words, count, 3 - shift); // number of words
2505     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2506     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2507     if (UseBlockZeroing) {
2508       Label non_block_zeroing, rest;
2509       // If the fill value is zero we can use the fast zero_words().
2510       __ cbnz(value, non_block_zeroing);
2511       __ mov(bz_base, to);
2512       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2513       address tpc = __ zero_words(bz_base, cnt_words);
2514       if (tpc == nullptr) {
2515         fatal("CodeCache is full at generate_fill");
2516       }
2517       __ b(rest);
2518       __ bind(non_block_zeroing);
2519       __ fill_words(to, cnt_words, value);
2520       __ bind(rest);
2521     } else {
2522       __ fill_words(to, cnt_words, value);
2523     }
2524 
2525     // Remaining count is less than 8 bytes. Fill it by a single store.
2526     // Note that the total length is no less than 8 bytes.
2527     if (t == T_BYTE || t == T_SHORT) {
2528       Label L_exit1;
2529       __ cbzw(count, L_exit1);
2530       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2531       __ str(value, Address(to, -8));    // overwrite some elements
2532       __ bind(L_exit1);
2533       __ leave();
2534       __ ret(lr);
2535     }
2536 
2537     // Handle copies less than 8 bytes.
2538     Label L_fill_2, L_fill_4, L_exit2;
2539     __ bind(L_fill_elements);
2540     switch (t) {
2541       case T_BYTE:
2542         __ tbz(count, 0, L_fill_2);
2543         __ strb(value, Address(__ post(to, 1)));
2544         __ bind(L_fill_2);
2545         __ tbz(count, 1, L_fill_4);
2546         __ strh(value, Address(__ post(to, 2)));
2547         __ bind(L_fill_4);
2548         __ tbz(count, 2, L_exit2);
2549         __ strw(value, Address(to));
2550         break;
2551       case T_SHORT:
2552         __ tbz(count, 0, L_fill_4);
2553         __ strh(value, Address(__ post(to, 2)));
2554         __ bind(L_fill_4);
2555         __ tbz(count, 1, L_exit2);
2556         __ strw(value, Address(to));
2557         break;
2558       case T_INT:
2559         __ cbzw(count, L_exit2);
2560         __ strw(value, Address(to));
2561         break;
2562       default: ShouldNotReachHere();
2563     }
2564     __ bind(L_exit2);
2565     __ leave();
2566     __ ret(lr);
2567     return start;
2568   }
2569 
2570   address generate_data_cache_writeback() {
2571     const Register line        = c_rarg0;  // address of line to write back
2572 
2573     __ align(CodeEntryAlignment);
2574 
2575     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
2576     StubCodeMark mark(this, stub_id);
2577 
2578     address start = __ pc();
2579     __ enter();
2580     __ cache_wb(Address(line, 0));
2581     __ leave();
2582     __ ret(lr);
2583 
2584     return start;
2585   }
2586 
2587   address generate_data_cache_writeback_sync() {
2588     const Register is_pre     = c_rarg0;  // pre or post sync
2589 
2590     __ align(CodeEntryAlignment);
2591 
2592     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
2593     StubCodeMark mark(this, stub_id);
2594 
2595     // pre wbsync is a no-op
2596     // post wbsync translates to an sfence
2597 
2598     Label skip;
2599     address start = __ pc();
2600     __ enter();
2601     __ cbnz(is_pre, skip);
2602     __ cache_wbsync(false);
2603     __ bind(skip);
2604     __ leave();
2605     __ ret(lr);
2606 
2607     return start;
2608   }
2609 
2610   void generate_arraycopy_stubs() {
2611     address entry;
2612     address entry_jbyte_arraycopy;
2613     address entry_jshort_arraycopy;
2614     address entry_jint_arraycopy;
2615     address entry_oop_arraycopy;
2616     address entry_jlong_arraycopy;
2617     address entry_checkcast_arraycopy;
2618 
2619     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
2620     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
2621 
2622     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
2623     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
2624 
2625     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
2626     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
2627 
2628     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2629 
2630     //*** jbyte
2631     // Always need aligned and unaligned versions
2632     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
2633     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
2634     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
2635     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
2636 
2637     //*** jshort
2638     // Always need aligned and unaligned versions
2639     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
2640     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
2641     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
2642     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
2643 
2644     //*** jint
2645     // Aligned versions
2646     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
2647     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
2648     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2649     // entry_jint_arraycopy always points to the unaligned version
2650     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
2651     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
2652 
2653     //*** jlong
2654     // It is always aligned
2655     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
2656     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
2657     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2658     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2659 
2660     //*** oops
2661     {
2662       // With compressed oops we need unaligned versions; notice that
2663       // we overwrite entry_oop_arraycopy.
2664       bool aligned = !UseCompressedOops;
2665 
2666       StubRoutines::_arrayof_oop_disjoint_arraycopy
2667         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
2668       StubRoutines::_arrayof_oop_arraycopy
2669         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
2670       // Aligned versions without pre-barriers
2671       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2672         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
2673       StubRoutines::_arrayof_oop_arraycopy_uninit
2674         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
2675     }
2676 
2677     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2678     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2679     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2680     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2681 
2682     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
2683     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
2684 
2685     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
2686                                                               entry_jshort_arraycopy,
2687                                                               entry_jint_arraycopy,
2688                                                               entry_jlong_arraycopy);
2689 
2690     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
2691                                                                entry_jshort_arraycopy,
2692                                                                entry_jint_arraycopy,
2693                                                                entry_oop_arraycopy,
2694                                                                entry_jlong_arraycopy,
2695                                                                entry_checkcast_arraycopy);
2696 
2697     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
2698     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
2699     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
2700     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
2701     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
2702     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
2703   }
2704 
2705   void generate_math_stubs() { Unimplemented(); }
2706 
2707   // Arguments:
2708   //
2709   // Inputs:
2710   //   c_rarg0   - source byte array address
2711   //   c_rarg1   - destination byte array address
2712   //   c_rarg2   - K (key) in little endian int array
2713   //
2714   address generate_aescrypt_encryptBlock() {
2715     __ align(CodeEntryAlignment);
2716     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
2717     StubCodeMark mark(this, stub_id);
2718 
2719     const Register from        = c_rarg0;  // source array address
2720     const Register to          = c_rarg1;  // destination array address
2721     const Register key         = c_rarg2;  // key array address
2722     const Register keylen      = rscratch1;
2723 
2724     address start = __ pc();
2725     __ enter();
2726 
2727     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2728 
2729     __ aesenc_loadkeys(key, keylen);
2730     __ aesecb_encrypt(from, to, keylen);
2731 
2732     __ mov(r0, 0);
2733 
2734     __ leave();
2735     __ ret(lr);
2736 
2737     return start;
2738   }
2739 
2740   // Arguments:
2741   //
2742   // Inputs:
2743   //   c_rarg0   - source byte array address
2744   //   c_rarg1   - destination byte array address
2745   //   c_rarg2   - K (key) in little endian int array
2746   //
2747   address generate_aescrypt_decryptBlock() {
2748     assert(UseAES, "need AES cryptographic extension support");
2749     __ align(CodeEntryAlignment);
2750     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
2751     StubCodeMark mark(this, stub_id);
2752     Label L_doLast;
2753 
2754     const Register from        = c_rarg0;  // source array address
2755     const Register to          = c_rarg1;  // destination array address
2756     const Register key         = c_rarg2;  // key array address
2757     const Register keylen      = rscratch1;
2758 
2759     address start = __ pc();
2760     __ enter(); // required for proper stackwalking of RuntimeStub frame
2761 
2762     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2763 
2764     __ aesecb_decrypt(from, to, key, keylen);
2765 
2766     __ mov(r0, 0);
2767 
2768     __ leave();
2769     __ ret(lr);
2770 
2771     return start;
2772   }
2773 
2774   // Arguments:
2775   //
2776   // Inputs:
2777   //   c_rarg0   - source byte array address
2778   //   c_rarg1   - destination byte array address
2779   //   c_rarg2   - K (key) in little endian int array
2780   //   c_rarg3   - r vector byte array address
2781   //   c_rarg4   - input length
2782   //
2783   // Output:
2784   //   x0        - input length
2785   //
2786   address generate_cipherBlockChaining_encryptAESCrypt() {
2787     assert(UseAES, "need AES cryptographic extension support");
2788     __ align(CodeEntryAlignment);
2789     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
2790     StubCodeMark mark(this, stub_id);
2791 
2792     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2793 
2794     const Register from        = c_rarg0;  // source array address
2795     const Register to          = c_rarg1;  // destination array address
2796     const Register key         = c_rarg2;  // key array address
2797     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2798                                            // and left with the results of the last encryption block
2799     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2800     const Register keylen      = rscratch1;
2801 
2802     address start = __ pc();
2803 
2804       __ enter();
2805 
2806       __ movw(rscratch2, len_reg);
2807 
2808       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2809 
2810       __ ld1(v0, __ T16B, rvec);
2811 
2812       __ cmpw(keylen, 52);
2813       __ br(Assembler::CC, L_loadkeys_44);
2814       __ br(Assembler::EQ, L_loadkeys_52);
2815 
2816       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2817       __ rev32(v17, __ T16B, v17);
2818       __ rev32(v18, __ T16B, v18);
2819     __ BIND(L_loadkeys_52);
2820       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2821       __ rev32(v19, __ T16B, v19);
2822       __ rev32(v20, __ T16B, v20);
2823     __ BIND(L_loadkeys_44);
2824       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2825       __ rev32(v21, __ T16B, v21);
2826       __ rev32(v22, __ T16B, v22);
2827       __ rev32(v23, __ T16B, v23);
2828       __ rev32(v24, __ T16B, v24);
2829       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2830       __ rev32(v25, __ T16B, v25);
2831       __ rev32(v26, __ T16B, v26);
2832       __ rev32(v27, __ T16B, v27);
2833       __ rev32(v28, __ T16B, v28);
2834       __ ld1(v29, v30, v31, __ T16B, key);
2835       __ rev32(v29, __ T16B, v29);
2836       __ rev32(v30, __ T16B, v30);
2837       __ rev32(v31, __ T16B, v31);
2838 
2839     __ BIND(L_aes_loop);
2840       __ ld1(v1, __ T16B, __ post(from, 16));
2841       __ eor(v0, __ T16B, v0, v1);
2842 
2843       __ br(Assembler::CC, L_rounds_44);
2844       __ br(Assembler::EQ, L_rounds_52);
2845 
2846       __ aese(v0, v17); __ aesmc(v0, v0);
2847       __ aese(v0, v18); __ aesmc(v0, v0);
2848     __ BIND(L_rounds_52);
2849       __ aese(v0, v19); __ aesmc(v0, v0);
2850       __ aese(v0, v20); __ aesmc(v0, v0);
2851     __ BIND(L_rounds_44);
2852       __ aese(v0, v21); __ aesmc(v0, v0);
2853       __ aese(v0, v22); __ aesmc(v0, v0);
2854       __ aese(v0, v23); __ aesmc(v0, v0);
2855       __ aese(v0, v24); __ aesmc(v0, v0);
2856       __ aese(v0, v25); __ aesmc(v0, v0);
2857       __ aese(v0, v26); __ aesmc(v0, v0);
2858       __ aese(v0, v27); __ aesmc(v0, v0);
2859       __ aese(v0, v28); __ aesmc(v0, v0);
2860       __ aese(v0, v29); __ aesmc(v0, v0);
2861       __ aese(v0, v30);
2862       __ eor(v0, __ T16B, v0, v31);
2863 
2864       __ st1(v0, __ T16B, __ post(to, 16));
2865 
2866       __ subw(len_reg, len_reg, 16);
2867       __ cbnzw(len_reg, L_aes_loop);
2868 
2869       __ st1(v0, __ T16B, rvec);
2870 
2871       __ mov(r0, rscratch2);
2872 
2873       __ leave();
2874       __ ret(lr);
2875 
2876       return start;
2877   }
2878 
2879   // Arguments:
2880   //
2881   // Inputs:
2882   //   c_rarg0   - source byte array address
2883   //   c_rarg1   - destination byte array address
2884   //   c_rarg2   - K (key) in little endian int array
2885   //   c_rarg3   - r vector byte array address
2886   //   c_rarg4   - input length
2887   //
2888   // Output:
2889   //   r0        - input length
2890   //
2891   address generate_cipherBlockChaining_decryptAESCrypt() {
2892     assert(UseAES, "need AES cryptographic extension support");
2893     __ align(CodeEntryAlignment);
2894     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
2895     StubCodeMark mark(this, stub_id);
2896 
2897     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2898 
2899     const Register from        = c_rarg0;  // source array address
2900     const Register to          = c_rarg1;  // destination array address
2901     const Register key         = c_rarg2;  // key array address
2902     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2903                                            // and left with the results of the last encryption block
2904     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2905     const Register keylen      = rscratch1;
2906 
2907     address start = __ pc();
2908 
2909       __ enter();
2910 
2911       __ movw(rscratch2, len_reg);
2912 
2913       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2914 
2915       __ ld1(v2, __ T16B, rvec);
2916 
2917       __ ld1(v31, __ T16B, __ post(key, 16));
2918       __ rev32(v31, __ T16B, v31);
2919 
2920       __ cmpw(keylen, 52);
2921       __ br(Assembler::CC, L_loadkeys_44);
2922       __ br(Assembler::EQ, L_loadkeys_52);
2923 
2924       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2925       __ rev32(v17, __ T16B, v17);
2926       __ rev32(v18, __ T16B, v18);
2927     __ BIND(L_loadkeys_52);
2928       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2929       __ rev32(v19, __ T16B, v19);
2930       __ rev32(v20, __ T16B, v20);
2931     __ BIND(L_loadkeys_44);
2932       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2933       __ rev32(v21, __ T16B, v21);
2934       __ rev32(v22, __ T16B, v22);
2935       __ rev32(v23, __ T16B, v23);
2936       __ rev32(v24, __ T16B, v24);
2937       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2938       __ rev32(v25, __ T16B, v25);
2939       __ rev32(v26, __ T16B, v26);
2940       __ rev32(v27, __ T16B, v27);
2941       __ rev32(v28, __ T16B, v28);
2942       __ ld1(v29, v30, __ T16B, key);
2943       __ rev32(v29, __ T16B, v29);
2944       __ rev32(v30, __ T16B, v30);
2945 
2946     __ BIND(L_aes_loop);
2947       __ ld1(v0, __ T16B, __ post(from, 16));
2948       __ orr(v1, __ T16B, v0, v0);
2949 
2950       __ br(Assembler::CC, L_rounds_44);
2951       __ br(Assembler::EQ, L_rounds_52);
2952 
2953       __ aesd(v0, v17); __ aesimc(v0, v0);
2954       __ aesd(v0, v18); __ aesimc(v0, v0);
2955     __ BIND(L_rounds_52);
2956       __ aesd(v0, v19); __ aesimc(v0, v0);
2957       __ aesd(v0, v20); __ aesimc(v0, v0);
2958     __ BIND(L_rounds_44);
2959       __ aesd(v0, v21); __ aesimc(v0, v0);
2960       __ aesd(v0, v22); __ aesimc(v0, v0);
2961       __ aesd(v0, v23); __ aesimc(v0, v0);
2962       __ aesd(v0, v24); __ aesimc(v0, v0);
2963       __ aesd(v0, v25); __ aesimc(v0, v0);
2964       __ aesd(v0, v26); __ aesimc(v0, v0);
2965       __ aesd(v0, v27); __ aesimc(v0, v0);
2966       __ aesd(v0, v28); __ aesimc(v0, v0);
2967       __ aesd(v0, v29); __ aesimc(v0, v0);
2968       __ aesd(v0, v30);
2969       __ eor(v0, __ T16B, v0, v31);
2970       __ eor(v0, __ T16B, v0, v2);
2971 
2972       __ st1(v0, __ T16B, __ post(to, 16));
2973       __ orr(v2, __ T16B, v1, v1);
2974 
2975       __ subw(len_reg, len_reg, 16);
2976       __ cbnzw(len_reg, L_aes_loop);
2977 
2978       __ st1(v2, __ T16B, rvec);
2979 
2980       __ mov(r0, rscratch2);
2981 
2982       __ leave();
2983       __ ret(lr);
2984 
2985     return start;
2986   }
2987 
2988   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2989   // Inputs: 128-bits. in is preserved.
2990   // The least-significant 64-bit word is in the upper dword of each vector.
2991   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2992   // Output: result
2993   void be_add_128_64(FloatRegister result, FloatRegister in,
2994                      FloatRegister inc, FloatRegister tmp) {
2995     assert_different_registers(result, tmp, inc);
2996 
2997     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2998                                            // input
2999     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3000     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
3001                                            // MSD == 0 (must be!) to LSD
3002     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
3003   }
3004 
3005   // CTR AES crypt.
3006   // Arguments:
3007   //
3008   // Inputs:
3009   //   c_rarg0   - source byte array address
3010   //   c_rarg1   - destination byte array address
3011   //   c_rarg2   - K (key) in little endian int array
3012   //   c_rarg3   - counter vector byte array address
3013   //   c_rarg4   - input length
3014   //   c_rarg5   - saved encryptedCounter start
3015   //   c_rarg6   - saved used length
3016   //
3017   // Output:
3018   //   r0       - input length
3019   //
3020   address generate_counterMode_AESCrypt() {
3021     const Register in = c_rarg0;
3022     const Register out = c_rarg1;
3023     const Register key = c_rarg2;
3024     const Register counter = c_rarg3;
3025     const Register saved_len = c_rarg4, len = r10;
3026     const Register saved_encrypted_ctr = c_rarg5;
3027     const Register used_ptr = c_rarg6, used = r12;
3028 
3029     const Register offset = r7;
3030     const Register keylen = r11;
3031 
3032     const unsigned char block_size = 16;
3033     const int bulk_width = 4;
3034     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3035     // performance with larger data sizes, but it also means that the
3036     // fast path isn't used until you have at least 8 blocks, and up
3037     // to 127 bytes of data will be executed on the slow path. For
3038     // that reason, and also so as not to blow away too much icache, 4
3039     // blocks seems like a sensible compromise.
3040 
3041     // Algorithm:
3042     //
3043     //    if (len == 0) {
3044     //        goto DONE;
3045     //    }
3046     //    int result = len;
3047     //    do {
3048     //        if (used >= blockSize) {
3049     //            if (len >= bulk_width * blockSize) {
3050     //                CTR_large_block();
3051     //                if (len == 0)
3052     //                    goto DONE;
3053     //            }
3054     //            for (;;) {
3055     //                16ByteVector v0 = counter;
3056     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3057     //                used = 0;
3058     //                if (len < blockSize)
3059     //                    break;    /* goto NEXT */
3060     //                16ByteVector v1 = load16Bytes(in, offset);
3061     //                v1 = v1 ^ encryptedCounter;
3062     //                store16Bytes(out, offset);
3063     //                used = blockSize;
3064     //                offset += blockSize;
3065     //                len -= blockSize;
3066     //                if (len == 0)
3067     //                    goto DONE;
3068     //            }
3069     //        }
3070     //      NEXT:
3071     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3072     //        len--;
3073     //    } while (len != 0);
3074     //  DONE:
3075     //    return result;
3076     //
3077     // CTR_large_block()
3078     //    Wide bulk encryption of whole blocks.
3079 
3080     __ align(CodeEntryAlignment);
3081     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
3082     StubCodeMark mark(this, stub_id);
3083     const address start = __ pc();
3084     __ enter();
3085 
3086     Label DONE, CTR_large_block, large_block_return;
3087     __ ldrw(used, Address(used_ptr));
3088     __ cbzw(saved_len, DONE);
3089 
3090     __ mov(len, saved_len);
3091     __ mov(offset, 0);
3092 
3093     // Compute #rounds for AES based on the length of the key array
3094     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3095 
3096     __ aesenc_loadkeys(key, keylen);
3097 
3098     {
3099       Label L_CTR_loop, NEXT;
3100 
3101       __ bind(L_CTR_loop);
3102 
3103       __ cmp(used, block_size);
3104       __ br(__ LO, NEXT);
3105 
3106       // Maybe we have a lot of data
3107       __ subsw(rscratch1, len, bulk_width * block_size);
3108       __ br(__ HS, CTR_large_block);
3109       __ BIND(large_block_return);
3110       __ cbzw(len, DONE);
3111 
3112       // Setup the counter
3113       __ movi(v4, __ T4S, 0);
3114       __ movi(v5, __ T4S, 1);
3115       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3116 
3117       // 128-bit big-endian increment
3118       __ ld1(v0, __ T16B, counter);
3119       __ rev64(v16, __ T16B, v0);
3120       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3121       __ rev64(v16, __ T16B, v16);
3122       __ st1(v16, __ T16B, counter);
3123       // Previous counter value is in v0
3124       // v4 contains { 0, 1 }
3125 
3126       {
3127         // We have fewer than bulk_width blocks of data left. Encrypt
3128         // them one by one until there is less than a full block
3129         // remaining, being careful to save both the encrypted counter
3130         // and the counter.
3131 
3132         Label inner_loop;
3133         __ bind(inner_loop);
3134         // Counter to encrypt is in v0
3135         __ aesecb_encrypt(noreg, noreg, keylen);
3136         __ st1(v0, __ T16B, saved_encrypted_ctr);
3137 
3138         // Do we have a remaining full block?
3139 
3140         __ mov(used, 0);
3141         __ cmp(len, block_size);
3142         __ br(__ LO, NEXT);
3143 
3144         // Yes, we have a full block
3145         __ ldrq(v1, Address(in, offset));
3146         __ eor(v1, __ T16B, v1, v0);
3147         __ strq(v1, Address(out, offset));
3148         __ mov(used, block_size);
3149         __ add(offset, offset, block_size);
3150 
3151         __ subw(len, len, block_size);
3152         __ cbzw(len, DONE);
3153 
3154         // Increment the counter, store it back
3155         __ orr(v0, __ T16B, v16, v16);
3156         __ rev64(v16, __ T16B, v16);
3157         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3158         __ rev64(v16, __ T16B, v16);
3159         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3160 
3161         __ b(inner_loop);
3162       }
3163 
3164       __ BIND(NEXT);
3165 
3166       // Encrypt a single byte, and loop.
3167       // We expect this to be a rare event.
3168       __ ldrb(rscratch1, Address(in, offset));
3169       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3170       __ eor(rscratch1, rscratch1, rscratch2);
3171       __ strb(rscratch1, Address(out, offset));
3172       __ add(offset, offset, 1);
3173       __ add(used, used, 1);
3174       __ subw(len, len,1);
3175       __ cbnzw(len, L_CTR_loop);
3176     }
3177 
3178     __ bind(DONE);
3179     __ strw(used, Address(used_ptr));
3180     __ mov(r0, saved_len);
3181 
3182     __ leave(); // required for proper stackwalking of RuntimeStub frame
3183     __ ret(lr);
3184 
3185     // Bulk encryption
3186 
3187     __ BIND (CTR_large_block);
3188     assert(bulk_width == 4 || bulk_width == 8, "must be");
3189 
3190     if (bulk_width == 8) {
3191       __ sub(sp, sp, 4 * 16);
3192       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3193     }
3194     __ sub(sp, sp, 4 * 16);
3195     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3196     RegSet saved_regs = (RegSet::of(in, out, offset)
3197                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3198     __ push(saved_regs, sp);
3199     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3200     __ add(in, in, offset);
3201     __ add(out, out, offset);
3202 
3203     // Keys should already be loaded into the correct registers
3204 
3205     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3206     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3207 
3208     // AES/CTR loop
3209     {
3210       Label L_CTR_loop;
3211       __ BIND(L_CTR_loop);
3212 
3213       // Setup the counters
3214       __ movi(v8, __ T4S, 0);
3215       __ movi(v9, __ T4S, 1);
3216       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3217 
3218       for (int i = 0; i < bulk_width; i++) {
3219         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3220         __ rev64(v0_ofs, __ T16B, v16);
3221         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3222       }
3223 
3224       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3225 
3226       // Encrypt the counters
3227       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3228 
3229       if (bulk_width == 8) {
3230         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3231       }
3232 
3233       // XOR the encrypted counters with the inputs
3234       for (int i = 0; i < bulk_width; i++) {
3235         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3236         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3237         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3238       }
3239 
3240       // Write the encrypted data
3241       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3242       if (bulk_width == 8) {
3243         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3244       }
3245 
3246       __ subw(len, len, 16 * bulk_width);
3247       __ cbnzw(len, L_CTR_loop);
3248     }
3249 
3250     // Save the counter back where it goes
3251     __ rev64(v16, __ T16B, v16);
3252     __ st1(v16, __ T16B, counter);
3253 
3254     __ pop(saved_regs, sp);
3255 
3256     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3257     if (bulk_width == 8) {
3258       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3259     }
3260 
3261     __ andr(rscratch1, len, -16 * bulk_width);
3262     __ sub(len, len, rscratch1);
3263     __ add(offset, offset, rscratch1);
3264     __ mov(used, 16);
3265     __ strw(used, Address(used_ptr));
3266     __ b(large_block_return);
3267 
3268     return start;
3269   }
3270 
3271   // Vector AES Galois Counter Mode implementation. Parameters:
3272   //
3273   // in = c_rarg0
3274   // len = c_rarg1
3275   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3276   // out = c_rarg3
3277   // key = c_rarg4
3278   // state = c_rarg5 - GHASH.state
3279   // subkeyHtbl = c_rarg6 - powers of H
3280   // counter = c_rarg7 - 16 bytes of CTR
3281   // return - number of processed bytes
3282   address generate_galoisCounterMode_AESCrypt() {
3283     address ghash_polynomial = __ pc();
3284     __ emit_int64(0x87);  // The low-order bits of the field
3285                           // polynomial (i.e. p = z^7+z^2+z+1)
3286                           // repeated in the low and high parts of a
3287                           // 128-bit vector
3288     __ emit_int64(0x87);
3289 
3290     __ align(CodeEntryAlignment);
3291     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
3292     StubCodeMark mark(this, stub_id);
3293     address start = __ pc();
3294     __ enter();
3295 
3296     const Register in = c_rarg0;
3297     const Register len = c_rarg1;
3298     const Register ct = c_rarg2;
3299     const Register out = c_rarg3;
3300     // and updated with the incremented counter in the end
3301 
3302     const Register key = c_rarg4;
3303     const Register state = c_rarg5;
3304 
3305     const Register subkeyHtbl = c_rarg6;
3306 
3307     const Register counter = c_rarg7;
3308 
3309     const Register keylen = r10;
3310     // Save state before entering routine
3311     __ sub(sp, sp, 4 * 16);
3312     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3313     __ sub(sp, sp, 4 * 16);
3314     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3315 
3316     // __ andr(len, len, -512);
3317     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3318     __ str(len, __ pre(sp, -2 * wordSize));
3319 
3320     Label DONE;
3321     __ cbz(len, DONE);
3322 
3323     // Compute #rounds for AES based on the length of the key array
3324     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3325 
3326     __ aesenc_loadkeys(key, keylen);
3327     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3328     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3329 
3330     // AES/CTR loop
3331     {
3332       Label L_CTR_loop;
3333       __ BIND(L_CTR_loop);
3334 
3335       // Setup the counters
3336       __ movi(v8, __ T4S, 0);
3337       __ movi(v9, __ T4S, 1);
3338       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3339 
3340       assert(v0->encoding() < v8->encoding(), "");
3341       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3342         FloatRegister f = as_FloatRegister(i);
3343         __ rev32(f, __ T16B, v16);
3344         __ addv(v16, __ T4S, v16, v8);
3345       }
3346 
3347       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3348 
3349       // Encrypt the counters
3350       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3351 
3352       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3353 
3354       // XOR the encrypted counters with the inputs
3355       for (int i = 0; i < 8; i++) {
3356         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3357         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3358         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3359       }
3360       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3361       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3362 
3363       __ subw(len, len, 16 * 8);
3364       __ cbnzw(len, L_CTR_loop);
3365     }
3366 
3367     __ rev32(v16, __ T16B, v16);
3368     __ st1(v16, __ T16B, counter);
3369 
3370     __ ldr(len, Address(sp));
3371     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3372 
3373     // GHASH/CTR loop
3374     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3375                                 len, /*unrolls*/4);
3376 
3377 #ifdef ASSERT
3378     { Label L;
3379       __ cmp(len, (unsigned char)0);
3380       __ br(Assembler::EQ, L);
3381       __ stop("stubGenerator: abort");
3382       __ bind(L);
3383   }
3384 #endif
3385 
3386   __ bind(DONE);
3387     // Return the number of bytes processed
3388     __ ldr(r0, __ post(sp, 2 * wordSize));
3389 
3390     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3391     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3392 
3393     __ leave(); // required for proper stackwalking of RuntimeStub frame
3394     __ ret(lr);
3395      return start;
3396   }
3397 
3398   class Cached64Bytes {
3399   private:
3400     MacroAssembler *_masm;
3401     Register _regs[8];
3402 
3403   public:
3404     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3405       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3406       auto it = rs.begin();
3407       for (auto &r: _regs) {
3408         r = *it;
3409         ++it;
3410       }
3411     }
3412 
3413     void gen_loads(Register base) {
3414       for (int i = 0; i < 8; i += 2) {
3415         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3416       }
3417     }
3418 
3419     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3420     void extract_u32(Register dest, int i) {
3421       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3422     }
3423   };
3424 
3425   // Utility routines for md5.
3426   // Clobbers r10 and r11.
3427   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3428               int k, int s, int t) {
3429     Register rscratch3 = r10;
3430     Register rscratch4 = r11;
3431 
3432     __ eorw(rscratch3, r3, r4);
3433     __ movw(rscratch2, t);
3434     __ andw(rscratch3, rscratch3, r2);
3435     __ addw(rscratch4, r1, rscratch2);
3436     reg_cache.extract_u32(rscratch1, k);
3437     __ eorw(rscratch3, rscratch3, r4);
3438     __ addw(rscratch4, rscratch4, rscratch1);
3439     __ addw(rscratch3, rscratch3, rscratch4);
3440     __ rorw(rscratch2, rscratch3, 32 - s);
3441     __ addw(r1, rscratch2, r2);
3442   }
3443 
3444   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3445               int k, int s, int t) {
3446     Register rscratch3 = r10;
3447     Register rscratch4 = r11;
3448 
3449     reg_cache.extract_u32(rscratch1, k);
3450     __ movw(rscratch2, t);
3451     __ addw(rscratch4, r1, rscratch2);
3452     __ addw(rscratch4, rscratch4, rscratch1);
3453     __ bicw(rscratch2, r3, r4);
3454     __ andw(rscratch3, r2, r4);
3455     __ addw(rscratch2, rscratch2, rscratch4);
3456     __ addw(rscratch2, rscratch2, rscratch3);
3457     __ rorw(rscratch2, rscratch2, 32 - s);
3458     __ addw(r1, rscratch2, r2);
3459   }
3460 
3461   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3462               int k, int s, int t) {
3463     Register rscratch3 = r10;
3464     Register rscratch4 = r11;
3465 
3466     __ eorw(rscratch3, r3, r4);
3467     __ movw(rscratch2, t);
3468     __ addw(rscratch4, r1, rscratch2);
3469     reg_cache.extract_u32(rscratch1, k);
3470     __ eorw(rscratch3, rscratch3, r2);
3471     __ addw(rscratch4, rscratch4, rscratch1);
3472     __ addw(rscratch3, rscratch3, rscratch4);
3473     __ rorw(rscratch2, rscratch3, 32 - s);
3474     __ addw(r1, rscratch2, r2);
3475   }
3476 
3477   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3478               int k, int s, int t) {
3479     Register rscratch3 = r10;
3480     Register rscratch4 = r11;
3481 
3482     __ movw(rscratch3, t);
3483     __ ornw(rscratch2, r2, r4);
3484     __ addw(rscratch4, r1, rscratch3);
3485     reg_cache.extract_u32(rscratch1, k);
3486     __ eorw(rscratch3, rscratch2, r3);
3487     __ addw(rscratch4, rscratch4, rscratch1);
3488     __ addw(rscratch3, rscratch3, rscratch4);
3489     __ rorw(rscratch2, rscratch3, 32 - s);
3490     __ addw(r1, rscratch2, r2);
3491   }
3492 
3493   // Arguments:
3494   //
3495   // Inputs:
3496   //   c_rarg0   - byte[]  source+offset
3497   //   c_rarg1   - int[]   SHA.state
3498   //   c_rarg2   - int     offset
3499   //   c_rarg3   - int     limit
3500   //
3501   address generate_md5_implCompress(StubGenStubId stub_id) {
3502     bool multi_block;
3503     switch (stub_id) {
3504     case md5_implCompress_id:
3505       multi_block = false;
3506       break;
3507     case md5_implCompressMB_id:
3508       multi_block = true;
3509       break;
3510     default:
3511       ShouldNotReachHere();
3512     }
3513     __ align(CodeEntryAlignment);
3514 
3515     StubCodeMark mark(this, stub_id);
3516     address start = __ pc();
3517 
3518     Register buf       = c_rarg0;
3519     Register state     = c_rarg1;
3520     Register ofs       = c_rarg2;
3521     Register limit     = c_rarg3;
3522     Register a         = r4;
3523     Register b         = r5;
3524     Register c         = r6;
3525     Register d         = r7;
3526     Register rscratch3 = r10;
3527     Register rscratch4 = r11;
3528 
3529     Register state_regs[2] = { r12, r13 };
3530     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3531     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3532 
3533     __ push(saved_regs, sp);
3534 
3535     __ ldp(state_regs[0], state_regs[1], Address(state));
3536     __ ubfx(a, state_regs[0],  0, 32);
3537     __ ubfx(b, state_regs[0], 32, 32);
3538     __ ubfx(c, state_regs[1],  0, 32);
3539     __ ubfx(d, state_regs[1], 32, 32);
3540 
3541     Label md5_loop;
3542     __ BIND(md5_loop);
3543 
3544     reg_cache.gen_loads(buf);
3545 
3546     // Round 1
3547     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3548     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3549     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3550     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3551     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3552     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3553     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3554     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3555     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3556     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3557     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3558     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3559     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3560     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3561     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3562     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3563 
3564     // Round 2
3565     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3566     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3567     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3568     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3569     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3570     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3571     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3572     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3573     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3574     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3575     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3576     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3577     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3578     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3579     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3580     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3581 
3582     // Round 3
3583     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3584     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3585     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3586     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3587     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3588     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3589     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3590     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3591     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3592     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3593     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3594     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3595     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3596     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3597     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3598     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3599 
3600     // Round 4
3601     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3602     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3603     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3604     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3605     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3606     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3607     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3608     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3609     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3610     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3611     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3612     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3613     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3614     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3615     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3616     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3617 
3618     __ addw(a, state_regs[0], a);
3619     __ ubfx(rscratch2, state_regs[0], 32, 32);
3620     __ addw(b, rscratch2, b);
3621     __ addw(c, state_regs[1], c);
3622     __ ubfx(rscratch4, state_regs[1], 32, 32);
3623     __ addw(d, rscratch4, d);
3624 
3625     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3626     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3627 
3628     if (multi_block) {
3629       __ add(buf, buf, 64);
3630       __ add(ofs, ofs, 64);
3631       __ cmp(ofs, limit);
3632       __ br(Assembler::LE, md5_loop);
3633       __ mov(c_rarg0, ofs); // return ofs
3634     }
3635 
3636     // write hash values back in the correct order
3637     __ stp(state_regs[0], state_regs[1], Address(state));
3638 
3639     __ pop(saved_regs, sp);
3640 
3641     __ ret(lr);
3642 
3643     return start;
3644   }
3645 
3646   // Arguments:
3647   //
3648   // Inputs:
3649   //   c_rarg0   - byte[]  source+offset
3650   //   c_rarg1   - int[]   SHA.state
3651   //   c_rarg2   - int     offset
3652   //   c_rarg3   - int     limit
3653   //
3654   address generate_sha1_implCompress(StubGenStubId stub_id) {
3655     bool multi_block;
3656     switch (stub_id) {
3657     case sha1_implCompress_id:
3658       multi_block = false;
3659       break;
3660     case sha1_implCompressMB_id:
3661       multi_block = true;
3662       break;
3663     default:
3664       ShouldNotReachHere();
3665     }
3666 
3667     __ align(CodeEntryAlignment);
3668 
3669     StubCodeMark mark(this, stub_id);
3670     address start = __ pc();
3671 
3672     Register buf   = c_rarg0;
3673     Register state = c_rarg1;
3674     Register ofs   = c_rarg2;
3675     Register limit = c_rarg3;
3676 
3677     Label keys;
3678     Label sha1_loop;
3679 
3680     // load the keys into v0..v3
3681     __ adr(rscratch1, keys);
3682     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3683     // load 5 words state into v6, v7
3684     __ ldrq(v6, Address(state, 0));
3685     __ ldrs(v7, Address(state, 16));
3686 
3687 
3688     __ BIND(sha1_loop);
3689     // load 64 bytes of data into v16..v19
3690     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3691     __ rev32(v16, __ T16B, v16);
3692     __ rev32(v17, __ T16B, v17);
3693     __ rev32(v18, __ T16B, v18);
3694     __ rev32(v19, __ T16B, v19);
3695 
3696     // do the sha1
3697     __ addv(v4, __ T4S, v16, v0);
3698     __ orr(v20, __ T16B, v6, v6);
3699 
3700     FloatRegister d0 = v16;
3701     FloatRegister d1 = v17;
3702     FloatRegister d2 = v18;
3703     FloatRegister d3 = v19;
3704 
3705     for (int round = 0; round < 20; round++) {
3706       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3707       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3708       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3709       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3710       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3711 
3712       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3713       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3714       __ sha1h(tmp2, __ T4S, v20);
3715       if (round < 5)
3716         __ sha1c(v20, __ T4S, tmp3, tmp4);
3717       else if (round < 10 || round >= 15)
3718         __ sha1p(v20, __ T4S, tmp3, tmp4);
3719       else
3720         __ sha1m(v20, __ T4S, tmp3, tmp4);
3721       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3722 
3723       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3724     }
3725 
3726     __ addv(v7, __ T2S, v7, v21);
3727     __ addv(v6, __ T4S, v6, v20);
3728 
3729     if (multi_block) {
3730       __ add(ofs, ofs, 64);
3731       __ cmp(ofs, limit);
3732       __ br(Assembler::LE, sha1_loop);
3733       __ mov(c_rarg0, ofs); // return ofs
3734     }
3735 
3736     __ strq(v6, Address(state, 0));
3737     __ strs(v7, Address(state, 16));
3738 
3739     __ ret(lr);
3740 
3741     __ bind(keys);
3742     __ emit_int32(0x5a827999);
3743     __ emit_int32(0x6ed9eba1);
3744     __ emit_int32(0x8f1bbcdc);
3745     __ emit_int32(0xca62c1d6);
3746 
3747     return start;
3748   }
3749 
3750 
3751   // Arguments:
3752   //
3753   // Inputs:
3754   //   c_rarg0   - byte[]  source+offset
3755   //   c_rarg1   - int[]   SHA.state
3756   //   c_rarg2   - int     offset
3757   //   c_rarg3   - int     limit
3758   //
3759   address generate_sha256_implCompress(StubGenStubId stub_id) {
3760     bool multi_block;
3761     switch (stub_id) {
3762     case sha256_implCompress_id:
3763       multi_block = false;
3764       break;
3765     case sha256_implCompressMB_id:
3766       multi_block = true;
3767       break;
3768     default:
3769       ShouldNotReachHere();
3770     }
3771 
3772     static const uint32_t round_consts[64] = {
3773       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3774       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3775       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3776       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3777       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3778       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3779       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3780       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3781       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3782       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3783       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3784       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3785       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3786       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3787       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3788       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3789     };
3790 
3791     __ align(CodeEntryAlignment);
3792 
3793     StubCodeMark mark(this, stub_id);
3794     address start = __ pc();
3795 
3796     Register buf   = c_rarg0;
3797     Register state = c_rarg1;
3798     Register ofs   = c_rarg2;
3799     Register limit = c_rarg3;
3800 
3801     Label sha1_loop;
3802 
3803     __ stpd(v8, v9, __ pre(sp, -32));
3804     __ stpd(v10, v11, Address(sp, 16));
3805 
3806 // dga == v0
3807 // dgb == v1
3808 // dg0 == v2
3809 // dg1 == v3
3810 // dg2 == v4
3811 // t0 == v6
3812 // t1 == v7
3813 
3814     // load 16 keys to v16..v31
3815     __ lea(rscratch1, ExternalAddress((address)round_consts));
3816     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3817     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3818     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3819     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3820 
3821     // load 8 words (256 bits) state
3822     __ ldpq(v0, v1, state);
3823 
3824     __ BIND(sha1_loop);
3825     // load 64 bytes of data into v8..v11
3826     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3827     __ rev32(v8, __ T16B, v8);
3828     __ rev32(v9, __ T16B, v9);
3829     __ rev32(v10, __ T16B, v10);
3830     __ rev32(v11, __ T16B, v11);
3831 
3832     __ addv(v6, __ T4S, v8, v16);
3833     __ orr(v2, __ T16B, v0, v0);
3834     __ orr(v3, __ T16B, v1, v1);
3835 
3836     FloatRegister d0 = v8;
3837     FloatRegister d1 = v9;
3838     FloatRegister d2 = v10;
3839     FloatRegister d3 = v11;
3840 
3841 
3842     for (int round = 0; round < 16; round++) {
3843       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3844       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3845       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3846       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3847 
3848       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3849        __ orr(v4, __ T16B, v2, v2);
3850       if (round < 15)
3851         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3852       __ sha256h(v2, __ T4S, v3, tmp2);
3853       __ sha256h2(v3, __ T4S, v4, tmp2);
3854       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3855 
3856       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3857     }
3858 
3859     __ addv(v0, __ T4S, v0, v2);
3860     __ addv(v1, __ T4S, v1, v3);
3861 
3862     if (multi_block) {
3863       __ add(ofs, ofs, 64);
3864       __ cmp(ofs, limit);
3865       __ br(Assembler::LE, sha1_loop);
3866       __ mov(c_rarg0, ofs); // return ofs
3867     }
3868 
3869     __ ldpd(v10, v11, Address(sp, 16));
3870     __ ldpd(v8, v9, __ post(sp, 32));
3871 
3872     __ stpq(v0, v1, state);
3873 
3874     __ ret(lr);
3875 
3876     return start;
3877   }
3878 
3879   // Double rounds for sha512.
3880   void sha512_dround(int dr,
3881                      FloatRegister vi0, FloatRegister vi1,
3882                      FloatRegister vi2, FloatRegister vi3,
3883                      FloatRegister vi4, FloatRegister vrc0,
3884                      FloatRegister vrc1, FloatRegister vin0,
3885                      FloatRegister vin1, FloatRegister vin2,
3886                      FloatRegister vin3, FloatRegister vin4) {
3887       if (dr < 36) {
3888         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3889       }
3890       __ addv(v5, __ T2D, vrc0, vin0);
3891       __ ext(v6, __ T16B, vi2, vi3, 8);
3892       __ ext(v5, __ T16B, v5, v5, 8);
3893       __ ext(v7, __ T16B, vi1, vi2, 8);
3894       __ addv(vi3, __ T2D, vi3, v5);
3895       if (dr < 32) {
3896         __ ext(v5, __ T16B, vin3, vin4, 8);
3897         __ sha512su0(vin0, __ T2D, vin1);
3898       }
3899       __ sha512h(vi3, __ T2D, v6, v7);
3900       if (dr < 32) {
3901         __ sha512su1(vin0, __ T2D, vin2, v5);
3902       }
3903       __ addv(vi4, __ T2D, vi1, vi3);
3904       __ sha512h2(vi3, __ T2D, vi1, vi0);
3905   }
3906 
3907   // Arguments:
3908   //
3909   // Inputs:
3910   //   c_rarg0   - byte[]  source+offset
3911   //   c_rarg1   - int[]   SHA.state
3912   //   c_rarg2   - int     offset
3913   //   c_rarg3   - int     limit
3914   //
3915   address generate_sha512_implCompress(StubGenStubId stub_id) {
3916     bool multi_block;
3917     switch (stub_id) {
3918     case sha512_implCompress_id:
3919       multi_block = false;
3920       break;
3921     case sha512_implCompressMB_id:
3922       multi_block = true;
3923       break;
3924     default:
3925       ShouldNotReachHere();
3926     }
3927 
3928     static const uint64_t round_consts[80] = {
3929       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3930       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3931       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3932       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3933       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3934       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3935       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3936       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3937       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3938       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3939       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3940       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3941       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3942       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3943       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3944       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3945       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3946       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3947       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3948       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3949       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3950       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3951       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3952       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3953       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3954       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3955       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3956     };
3957 
3958     __ align(CodeEntryAlignment);
3959 
3960     StubCodeMark mark(this, stub_id);
3961     address start = __ pc();
3962 
3963     Register buf   = c_rarg0;
3964     Register state = c_rarg1;
3965     Register ofs   = c_rarg2;
3966     Register limit = c_rarg3;
3967 
3968     __ stpd(v8, v9, __ pre(sp, -64));
3969     __ stpd(v10, v11, Address(sp, 16));
3970     __ stpd(v12, v13, Address(sp, 32));
3971     __ stpd(v14, v15, Address(sp, 48));
3972 
3973     Label sha512_loop;
3974 
3975     // load state
3976     __ ld1(v8, v9, v10, v11, __ T2D, state);
3977 
3978     // load first 4 round constants
3979     __ lea(rscratch1, ExternalAddress((address)round_consts));
3980     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3981 
3982     __ BIND(sha512_loop);
3983     // load 128B of data into v12..v19
3984     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3985     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3986     __ rev64(v12, __ T16B, v12);
3987     __ rev64(v13, __ T16B, v13);
3988     __ rev64(v14, __ T16B, v14);
3989     __ rev64(v15, __ T16B, v15);
3990     __ rev64(v16, __ T16B, v16);
3991     __ rev64(v17, __ T16B, v17);
3992     __ rev64(v18, __ T16B, v18);
3993     __ rev64(v19, __ T16B, v19);
3994 
3995     __ mov(rscratch2, rscratch1);
3996 
3997     __ mov(v0, __ T16B, v8);
3998     __ mov(v1, __ T16B, v9);
3999     __ mov(v2, __ T16B, v10);
4000     __ mov(v3, __ T16B, v11);
4001 
4002     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4003     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4004     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4005     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4006     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4007     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4008     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4009     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4010     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4011     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4012     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4013     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4014     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4015     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4016     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4017     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4018     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4019     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4020     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4021     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4022     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4023     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4024     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4025     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4026     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4027     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4028     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4029     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4030     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4031     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4032     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4033     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4034     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
4035     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
4036     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
4037     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
4038     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
4039     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
4040     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
4041     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
4042 
4043     __ addv(v8, __ T2D, v8, v0);
4044     __ addv(v9, __ T2D, v9, v1);
4045     __ addv(v10, __ T2D, v10, v2);
4046     __ addv(v11, __ T2D, v11, v3);
4047 
4048     if (multi_block) {
4049       __ add(ofs, ofs, 128);
4050       __ cmp(ofs, limit);
4051       __ br(Assembler::LE, sha512_loop);
4052       __ mov(c_rarg0, ofs); // return ofs
4053     }
4054 
4055     __ st1(v8, v9, v10, v11, __ T2D, state);
4056 
4057     __ ldpd(v14, v15, Address(sp, 48));
4058     __ ldpd(v12, v13, Address(sp, 32));
4059     __ ldpd(v10, v11, Address(sp, 16));
4060     __ ldpd(v8, v9, __ post(sp, 64));
4061 
4062     __ ret(lr);
4063 
4064     return start;
4065   }
4066 
4067   // Arguments:
4068   //
4069   // Inputs:
4070   //   c_rarg0   - byte[]  source+offset
4071   //   c_rarg1   - byte[]  SHA.state
4072   //   c_rarg2   - int     block_size
4073   //   c_rarg3   - int     offset
4074   //   c_rarg4   - int     limit
4075   //
4076   address generate_sha3_implCompress(StubGenStubId stub_id) {
4077     bool multi_block;
4078     switch (stub_id) {
4079     case sha3_implCompress_id:
4080       multi_block = false;
4081       break;
4082     case sha3_implCompressMB_id:
4083       multi_block = true;
4084       break;
4085     default:
4086       ShouldNotReachHere();
4087     }
4088 
4089     static const uint64_t round_consts[24] = {
4090       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4091       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4092       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4093       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4094       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4095       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4096       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4097       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4098     };
4099 
4100     __ align(CodeEntryAlignment);
4101 
4102     StubCodeMark mark(this, stub_id);
4103     address start = __ pc();
4104 
4105     Register buf           = c_rarg0;
4106     Register state         = c_rarg1;
4107     Register block_size    = c_rarg2;
4108     Register ofs           = c_rarg3;
4109     Register limit         = c_rarg4;
4110 
4111     Label sha3_loop, rounds24_loop;
4112     Label sha3_512_or_sha3_384, shake128;
4113 
4114     __ stpd(v8, v9, __ pre(sp, -64));
4115     __ stpd(v10, v11, Address(sp, 16));
4116     __ stpd(v12, v13, Address(sp, 32));
4117     __ stpd(v14, v15, Address(sp, 48));
4118 
4119     // load state
4120     __ add(rscratch1, state, 32);
4121     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4122     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4123     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4124     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4125     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4126     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4127     __ ld1(v24, __ T1D, rscratch1);
4128 
4129     __ BIND(sha3_loop);
4130 
4131     // 24 keccak rounds
4132     __ movw(rscratch2, 24);
4133 
4134     // load round_constants base
4135     __ lea(rscratch1, ExternalAddress((address) round_consts));
4136 
4137     // load input
4138     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4139     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4140     __ eor(v0, __ T8B, v0, v25);
4141     __ eor(v1, __ T8B, v1, v26);
4142     __ eor(v2, __ T8B, v2, v27);
4143     __ eor(v3, __ T8B, v3, v28);
4144     __ eor(v4, __ T8B, v4, v29);
4145     __ eor(v5, __ T8B, v5, v30);
4146     __ eor(v6, __ T8B, v6, v31);
4147 
4148     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4149     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4150 
4151     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4152     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4153     __ eor(v7, __ T8B, v7, v25);
4154     __ eor(v8, __ T8B, v8, v26);
4155     __ eor(v9, __ T8B, v9, v27);
4156     __ eor(v10, __ T8B, v10, v28);
4157     __ eor(v11, __ T8B, v11, v29);
4158     __ eor(v12, __ T8B, v12, v30);
4159     __ eor(v13, __ T8B, v13, v31);
4160 
4161     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4162     __ eor(v14, __ T8B, v14, v25);
4163     __ eor(v15, __ T8B, v15, v26);
4164     __ eor(v16, __ T8B, v16, v27);
4165 
4166     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4167     __ andw(c_rarg5, block_size, 48);
4168     __ cbzw(c_rarg5, rounds24_loop);
4169 
4170     __ tbnz(block_size, 5, shake128);
4171     // block_size == 144, bit5 == 0, SHA3-244
4172     __ ldrd(v28, __ post(buf, 8));
4173     __ eor(v17, __ T8B, v17, v28);
4174     __ b(rounds24_loop);
4175 
4176     __ BIND(shake128);
4177     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4178     __ eor(v17, __ T8B, v17, v28);
4179     __ eor(v18, __ T8B, v18, v29);
4180     __ eor(v19, __ T8B, v19, v30);
4181     __ eor(v20, __ T8B, v20, v31);
4182     __ b(rounds24_loop); // block_size == 168, SHAKE128
4183 
4184     __ BIND(sha3_512_or_sha3_384);
4185     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4186     __ eor(v7, __ T8B, v7, v25);
4187     __ eor(v8, __ T8B, v8, v26);
4188     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4189 
4190     // SHA3-384
4191     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4192     __ eor(v9,  __ T8B, v9,  v27);
4193     __ eor(v10, __ T8B, v10, v28);
4194     __ eor(v11, __ T8B, v11, v29);
4195     __ eor(v12, __ T8B, v12, v30);
4196 
4197     __ BIND(rounds24_loop);
4198     __ subw(rscratch2, rscratch2, 1);
4199 
4200     __ eor3(v29, __ T16B, v4, v9, v14);
4201     __ eor3(v26, __ T16B, v1, v6, v11);
4202     __ eor3(v28, __ T16B, v3, v8, v13);
4203     __ eor3(v25, __ T16B, v0, v5, v10);
4204     __ eor3(v27, __ T16B, v2, v7, v12);
4205     __ eor3(v29, __ T16B, v29, v19, v24);
4206     __ eor3(v26, __ T16B, v26, v16, v21);
4207     __ eor3(v28, __ T16B, v28, v18, v23);
4208     __ eor3(v25, __ T16B, v25, v15, v20);
4209     __ eor3(v27, __ T16B, v27, v17, v22);
4210 
4211     __ rax1(v30, __ T2D, v29, v26);
4212     __ rax1(v26, __ T2D, v26, v28);
4213     __ rax1(v28, __ T2D, v28, v25);
4214     __ rax1(v25, __ T2D, v25, v27);
4215     __ rax1(v27, __ T2D, v27, v29);
4216 
4217     __ eor(v0, __ T16B, v0, v30);
4218     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4219     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4220     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4221     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4222     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4223     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4224     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4225     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4226     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4227     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4228     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4229     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4230     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4231     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4232     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4233     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4234     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4235     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4236     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4237     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4238     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4239     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4240     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4241     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4242 
4243     __ bcax(v20, __ T16B, v31, v22, v8);
4244     __ bcax(v21, __ T16B, v8,  v23, v22);
4245     __ bcax(v22, __ T16B, v22, v24, v23);
4246     __ bcax(v23, __ T16B, v23, v31, v24);
4247     __ bcax(v24, __ T16B, v24, v8,  v31);
4248 
4249     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4250 
4251     __ bcax(v17, __ T16B, v25, v19, v3);
4252     __ bcax(v18, __ T16B, v3,  v15, v19);
4253     __ bcax(v19, __ T16B, v19, v16, v15);
4254     __ bcax(v15, __ T16B, v15, v25, v16);
4255     __ bcax(v16, __ T16B, v16, v3,  v25);
4256 
4257     __ bcax(v10, __ T16B, v29, v12, v26);
4258     __ bcax(v11, __ T16B, v26, v13, v12);
4259     __ bcax(v12, __ T16B, v12, v14, v13);
4260     __ bcax(v13, __ T16B, v13, v29, v14);
4261     __ bcax(v14, __ T16B, v14, v26, v29);
4262 
4263     __ bcax(v7, __ T16B, v30, v9,  v4);
4264     __ bcax(v8, __ T16B, v4,  v5,  v9);
4265     __ bcax(v9, __ T16B, v9,  v6,  v5);
4266     __ bcax(v5, __ T16B, v5,  v30, v6);
4267     __ bcax(v6, __ T16B, v6,  v4,  v30);
4268 
4269     __ bcax(v3, __ T16B, v27, v0,  v28);
4270     __ bcax(v4, __ T16B, v28, v1,  v0);
4271     __ bcax(v0, __ T16B, v0,  v2,  v1);
4272     __ bcax(v1, __ T16B, v1,  v27, v2);
4273     __ bcax(v2, __ T16B, v2,  v28, v27);
4274 
4275     __ eor(v0, __ T16B, v0, v31);
4276 
4277     __ cbnzw(rscratch2, rounds24_loop);
4278 
4279     if (multi_block) {
4280       __ add(ofs, ofs, block_size);
4281       __ cmp(ofs, limit);
4282       __ br(Assembler::LE, sha3_loop);
4283       __ mov(c_rarg0, ofs); // return ofs
4284     }
4285 
4286     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4287     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4288     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4289     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4290     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4291     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4292     __ st1(v24, __ T1D, state);
4293 
4294     __ ldpd(v14, v15, Address(sp, 48));
4295     __ ldpd(v12, v13, Address(sp, 32));
4296     __ ldpd(v10, v11, Address(sp, 16));
4297     __ ldpd(v8, v9, __ post(sp, 64));
4298 
4299     __ ret(lr);
4300 
4301     return start;
4302   }
4303 
4304   /**
4305    *  Arguments:
4306    *
4307    * Inputs:
4308    *   c_rarg0   - int crc
4309    *   c_rarg1   - byte* buf
4310    *   c_rarg2   - int length
4311    *
4312    * Output:
4313    *       rax   - int crc result
4314    */
4315   address generate_updateBytesCRC32() {
4316     assert(UseCRC32Intrinsics, "what are we doing here?");
4317 
4318     __ align(CodeEntryAlignment);
4319     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
4320     StubCodeMark mark(this, stub_id);
4321 
4322     address start = __ pc();
4323 
4324     const Register crc   = c_rarg0;  // crc
4325     const Register buf   = c_rarg1;  // source java byte array address
4326     const Register len   = c_rarg2;  // length
4327     const Register table0 = c_rarg3; // crc_table address
4328     const Register table1 = c_rarg4;
4329     const Register table2 = c_rarg5;
4330     const Register table3 = c_rarg6;
4331     const Register tmp3 = c_rarg7;
4332 
4333     BLOCK_COMMENT("Entry:");
4334     __ enter(); // required for proper stackwalking of RuntimeStub frame
4335 
4336     __ kernel_crc32(crc, buf, len,
4337               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4338 
4339     __ leave(); // required for proper stackwalking of RuntimeStub frame
4340     __ ret(lr);
4341 
4342     return start;
4343   }
4344 
4345   // ChaCha20 block function.  This version parallelizes 4 quarter
4346   // round operations at a time.  It uses 16 SIMD registers to
4347   // produce 4 blocks of key stream.
4348   //
4349   // state (int[16]) = c_rarg0
4350   // keystream (byte[256]) = c_rarg1
4351   // return - number of bytes of keystream (always 256)
4352   //
4353   // In this approach, we load the 512-bit start state sequentially into
4354   // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
4355   // state, with each successive set of 4 vectors having a +1 added into
4356   // the first 32-bit lane of the 4th vector in that group (the counter).
4357   // By doing this, we can perform the block function on 4 512-bit blocks
4358   // within one run of this intrinsic.
4359   // The alignment of the data across the 4-vector group is such that at
4360   // the start it is already aligned for the first round of each two-round
4361   // loop iteration.  In other words, the corresponding lanes of each vector
4362   // will contain the values needed for that quarter round operation (e.g.
4363   // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
4364   // In between each full round, a lane shift must occur.  Within a loop
4365   // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
4366   // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
4367   // is effectively a diagonal orientation in columnar form.  After the
4368   // second full round, those registers are left-rotated again, this time
4369   // 96, 64, and 32 bits - returning the vectors to their columnar organization.
4370   // After all 10 iterations, the original state is added to each 4-vector
4371   // working state along with the add mask, and the 4 vector groups are
4372   // sequentially written to the memory dedicated for the output key stream.
4373   //
4374   // For a more detailed explanation, see Goll and Gueron, "Vectorization of
4375   // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
4376   // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
4377   address generate_chacha20Block_qrpar() {
4378     Label L_Q_twoRounds, L_Q_cc20_const;
4379     // The constant data is broken into two 128-bit segments to be loaded
4380     // onto SIMD registers.  The first 128 bits are a counter add overlay
4381     // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
4382     // The second 128-bits is a table constant used for 8-bit left rotations.
4383     // on 32-bit lanes within a SIMD register.
4384     __ BIND(L_Q_cc20_const);
4385     __ emit_int64(0x0000000000000001UL);
4386     __ emit_int64(0x0000000000000000UL);
4387     __ emit_int64(0x0605040702010003UL);
4388     __ emit_int64(0x0E0D0C0F0A09080BUL);
4389 
4390     __ align(CodeEntryAlignment);
4391     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
4392     StubCodeMark mark(this, stub_id);
4393     address start = __ pc();
4394     __ enter();
4395 
4396     const Register state = c_rarg0;
4397     const Register keystream = c_rarg1;
4398     const Register loopCtr = r10;
4399     const Register tmpAddr = r11;
4400 
4401     const FloatRegister aState = v0;
4402     const FloatRegister bState = v1;
4403     const FloatRegister cState = v2;
4404     const FloatRegister dState = v3;
4405     const FloatRegister a1Vec = v4;
4406     const FloatRegister b1Vec = v5;
4407     const FloatRegister c1Vec = v6;
4408     const FloatRegister d1Vec = v7;
4409     // Skip the callee-saved registers v8 - v15
4410     const FloatRegister a2Vec = v16;
4411     const FloatRegister b2Vec = v17;
4412     const FloatRegister c2Vec = v18;
4413     const FloatRegister d2Vec = v19;
4414     const FloatRegister a3Vec = v20;
4415     const FloatRegister b3Vec = v21;
4416     const FloatRegister c3Vec = v22;
4417     const FloatRegister d3Vec = v23;
4418     const FloatRegister a4Vec = v24;
4419     const FloatRegister b4Vec = v25;
4420     const FloatRegister c4Vec = v26;
4421     const FloatRegister d4Vec = v27;
4422     const FloatRegister scratch = v28;
4423     const FloatRegister addMask = v29;
4424     const FloatRegister lrot8Tbl = v30;
4425 
4426     // Load the initial state in the first 4 quadword registers,
4427     // then copy the initial state into the next 4 quadword registers
4428     // that will be used for the working state.
4429     __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
4430 
4431     // Load the index register for 2 constant 128-bit data fields.
4432     // The first represents the +1/+0/+0/+0 add mask.  The second is
4433     // the 8-bit left rotation.
4434     __ adr(tmpAddr, L_Q_cc20_const);
4435     __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
4436 
4437     __ mov(a1Vec, __ T16B, aState);
4438     __ mov(b1Vec, __ T16B, bState);
4439     __ mov(c1Vec, __ T16B, cState);
4440     __ mov(d1Vec, __ T16B, dState);
4441 
4442     __ mov(a2Vec, __ T16B, aState);
4443     __ mov(b2Vec, __ T16B, bState);
4444     __ mov(c2Vec, __ T16B, cState);
4445     __ addv(d2Vec, __ T4S, d1Vec, addMask);
4446 
4447     __ mov(a3Vec, __ T16B, aState);
4448     __ mov(b3Vec, __ T16B, bState);
4449     __ mov(c3Vec, __ T16B, cState);
4450     __ addv(d3Vec, __ T4S, d2Vec, addMask);
4451 
4452     __ mov(a4Vec, __ T16B, aState);
4453     __ mov(b4Vec, __ T16B, bState);
4454     __ mov(c4Vec, __ T16B, cState);
4455     __ addv(d4Vec, __ T4S, d3Vec, addMask);
4456 
4457     // Set up the 10 iteration loop
4458     __ mov(loopCtr, 10);
4459     __ BIND(L_Q_twoRounds);
4460 
4461     // The first set of operations on the vectors covers the first 4 quarter
4462     // round operations:
4463     //  Qround(state, 0, 4, 8,12)
4464     //  Qround(state, 1, 5, 9,13)
4465     //  Qround(state, 2, 6,10,14)
4466     //  Qround(state, 3, 7,11,15)
4467     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
4468     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
4469     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
4470     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
4471 
4472     // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
4473     // diagonals. The a1Vec does not need to change orientation.
4474     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
4475     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
4476     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
4477     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
4478 
4479     // The second set of operations on the vectors covers the second 4 quarter
4480     // round operations, now acting on the diagonals:
4481     //  Qround(state, 0, 5,10,15)
4482     //  Qround(state, 1, 6,11,12)
4483     //  Qround(state, 2, 7, 8,13)
4484     //  Qround(state, 3, 4, 9,14)
4485     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
4486     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
4487     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
4488     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
4489 
4490     // Before we start the next iteration, we need to perform shuffles
4491     // on the b/c/d vectors to move them back to columnar organizations
4492     // from their current diagonal orientation.
4493     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
4494     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
4495     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
4496     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
4497 
4498     // Decrement and iterate
4499     __ sub(loopCtr, loopCtr, 1);
4500     __ cbnz(loopCtr, L_Q_twoRounds);
4501 
4502     // Once the counter reaches zero, we fall out of the loop
4503     // and need to add the initial state back into the working state
4504     // represented by the a/b/c/d1Vec registers.  This is destructive
4505     // on the dState register but we no longer will need it.
4506     __ addv(a1Vec, __ T4S, a1Vec, aState);
4507     __ addv(b1Vec, __ T4S, b1Vec, bState);
4508     __ addv(c1Vec, __ T4S, c1Vec, cState);
4509     __ addv(d1Vec, __ T4S, d1Vec, dState);
4510 
4511     __ addv(a2Vec, __ T4S, a2Vec, aState);
4512     __ addv(b2Vec, __ T4S, b2Vec, bState);
4513     __ addv(c2Vec, __ T4S, c2Vec, cState);
4514     __ addv(dState, __ T4S, dState, addMask);
4515     __ addv(d2Vec, __ T4S, d2Vec, dState);
4516 
4517     __ addv(a3Vec, __ T4S, a3Vec, aState);
4518     __ addv(b3Vec, __ T4S, b3Vec, bState);
4519     __ addv(c3Vec, __ T4S, c3Vec, cState);
4520     __ addv(dState, __ T4S, dState, addMask);
4521     __ addv(d3Vec, __ T4S, d3Vec, dState);
4522 
4523     __ addv(a4Vec, __ T4S, a4Vec, aState);
4524     __ addv(b4Vec, __ T4S, b4Vec, bState);
4525     __ addv(c4Vec, __ T4S, c4Vec, cState);
4526     __ addv(dState, __ T4S, dState, addMask);
4527     __ addv(d4Vec, __ T4S, d4Vec, dState);
4528 
4529     // Write the final state back to the result buffer
4530     __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
4531     __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
4532     __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
4533     __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
4534 
4535     __ mov(r0, 256);             // Return length of output keystream
4536     __ leave();
4537     __ ret(lr);
4538 
4539     return start;
4540   }
4541 
4542   /**
4543    *  Arguments:
4544    *
4545    * Inputs:
4546    *   c_rarg0   - int crc
4547    *   c_rarg1   - byte* buf
4548    *   c_rarg2   - int length
4549    *   c_rarg3   - int* table
4550    *
4551    * Output:
4552    *       r0   - int crc result
4553    */
4554   address generate_updateBytesCRC32C() {
4555     assert(UseCRC32CIntrinsics, "what are we doing here?");
4556 
4557     __ align(CodeEntryAlignment);
4558     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
4559     StubCodeMark mark(this, stub_id);
4560 
4561     address start = __ pc();
4562 
4563     const Register crc   = c_rarg0;  // crc
4564     const Register buf   = c_rarg1;  // source java byte array address
4565     const Register len   = c_rarg2;  // length
4566     const Register table0 = c_rarg3; // crc_table address
4567     const Register table1 = c_rarg4;
4568     const Register table2 = c_rarg5;
4569     const Register table3 = c_rarg6;
4570     const Register tmp3 = c_rarg7;
4571 
4572     BLOCK_COMMENT("Entry:");
4573     __ enter(); // required for proper stackwalking of RuntimeStub frame
4574 
4575     __ kernel_crc32c(crc, buf, len,
4576               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4577 
4578     __ leave(); // required for proper stackwalking of RuntimeStub frame
4579     __ ret(lr);
4580 
4581     return start;
4582   }
4583 
4584   /***
4585    *  Arguments:
4586    *
4587    *  Inputs:
4588    *   c_rarg0   - int   adler
4589    *   c_rarg1   - byte* buff
4590    *   c_rarg2   - int   len
4591    *
4592    * Output:
4593    *   c_rarg0   - int adler result
4594    */
4595   address generate_updateBytesAdler32() {
4596     __ align(CodeEntryAlignment);
4597     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
4598     StubCodeMark mark(this, stub_id);
4599     address start = __ pc();
4600 
4601     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4602 
4603     // Aliases
4604     Register adler  = c_rarg0;
4605     Register s1     = c_rarg0;
4606     Register s2     = c_rarg3;
4607     Register buff   = c_rarg1;
4608     Register len    = c_rarg2;
4609     Register nmax  = r4;
4610     Register base  = r5;
4611     Register count = r6;
4612     Register temp0 = rscratch1;
4613     Register temp1 = rscratch2;
4614     FloatRegister vbytes = v0;
4615     FloatRegister vs1acc = v1;
4616     FloatRegister vs2acc = v2;
4617     FloatRegister vtable = v3;
4618 
4619     // Max number of bytes we can process before having to take the mod
4620     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4621     uint64_t BASE = 0xfff1;
4622     uint64_t NMAX = 0x15B0;
4623 
4624     __ mov(base, BASE);
4625     __ mov(nmax, NMAX);
4626 
4627     // Load accumulation coefficients for the upper 16 bits
4628     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4629     __ ld1(vtable, __ T16B, Address(temp0));
4630 
4631     // s1 is initialized to the lower 16 bits of adler
4632     // s2 is initialized to the upper 16 bits of adler
4633     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4634     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4635 
4636     // The pipelined loop needs at least 16 elements for 1 iteration
4637     // It does check this, but it is more effective to skip to the cleanup loop
4638     __ cmp(len, (u1)16);
4639     __ br(Assembler::HS, L_nmax);
4640     __ cbz(len, L_combine);
4641 
4642     __ bind(L_simple_by1_loop);
4643     __ ldrb(temp0, Address(__ post(buff, 1)));
4644     __ add(s1, s1, temp0);
4645     __ add(s2, s2, s1);
4646     __ subs(len, len, 1);
4647     __ br(Assembler::HI, L_simple_by1_loop);
4648 
4649     // s1 = s1 % BASE
4650     __ subs(temp0, s1, base);
4651     __ csel(s1, temp0, s1, Assembler::HS);
4652 
4653     // s2 = s2 % BASE
4654     __ lsr(temp0, s2, 16);
4655     __ lsl(temp1, temp0, 4);
4656     __ sub(temp1, temp1, temp0);
4657     __ add(s2, temp1, s2, ext::uxth);
4658 
4659     __ subs(temp0, s2, base);
4660     __ csel(s2, temp0, s2, Assembler::HS);
4661 
4662     __ b(L_combine);
4663 
4664     __ bind(L_nmax);
4665     __ subs(len, len, nmax);
4666     __ sub(count, nmax, 16);
4667     __ br(Assembler::LO, L_by16);
4668 
4669     __ bind(L_nmax_loop);
4670 
4671     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4672                                       vbytes, vs1acc, vs2acc, vtable);
4673 
4674     __ subs(count, count, 16);
4675     __ br(Assembler::HS, L_nmax_loop);
4676 
4677     // s1 = s1 % BASE
4678     __ lsr(temp0, s1, 16);
4679     __ lsl(temp1, temp0, 4);
4680     __ sub(temp1, temp1, temp0);
4681     __ add(temp1, temp1, s1, ext::uxth);
4682 
4683     __ lsr(temp0, temp1, 16);
4684     __ lsl(s1, temp0, 4);
4685     __ sub(s1, s1, temp0);
4686     __ add(s1, s1, temp1, ext:: uxth);
4687 
4688     __ subs(temp0, s1, base);
4689     __ csel(s1, temp0, s1, Assembler::HS);
4690 
4691     // s2 = s2 % BASE
4692     __ lsr(temp0, s2, 16);
4693     __ lsl(temp1, temp0, 4);
4694     __ sub(temp1, temp1, temp0);
4695     __ add(temp1, temp1, s2, ext::uxth);
4696 
4697     __ lsr(temp0, temp1, 16);
4698     __ lsl(s2, temp0, 4);
4699     __ sub(s2, s2, temp0);
4700     __ add(s2, s2, temp1, ext:: uxth);
4701 
4702     __ subs(temp0, s2, base);
4703     __ csel(s2, temp0, s2, Assembler::HS);
4704 
4705     __ subs(len, len, nmax);
4706     __ sub(count, nmax, 16);
4707     __ br(Assembler::HS, L_nmax_loop);
4708 
4709     __ bind(L_by16);
4710     __ adds(len, len, count);
4711     __ br(Assembler::LO, L_by1);
4712 
4713     __ bind(L_by16_loop);
4714 
4715     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4716                                       vbytes, vs1acc, vs2acc, vtable);
4717 
4718     __ subs(len, len, 16);
4719     __ br(Assembler::HS, L_by16_loop);
4720 
4721     __ bind(L_by1);
4722     __ adds(len, len, 15);
4723     __ br(Assembler::LO, L_do_mod);
4724 
4725     __ bind(L_by1_loop);
4726     __ ldrb(temp0, Address(__ post(buff, 1)));
4727     __ add(s1, temp0, s1);
4728     __ add(s2, s2, s1);
4729     __ subs(len, len, 1);
4730     __ br(Assembler::HS, L_by1_loop);
4731 
4732     __ bind(L_do_mod);
4733     // s1 = s1 % BASE
4734     __ lsr(temp0, s1, 16);
4735     __ lsl(temp1, temp0, 4);
4736     __ sub(temp1, temp1, temp0);
4737     __ add(temp1, temp1, s1, ext::uxth);
4738 
4739     __ lsr(temp0, temp1, 16);
4740     __ lsl(s1, temp0, 4);
4741     __ sub(s1, s1, temp0);
4742     __ add(s1, s1, temp1, ext:: uxth);
4743 
4744     __ subs(temp0, s1, base);
4745     __ csel(s1, temp0, s1, Assembler::HS);
4746 
4747     // s2 = s2 % BASE
4748     __ lsr(temp0, s2, 16);
4749     __ lsl(temp1, temp0, 4);
4750     __ sub(temp1, temp1, temp0);
4751     __ add(temp1, temp1, s2, ext::uxth);
4752 
4753     __ lsr(temp0, temp1, 16);
4754     __ lsl(s2, temp0, 4);
4755     __ sub(s2, s2, temp0);
4756     __ add(s2, s2, temp1, ext:: uxth);
4757 
4758     __ subs(temp0, s2, base);
4759     __ csel(s2, temp0, s2, Assembler::HS);
4760 
4761     // Combine lower bits and higher bits
4762     __ bind(L_combine);
4763     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4764 
4765     __ ret(lr);
4766 
4767     return start;
4768   }
4769 
4770   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4771           Register temp0, Register temp1, FloatRegister vbytes,
4772           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4773     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4774     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4775     // In non-vectorized code, we update s1 and s2 as:
4776     //   s1 <- s1 + b1
4777     //   s2 <- s2 + s1
4778     //   s1 <- s1 + b2
4779     //   s2 <- s2 + b1
4780     //   ...
4781     //   s1 <- s1 + b16
4782     //   s2 <- s2 + s1
4783     // Putting above assignments together, we have:
4784     //   s1_new = s1 + b1 + b2 + ... + b16
4785     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4786     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4787     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4788     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4789 
4790     // s2 = s2 + s1 * 16
4791     __ add(s2, s2, s1, Assembler::LSL, 4);
4792 
4793     // vs1acc = b1 + b2 + b3 + ... + b16
4794     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4795     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4796     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4797     __ uaddlv(vs1acc, __ T16B, vbytes);
4798     __ uaddlv(vs2acc, __ T8H, vs2acc);
4799 
4800     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4801     __ fmovd(temp0, vs1acc);
4802     __ fmovd(temp1, vs2acc);
4803     __ add(s1, s1, temp0);
4804     __ add(s2, s2, temp1);
4805   }
4806 
4807   /**
4808    *  Arguments:
4809    *
4810    *  Input:
4811    *    c_rarg0   - x address
4812    *    c_rarg1   - x length
4813    *    c_rarg2   - y address
4814    *    c_rarg3   - y length
4815    *    c_rarg4   - z address
4816    */
4817   address generate_multiplyToLen() {
4818     __ align(CodeEntryAlignment);
4819     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
4820     StubCodeMark mark(this, stub_id);
4821 
4822     address start = __ pc();
4823  
4824     if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
4825       return start;
4826     }
4827     const Register x     = r0;
4828     const Register xlen  = r1;
4829     const Register y     = r2;
4830     const Register ylen  = r3;
4831     const Register z     = r4;
4832 
4833     const Register tmp0  = r5;
4834     const Register tmp1  = r10;
4835     const Register tmp2  = r11;
4836     const Register tmp3  = r12;
4837     const Register tmp4  = r13;
4838     const Register tmp5  = r14;
4839     const Register tmp6  = r15;
4840     const Register tmp7  = r16;
4841 
4842     BLOCK_COMMENT("Entry:");
4843     __ enter(); // required for proper stackwalking of RuntimeStub frame
4844     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4845     __ leave(); // required for proper stackwalking of RuntimeStub frame
4846     __ ret(lr);
4847 
4848     SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
4849     return start;
4850   }
4851 
4852   address generate_squareToLen() {
4853     // squareToLen algorithm for sizes 1..127 described in java code works
4854     // faster than multiply_to_len on some CPUs and slower on others, but
4855     // multiply_to_len shows a bit better overall results
4856     __ align(CodeEntryAlignment);
4857     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
4858     StubCodeMark mark(this, stub_id);
4859     address start = __ pc();
4860 
4861     if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
4862       return start;
4863     }
4864     const Register x     = r0;
4865     const Register xlen  = r1;
4866     const Register z     = r2;
4867     const Register y     = r4; // == x
4868     const Register ylen  = r5; // == xlen
4869 
4870     const Register tmp0  = r3;
4871     const Register tmp1  = r10;
4872     const Register tmp2  = r11;
4873     const Register tmp3  = r12;
4874     const Register tmp4  = r13;
4875     const Register tmp5  = r14;
4876     const Register tmp6  = r15;
4877     const Register tmp7  = r16;
4878 
4879     RegSet spilled_regs = RegSet::of(y, ylen);
4880     BLOCK_COMMENT("Entry:");
4881     __ enter();
4882     __ push(spilled_regs, sp);
4883     __ mov(y, x);
4884     __ mov(ylen, xlen);
4885     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4886     __ pop(spilled_regs, sp);
4887     __ leave();
4888     __ ret(lr);
4889 
4890     SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
4891     return start;
4892   }
4893 
4894   address generate_mulAdd() {
4895     __ align(CodeEntryAlignment);
4896     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
4897     StubCodeMark mark(this, stub_id);
4898 
4899     address start = __ pc();
4900 
4901     if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
4902       return start;
4903     }
4904     const Register out     = r0;
4905     const Register in      = r1;
4906     const Register offset  = r2;
4907     const Register len     = r3;
4908     const Register k       = r4;
4909 
4910     BLOCK_COMMENT("Entry:");
4911     __ enter();
4912     __ mul_add(out, in, offset, len, k);
4913     __ leave();
4914     __ ret(lr);
4915 
4916     SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
4917     return start;
4918   }
4919 
4920   // Arguments:
4921   //
4922   // Input:
4923   //   c_rarg0   - newArr address
4924   //   c_rarg1   - oldArr address
4925   //   c_rarg2   - newIdx
4926   //   c_rarg3   - shiftCount
4927   //   c_rarg4   - numIter
4928   //
4929   address generate_bigIntegerRightShift() {
4930     __ align(CodeEntryAlignment);
4931     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
4932     StubCodeMark mark(this, stub_id);
4933     address start = __ pc();
4934 
4935     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4936 
4937     Register newArr        = c_rarg0;
4938     Register oldArr        = c_rarg1;
4939     Register newIdx        = c_rarg2;
4940     Register shiftCount    = c_rarg3;
4941     Register numIter       = c_rarg4;
4942     Register idx           = numIter;
4943 
4944     Register newArrCur     = rscratch1;
4945     Register shiftRevCount = rscratch2;
4946     Register oldArrCur     = r13;
4947     Register oldArrNext    = r14;
4948 
4949     FloatRegister oldElem0        = v0;
4950     FloatRegister oldElem1        = v1;
4951     FloatRegister newElem         = v2;
4952     FloatRegister shiftVCount     = v3;
4953     FloatRegister shiftVRevCount  = v4;
4954 
4955     __ cbz(idx, Exit);
4956 
4957     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4958 
4959     // left shift count
4960     __ movw(shiftRevCount, 32);
4961     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4962 
4963     // numIter too small to allow a 4-words SIMD loop, rolling back
4964     __ cmp(numIter, (u1)4);
4965     __ br(Assembler::LT, ShiftThree);
4966 
4967     __ dup(shiftVCount,    __ T4S, shiftCount);
4968     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4969     __ negr(shiftVCount,   __ T4S, shiftVCount);
4970 
4971     __ BIND(ShiftSIMDLoop);
4972 
4973     // Calculate the load addresses
4974     __ sub(idx, idx, 4);
4975     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4976     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4977     __ add(oldArrCur,  oldArrNext, 4);
4978 
4979     // Load 4 words and process
4980     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4981     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4982     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4983     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4984     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4985     __ st1(newElem,   __ T4S,  Address(newArrCur));
4986 
4987     __ cmp(idx, (u1)4);
4988     __ br(Assembler::LT, ShiftTwoLoop);
4989     __ b(ShiftSIMDLoop);
4990 
4991     __ BIND(ShiftTwoLoop);
4992     __ cbz(idx, Exit);
4993     __ cmp(idx, (u1)1);
4994     __ br(Assembler::EQ, ShiftOne);
4995 
4996     // Calculate the load addresses
4997     __ sub(idx, idx, 2);
4998     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4999     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
5000     __ add(oldArrCur,  oldArrNext, 4);
5001 
5002     // Load 2 words and process
5003     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
5004     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
5005     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
5006     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
5007     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
5008     __ st1(newElem,   __ T2S, Address(newArrCur));
5009     __ b(ShiftTwoLoop);
5010 
5011     __ BIND(ShiftThree);
5012     __ tbz(idx, 1, ShiftOne);
5013     __ tbz(idx, 0, ShiftTwo);
5014     __ ldrw(r10,  Address(oldArr, 12));
5015     __ ldrw(r11,  Address(oldArr, 8));
5016     __ lsrvw(r10, r10, shiftCount);
5017     __ lslvw(r11, r11, shiftRevCount);
5018     __ orrw(r12,  r10, r11);
5019     __ strw(r12,  Address(newArr, 8));
5020 
5021     __ BIND(ShiftTwo);
5022     __ ldrw(r10,  Address(oldArr, 8));
5023     __ ldrw(r11,  Address(oldArr, 4));
5024     __ lsrvw(r10, r10, shiftCount);
5025     __ lslvw(r11, r11, shiftRevCount);
5026     __ orrw(r12,  r10, r11);
5027     __ strw(r12,  Address(newArr, 4));
5028 
5029     __ BIND(ShiftOne);
5030     __ ldrw(r10,  Address(oldArr, 4));
5031     __ ldrw(r11,  Address(oldArr));
5032     __ lsrvw(r10, r10, shiftCount);
5033     __ lslvw(r11, r11, shiftRevCount);
5034     __ orrw(r12,  r10, r11);
5035     __ strw(r12,  Address(newArr));
5036 
5037     __ BIND(Exit);
5038     __ ret(lr);
5039 
5040     return start;
5041   }
5042 
5043   // Arguments:
5044   //
5045   // Input:
5046   //   c_rarg0   - newArr address
5047   //   c_rarg1   - oldArr address
5048   //   c_rarg2   - newIdx
5049   //   c_rarg3   - shiftCount
5050   //   c_rarg4   - numIter
5051   //
5052   address generate_bigIntegerLeftShift() {
5053     __ align(CodeEntryAlignment);
5054     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
5055     StubCodeMark mark(this, stub_id);
5056     address start = __ pc();
5057 
5058     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
5059 
5060     Register newArr        = c_rarg0;
5061     Register oldArr        = c_rarg1;
5062     Register newIdx        = c_rarg2;
5063     Register shiftCount    = c_rarg3;
5064     Register numIter       = c_rarg4;
5065 
5066     Register shiftRevCount = rscratch1;
5067     Register oldArrNext    = rscratch2;
5068 
5069     FloatRegister oldElem0        = v0;
5070     FloatRegister oldElem1        = v1;
5071     FloatRegister newElem         = v2;
5072     FloatRegister shiftVCount     = v3;
5073     FloatRegister shiftVRevCount  = v4;
5074 
5075     __ cbz(numIter, Exit);
5076 
5077     __ add(oldArrNext, oldArr, 4);
5078     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
5079 
5080     // right shift count
5081     __ movw(shiftRevCount, 32);
5082     __ subw(shiftRevCount, shiftRevCount, shiftCount);
5083 
5084     // numIter too small to allow a 4-words SIMD loop, rolling back
5085     __ cmp(numIter, (u1)4);
5086     __ br(Assembler::LT, ShiftThree);
5087 
5088     __ dup(shiftVCount,     __ T4S, shiftCount);
5089     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
5090     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
5091 
5092     __ BIND(ShiftSIMDLoop);
5093 
5094     // load 4 words and process
5095     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
5096     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
5097     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
5098     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
5099     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
5100     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
5101     __ sub(numIter,   numIter, 4);
5102 
5103     __ cmp(numIter, (u1)4);
5104     __ br(Assembler::LT, ShiftTwoLoop);
5105     __ b(ShiftSIMDLoop);
5106 
5107     __ BIND(ShiftTwoLoop);
5108     __ cbz(numIter, Exit);
5109     __ cmp(numIter, (u1)1);
5110     __ br(Assembler::EQ, ShiftOne);
5111 
5112     // load 2 words and process
5113     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
5114     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
5115     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
5116     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
5117     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
5118     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
5119     __ sub(numIter,   numIter, 2);
5120     __ b(ShiftTwoLoop);
5121 
5122     __ BIND(ShiftThree);
5123     __ ldrw(r10,  __ post(oldArr, 4));
5124     __ ldrw(r11,  __ post(oldArrNext, 4));
5125     __ lslvw(r10, r10, shiftCount);
5126     __ lsrvw(r11, r11, shiftRevCount);
5127     __ orrw(r12,  r10, r11);
5128     __ strw(r12,  __ post(newArr, 4));
5129     __ tbz(numIter, 1, Exit);
5130     __ tbz(numIter, 0, ShiftOne);
5131 
5132     __ BIND(ShiftTwo);
5133     __ ldrw(r10,  __ post(oldArr, 4));
5134     __ ldrw(r11,  __ post(oldArrNext, 4));
5135     __ lslvw(r10, r10, shiftCount);
5136     __ lsrvw(r11, r11, shiftRevCount);
5137     __ orrw(r12,  r10, r11);
5138     __ strw(r12,  __ post(newArr, 4));
5139 
5140     __ BIND(ShiftOne);
5141     __ ldrw(r10,  Address(oldArr));
5142     __ ldrw(r11,  Address(oldArrNext));
5143     __ lslvw(r10, r10, shiftCount);
5144     __ lsrvw(r11, r11, shiftRevCount);
5145     __ orrw(r12,  r10, r11);
5146     __ strw(r12,  Address(newArr));
5147 
5148     __ BIND(Exit);
5149     __ ret(lr);
5150 
5151     return start;
5152   }
5153 
5154   address generate_count_positives(address &count_positives_long) {
5155     const u1 large_loop_size = 64;
5156     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5157     int dcache_line = VM_Version::dcache_line_size();
5158 
5159     Register ary1 = r1, len = r2, result = r0;
5160 
5161     __ align(CodeEntryAlignment);
5162 
5163     StubGenStubId stub_id = StubGenStubId::count_positives_id;
5164     StubCodeMark mark(this, stub_id);
5165 
5166     address entry = __ pc();
5167 
5168     __ enter();
5169     // precondition: a copy of len is already in result
5170     // __ mov(result, len);
5171 
5172   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5173         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5174 
5175   __ cmp(len, (u1)15);
5176   __ br(Assembler::GT, LEN_OVER_15);
5177   // The only case when execution falls into this code is when pointer is near
5178   // the end of memory page and we have to avoid reading next page
5179   __ add(ary1, ary1, len);
5180   __ subs(len, len, 8);
5181   __ br(Assembler::GT, LEN_OVER_8);
5182   __ ldr(rscratch2, Address(ary1, -8));
5183   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5184   __ lsrv(rscratch2, rscratch2, rscratch1);
5185   __ tst(rscratch2, UPPER_BIT_MASK);
5186   __ csel(result, zr, result, Assembler::NE);
5187   __ leave();
5188   __ ret(lr);
5189   __ bind(LEN_OVER_8);
5190   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5191   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5192   __ tst(rscratch2, UPPER_BIT_MASK);
5193   __ br(Assembler::NE, RET_NO_POP);
5194   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5195   __ lsrv(rscratch1, rscratch1, rscratch2);
5196   __ tst(rscratch1, UPPER_BIT_MASK);
5197   __ bind(RET_NO_POP);
5198   __ csel(result, zr, result, Assembler::NE);
5199   __ leave();
5200   __ ret(lr);
5201 
5202   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5203   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5204 
5205   count_positives_long = __ pc(); // 2nd entry point
5206 
5207   __ enter();
5208 
5209   __ bind(LEN_OVER_15);
5210     __ push(spilled_regs, sp);
5211     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5212     __ cbz(rscratch2, ALIGNED);
5213     __ ldp(tmp6, tmp1, Address(ary1));
5214     __ mov(tmp5, 16);
5215     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5216     __ add(ary1, ary1, rscratch1);
5217     __ orr(tmp6, tmp6, tmp1);
5218     __ tst(tmp6, UPPER_BIT_MASK);
5219     __ br(Assembler::NE, RET_ADJUST);
5220     __ sub(len, len, rscratch1);
5221 
5222   __ bind(ALIGNED);
5223     __ cmp(len, large_loop_size);
5224     __ br(Assembler::LT, CHECK_16);
5225     // Perform 16-byte load as early return in pre-loop to handle situation
5226     // when initially aligned large array has negative values at starting bytes,
5227     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5228     // slower. Cases with negative bytes further ahead won't be affected that
5229     // much. In fact, it'll be faster due to early loads, less instructions and
5230     // less branches in LARGE_LOOP.
5231     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5232     __ sub(len, len, 16);
5233     __ orr(tmp6, tmp6, tmp1);
5234     __ tst(tmp6, UPPER_BIT_MASK);
5235     __ br(Assembler::NE, RET_ADJUST_16);
5236     __ cmp(len, large_loop_size);
5237     __ br(Assembler::LT, CHECK_16);
5238 
5239     if (SoftwarePrefetchHintDistance >= 0
5240         && SoftwarePrefetchHintDistance >= dcache_line) {
5241       // initial prefetch
5242       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5243     }
5244   __ bind(LARGE_LOOP);
5245     if (SoftwarePrefetchHintDistance >= 0) {
5246       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5247     }
5248     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5249     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5250     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5251     // instructions per cycle and have less branches, but this approach disables
5252     // early return, thus, all 64 bytes are loaded and checked every time.
5253     __ ldp(tmp2, tmp3, Address(ary1));
5254     __ ldp(tmp4, tmp5, Address(ary1, 16));
5255     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5256     __ ldp(tmp6, tmp1, Address(ary1, 48));
5257     __ add(ary1, ary1, large_loop_size);
5258     __ sub(len, len, large_loop_size);
5259     __ orr(tmp2, tmp2, tmp3);
5260     __ orr(tmp4, tmp4, tmp5);
5261     __ orr(rscratch1, rscratch1, rscratch2);
5262     __ orr(tmp6, tmp6, tmp1);
5263     __ orr(tmp2, tmp2, tmp4);
5264     __ orr(rscratch1, rscratch1, tmp6);
5265     __ orr(tmp2, tmp2, rscratch1);
5266     __ tst(tmp2, UPPER_BIT_MASK);
5267     __ br(Assembler::NE, RET_ADJUST_LONG);
5268     __ cmp(len, large_loop_size);
5269     __ br(Assembler::GE, LARGE_LOOP);
5270 
5271   __ bind(CHECK_16); // small 16-byte load pre-loop
5272     __ cmp(len, (u1)16);
5273     __ br(Assembler::LT, POST_LOOP16);
5274 
5275   __ bind(LOOP16); // small 16-byte load loop
5276     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5277     __ sub(len, len, 16);
5278     __ orr(tmp2, tmp2, tmp3);
5279     __ tst(tmp2, UPPER_BIT_MASK);
5280     __ br(Assembler::NE, RET_ADJUST_16);
5281     __ cmp(len, (u1)16);
5282     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5283 
5284   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5285     __ cmp(len, (u1)8);
5286     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5287     __ ldr(tmp3, Address(__ post(ary1, 8)));
5288     __ tst(tmp3, UPPER_BIT_MASK);
5289     __ br(Assembler::NE, RET_ADJUST);
5290     __ sub(len, len, 8);
5291 
5292   __ bind(POST_LOOP16_LOAD_TAIL);
5293     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5294     __ ldr(tmp1, Address(ary1));
5295     __ mov(tmp2, 64);
5296     __ sub(tmp4, tmp2, len, __ LSL, 3);
5297     __ lslv(tmp1, tmp1, tmp4);
5298     __ tst(tmp1, UPPER_BIT_MASK);
5299     __ br(Assembler::NE, RET_ADJUST);
5300     // Fallthrough
5301 
5302   __ bind(RET_LEN);
5303     __ pop(spilled_regs, sp);
5304     __ leave();
5305     __ ret(lr);
5306 
5307     // difference result - len is the count of guaranteed to be
5308     // positive bytes
5309 
5310   __ bind(RET_ADJUST_LONG);
5311     __ add(len, len, (u1)(large_loop_size - 16));
5312   __ bind(RET_ADJUST_16);
5313     __ add(len, len, 16);
5314   __ bind(RET_ADJUST);
5315     __ pop(spilled_regs, sp);
5316     __ leave();
5317     __ sub(result, result, len);
5318     __ ret(lr);
5319 
5320     return entry;
5321   }
5322 
5323   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5324         bool usePrefetch, Label &NOT_EQUAL) {
5325     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5326         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5327         tmp7 = r12, tmp8 = r13;
5328     Label LOOP;
5329 
5330     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5331     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5332     __ bind(LOOP);
5333     if (usePrefetch) {
5334       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5335       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5336     }
5337     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5338     __ eor(tmp1, tmp1, tmp2);
5339     __ eor(tmp3, tmp3, tmp4);
5340     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5341     __ orr(tmp1, tmp1, tmp3);
5342     __ cbnz(tmp1, NOT_EQUAL);
5343     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5344     __ eor(tmp5, tmp5, tmp6);
5345     __ eor(tmp7, tmp7, tmp8);
5346     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5347     __ orr(tmp5, tmp5, tmp7);
5348     __ cbnz(tmp5, NOT_EQUAL);
5349     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5350     __ eor(tmp1, tmp1, tmp2);
5351     __ eor(tmp3, tmp3, tmp4);
5352     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5353     __ orr(tmp1, tmp1, tmp3);
5354     __ cbnz(tmp1, NOT_EQUAL);
5355     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5356     __ eor(tmp5, tmp5, tmp6);
5357     __ sub(cnt1, cnt1, 8 * wordSize);
5358     __ eor(tmp7, tmp7, tmp8);
5359     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5360     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5361     // cmp) because subs allows an unlimited range of immediate operand.
5362     __ subs(tmp6, cnt1, loopThreshold);
5363     __ orr(tmp5, tmp5, tmp7);
5364     __ cbnz(tmp5, NOT_EQUAL);
5365     __ br(__ GE, LOOP);
5366     // post-loop
5367     __ eor(tmp1, tmp1, tmp2);
5368     __ eor(tmp3, tmp3, tmp4);
5369     __ orr(tmp1, tmp1, tmp3);
5370     __ sub(cnt1, cnt1, 2 * wordSize);
5371     __ cbnz(tmp1, NOT_EQUAL);
5372   }
5373 
5374   void generate_large_array_equals_loop_simd(int loopThreshold,
5375         bool usePrefetch, Label &NOT_EQUAL) {
5376     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5377         tmp2 = rscratch2;
5378     Label LOOP;
5379 
5380     __ bind(LOOP);
5381     if (usePrefetch) {
5382       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5383       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5384     }
5385     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5386     __ sub(cnt1, cnt1, 8 * wordSize);
5387     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5388     __ subs(tmp1, cnt1, loopThreshold);
5389     __ eor(v0, __ T16B, v0, v4);
5390     __ eor(v1, __ T16B, v1, v5);
5391     __ eor(v2, __ T16B, v2, v6);
5392     __ eor(v3, __ T16B, v3, v7);
5393     __ orr(v0, __ T16B, v0, v1);
5394     __ orr(v1, __ T16B, v2, v3);
5395     __ orr(v0, __ T16B, v0, v1);
5396     __ umov(tmp1, v0, __ D, 0);
5397     __ umov(tmp2, v0, __ D, 1);
5398     __ orr(tmp1, tmp1, tmp2);
5399     __ cbnz(tmp1, NOT_EQUAL);
5400     __ br(__ GE, LOOP);
5401   }
5402 
5403   // a1 = r1 - array1 address
5404   // a2 = r2 - array2 address
5405   // result = r0 - return value. Already contains "false"
5406   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5407   // r3-r5 are reserved temporary registers
5408   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5409   address generate_large_array_equals() {
5410     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5411         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5412         tmp7 = r12, tmp8 = r13;
5413     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5414         SMALL_LOOP, POST_LOOP;
5415     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5416     // calculate if at least 32 prefetched bytes are used
5417     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5418     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5419     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5420     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5421         tmp5, tmp6, tmp7, tmp8);
5422 
5423     __ align(CodeEntryAlignment);
5424 
5425     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
5426     StubCodeMark mark(this, stub_id);
5427 
5428     address entry = __ pc();
5429     __ enter();
5430     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5431     // also advance pointers to use post-increment instead of pre-increment
5432     __ add(a1, a1, wordSize);
5433     __ add(a2, a2, wordSize);
5434     if (AvoidUnalignedAccesses) {
5435       // both implementations (SIMD/nonSIMD) are using relatively large load
5436       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5437       // on some CPUs in case of address is not at least 16-byte aligned.
5438       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5439       // load if needed at least for 1st address and make if 16-byte aligned.
5440       Label ALIGNED16;
5441       __ tbz(a1, 3, ALIGNED16);
5442       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5443       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5444       __ sub(cnt1, cnt1, wordSize);
5445       __ eor(tmp1, tmp1, tmp2);
5446       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5447       __ bind(ALIGNED16);
5448     }
5449     if (UseSIMDForArrayEquals) {
5450       if (SoftwarePrefetchHintDistance >= 0) {
5451         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5452         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5453         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5454             /* prfm = */ true, NOT_EQUAL);
5455         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5456         __ br(__ LT, TAIL);
5457       }
5458       __ bind(NO_PREFETCH_LARGE_LOOP);
5459       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5460           /* prfm = */ false, NOT_EQUAL);
5461     } else {
5462       __ push(spilled_regs, sp);
5463       if (SoftwarePrefetchHintDistance >= 0) {
5464         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5465         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5466         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5467             /* prfm = */ true, NOT_EQUAL);
5468         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5469         __ br(__ LT, TAIL);
5470       }
5471       __ bind(NO_PREFETCH_LARGE_LOOP);
5472       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5473           /* prfm = */ false, NOT_EQUAL);
5474     }
5475     __ bind(TAIL);
5476       __ cbz(cnt1, EQUAL);
5477       __ subs(cnt1, cnt1, wordSize);
5478       __ br(__ LE, POST_LOOP);
5479     __ bind(SMALL_LOOP);
5480       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5481       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5482       __ subs(cnt1, cnt1, wordSize);
5483       __ eor(tmp1, tmp1, tmp2);
5484       __ cbnz(tmp1, NOT_EQUAL);
5485       __ br(__ GT, SMALL_LOOP);
5486     __ bind(POST_LOOP);
5487       __ ldr(tmp1, Address(a1, cnt1));
5488       __ ldr(tmp2, Address(a2, cnt1));
5489       __ eor(tmp1, tmp1, tmp2);
5490       __ cbnz(tmp1, NOT_EQUAL);
5491     __ bind(EQUAL);
5492       __ mov(result, true);
5493     __ bind(NOT_EQUAL);
5494       if (!UseSIMDForArrayEquals) {
5495         __ pop(spilled_regs, sp);
5496       }
5497     __ bind(NOT_EQUAL_NO_POP);
5498     __ leave();
5499     __ ret(lr);
5500     return entry;
5501   }
5502 
5503   // result = r0 - return value. Contains initial hashcode value on entry.
5504   // ary = r1 - array address
5505   // cnt = r2 - elements count
5506   // Clobbers: v0-v13, rscratch1, rscratch2
5507   address generate_large_arrays_hashcode(BasicType eltype) {
5508     const Register result = r0, ary = r1, cnt = r2;
5509     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
5510     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
5511     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
5512     const FloatRegister vpowm = v13;
5513 
5514     ARRAYS_HASHCODE_REGISTERS;
5515 
5516     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
5517 
5518     unsigned int vf; // vectorization factor
5519     bool multiply_by_halves;
5520     Assembler::SIMD_Arrangement load_arrangement;
5521     switch (eltype) {
5522     case T_BOOLEAN:
5523     case T_BYTE:
5524       load_arrangement = Assembler::T8B;
5525       multiply_by_halves = true;
5526       vf = 8;
5527       break;
5528     case T_CHAR:
5529     case T_SHORT:
5530       load_arrangement = Assembler::T8H;
5531       multiply_by_halves = true;
5532       vf = 8;
5533       break;
5534     case T_INT:
5535       load_arrangement = Assembler::T4S;
5536       multiply_by_halves = false;
5537       vf = 4;
5538       break;
5539     default:
5540       ShouldNotReachHere();
5541     }
5542 
5543     // Unroll factor
5544     const unsigned uf = 4;
5545 
5546     // Effective vectorization factor
5547     const unsigned evf = vf * uf;
5548 
5549     __ align(CodeEntryAlignment);
5550 
5551     StubGenStubId stub_id;
5552     switch (eltype) {
5553     case T_BOOLEAN:
5554       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
5555       break;
5556     case T_BYTE:
5557       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
5558       break;
5559     case T_CHAR:
5560       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
5561       break;
5562     case T_SHORT:
5563       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
5564       break;
5565     case T_INT:
5566       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
5567       break;
5568     default:
5569       stub_id = StubGenStubId::NO_STUBID;
5570       ShouldNotReachHere();
5571     };
5572 
5573     StubCodeMark mark(this, stub_id);
5574 
5575     address entry = __ pc();
5576     __ enter();
5577 
5578     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
5579     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
5580     // value shouldn't change throughout both loops.
5581     __ movw(rscratch1, intpow(31U, 3));
5582     __ mov(vpow, Assembler::S, 0, rscratch1);
5583     __ movw(rscratch1, intpow(31U, 2));
5584     __ mov(vpow, Assembler::S, 1, rscratch1);
5585     __ movw(rscratch1, intpow(31U, 1));
5586     __ mov(vpow, Assembler::S, 2, rscratch1);
5587     __ movw(rscratch1, intpow(31U, 0));
5588     __ mov(vpow, Assembler::S, 3, rscratch1);
5589 
5590     __ mov(vmul0, Assembler::T16B, 0);
5591     __ mov(vmul0, Assembler::S, 3, result);
5592 
5593     __ andr(rscratch2, cnt, (uf - 1) * vf);
5594     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
5595 
5596     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
5597     __ mov(vpowm, Assembler::S, 0, rscratch1);
5598 
5599     // SMALL LOOP
5600     __ bind(SMALL_LOOP);
5601 
5602     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
5603     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5604     __ subsw(rscratch2, rscratch2, vf);
5605 
5606     if (load_arrangement == Assembler::T8B) {
5607       // Extend 8B to 8H to be able to use vector multiply
5608       // instructions
5609       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5610       if (is_signed_subword_type(eltype)) {
5611         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5612       } else {
5613         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5614       }
5615     }
5616 
5617     switch (load_arrangement) {
5618     case Assembler::T4S:
5619       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5620       break;
5621     case Assembler::T8B:
5622     case Assembler::T8H:
5623       assert(is_subword_type(eltype), "subword type expected");
5624       if (is_signed_subword_type(eltype)) {
5625         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5626       } else {
5627         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5628       }
5629       break;
5630     default:
5631       __ should_not_reach_here();
5632     }
5633 
5634     // Process the upper half of a vector
5635     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5636       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5637       if (is_signed_subword_type(eltype)) {
5638         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5639       } else {
5640         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5641       }
5642     }
5643 
5644     __ br(Assembler::HI, SMALL_LOOP);
5645 
5646     // SMALL LOOP'S EPILOQUE
5647     __ lsr(rscratch2, cnt, exact_log2(evf));
5648     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
5649 
5650     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5651     __ addv(vmul0, Assembler::T4S, vmul0);
5652     __ umov(result, vmul0, Assembler::S, 0);
5653 
5654     // TAIL
5655     __ bind(TAIL);
5656 
5657     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
5658     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
5659     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
5660     __ andr(rscratch2, cnt, vf - 1);
5661     __ bind(TAIL_SHORTCUT);
5662     __ adr(rscratch1, BR_BASE);
5663     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
5664     __ movw(rscratch2, 0x1f);
5665     __ br(rscratch1);
5666 
5667     for (size_t i = 0; i < vf - 1; ++i) {
5668       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
5669                                    eltype);
5670       __ maddw(result, result, rscratch2, rscratch1);
5671     }
5672     __ bind(BR_BASE);
5673 
5674     __ leave();
5675     __ ret(lr);
5676 
5677     // LARGE LOOP
5678     __ bind(LARGE_LOOP_PREHEADER);
5679 
5680     __ lsr(rscratch2, cnt, exact_log2(evf));
5681 
5682     if (multiply_by_halves) {
5683       // 31^4 - multiplier between lower and upper parts of a register
5684       __ movw(rscratch1, intpow(31U, vf / 2));
5685       __ mov(vpowm, Assembler::S, 1, rscratch1);
5686       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
5687       __ movw(rscratch1, intpow(31U, evf - vf / 2));
5688       __ mov(vpowm, Assembler::S, 0, rscratch1);
5689     } else {
5690       // 31^16
5691       __ movw(rscratch1, intpow(31U, evf));
5692       __ mov(vpowm, Assembler::S, 0, rscratch1);
5693     }
5694 
5695     __ mov(vmul3, Assembler::T16B, 0);
5696     __ mov(vmul2, Assembler::T16B, 0);
5697     __ mov(vmul1, Assembler::T16B, 0);
5698 
5699     __ bind(LARGE_LOOP);
5700 
5701     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
5702     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
5703     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
5704     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5705 
5706     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
5707            Address(__ post(ary, evf * type2aelembytes(eltype))));
5708 
5709     if (load_arrangement == Assembler::T8B) {
5710       // Extend 8B to 8H to be able to use vector multiply
5711       // instructions
5712       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5713       if (is_signed_subword_type(eltype)) {
5714         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5715         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5716         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5717         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5718       } else {
5719         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5720         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5721         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5722         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5723       }
5724     }
5725 
5726     switch (load_arrangement) {
5727     case Assembler::T4S:
5728       __ addv(vmul3, load_arrangement, vmul3, vdata3);
5729       __ addv(vmul2, load_arrangement, vmul2, vdata2);
5730       __ addv(vmul1, load_arrangement, vmul1, vdata1);
5731       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5732       break;
5733     case Assembler::T8B:
5734     case Assembler::T8H:
5735       assert(is_subword_type(eltype), "subword type expected");
5736       if (is_signed_subword_type(eltype)) {
5737         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5738         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5739         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5740         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5741       } else {
5742         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5743         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5744         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5745         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5746       }
5747       break;
5748     default:
5749       __ should_not_reach_here();
5750     }
5751 
5752     // Process the upper half of a vector
5753     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5754       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
5755       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
5756       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
5757       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
5758       if (is_signed_subword_type(eltype)) {
5759         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5760         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5761         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5762         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5763       } else {
5764         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5765         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5766         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5767         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5768       }
5769     }
5770 
5771     __ subsw(rscratch2, rscratch2, 1);
5772     __ br(Assembler::HI, LARGE_LOOP);
5773 
5774     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
5775     __ addv(vmul3, Assembler::T4S, vmul3);
5776     __ umov(result, vmul3, Assembler::S, 0);
5777 
5778     __ mov(rscratch2, intpow(31U, vf));
5779 
5780     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
5781     __ addv(vmul2, Assembler::T4S, vmul2);
5782     __ umov(rscratch1, vmul2, Assembler::S, 0);
5783     __ maddw(result, result, rscratch2, rscratch1);
5784 
5785     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
5786     __ addv(vmul1, Assembler::T4S, vmul1);
5787     __ umov(rscratch1, vmul1, Assembler::S, 0);
5788     __ maddw(result, result, rscratch2, rscratch1);
5789 
5790     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5791     __ addv(vmul0, Assembler::T4S, vmul0);
5792     __ umov(rscratch1, vmul0, Assembler::S, 0);
5793     __ maddw(result, result, rscratch2, rscratch1);
5794 
5795     __ andr(rscratch2, cnt, vf - 1);
5796     __ cbnz(rscratch2, TAIL_SHORTCUT);
5797 
5798     __ leave();
5799     __ ret(lr);
5800 
5801     return entry;
5802   }
5803 
5804   address generate_dsin_dcos(bool isCos) {
5805     __ align(CodeEntryAlignment);
5806     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
5807     StubCodeMark mark(this, stub_id);
5808     address start = __ pc();
5809     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5810         (address)StubRoutines::aarch64::_two_over_pi,
5811         (address)StubRoutines::aarch64::_pio2,
5812         (address)StubRoutines::aarch64::_dsin_coef,
5813         (address)StubRoutines::aarch64::_dcos_coef);
5814     return start;
5815   }
5816 
5817   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5818   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5819       Label &DIFF2) {
5820     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5821     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5822 
5823     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5824     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5825     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5826     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5827 
5828     __ fmovd(tmpL, vtmp3);
5829     __ eor(rscratch2, tmp3, tmpL);
5830     __ cbnz(rscratch2, DIFF2);
5831 
5832     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5833     __ umov(tmpL, vtmp3, __ D, 1);
5834     __ eor(rscratch2, tmpU, tmpL);
5835     __ cbnz(rscratch2, DIFF1);
5836 
5837     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5838     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5839     __ fmovd(tmpL, vtmp);
5840     __ eor(rscratch2, tmp3, tmpL);
5841     __ cbnz(rscratch2, DIFF2);
5842 
5843     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5844     __ umov(tmpL, vtmp, __ D, 1);
5845     __ eor(rscratch2, tmpU, tmpL);
5846     __ cbnz(rscratch2, DIFF1);
5847   }
5848 
5849   // r0  = result
5850   // r1  = str1
5851   // r2  = cnt1
5852   // r3  = str2
5853   // r4  = cnt2
5854   // r10 = tmp1
5855   // r11 = tmp2
5856   address generate_compare_long_string_different_encoding(bool isLU) {
5857     __ align(CodeEntryAlignment);
5858     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
5859     StubCodeMark mark(this, stub_id);
5860     address entry = __ pc();
5861     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5862         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5863         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5864     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5865         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5866     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5867     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5868 
5869     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5870 
5871     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5872     // cnt2 == amount of characters left to compare
5873     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5874     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5875     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5876     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5877     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5878     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5879     __ eor(rscratch2, tmp1, tmp2);
5880     __ mov(rscratch1, tmp2);
5881     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5882     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5883              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5884     __ push(spilled_regs, sp);
5885     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5886     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5887 
5888     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5889 
5890     if (SoftwarePrefetchHintDistance >= 0) {
5891       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5892       __ br(__ LT, NO_PREFETCH);
5893       __ bind(LARGE_LOOP_PREFETCH);
5894         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5895         __ mov(tmp4, 2);
5896         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5897         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5898           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5899           __ subs(tmp4, tmp4, 1);
5900           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5901           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5902           __ mov(tmp4, 2);
5903         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5904           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5905           __ subs(tmp4, tmp4, 1);
5906           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5907           __ sub(cnt2, cnt2, 64);
5908           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5909           __ br(__ GE, LARGE_LOOP_PREFETCH);
5910     }
5911     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5912     __ bind(NO_PREFETCH);
5913     __ subs(cnt2, cnt2, 16);
5914     __ br(__ LT, TAIL);
5915     __ align(OptoLoopAlignment);
5916     __ bind(SMALL_LOOP); // smaller loop
5917       __ subs(cnt2, cnt2, 16);
5918       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5919       __ br(__ GE, SMALL_LOOP);
5920       __ cmn(cnt2, (u1)16);
5921       __ br(__ EQ, LOAD_LAST);
5922     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5923       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5924       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5925       __ ldr(tmp3, Address(cnt1, -8));
5926       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5927       __ b(LOAD_LAST);
5928     __ bind(DIFF2);
5929       __ mov(tmpU, tmp3);
5930     __ bind(DIFF1);
5931       __ pop(spilled_regs, sp);
5932       __ b(CALCULATE_DIFFERENCE);
5933     __ bind(LOAD_LAST);
5934       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5935       // No need to load it again
5936       __ mov(tmpU, tmp3);
5937       __ pop(spilled_regs, sp);
5938 
5939       // tmp2 points to the address of the last 4 Latin1 characters right now
5940       __ ldrs(vtmp, Address(tmp2));
5941       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5942       __ fmovd(tmpL, vtmp);
5943 
5944       __ eor(rscratch2, tmpU, tmpL);
5945       __ cbz(rscratch2, DONE);
5946 
5947     // Find the first different characters in the longwords and
5948     // compute their difference.
5949     __ bind(CALCULATE_DIFFERENCE);
5950       __ rev(rscratch2, rscratch2);
5951       __ clz(rscratch2, rscratch2);
5952       __ andr(rscratch2, rscratch2, -16);
5953       __ lsrv(tmp1, tmp1, rscratch2);
5954       __ uxthw(tmp1, tmp1);
5955       __ lsrv(rscratch1, rscratch1, rscratch2);
5956       __ uxthw(rscratch1, rscratch1);
5957       __ subw(result, tmp1, rscratch1);
5958     __ bind(DONE);
5959       __ ret(lr);
5960     return entry;
5961   }
5962 
5963   // r0 = input (float16)
5964   // v0 = result (float)
5965   // v1 = temporary float register
5966   address generate_float16ToFloat() {
5967     __ align(CodeEntryAlignment);
5968     StubGenStubId stub_id = StubGenStubId::hf2f_id;
5969     StubCodeMark mark(this, stub_id);
5970     address entry = __ pc();
5971     BLOCK_COMMENT("Entry:");
5972     __ flt16_to_flt(v0, r0, v1);
5973     __ ret(lr);
5974     return entry;
5975   }
5976 
5977   // v0 = input (float)
5978   // r0 = result (float16)
5979   // v1 = temporary float register
5980   address generate_floatToFloat16() {
5981     __ align(CodeEntryAlignment);
5982     StubGenStubId stub_id = StubGenStubId::f2hf_id;
5983     StubCodeMark mark(this, stub_id);
5984     address entry = __ pc();
5985     BLOCK_COMMENT("Entry:");
5986     __ flt_to_flt16(r0, v0, v1);
5987     __ ret(lr);
5988     return entry;
5989   }
5990 
5991   address generate_method_entry_barrier() {
5992     __ align(CodeEntryAlignment);
5993     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
5994     StubCodeMark mark(this, stub_id);
5995 
5996     Label deoptimize_label;
5997 
5998     address start = __ pc();
5999 
6000     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
6001 
6002     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
6003       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6004       // We can get here despite the nmethod being good, if we have not
6005       // yet applied our cross modification fence (or data fence).
6006       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
6007       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
6008       __ ldrw(rscratch2, rscratch2);
6009       __ strw(rscratch2, thread_epoch_addr);
6010       __ isb();
6011       __ membar(__ LoadLoad);
6012     }
6013 
6014     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
6015 
6016     __ enter();
6017     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
6018 
6019     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
6020 
6021     __ push_call_clobbered_registers();
6022 
6023     __ mov(c_rarg0, rscratch2);
6024     __ call_VM_leaf
6025          (CAST_FROM_FN_PTR
6026           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
6027 
6028     __ reset_last_Java_frame(true);
6029 
6030     __ mov(rscratch1, r0);
6031 
6032     __ pop_call_clobbered_registers();
6033 
6034     __ cbnz(rscratch1, deoptimize_label);
6035 
6036     __ leave();
6037     __ ret(lr);
6038 
6039     __ BIND(deoptimize_label);
6040 
6041     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
6042     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
6043 
6044     __ mov(sp, rscratch1);
6045     __ br(rscratch2);
6046 
6047     return start;
6048   }
6049 
6050   // r0  = result
6051   // r1  = str1
6052   // r2  = cnt1
6053   // r3  = str2
6054   // r4  = cnt2
6055   // r10 = tmp1
6056   // r11 = tmp2
6057   address generate_compare_long_string_same_encoding(bool isLL) {
6058     __ align(CodeEntryAlignment);
6059     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
6060     StubCodeMark mark(this, stub_id);
6061     address entry = __ pc();
6062     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
6063         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
6064 
6065     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
6066 
6067     // exit from large loop when less than 64 bytes left to read or we're about
6068     // to prefetch memory behind array border
6069     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
6070 
6071     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
6072     __ eor(rscratch2, tmp1, tmp2);
6073     __ cbnz(rscratch2, CAL_DIFFERENCE);
6074 
6075     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
6076     // update pointers, because of previous read
6077     __ add(str1, str1, wordSize);
6078     __ add(str2, str2, wordSize);
6079     if (SoftwarePrefetchHintDistance >= 0) {
6080       __ align(OptoLoopAlignment);
6081       __ bind(LARGE_LOOP_PREFETCH);
6082         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
6083         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
6084 
6085         for (int i = 0; i < 4; i++) {
6086           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
6087           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
6088           __ cmp(tmp1, tmp2);
6089           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
6090           __ br(Assembler::NE, DIFF);
6091         }
6092         __ sub(cnt2, cnt2, isLL ? 64 : 32);
6093         __ add(str1, str1, 64);
6094         __ add(str2, str2, 64);
6095         __ subs(rscratch2, cnt2, largeLoopExitCondition);
6096         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
6097         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
6098     }
6099 
6100     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
6101     __ br(Assembler::LE, LESS16);
6102     __ align(OptoLoopAlignment);
6103     __ bind(LOOP_COMPARE16);
6104       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
6105       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
6106       __ cmp(tmp1, tmp2);
6107       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
6108       __ br(Assembler::NE, DIFF);
6109       __ sub(cnt2, cnt2, isLL ? 16 : 8);
6110       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
6111       __ br(Assembler::LT, LESS16);
6112 
6113       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
6114       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
6115       __ cmp(tmp1, tmp2);
6116       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
6117       __ br(Assembler::NE, DIFF);
6118       __ sub(cnt2, cnt2, isLL ? 16 : 8);
6119       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
6120       __ br(Assembler::GE, LOOP_COMPARE16);
6121       __ cbz(cnt2, LENGTH_DIFF);
6122 
6123     __ bind(LESS16);
6124       // each 8 compare
6125       __ subs(cnt2, cnt2, isLL ? 8 : 4);
6126       __ br(Assembler::LE, LESS8);
6127       __ ldr(tmp1, Address(__ post(str1, 8)));
6128       __ ldr(tmp2, Address(__ post(str2, 8)));
6129       __ eor(rscratch2, tmp1, tmp2);
6130       __ cbnz(rscratch2, CAL_DIFFERENCE);
6131       __ sub(cnt2, cnt2, isLL ? 8 : 4);
6132 
6133     __ bind(LESS8); // directly load last 8 bytes
6134       if (!isLL) {
6135         __ add(cnt2, cnt2, cnt2);
6136       }
6137       __ ldr(tmp1, Address(str1, cnt2));
6138       __ ldr(tmp2, Address(str2, cnt2));
6139       __ eor(rscratch2, tmp1, tmp2);
6140       __ cbz(rscratch2, LENGTH_DIFF);
6141       __ b(CAL_DIFFERENCE);
6142 
6143     __ bind(DIFF);
6144       __ cmp(tmp1, tmp2);
6145       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
6146       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
6147       // reuse rscratch2 register for the result of eor instruction
6148       __ eor(rscratch2, tmp1, tmp2);
6149 
6150     __ bind(CAL_DIFFERENCE);
6151       __ rev(rscratch2, rscratch2);
6152       __ clz(rscratch2, rscratch2);
6153       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
6154       __ lsrv(tmp1, tmp1, rscratch2);
6155       __ lsrv(tmp2, tmp2, rscratch2);
6156       if (isLL) {
6157         __ uxtbw(tmp1, tmp1);
6158         __ uxtbw(tmp2, tmp2);
6159       } else {
6160         __ uxthw(tmp1, tmp1);
6161         __ uxthw(tmp2, tmp2);
6162       }
6163       __ subw(result, tmp1, tmp2);
6164 
6165     __ bind(LENGTH_DIFF);
6166       __ ret(lr);
6167     return entry;
6168   }
6169 
6170   enum string_compare_mode {
6171     LL,
6172     LU,
6173     UL,
6174     UU,
6175   };
6176 
6177   // The following registers are declared in aarch64.ad
6178   // r0  = result
6179   // r1  = str1
6180   // r2  = cnt1
6181   // r3  = str2
6182   // r4  = cnt2
6183   // r10 = tmp1
6184   // r11 = tmp2
6185   // z0  = ztmp1
6186   // z1  = ztmp2
6187   // p0  = pgtmp1
6188   // p1  = pgtmp2
6189   address generate_compare_long_string_sve(string_compare_mode mode) {
6190     StubGenStubId stub_id;
6191     switch (mode) {
6192       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
6193       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
6194       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
6195       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
6196       default: ShouldNotReachHere();
6197     }
6198 
6199     __ align(CodeEntryAlignment);
6200     address entry = __ pc();
6201     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
6202              tmp1 = r10, tmp2 = r11;
6203 
6204     Label LOOP, DONE, MISMATCH;
6205     Register vec_len = tmp1;
6206     Register idx = tmp2;
6207     // The minimum of the string lengths has been stored in cnt2.
6208     Register cnt = cnt2;
6209     FloatRegister ztmp1 = z0, ztmp2 = z1;
6210     PRegister pgtmp1 = p0, pgtmp2 = p1;
6211 
6212 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
6213     switch (mode) {                                                            \
6214       case LL:                                                                 \
6215         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
6216         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
6217         break;                                                                 \
6218       case LU:                                                                 \
6219         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
6220         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6221         break;                                                                 \
6222       case UL:                                                                 \
6223         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6224         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
6225         break;                                                                 \
6226       case UU:                                                                 \
6227         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6228         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6229         break;                                                                 \
6230       default:                                                                 \
6231         ShouldNotReachHere();                                                  \
6232     }
6233 
6234     StubCodeMark mark(this, stub_id);
6235 
6236     __ mov(idx, 0);
6237     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6238 
6239     if (mode == LL) {
6240       __ sve_cntb(vec_len);
6241     } else {
6242       __ sve_cnth(vec_len);
6243     }
6244 
6245     __ sub(rscratch1, cnt, vec_len);
6246 
6247     __ bind(LOOP);
6248 
6249       // main loop
6250       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6251       __ add(idx, idx, vec_len);
6252       // Compare strings.
6253       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6254       __ br(__ NE, MISMATCH);
6255       __ cmp(idx, rscratch1);
6256       __ br(__ LT, LOOP);
6257 
6258     // post loop, last iteration
6259     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6260 
6261     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6262     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6263     __ br(__ EQ, DONE);
6264 
6265     __ bind(MISMATCH);
6266 
6267     // Crop the vector to find its location.
6268     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
6269     // Extract the first different characters of each string.
6270     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
6271     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
6272 
6273     // Compute the difference of the first different characters.
6274     __ sub(result, rscratch1, rscratch2);
6275 
6276     __ bind(DONE);
6277     __ ret(lr);
6278 #undef LOAD_PAIR
6279     return entry;
6280   }
6281 
6282   void generate_compare_long_strings() {
6283     if (UseSVE == 0) {
6284       StubRoutines::aarch64::_compare_long_string_LL
6285           = generate_compare_long_string_same_encoding(true);
6286       StubRoutines::aarch64::_compare_long_string_UU
6287           = generate_compare_long_string_same_encoding(false);
6288       StubRoutines::aarch64::_compare_long_string_LU
6289           = generate_compare_long_string_different_encoding(true);
6290       StubRoutines::aarch64::_compare_long_string_UL
6291           = generate_compare_long_string_different_encoding(false);
6292     } else {
6293       StubRoutines::aarch64::_compare_long_string_LL
6294           = generate_compare_long_string_sve(LL);
6295       StubRoutines::aarch64::_compare_long_string_UU
6296           = generate_compare_long_string_sve(UU);
6297       StubRoutines::aarch64::_compare_long_string_LU
6298           = generate_compare_long_string_sve(LU);
6299       StubRoutines::aarch64::_compare_long_string_UL
6300           = generate_compare_long_string_sve(UL);
6301     }
6302   }
6303 
6304   // R0 = result
6305   // R1 = str2
6306   // R2 = cnt1
6307   // R3 = str1
6308   // R4 = cnt2
6309   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
6310   //
6311   // This generic linear code use few additional ideas, which makes it faster:
6312   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
6313   // in order to skip initial loading(help in systems with 1 ld pipeline)
6314   // 2) we can use "fast" algorithm of finding single character to search for
6315   // first symbol with less branches(1 branch per each loaded register instead
6316   // of branch for each symbol), so, this is where constants like
6317   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
6318   // 3) after loading and analyzing 1st register of source string, it can be
6319   // used to search for every 1st character entry, saving few loads in
6320   // comparison with "simplier-but-slower" implementation
6321   // 4) in order to avoid lots of push/pop operations, code below is heavily
6322   // re-using/re-initializing/compressing register values, which makes code
6323   // larger and a bit less readable, however, most of extra operations are
6324   // issued during loads or branches, so, penalty is minimal
6325   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
6326     StubGenStubId stub_id;
6327     if (str1_isL) {
6328       if (str2_isL) {
6329         stub_id = StubGenStubId::string_indexof_linear_ll_id;
6330       } else {
6331         stub_id = StubGenStubId::string_indexof_linear_ul_id;
6332       }
6333     } else {
6334       if (str2_isL) {
6335         ShouldNotReachHere();
6336       } else {
6337         stub_id = StubGenStubId::string_indexof_linear_uu_id;
6338       }
6339     }
6340     __ align(CodeEntryAlignment);
6341     StubCodeMark mark(this, stub_id);
6342     address entry = __ pc();
6343 
6344     int str1_chr_size = str1_isL ? 1 : 2;
6345     int str2_chr_size = str2_isL ? 1 : 2;
6346     int str1_chr_shift = str1_isL ? 0 : 1;
6347     int str2_chr_shift = str2_isL ? 0 : 1;
6348     bool isL = str1_isL && str2_isL;
6349    // parameters
6350     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
6351     // temporary registers
6352     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
6353     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
6354     // redefinitions
6355     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
6356 
6357     __ push(spilled_regs, sp);
6358     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
6359         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
6360         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
6361         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
6362         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
6363         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
6364     // Read whole register from str1. It is safe, because length >=8 here
6365     __ ldr(ch1, Address(str1));
6366     // Read whole register from str2. It is safe, because length >=8 here
6367     __ ldr(ch2, Address(str2));
6368     __ sub(cnt2, cnt2, cnt1);
6369     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
6370     if (str1_isL != str2_isL) {
6371       __ eor(v0, __ T16B, v0, v0);
6372     }
6373     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
6374     __ mul(first, first, tmp1);
6375     // check if we have less than 1 register to check
6376     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
6377     if (str1_isL != str2_isL) {
6378       __ fmovd(v1, ch1);
6379     }
6380     __ br(__ LE, L_SMALL);
6381     __ eor(ch2, first, ch2);
6382     if (str1_isL != str2_isL) {
6383       __ zip1(v1, __ T16B, v1, v0);
6384     }
6385     __ sub(tmp2, ch2, tmp1);
6386     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6387     __ bics(tmp2, tmp2, ch2);
6388     if (str1_isL != str2_isL) {
6389       __ fmovd(ch1, v1);
6390     }
6391     __ br(__ NE, L_HAS_ZERO);
6392     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6393     __ add(result, result, wordSize/str2_chr_size);
6394     __ add(str2, str2, wordSize);
6395     __ br(__ LT, L_POST_LOOP);
6396     __ BIND(L_LOOP);
6397       __ ldr(ch2, Address(str2));
6398       __ eor(ch2, first, ch2);
6399       __ sub(tmp2, ch2, tmp1);
6400       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6401       __ bics(tmp2, tmp2, ch2);
6402       __ br(__ NE, L_HAS_ZERO);
6403     __ BIND(L_LOOP_PROCEED);
6404       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6405       __ add(str2, str2, wordSize);
6406       __ add(result, result, wordSize/str2_chr_size);
6407       __ br(__ GE, L_LOOP);
6408     __ BIND(L_POST_LOOP);
6409       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
6410       __ br(__ LE, NOMATCH);
6411       __ ldr(ch2, Address(str2));
6412       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6413       __ eor(ch2, first, ch2);
6414       __ sub(tmp2, ch2, tmp1);
6415       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6416       __ mov(tmp4, -1); // all bits set
6417       __ b(L_SMALL_PROCEED);
6418     __ align(OptoLoopAlignment);
6419     __ BIND(L_SMALL);
6420       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6421       __ eor(ch2, first, ch2);
6422       if (str1_isL != str2_isL) {
6423         __ zip1(v1, __ T16B, v1, v0);
6424       }
6425       __ sub(tmp2, ch2, tmp1);
6426       __ mov(tmp4, -1); // all bits set
6427       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6428       if (str1_isL != str2_isL) {
6429         __ fmovd(ch1, v1); // move converted 4 symbols
6430       }
6431     __ BIND(L_SMALL_PROCEED);
6432       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
6433       __ bic(tmp2, tmp2, ch2);
6434       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
6435       __ rbit(tmp2, tmp2);
6436       __ br(__ EQ, NOMATCH);
6437     __ BIND(L_SMALL_HAS_ZERO_LOOP);
6438       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
6439       __ cmp(cnt1, u1(wordSize/str2_chr_size));
6440       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
6441       if (str2_isL) { // LL
6442         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6443         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6444         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6445         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6446         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6447       } else {
6448         __ mov(ch2, 0xE); // all bits in byte set except last one
6449         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6450         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6451         __ lslv(tmp2, tmp2, tmp4);
6452         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6453         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6454         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6455         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6456       }
6457       __ cmp(ch1, ch2);
6458       __ mov(tmp4, wordSize/str2_chr_size);
6459       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6460     __ BIND(L_SMALL_CMP_LOOP);
6461       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6462                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6463       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6464                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6465       __ add(tmp4, tmp4, 1);
6466       __ cmp(tmp4, cnt1);
6467       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
6468       __ cmp(first, ch2);
6469       __ br(__ EQ, L_SMALL_CMP_LOOP);
6470     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
6471       __ cbz(tmp2, NOMATCH); // no more matches. exit
6472       __ clz(tmp4, tmp2);
6473       __ add(result, result, 1); // advance index
6474       __ add(str2, str2, str2_chr_size); // advance pointer
6475       __ b(L_SMALL_HAS_ZERO_LOOP);
6476     __ align(OptoLoopAlignment);
6477     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
6478       __ cmp(first, ch2);
6479       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6480       __ b(DONE);
6481     __ align(OptoLoopAlignment);
6482     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
6483       if (str2_isL) { // LL
6484         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6485         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6486         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6487         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6488         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6489       } else {
6490         __ mov(ch2, 0xE); // all bits in byte set except last one
6491         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6492         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6493         __ lslv(tmp2, tmp2, tmp4);
6494         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6495         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6496         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6497         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6498       }
6499       __ cmp(ch1, ch2);
6500       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6501       __ b(DONE);
6502     __ align(OptoLoopAlignment);
6503     __ BIND(L_HAS_ZERO);
6504       __ rbit(tmp2, tmp2);
6505       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6506       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6507       // It's fine because both counters are 32bit and are not changed in this
6508       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6509       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6510       __ sub(result, result, 1);
6511     __ BIND(L_HAS_ZERO_LOOP);
6512       __ mov(cnt1, wordSize/str2_chr_size);
6513       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6514       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6515       if (str2_isL) {
6516         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6517         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6518         __ lslv(tmp2, tmp2, tmp4);
6519         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6520         __ add(tmp4, tmp4, 1);
6521         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6522         __ lsl(tmp2, tmp2, 1);
6523         __ mov(tmp4, wordSize/str2_chr_size);
6524       } else {
6525         __ mov(ch2, 0xE);
6526         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6527         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6528         __ lslv(tmp2, tmp2, tmp4);
6529         __ add(tmp4, tmp4, 1);
6530         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6531         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6532         __ lsl(tmp2, tmp2, 1);
6533         __ mov(tmp4, wordSize/str2_chr_size);
6534         __ sub(str2, str2, str2_chr_size);
6535       }
6536       __ cmp(ch1, ch2);
6537       __ mov(tmp4, wordSize/str2_chr_size);
6538       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6539     __ BIND(L_CMP_LOOP);
6540       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6541                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6542       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6543                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6544       __ add(tmp4, tmp4, 1);
6545       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6546       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6547       __ cmp(cnt1, ch2);
6548       __ br(__ EQ, L_CMP_LOOP);
6549     __ BIND(L_CMP_LOOP_NOMATCH);
6550       // here we're not matched
6551       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6552       __ clz(tmp4, tmp2);
6553       __ add(str2, str2, str2_chr_size); // advance pointer
6554       __ b(L_HAS_ZERO_LOOP);
6555     __ align(OptoLoopAlignment);
6556     __ BIND(L_CMP_LOOP_LAST_CMP);
6557       __ cmp(cnt1, ch2);
6558       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6559       __ b(DONE);
6560     __ align(OptoLoopAlignment);
6561     __ BIND(L_CMP_LOOP_LAST_CMP2);
6562       if (str2_isL) {
6563         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6564         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6565         __ lslv(tmp2, tmp2, tmp4);
6566         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6567         __ add(tmp4, tmp4, 1);
6568         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6569         __ lsl(tmp2, tmp2, 1);
6570       } else {
6571         __ mov(ch2, 0xE);
6572         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6573         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6574         __ lslv(tmp2, tmp2, tmp4);
6575         __ add(tmp4, tmp4, 1);
6576         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6577         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6578         __ lsl(tmp2, tmp2, 1);
6579         __ sub(str2, str2, str2_chr_size);
6580       }
6581       __ cmp(ch1, ch2);
6582       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6583       __ b(DONE);
6584     __ align(OptoLoopAlignment);
6585     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6586       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6587       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6588       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6589       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6590       // result by analyzed characters value, so, we can just reset lower bits
6591       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6592       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6593       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6594       // index of last analyzed substring inside current octet. So, str2 in at
6595       // respective start address. We need to advance it to next octet
6596       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6597       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6598       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6599       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6600       __ movw(cnt2, cnt2);
6601       __ b(L_LOOP_PROCEED);
6602     __ align(OptoLoopAlignment);
6603     __ BIND(NOMATCH);
6604       __ mov(result, -1);
6605     __ BIND(DONE);
6606       __ pop(spilled_regs, sp);
6607       __ ret(lr);
6608     return entry;
6609   }
6610 
6611   void generate_string_indexof_stubs() {
6612     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6613     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6614     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6615   }
6616 
6617   void inflate_and_store_2_fp_registers(bool generatePrfm,
6618       FloatRegister src1, FloatRegister src2) {
6619     Register dst = r1;
6620     __ zip1(v1, __ T16B, src1, v0);
6621     __ zip2(v2, __ T16B, src1, v0);
6622     if (generatePrfm) {
6623       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6624     }
6625     __ zip1(v3, __ T16B, src2, v0);
6626     __ zip2(v4, __ T16B, src2, v0);
6627     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6628   }
6629 
6630   // R0 = src
6631   // R1 = dst
6632   // R2 = len
6633   // R3 = len >> 3
6634   // V0 = 0
6635   // v1 = loaded 8 bytes
6636   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6637   address generate_large_byte_array_inflate() {
6638     __ align(CodeEntryAlignment);
6639     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
6640     StubCodeMark mark(this, stub_id);
6641     address entry = __ pc();
6642     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6643     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6644     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6645 
6646     // do one more 8-byte read to have address 16-byte aligned in most cases
6647     // also use single store instruction
6648     __ ldrd(v2, __ post(src, 8));
6649     __ sub(octetCounter, octetCounter, 2);
6650     __ zip1(v1, __ T16B, v1, v0);
6651     __ zip1(v2, __ T16B, v2, v0);
6652     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6653     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6654     __ subs(rscratch1, octetCounter, large_loop_threshold);
6655     __ br(__ LE, LOOP_START);
6656     __ b(LOOP_PRFM_START);
6657     __ bind(LOOP_PRFM);
6658       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6659     __ bind(LOOP_PRFM_START);
6660       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6661       __ sub(octetCounter, octetCounter, 8);
6662       __ subs(rscratch1, octetCounter, large_loop_threshold);
6663       inflate_and_store_2_fp_registers(true, v3, v4);
6664       inflate_and_store_2_fp_registers(true, v5, v6);
6665       __ br(__ GT, LOOP_PRFM);
6666       __ cmp(octetCounter, (u1)8);
6667       __ br(__ LT, DONE);
6668     __ bind(LOOP);
6669       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6670       __ bind(LOOP_START);
6671       __ sub(octetCounter, octetCounter, 8);
6672       __ cmp(octetCounter, (u1)8);
6673       inflate_and_store_2_fp_registers(false, v3, v4);
6674       inflate_and_store_2_fp_registers(false, v5, v6);
6675       __ br(__ GE, LOOP);
6676     __ bind(DONE);
6677       __ ret(lr);
6678     return entry;
6679   }
6680 
6681   /**
6682    *  Arguments:
6683    *
6684    *  Input:
6685    *  c_rarg0   - current state address
6686    *  c_rarg1   - H key address
6687    *  c_rarg2   - data address
6688    *  c_rarg3   - number of blocks
6689    *
6690    *  Output:
6691    *  Updated state at c_rarg0
6692    */
6693   address generate_ghash_processBlocks() {
6694     // Bafflingly, GCM uses little-endian for the byte order, but
6695     // big-endian for the bit order.  For example, the polynomial 1 is
6696     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6697     //
6698     // So, we must either reverse the bytes in each word and do
6699     // everything big-endian or reverse the bits in each byte and do
6700     // it little-endian.  On AArch64 it's more idiomatic to reverse
6701     // the bits in each byte (we have an instruction, RBIT, to do
6702     // that) and keep the data in little-endian bit order through the
6703     // calculation, bit-reversing the inputs and outputs.
6704 
6705     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
6706     StubCodeMark mark(this, stub_id);
6707     __ align(wordSize * 2);
6708     address p = __ pc();
6709     __ emit_int64(0x87);  // The low-order bits of the field
6710                           // polynomial (i.e. p = z^7+z^2+z+1)
6711                           // repeated in the low and high parts of a
6712                           // 128-bit vector
6713     __ emit_int64(0x87);
6714 
6715     __ align(CodeEntryAlignment);
6716     address start = __ pc();
6717 
6718     Register state   = c_rarg0;
6719     Register subkeyH = c_rarg1;
6720     Register data    = c_rarg2;
6721     Register blocks  = c_rarg3;
6722 
6723     FloatRegister vzr = v30;
6724     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6725 
6726     __ ldrq(v24, p);    // The field polynomial
6727 
6728     __ ldrq(v0, Address(state));
6729     __ ldrq(v1, Address(subkeyH));
6730 
6731     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6732     __ rbit(v0, __ T16B, v0);
6733     __ rev64(v1, __ T16B, v1);
6734     __ rbit(v1, __ T16B, v1);
6735 
6736     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6737     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6738 
6739     {
6740       Label L_ghash_loop;
6741       __ bind(L_ghash_loop);
6742 
6743       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6744                                                  // reversing each byte
6745       __ rbit(v2, __ T16B, v2);
6746       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6747 
6748       // Multiply state in v2 by subkey in v1
6749       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6750                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6751                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6752       // Reduce v7:v5 by the field polynomial
6753       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6754 
6755       __ sub(blocks, blocks, 1);
6756       __ cbnz(blocks, L_ghash_loop);
6757     }
6758 
6759     // The bit-reversed result is at this point in v0
6760     __ rev64(v0, __ T16B, v0);
6761     __ rbit(v0, __ T16B, v0);
6762 
6763     __ st1(v0, __ T16B, state);
6764     __ ret(lr);
6765 
6766     return start;
6767   }
6768 
6769   address generate_ghash_processBlocks_wide() {
6770     address small = generate_ghash_processBlocks();
6771 
6772     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
6773     StubCodeMark mark(this, stub_id);
6774     __ align(wordSize * 2);
6775     address p = __ pc();
6776     __ emit_int64(0x87);  // The low-order bits of the field
6777                           // polynomial (i.e. p = z^7+z^2+z+1)
6778                           // repeated in the low and high parts of a
6779                           // 128-bit vector
6780     __ emit_int64(0x87);
6781 
6782     __ align(CodeEntryAlignment);
6783     address start = __ pc();
6784 
6785     Register state   = c_rarg0;
6786     Register subkeyH = c_rarg1;
6787     Register data    = c_rarg2;
6788     Register blocks  = c_rarg3;
6789 
6790     const int unroll = 4;
6791 
6792     __ cmp(blocks, (unsigned char)(unroll * 2));
6793     __ br(__ LT, small);
6794 
6795     if (unroll > 1) {
6796     // Save state before entering routine
6797       __ sub(sp, sp, 4 * 16);
6798       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6799       __ sub(sp, sp, 4 * 16);
6800       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6801     }
6802 
6803     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6804 
6805     if (unroll > 1) {
6806       // And restore state
6807       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6808       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6809     }
6810 
6811     __ cmp(blocks, (unsigned char)0);
6812     __ br(__ GT, small);
6813 
6814     __ ret(lr);
6815 
6816     return start;
6817   }
6818 
6819   void generate_base64_encode_simdround(Register src, Register dst,
6820         FloatRegister codec, u8 size) {
6821 
6822     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6823     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6824     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6825 
6826     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6827 
6828     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6829 
6830     __ ushr(ind0, arrangement, in0,  2);
6831 
6832     __ ushr(ind1, arrangement, in1,  2);
6833     __ shl(in0,   arrangement, in0,  6);
6834     __ orr(ind1,  arrangement, ind1, in0);
6835     __ ushr(ind1, arrangement, ind1, 2);
6836 
6837     __ ushr(ind2, arrangement, in2,  4);
6838     __ shl(in1,   arrangement, in1,  4);
6839     __ orr(ind2,  arrangement, in1,  ind2);
6840     __ ushr(ind2, arrangement, ind2, 2);
6841 
6842     __ shl(ind3,  arrangement, in2,  2);
6843     __ ushr(ind3, arrangement, ind3, 2);
6844 
6845     __ tbl(out0,  arrangement, codec,  4, ind0);
6846     __ tbl(out1,  arrangement, codec,  4, ind1);
6847     __ tbl(out2,  arrangement, codec,  4, ind2);
6848     __ tbl(out3,  arrangement, codec,  4, ind3);
6849 
6850     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6851   }
6852 
6853    /**
6854    *  Arguments:
6855    *
6856    *  Input:
6857    *  c_rarg0   - src_start
6858    *  c_rarg1   - src_offset
6859    *  c_rarg2   - src_length
6860    *  c_rarg3   - dest_start
6861    *  c_rarg4   - dest_offset
6862    *  c_rarg5   - isURL
6863    *
6864    */
6865   address generate_base64_encodeBlock() {
6866 
6867     static const char toBase64[64] = {
6868       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6869       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6870       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6871       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6872       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6873     };
6874 
6875     static const char toBase64URL[64] = {
6876       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6877       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6878       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6879       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6880       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6881     };
6882 
6883     __ align(CodeEntryAlignment);
6884     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
6885     StubCodeMark mark(this, stub_id);
6886     address start = __ pc();
6887 
6888     Register src   = c_rarg0;  // source array
6889     Register soff  = c_rarg1;  // source start offset
6890     Register send  = c_rarg2;  // source end offset
6891     Register dst   = c_rarg3;  // dest array
6892     Register doff  = c_rarg4;  // position for writing to dest array
6893     Register isURL = c_rarg5;  // Base64 or URL character set
6894 
6895     // c_rarg6 and c_rarg7 are free to use as temps
6896     Register codec  = c_rarg6;
6897     Register length = c_rarg7;
6898 
6899     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6900 
6901     __ add(src, src, soff);
6902     __ add(dst, dst, doff);
6903     __ sub(length, send, soff);
6904 
6905     // load the codec base address
6906     __ lea(codec, ExternalAddress((address) toBase64));
6907     __ cbz(isURL, ProcessData);
6908     __ lea(codec, ExternalAddress((address) toBase64URL));
6909 
6910     __ BIND(ProcessData);
6911 
6912     // too short to formup a SIMD loop, roll back
6913     __ cmp(length, (u1)24);
6914     __ br(Assembler::LT, Process3B);
6915 
6916     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6917 
6918     __ BIND(Process48B);
6919     __ cmp(length, (u1)48);
6920     __ br(Assembler::LT, Process24B);
6921     generate_base64_encode_simdround(src, dst, v0, 16);
6922     __ sub(length, length, 48);
6923     __ b(Process48B);
6924 
6925     __ BIND(Process24B);
6926     __ cmp(length, (u1)24);
6927     __ br(Assembler::LT, SIMDExit);
6928     generate_base64_encode_simdround(src, dst, v0, 8);
6929     __ sub(length, length, 24);
6930 
6931     __ BIND(SIMDExit);
6932     __ cbz(length, Exit);
6933 
6934     __ BIND(Process3B);
6935     //  3 src bytes, 24 bits
6936     __ ldrb(r10, __ post(src, 1));
6937     __ ldrb(r11, __ post(src, 1));
6938     __ ldrb(r12, __ post(src, 1));
6939     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6940     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6941     // codec index
6942     __ ubfmw(r15, r12, 18, 23);
6943     __ ubfmw(r14, r12, 12, 17);
6944     __ ubfmw(r13, r12, 6,  11);
6945     __ andw(r12,  r12, 63);
6946     // get the code based on the codec
6947     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6948     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6949     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6950     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6951     __ strb(r15, __ post(dst, 1));
6952     __ strb(r14, __ post(dst, 1));
6953     __ strb(r13, __ post(dst, 1));
6954     __ strb(r12, __ post(dst, 1));
6955     __ sub(length, length, 3);
6956     __ cbnz(length, Process3B);
6957 
6958     __ BIND(Exit);
6959     __ ret(lr);
6960 
6961     return start;
6962   }
6963 
6964   void generate_base64_decode_simdround(Register src, Register dst,
6965         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6966 
6967     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6968     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6969 
6970     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6971     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6972 
6973     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6974 
6975     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6976 
6977     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6978 
6979     // we need unsigned saturating subtract, to make sure all input values
6980     // in range [0, 63] will have 0U value in the higher half lookup
6981     __ uqsubv(decH0, __ T16B, in0, v27);
6982     __ uqsubv(decH1, __ T16B, in1, v27);
6983     __ uqsubv(decH2, __ T16B, in2, v27);
6984     __ uqsubv(decH3, __ T16B, in3, v27);
6985 
6986     // lower half lookup
6987     __ tbl(decL0, arrangement, codecL, 4, in0);
6988     __ tbl(decL1, arrangement, codecL, 4, in1);
6989     __ tbl(decL2, arrangement, codecL, 4, in2);
6990     __ tbl(decL3, arrangement, codecL, 4, in3);
6991 
6992     // higher half lookup
6993     __ tbx(decH0, arrangement, codecH, 4, decH0);
6994     __ tbx(decH1, arrangement, codecH, 4, decH1);
6995     __ tbx(decH2, arrangement, codecH, 4, decH2);
6996     __ tbx(decH3, arrangement, codecH, 4, decH3);
6997 
6998     // combine lower and higher
6999     __ orr(decL0, arrangement, decL0, decH0);
7000     __ orr(decL1, arrangement, decL1, decH1);
7001     __ orr(decL2, arrangement, decL2, decH2);
7002     __ orr(decL3, arrangement, decL3, decH3);
7003 
7004     // check illegal inputs, value larger than 63 (maximum of 6 bits)
7005     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
7006     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
7007     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
7008     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
7009     __ orr(in0, arrangement, decH0, decH1);
7010     __ orr(in1, arrangement, decH2, decH3);
7011     __ orr(in2, arrangement, in0,   in1);
7012     __ umaxv(in3, arrangement, in2);
7013     __ umov(rscratch2, in3, __ B, 0);
7014 
7015     // get the data to output
7016     __ shl(out0,  arrangement, decL0, 2);
7017     __ ushr(out1, arrangement, decL1, 4);
7018     __ orr(out0,  arrangement, out0,  out1);
7019     __ shl(out1,  arrangement, decL1, 4);
7020     __ ushr(out2, arrangement, decL2, 2);
7021     __ orr(out1,  arrangement, out1,  out2);
7022     __ shl(out2,  arrangement, decL2, 6);
7023     __ orr(out2,  arrangement, out2,  decL3);
7024 
7025     __ cbz(rscratch2, NoIllegalData);
7026 
7027     // handle illegal input
7028     __ umov(r10, in2, __ D, 0);
7029     if (size == 16) {
7030       __ cbnz(r10, ErrorInLowerHalf);
7031 
7032       // illegal input is in higher half, store the lower half now.
7033       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
7034 
7035       __ umov(r10, in2,  __ D, 1);
7036       __ umov(r11, out0, __ D, 1);
7037       __ umov(r12, out1, __ D, 1);
7038       __ umov(r13, out2, __ D, 1);
7039       __ b(StoreLegalData);
7040 
7041       __ BIND(ErrorInLowerHalf);
7042     }
7043     __ umov(r11, out0, __ D, 0);
7044     __ umov(r12, out1, __ D, 0);
7045     __ umov(r13, out2, __ D, 0);
7046 
7047     __ BIND(StoreLegalData);
7048     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
7049     __ strb(r11, __ post(dst, 1));
7050     __ strb(r12, __ post(dst, 1));
7051     __ strb(r13, __ post(dst, 1));
7052     __ lsr(r10, r10, 8);
7053     __ lsr(r11, r11, 8);
7054     __ lsr(r12, r12, 8);
7055     __ lsr(r13, r13, 8);
7056     __ b(StoreLegalData);
7057 
7058     __ BIND(NoIllegalData);
7059     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
7060   }
7061 
7062 
7063    /**
7064    *  Arguments:
7065    *
7066    *  Input:
7067    *  c_rarg0   - src_start
7068    *  c_rarg1   - src_offset
7069    *  c_rarg2   - src_length
7070    *  c_rarg3   - dest_start
7071    *  c_rarg4   - dest_offset
7072    *  c_rarg5   - isURL
7073    *  c_rarg6   - isMIME
7074    *
7075    */
7076   address generate_base64_decodeBlock() {
7077 
7078     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
7079     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
7080     // titled "Base64 decoding".
7081 
7082     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
7083     // except the trailing character '=' is also treated illegal value in this intrinsic. That
7084     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
7085     static const uint8_t fromBase64ForNoSIMD[256] = {
7086       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7087       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7088       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
7089        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
7090       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
7091        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
7092       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
7093        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
7094       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7095       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7096       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7097       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7098       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7099       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7100       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7101       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7102     };
7103 
7104     static const uint8_t fromBase64URLForNoSIMD[256] = {
7105       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7106       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7107       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
7108        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
7109       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
7110        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
7111       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
7112        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
7113       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7114       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7115       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7116       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7117       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7118       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7119       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7120       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7121     };
7122 
7123     // A legal value of base64 code is in range [0, 127].  We need two lookups
7124     // with tbl/tbx and combine them to get the decode data. The 1st table vector
7125     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
7126     // table vector lookup use tbx, out of range indices are unchanged in
7127     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
7128     // The value of index 64 is set to 0, so that we know that we already get the
7129     // decoded data with the 1st lookup.
7130     static const uint8_t fromBase64ForSIMD[128] = {
7131       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7132       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7133       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
7134        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
7135         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
7136        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
7137       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
7138        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
7139     };
7140 
7141     static const uint8_t fromBase64URLForSIMD[128] = {
7142       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7143       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
7144       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
7145        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
7146         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
7147        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
7148        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
7149        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
7150     };
7151 
7152     __ align(CodeEntryAlignment);
7153     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
7154     StubCodeMark mark(this, stub_id);
7155     address start = __ pc();
7156 
7157     Register src    = c_rarg0;  // source array
7158     Register soff   = c_rarg1;  // source start offset
7159     Register send   = c_rarg2;  // source end offset
7160     Register dst    = c_rarg3;  // dest array
7161     Register doff   = c_rarg4;  // position for writing to dest array
7162     Register isURL  = c_rarg5;  // Base64 or URL character set
7163     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
7164 
7165     Register length = send;    // reuse send as length of source data to process
7166 
7167     Register simd_codec   = c_rarg6;
7168     Register nosimd_codec = c_rarg7;
7169 
7170     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
7171 
7172     __ enter();
7173 
7174     __ add(src, src, soff);
7175     __ add(dst, dst, doff);
7176 
7177     __ mov(doff, dst);
7178 
7179     __ sub(length, send, soff);
7180     __ bfm(length, zr, 0, 1);
7181 
7182     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
7183     __ cbz(isURL, ProcessData);
7184     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
7185 
7186     __ BIND(ProcessData);
7187     __ mov(rscratch1, length);
7188     __ cmp(length, (u1)144); // 144 = 80 + 64
7189     __ br(Assembler::LT, Process4B);
7190 
7191     // In the MIME case, the line length cannot be more than 76
7192     // bytes (see RFC 2045). This is too short a block for SIMD
7193     // to be worthwhile, so we use non-SIMD here.
7194     __ movw(rscratch1, 79);
7195 
7196     __ BIND(Process4B);
7197     __ ldrw(r14, __ post(src, 4));
7198     __ ubfxw(r10, r14, 0,  8);
7199     __ ubfxw(r11, r14, 8,  8);
7200     __ ubfxw(r12, r14, 16, 8);
7201     __ ubfxw(r13, r14, 24, 8);
7202     // get the de-code
7203     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
7204     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
7205     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
7206     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
7207     // error detection, 255u indicates an illegal input
7208     __ orrw(r14, r10, r11);
7209     __ orrw(r15, r12, r13);
7210     __ orrw(r14, r14, r15);
7211     __ tbnz(r14, 7, Exit);
7212     // recover the data
7213     __ lslw(r14, r10, 10);
7214     __ bfiw(r14, r11, 4, 6);
7215     __ bfmw(r14, r12, 2, 5);
7216     __ rev16w(r14, r14);
7217     __ bfiw(r13, r12, 6, 2);
7218     __ strh(r14, __ post(dst, 2));
7219     __ strb(r13, __ post(dst, 1));
7220     // non-simd loop
7221     __ subsw(rscratch1, rscratch1, 4);
7222     __ br(Assembler::GT, Process4B);
7223 
7224     // if exiting from PreProcess80B, rscratch1 == -1;
7225     // otherwise, rscratch1 == 0.
7226     __ cbzw(rscratch1, Exit);
7227     __ sub(length, length, 80);
7228 
7229     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
7230     __ cbz(isURL, SIMDEnter);
7231     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
7232 
7233     __ BIND(SIMDEnter);
7234     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
7235     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
7236     __ mov(rscratch1, 63);
7237     __ dup(v27, __ T16B, rscratch1);
7238 
7239     __ BIND(Process64B);
7240     __ cmp(length, (u1)64);
7241     __ br(Assembler::LT, Process32B);
7242     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
7243     __ sub(length, length, 64);
7244     __ b(Process64B);
7245 
7246     __ BIND(Process32B);
7247     __ cmp(length, (u1)32);
7248     __ br(Assembler::LT, SIMDExit);
7249     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
7250     __ sub(length, length, 32);
7251     __ b(Process32B);
7252 
7253     __ BIND(SIMDExit);
7254     __ cbz(length, Exit);
7255     __ movw(rscratch1, length);
7256     __ b(Process4B);
7257 
7258     __ BIND(Exit);
7259     __ sub(c_rarg0, dst, doff);
7260 
7261     __ leave();
7262     __ ret(lr);
7263 
7264     return start;
7265   }
7266 
7267   // Support for spin waits.
7268   address generate_spin_wait() {
7269     __ align(CodeEntryAlignment);
7270     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
7271     StubCodeMark mark(this, stub_id);
7272     address start = __ pc();
7273 
7274     __ spin_wait();
7275     __ ret(lr);
7276 
7277     return start;
7278   }
7279 
7280   void generate_lookup_secondary_supers_table_stub() {
7281     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
7282     StubCodeMark mark(this, stub_id);
7283 
7284     const Register
7285       r_super_klass  = r0,
7286       r_array_base   = r1,
7287       r_array_length = r2,
7288       r_array_index  = r3,
7289       r_sub_klass    = r4,
7290       r_bitmap       = rscratch2,
7291       result         = r5;
7292     const FloatRegister
7293       vtemp          = v0;
7294 
7295     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
7296       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
7297       Label L_success;
7298       __ enter();
7299       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
7300                                              r_array_base, r_array_length, r_array_index,
7301                                              vtemp, result, slot,
7302                                              /*stub_is_near*/true);
7303       __ leave();
7304       __ ret(lr);
7305     }
7306   }
7307 
7308   // Slow path implementation for UseSecondarySupersTable.
7309   address generate_lookup_secondary_supers_table_slow_path_stub() {
7310     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
7311     StubCodeMark mark(this, stub_id);
7312 
7313     address start = __ pc();
7314     const Register
7315       r_super_klass  = r0,        // argument
7316       r_array_base   = r1,        // argument
7317       temp1          = r2,        // temp
7318       r_array_index  = r3,        // argument
7319       r_bitmap       = rscratch2, // argument
7320       result         = r5;        // argument
7321 
7322     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
7323     __ ret(lr);
7324 
7325     return start;
7326   }
7327 
7328 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
7329 
7330   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
7331   //
7332   // If LSE is in use, generate LSE versions of all the stubs. The
7333   // non-LSE versions are in atomic_aarch64.S.
7334 
7335   // class AtomicStubMark records the entry point of a stub and the
7336   // stub pointer which will point to it. The stub pointer is set to
7337   // the entry point when ~AtomicStubMark() is called, which must be
7338   // after ICache::invalidate_range. This ensures safe publication of
7339   // the generated code.
7340   class AtomicStubMark {
7341     address _entry_point;
7342     aarch64_atomic_stub_t *_stub;
7343     MacroAssembler *_masm;
7344   public:
7345     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
7346       _masm = masm;
7347       __ align(32);
7348       _entry_point = __ pc();
7349       _stub = stub;
7350     }
7351     ~AtomicStubMark() {
7352       *_stub = (aarch64_atomic_stub_t)_entry_point;
7353     }
7354   };
7355 
7356   // NB: For memory_order_conservative we need a trailing membar after
7357   // LSE atomic operations but not a leading membar.
7358   //
7359   // We don't need a leading membar because a clause in the Arm ARM
7360   // says:
7361   //
7362   //   Barrier-ordered-before
7363   //
7364   //   Barrier instructions order prior Memory effects before subsequent
7365   //   Memory effects generated by the same Observer. A read or a write
7366   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
7367   //   Observer if and only if RW1 appears in program order before RW 2
7368   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
7369   //   instruction with both Acquire and Release semantics.
7370   //
7371   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
7372   // and Release semantics, therefore we don't need a leading
7373   // barrier. However, there is no corresponding Barrier-ordered-after
7374   // relationship, therefore we need a trailing membar to prevent a
7375   // later store or load from being reordered with the store in an
7376   // atomic instruction.
7377   //
7378   // This was checked by using the herd7 consistency model simulator
7379   // (http://diy.inria.fr/) with this test case:
7380   //
7381   // AArch64 LseCas
7382   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
7383   // P0 | P1;
7384   // LDR W4, [X2] | MOV W3, #0;
7385   // DMB LD       | MOV W4, #1;
7386   // LDR W3, [X1] | CASAL W3, W4, [X1];
7387   //              | DMB ISH;
7388   //              | STR W4, [X2];
7389   // exists
7390   // (0:X3=0 /\ 0:X4=1)
7391   //
7392   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
7393   // with the store to x in P1. Without the DMB in P1 this may happen.
7394   //
7395   // At the time of writing we don't know of any AArch64 hardware that
7396   // reorders stores in this way, but the Reference Manual permits it.
7397 
7398   void gen_cas_entry(Assembler::operand_size size,
7399                      atomic_memory_order order) {
7400     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
7401       exchange_val = c_rarg2;
7402     bool acquire, release;
7403     switch (order) {
7404       case memory_order_relaxed:
7405         acquire = false;
7406         release = false;
7407         break;
7408       case memory_order_release:
7409         acquire = false;
7410         release = true;
7411         break;
7412       default:
7413         acquire = true;
7414         release = true;
7415         break;
7416     }
7417     __ mov(prev, compare_val);
7418     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
7419     if (order == memory_order_conservative) {
7420       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7421     }
7422     if (size == Assembler::xword) {
7423       __ mov(r0, prev);
7424     } else {
7425       __ movw(r0, prev);
7426     }
7427     __ ret(lr);
7428   }
7429 
7430   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
7431     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7432     // If not relaxed, then default to conservative.  Relaxed is the only
7433     // case we use enough to be worth specializing.
7434     if (order == memory_order_relaxed) {
7435       __ ldadd(size, incr, prev, addr);
7436     } else {
7437       __ ldaddal(size, incr, prev, addr);
7438       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7439     }
7440     if (size == Assembler::xword) {
7441       __ mov(r0, prev);
7442     } else {
7443       __ movw(r0, prev);
7444     }
7445     __ ret(lr);
7446   }
7447 
7448   void gen_swpal_entry(Assembler::operand_size size) {
7449     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7450     __ swpal(size, incr, prev, addr);
7451     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7452     if (size == Assembler::xword) {
7453       __ mov(r0, prev);
7454     } else {
7455       __ movw(r0, prev);
7456     }
7457     __ ret(lr);
7458   }
7459 
7460   void generate_atomic_entry_points() {
7461     if (! UseLSE) {
7462       return;
7463     }
7464     __ align(CodeEntryAlignment);
7465     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
7466     StubCodeMark mark(this, stub_id);
7467     address first_entry = __ pc();
7468 
7469     // ADD, memory_order_conservative
7470     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
7471     gen_ldadd_entry(Assembler::word, memory_order_conservative);
7472     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
7473     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
7474 
7475     // ADD, memory_order_relaxed
7476     AtomicStubMark mark_fetch_add_4_relaxed
7477       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
7478     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
7479     AtomicStubMark mark_fetch_add_8_relaxed
7480       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
7481     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
7482 
7483     // XCHG, memory_order_conservative
7484     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
7485     gen_swpal_entry(Assembler::word);
7486     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
7487     gen_swpal_entry(Assembler::xword);
7488 
7489     // CAS, memory_order_conservative
7490     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
7491     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
7492     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
7493     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
7494     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
7495     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
7496 
7497     // CAS, memory_order_relaxed
7498     AtomicStubMark mark_cmpxchg_1_relaxed
7499       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
7500     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
7501     AtomicStubMark mark_cmpxchg_4_relaxed
7502       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
7503     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
7504     AtomicStubMark mark_cmpxchg_8_relaxed
7505       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
7506     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
7507 
7508     AtomicStubMark mark_cmpxchg_4_release
7509       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
7510     gen_cas_entry(MacroAssembler::word, memory_order_release);
7511     AtomicStubMark mark_cmpxchg_8_release
7512       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
7513     gen_cas_entry(MacroAssembler::xword, memory_order_release);
7514 
7515     AtomicStubMark mark_cmpxchg_4_seq_cst
7516       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
7517     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
7518     AtomicStubMark mark_cmpxchg_8_seq_cst
7519       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
7520     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
7521 
7522     ICache::invalidate_range(first_entry, __ pc() - first_entry);
7523   }
7524 #endif // LINUX
7525 
7526   address generate_cont_thaw(Continuation::thaw_kind kind) {
7527     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
7528     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
7529 
7530     address start = __ pc();
7531 
7532     if (return_barrier) {
7533       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7534       __ mov(sp, rscratch1);
7535     }
7536     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7537 
7538     if (return_barrier) {
7539       // preserve possible return value from a method returning to the return barrier
7540       __ fmovd(rscratch1, v0);
7541       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7542     }
7543 
7544     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7545     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7546     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7547 
7548     if (return_barrier) {
7549       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7550       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7551       __ fmovd(v0, rscratch1);
7552     }
7553     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7554 
7555 
7556     Label thaw_success;
7557     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7558     __ cbnz(rscratch2, thaw_success);
7559     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
7560     __ br(rscratch1);
7561     __ bind(thaw_success);
7562 
7563     // make room for the thawed frames
7564     __ sub(rscratch1, sp, rscratch2);
7565     __ andr(rscratch1, rscratch1, -16); // align
7566     __ mov(sp, rscratch1);
7567 
7568     if (return_barrier) {
7569       // save original return value -- again
7570       __ fmovd(rscratch1, v0);
7571       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7572     }
7573 
7574     // If we want, we can templatize thaw by kind, and have three different entries
7575     __ movw(c_rarg1, (uint32_t)kind);
7576 
7577     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7578     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7579 
7580     if (return_barrier) {
7581       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7582       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7583       __ fmovd(v0, rscratch1);
7584     } else {
7585       __ mov(r0, zr); // return 0 (success) from doYield
7586     }
7587 
7588     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7589     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7590     __ mov(rfp, sp);
7591 
7592     if (return_barrier_exception) {
7593       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7594       __ authenticate_return_address(c_rarg1);
7595       __ verify_oop(r0);
7596       // save return value containing the exception oop in callee-saved R19
7597       __ mov(r19, r0);
7598 
7599       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7600 
7601       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7602       // __ reinitialize_ptrue();
7603 
7604       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7605 
7606       __ mov(r1, r0); // the exception handler
7607       __ mov(r0, r19); // restore return value containing the exception oop
7608       __ verify_oop(r0);
7609 
7610       __ leave();
7611       __ mov(r3, lr);
7612       __ br(r1); // the exception handler
7613     } else {
7614       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7615       __ leave();
7616       __ ret(lr);
7617     }
7618 
7619     return start;
7620   }
7621 
7622   address generate_cont_thaw() {
7623     if (!Continuations::enabled()) return nullptr;
7624 
7625     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
7626     StubCodeMark mark(this, stub_id);
7627     address start = __ pc();
7628     generate_cont_thaw(Continuation::thaw_top);
7629     return start;
7630   }
7631 
7632   address generate_cont_returnBarrier() {
7633     if (!Continuations::enabled()) return nullptr;
7634 
7635     // TODO: will probably need multiple return barriers depending on return type
7636     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
7637     StubCodeMark mark(this, stub_id);
7638     address start = __ pc();
7639 
7640     generate_cont_thaw(Continuation::thaw_return_barrier);
7641 
7642     return start;
7643   }
7644 
7645   address generate_cont_returnBarrier_exception() {
7646     if (!Continuations::enabled()) return nullptr;
7647 
7648     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
7649     StubCodeMark mark(this, stub_id);
7650     address start = __ pc();
7651 
7652     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7653 
7654     return start;
7655   }
7656 
7657   address generate_cont_preempt_stub() {
7658     if (!Continuations::enabled()) return nullptr;
7659     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
7660     StubCodeMark mark(this, stub_id);
7661     address start = __ pc();
7662 
7663     __ reset_last_Java_frame(true);
7664 
7665     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
7666     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
7667     __ mov(sp, rscratch2);
7668 
7669     Label preemption_cancelled;
7670     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
7671     __ cbnz(rscratch1, preemption_cancelled);
7672 
7673     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
7674     SharedRuntime::continuation_enter_cleanup(_masm);
7675     __ leave();
7676     __ ret(lr);
7677 
7678     // We acquired the monitor after freezing the frames so call thaw to continue execution.
7679     __ bind(preemption_cancelled);
7680     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
7681     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
7682     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
7683     __ ldr(rscratch1, Address(rscratch1));
7684     __ br(rscratch1);
7685 
7686     return start;
7687   }
7688 
7689   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7690   // are represented as long[5], with BITS_PER_LIMB = 26.
7691   // Pack five 26-bit limbs into three 64-bit registers.
7692   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7693     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7694     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7695     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7696     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7697 
7698     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7699     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7700     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7701     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7702 
7703     if (dest2->is_valid()) {
7704       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7705     } else {
7706 #ifdef ASSERT
7707       Label OK;
7708       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7709       __ br(__ EQ, OK);
7710       __ stop("high bits of Poly1305 integer should be zero");
7711       __ should_not_reach_here();
7712       __ bind(OK);
7713 #endif
7714     }
7715   }
7716 
7717   // As above, but return only a 128-bit integer, packed into two
7718   // 64-bit registers.
7719   void pack_26(Register dest0, Register dest1, Register src) {
7720     pack_26(dest0, dest1, noreg, src);
7721   }
7722 
7723   // Multiply and multiply-accumulate unsigned 64-bit registers.
7724   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7725     __ mul(prod_lo, n, m);
7726     __ umulh(prod_hi, n, m);
7727   }
7728   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7729     wide_mul(rscratch1, rscratch2, n, m);
7730     __ adds(sum_lo, sum_lo, rscratch1);
7731     __ adc(sum_hi, sum_hi, rscratch2);
7732   }
7733 
7734   // Poly1305, RFC 7539
7735 
7736   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7737   // description of the tricks used to simplify and accelerate this
7738   // computation.
7739 
7740   address generate_poly1305_processBlocks() {
7741     __ align(CodeEntryAlignment);
7742     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
7743     StubCodeMark mark(this, stub_id);
7744     address start = __ pc();
7745     Label here;
7746     __ enter();
7747     RegSet callee_saved = RegSet::range(r19, r28);
7748     __ push(callee_saved, sp);
7749 
7750     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7751 
7752     // Arguments
7753     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7754 
7755     // R_n is the 128-bit randomly-generated key, packed into two
7756     // registers.  The caller passes this key to us as long[5], with
7757     // BITS_PER_LIMB = 26.
7758     const Register R_0 = *++regs, R_1 = *++regs;
7759     pack_26(R_0, R_1, r_start);
7760 
7761     // RR_n is (R_n >> 2) * 5
7762     const Register RR_0 = *++regs, RR_1 = *++regs;
7763     __ lsr(RR_0, R_0, 2);
7764     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7765     __ lsr(RR_1, R_1, 2);
7766     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7767 
7768     // U_n is the current checksum
7769     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7770     pack_26(U_0, U_1, U_2, acc_start);
7771 
7772     static constexpr int BLOCK_LENGTH = 16;
7773     Label DONE, LOOP;
7774 
7775     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7776     __ br(Assembler::LT, DONE); {
7777       __ bind(LOOP);
7778 
7779       // S_n is to be the sum of U_n and the next block of data
7780       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7781       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7782       __ adds(S_0, U_0, S_0);
7783       __ adcs(S_1, U_1, S_1);
7784       __ adc(S_2, U_2, zr);
7785       __ add(S_2, S_2, 1);
7786 
7787       const Register U_0HI = *++regs, U_1HI = *++regs;
7788 
7789       // NB: this logic depends on some of the special properties of
7790       // Poly1305 keys. In particular, because we know that the top
7791       // four bits of R_0 and R_1 are zero, we can add together
7792       // partial products without any risk of needing to propagate a
7793       // carry out.
7794       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7795       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7796       __ andr(U_2, R_0, 3);
7797       __ mul(U_2, S_2, U_2);
7798 
7799       // Recycle registers S_0, S_1, S_2
7800       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7801 
7802       // Partial reduction mod 2**130 - 5
7803       __ adds(U_1, U_0HI, U_1);
7804       __ adc(U_2, U_1HI, U_2);
7805       // Sum now in U_2:U_1:U_0.
7806       // Dead: U_0HI, U_1HI.
7807       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7808 
7809       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7810 
7811       // First, U_2:U_1:U_0 += (U_2 >> 2)
7812       __ lsr(rscratch1, U_2, 2);
7813       __ andr(U_2, U_2, (u8)3);
7814       __ adds(U_0, U_0, rscratch1);
7815       __ adcs(U_1, U_1, zr);
7816       __ adc(U_2, U_2, zr);
7817       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7818       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7819       __ adcs(U_1, U_1, zr);
7820       __ adc(U_2, U_2, zr);
7821 
7822       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7823       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7824       __ br(~ Assembler::LT, LOOP);
7825     }
7826 
7827     // Further reduce modulo 2^130 - 5
7828     __ lsr(rscratch1, U_2, 2);
7829     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7830     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7831     __ adcs(U_1, U_1, zr);
7832     __ andr(U_2, U_2, (u1)3);
7833     __ adc(U_2, U_2, zr);
7834 
7835     // Unpack the sum into five 26-bit limbs and write to memory.
7836     __ ubfiz(rscratch1, U_0, 0, 26);
7837     __ ubfx(rscratch2, U_0, 26, 26);
7838     __ stp(rscratch1, rscratch2, Address(acc_start));
7839     __ ubfx(rscratch1, U_0, 52, 12);
7840     __ bfi(rscratch1, U_1, 12, 14);
7841     __ ubfx(rscratch2, U_1, 14, 26);
7842     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7843     __ ubfx(rscratch1, U_1, 40, 24);
7844     __ bfi(rscratch1, U_2, 24, 3);
7845     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7846 
7847     __ bind(DONE);
7848     __ pop(callee_saved, sp);
7849     __ leave();
7850     __ ret(lr);
7851 
7852     return start;
7853   }
7854 
7855   // exception handler for upcall stubs
7856   address generate_upcall_stub_exception_handler() {
7857     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
7858     StubCodeMark mark(this, stub_id);
7859     address start = __ pc();
7860 
7861     // Native caller has no idea how to handle exceptions,
7862     // so we just crash here. Up to callee to catch exceptions.
7863     __ verify_oop(r0);
7864     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7865     __ blr(rscratch1);
7866     __ should_not_reach_here();
7867 
7868     return start;
7869   }
7870 
7871   // load Method* target of MethodHandle
7872   // j_rarg0 = jobject receiver
7873   // rmethod = result
7874   address generate_upcall_stub_load_target() {
7875     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
7876     StubCodeMark mark(this, stub_id);
7877     address start = __ pc();
7878 
7879     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
7880       // Load target method from receiver
7881     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
7882     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
7883     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
7884     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
7885                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7886                       noreg, noreg);
7887     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7888 
7889     __ ret(lr);
7890 
7891     return start;
7892   }
7893 
7894 #undef __
7895 #define __ masm->
7896 
7897   class MontgomeryMultiplyGenerator : public MacroAssembler {
7898 
7899     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7900       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7901 
7902     RegSet _toSave;
7903     bool _squaring;
7904 
7905   public:
7906     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7907       : MacroAssembler(as->code()), _squaring(squaring) {
7908 
7909       // Register allocation
7910 
7911       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7912       Pa_base = *regs;       // Argument registers
7913       if (squaring)
7914         Pb_base = Pa_base;
7915       else
7916         Pb_base = *++regs;
7917       Pn_base = *++regs;
7918       Rlen= *++regs;
7919       inv = *++regs;
7920       Pm_base = *++regs;
7921 
7922                           // Working registers:
7923       Ra =  *++regs;        // The current digit of a, b, n, and m.
7924       Rb =  *++regs;
7925       Rm =  *++regs;
7926       Rn =  *++regs;
7927 
7928       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7929       Pb =  *++regs;
7930       Pm =  *++regs;
7931       Pn =  *++regs;
7932 
7933       t0 =  *++regs;        // Three registers which form a
7934       t1 =  *++regs;        // triple-precision accumuator.
7935       t2 =  *++regs;
7936 
7937       Ri =  *++regs;        // Inner and outer loop indexes.
7938       Rj =  *++regs;
7939 
7940       Rhi_ab = *++regs;     // Product registers: low and high parts
7941       Rlo_ab = *++regs;     // of a*b and m*n.
7942       Rhi_mn = *++regs;
7943       Rlo_mn = *++regs;
7944 
7945       // r19 and up are callee-saved.
7946       _toSave = RegSet::range(r19, *regs) + Pm_base;
7947     }
7948 
7949   private:
7950     void save_regs() {
7951       push(_toSave, sp);
7952     }
7953 
7954     void restore_regs() {
7955       pop(_toSave, sp);
7956     }
7957 
7958     template <typename T>
7959     void unroll_2(Register count, T block) {
7960       Label loop, end, odd;
7961       tbnz(count, 0, odd);
7962       cbz(count, end);
7963       align(16);
7964       bind(loop);
7965       (this->*block)();
7966       bind(odd);
7967       (this->*block)();
7968       subs(count, count, 2);
7969       br(Assembler::GT, loop);
7970       bind(end);
7971     }
7972 
7973     template <typename T>
7974     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7975       Label loop, end, odd;
7976       tbnz(count, 0, odd);
7977       cbz(count, end);
7978       align(16);
7979       bind(loop);
7980       (this->*block)(d, s, tmp);
7981       bind(odd);
7982       (this->*block)(d, s, tmp);
7983       subs(count, count, 2);
7984       br(Assembler::GT, loop);
7985       bind(end);
7986     }
7987 
7988     void pre1(RegisterOrConstant i) {
7989       block_comment("pre1");
7990       // Pa = Pa_base;
7991       // Pb = Pb_base + i;
7992       // Pm = Pm_base;
7993       // Pn = Pn_base + i;
7994       // Ra = *Pa;
7995       // Rb = *Pb;
7996       // Rm = *Pm;
7997       // Rn = *Pn;
7998       ldr(Ra, Address(Pa_base));
7999       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
8000       ldr(Rm, Address(Pm_base));
8001       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
8002       lea(Pa, Address(Pa_base));
8003       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
8004       lea(Pm, Address(Pm_base));
8005       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
8006 
8007       // Zero the m*n result.
8008       mov(Rhi_mn, zr);
8009       mov(Rlo_mn, zr);
8010     }
8011 
8012     // The core multiply-accumulate step of a Montgomery
8013     // multiplication.  The idea is to schedule operations as a
8014     // pipeline so that instructions with long latencies (loads and
8015     // multiplies) have time to complete before their results are
8016     // used.  This most benefits in-order implementations of the
8017     // architecture but out-of-order ones also benefit.
8018     void step() {
8019       block_comment("step");
8020       // MACC(Ra, Rb, t0, t1, t2);
8021       // Ra = *++Pa;
8022       // Rb = *--Pb;
8023       umulh(Rhi_ab, Ra, Rb);
8024       mul(Rlo_ab, Ra, Rb);
8025       ldr(Ra, pre(Pa, wordSize));
8026       ldr(Rb, pre(Pb, -wordSize));
8027       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
8028                                        // previous iteration.
8029       // MACC(Rm, Rn, t0, t1, t2);
8030       // Rm = *++Pm;
8031       // Rn = *--Pn;
8032       umulh(Rhi_mn, Rm, Rn);
8033       mul(Rlo_mn, Rm, Rn);
8034       ldr(Rm, pre(Pm, wordSize));
8035       ldr(Rn, pre(Pn, -wordSize));
8036       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8037     }
8038 
8039     void post1() {
8040       block_comment("post1");
8041 
8042       // MACC(Ra, Rb, t0, t1, t2);
8043       // Ra = *++Pa;
8044       // Rb = *--Pb;
8045       umulh(Rhi_ab, Ra, Rb);
8046       mul(Rlo_ab, Ra, Rb);
8047       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8048       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8049 
8050       // *Pm = Rm = t0 * inv;
8051       mul(Rm, t0, inv);
8052       str(Rm, Address(Pm));
8053 
8054       // MACC(Rm, Rn, t0, t1, t2);
8055       // t0 = t1; t1 = t2; t2 = 0;
8056       umulh(Rhi_mn, Rm, Rn);
8057 
8058 #ifndef PRODUCT
8059       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
8060       {
8061         mul(Rlo_mn, Rm, Rn);
8062         add(Rlo_mn, t0, Rlo_mn);
8063         Label ok;
8064         cbz(Rlo_mn, ok); {
8065           stop("broken Montgomery multiply");
8066         } bind(ok);
8067       }
8068 #endif
8069       // We have very carefully set things up so that
8070       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
8071       // the lower half of Rm * Rn because we know the result already:
8072       // it must be -t0.  t0 + (-t0) must generate a carry iff
8073       // t0 != 0.  So, rather than do a mul and an adds we just set
8074       // the carry flag iff t0 is nonzero.
8075       //
8076       // mul(Rlo_mn, Rm, Rn);
8077       // adds(zr, t0, Rlo_mn);
8078       subs(zr, t0, 1); // Set carry iff t0 is nonzero
8079       adcs(t0, t1, Rhi_mn);
8080       adc(t1, t2, zr);
8081       mov(t2, zr);
8082     }
8083 
8084     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
8085       block_comment("pre2");
8086       // Pa = Pa_base + i-len;
8087       // Pb = Pb_base + len;
8088       // Pm = Pm_base + i-len;
8089       // Pn = Pn_base + len;
8090 
8091       if (i.is_register()) {
8092         sub(Rj, i.as_register(), len);
8093       } else {
8094         mov(Rj, i.as_constant());
8095         sub(Rj, Rj, len);
8096       }
8097       // Rj == i-len
8098 
8099       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
8100       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
8101       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
8102       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
8103 
8104       // Ra = *++Pa;
8105       // Rb = *--Pb;
8106       // Rm = *++Pm;
8107       // Rn = *--Pn;
8108       ldr(Ra, pre(Pa, wordSize));
8109       ldr(Rb, pre(Pb, -wordSize));
8110       ldr(Rm, pre(Pm, wordSize));
8111       ldr(Rn, pre(Pn, -wordSize));
8112 
8113       mov(Rhi_mn, zr);
8114       mov(Rlo_mn, zr);
8115     }
8116 
8117     void post2(RegisterOrConstant i, RegisterOrConstant len) {
8118       block_comment("post2");
8119       if (i.is_constant()) {
8120         mov(Rj, i.as_constant()-len.as_constant());
8121       } else {
8122         sub(Rj, i.as_register(), len);
8123       }
8124 
8125       adds(t0, t0, Rlo_mn); // The pending m*n, low part
8126 
8127       // As soon as we know the least significant digit of our result,
8128       // store it.
8129       // Pm_base[i-len] = t0;
8130       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
8131 
8132       // t0 = t1; t1 = t2; t2 = 0;
8133       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
8134       adc(t1, t2, zr);
8135       mov(t2, zr);
8136     }
8137 
8138     // A carry in t0 after Montgomery multiplication means that we
8139     // should subtract multiples of n from our result in m.  We'll
8140     // keep doing that until there is no carry.
8141     void normalize(RegisterOrConstant len) {
8142       block_comment("normalize");
8143       // while (t0)
8144       //   t0 = sub(Pm_base, Pn_base, t0, len);
8145       Label loop, post, again;
8146       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
8147       cbz(t0, post); {
8148         bind(again); {
8149           mov(i, zr);
8150           mov(cnt, len);
8151           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
8152           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
8153           subs(zr, zr, zr); // set carry flag, i.e. no borrow
8154           align(16);
8155           bind(loop); {
8156             sbcs(Rm, Rm, Rn);
8157             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
8158             add(i, i, 1);
8159             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
8160             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
8161             sub(cnt, cnt, 1);
8162           } cbnz(cnt, loop);
8163           sbc(t0, t0, zr);
8164         } cbnz(t0, again);
8165       } bind(post);
8166     }
8167 
8168     // Move memory at s to d, reversing words.
8169     //    Increments d to end of copied memory
8170     //    Destroys tmp1, tmp2
8171     //    Preserves len
8172     //    Leaves s pointing to the address which was in d at start
8173     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
8174       assert(tmp1->encoding() < r19->encoding(), "register corruption");
8175       assert(tmp2->encoding() < r19->encoding(), "register corruption");
8176 
8177       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
8178       mov(tmp1, len);
8179       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
8180       sub(s, d, len, ext::uxtw, LogBytesPerWord);
8181     }
8182     // where
8183     void reverse1(Register d, Register s, Register tmp) {
8184       ldr(tmp, pre(s, -wordSize));
8185       ror(tmp, tmp, 32);
8186       str(tmp, post(d, wordSize));
8187     }
8188 
8189     void step_squaring() {
8190       // An extra ACC
8191       step();
8192       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8193     }
8194 
8195     void last_squaring(RegisterOrConstant i) {
8196       Label dont;
8197       // if ((i & 1) == 0) {
8198       tbnz(i.as_register(), 0, dont); {
8199         // MACC(Ra, Rb, t0, t1, t2);
8200         // Ra = *++Pa;
8201         // Rb = *--Pb;
8202         umulh(Rhi_ab, Ra, Rb);
8203         mul(Rlo_ab, Ra, Rb);
8204         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8205       } bind(dont);
8206     }
8207 
8208     void extra_step_squaring() {
8209       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8210 
8211       // MACC(Rm, Rn, t0, t1, t2);
8212       // Rm = *++Pm;
8213       // Rn = *--Pn;
8214       umulh(Rhi_mn, Rm, Rn);
8215       mul(Rlo_mn, Rm, Rn);
8216       ldr(Rm, pre(Pm, wordSize));
8217       ldr(Rn, pre(Pn, -wordSize));
8218     }
8219 
8220     void post1_squaring() {
8221       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8222 
8223       // *Pm = Rm = t0 * inv;
8224       mul(Rm, t0, inv);
8225       str(Rm, Address(Pm));
8226 
8227       // MACC(Rm, Rn, t0, t1, t2);
8228       // t0 = t1; t1 = t2; t2 = 0;
8229       umulh(Rhi_mn, Rm, Rn);
8230 
8231 #ifndef PRODUCT
8232       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
8233       {
8234         mul(Rlo_mn, Rm, Rn);
8235         add(Rlo_mn, t0, Rlo_mn);
8236         Label ok;
8237         cbz(Rlo_mn, ok); {
8238           stop("broken Montgomery multiply");
8239         } bind(ok);
8240       }
8241 #endif
8242       // We have very carefully set things up so that
8243       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
8244       // the lower half of Rm * Rn because we know the result already:
8245       // it must be -t0.  t0 + (-t0) must generate a carry iff
8246       // t0 != 0.  So, rather than do a mul and an adds we just set
8247       // the carry flag iff t0 is nonzero.
8248       //
8249       // mul(Rlo_mn, Rm, Rn);
8250       // adds(zr, t0, Rlo_mn);
8251       subs(zr, t0, 1); // Set carry iff t0 is nonzero
8252       adcs(t0, t1, Rhi_mn);
8253       adc(t1, t2, zr);
8254       mov(t2, zr);
8255     }
8256 
8257     void acc(Register Rhi, Register Rlo,
8258              Register t0, Register t1, Register t2) {
8259       adds(t0, t0, Rlo);
8260       adcs(t1, t1, Rhi);
8261       adc(t2, t2, zr);
8262     }
8263 
8264   public:
8265     /**
8266      * Fast Montgomery multiplication.  The derivation of the
8267      * algorithm is in A Cryptographic Library for the Motorola
8268      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
8269      *
8270      * Arguments:
8271      *
8272      * Inputs for multiplication:
8273      *   c_rarg0   - int array elements a
8274      *   c_rarg1   - int array elements b
8275      *   c_rarg2   - int array elements n (the modulus)
8276      *   c_rarg3   - int length
8277      *   c_rarg4   - int inv
8278      *   c_rarg5   - int array elements m (the result)
8279      *
8280      * Inputs for squaring:
8281      *   c_rarg0   - int array elements a
8282      *   c_rarg1   - int array elements n (the modulus)
8283      *   c_rarg2   - int length
8284      *   c_rarg3   - int inv
8285      *   c_rarg4   - int array elements m (the result)
8286      *
8287      */
8288     address generate_multiply() {
8289       Label argh, nothing;
8290       bind(argh);
8291       stop("MontgomeryMultiply total_allocation must be <= 8192");
8292 
8293       align(CodeEntryAlignment);
8294       address entry = pc();
8295 
8296       cbzw(Rlen, nothing);
8297 
8298       enter();
8299 
8300       // Make room.
8301       cmpw(Rlen, 512);
8302       br(Assembler::HI, argh);
8303       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8304       andr(sp, Ra, -2 * wordSize);
8305 
8306       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8307 
8308       {
8309         // Copy input args, reversing as we go.  We use Ra as a
8310         // temporary variable.
8311         reverse(Ra, Pa_base, Rlen, t0, t1);
8312         if (!_squaring)
8313           reverse(Ra, Pb_base, Rlen, t0, t1);
8314         reverse(Ra, Pn_base, Rlen, t0, t1);
8315       }
8316 
8317       // Push all call-saved registers and also Pm_base which we'll need
8318       // at the end.
8319       save_regs();
8320 
8321 #ifndef PRODUCT
8322       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
8323       {
8324         ldr(Rn, Address(Pn_base, 0));
8325         mul(Rlo_mn, Rn, inv);
8326         subs(zr, Rlo_mn, -1);
8327         Label ok;
8328         br(EQ, ok); {
8329           stop("broken inverse in Montgomery multiply");
8330         } bind(ok);
8331       }
8332 #endif
8333 
8334       mov(Pm_base, Ra);
8335 
8336       mov(t0, zr);
8337       mov(t1, zr);
8338       mov(t2, zr);
8339 
8340       block_comment("for (int i = 0; i < len; i++) {");
8341       mov(Ri, zr); {
8342         Label loop, end;
8343         cmpw(Ri, Rlen);
8344         br(Assembler::GE, end);
8345 
8346         bind(loop);
8347         pre1(Ri);
8348 
8349         block_comment("  for (j = i; j; j--) {"); {
8350           movw(Rj, Ri);
8351           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8352         } block_comment("  } // j");
8353 
8354         post1();
8355         addw(Ri, Ri, 1);
8356         cmpw(Ri, Rlen);
8357         br(Assembler::LT, loop);
8358         bind(end);
8359         block_comment("} // i");
8360       }
8361 
8362       block_comment("for (int i = len; i < 2*len; i++) {");
8363       mov(Ri, Rlen); {
8364         Label loop, end;
8365         cmpw(Ri, Rlen, Assembler::LSL, 1);
8366         br(Assembler::GE, end);
8367 
8368         bind(loop);
8369         pre2(Ri, Rlen);
8370 
8371         block_comment("  for (j = len*2-i-1; j; j--) {"); {
8372           lslw(Rj, Rlen, 1);
8373           subw(Rj, Rj, Ri);
8374           subw(Rj, Rj, 1);
8375           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8376         } block_comment("  } // j");
8377 
8378         post2(Ri, Rlen);
8379         addw(Ri, Ri, 1);
8380         cmpw(Ri, Rlen, Assembler::LSL, 1);
8381         br(Assembler::LT, loop);
8382         bind(end);
8383       }
8384       block_comment("} // i");
8385 
8386       normalize(Rlen);
8387 
8388       mov(Ra, Pm_base);  // Save Pm_base in Ra
8389       restore_regs();  // Restore caller's Pm_base
8390 
8391       // Copy our result into caller's Pm_base
8392       reverse(Pm_base, Ra, Rlen, t0, t1);
8393 
8394       leave();
8395       bind(nothing);
8396       ret(lr);
8397 
8398       return entry;
8399     }
8400     // In C, approximately:
8401 
8402     // void
8403     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8404     //                     julong Pn_base[], julong Pm_base[],
8405     //                     julong inv, int len) {
8406     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8407     //   julong *Pa, *Pb, *Pn, *Pm;
8408     //   julong Ra, Rb, Rn, Rm;
8409 
8410     //   int i;
8411 
8412     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8413 
8414     //   for (i = 0; i < len; i++) {
8415     //     int j;
8416 
8417     //     Pa = Pa_base;
8418     //     Pb = Pb_base + i;
8419     //     Pm = Pm_base;
8420     //     Pn = Pn_base + i;
8421 
8422     //     Ra = *Pa;
8423     //     Rb = *Pb;
8424     //     Rm = *Pm;
8425     //     Rn = *Pn;
8426 
8427     //     int iters = i;
8428     //     for (j = 0; iters--; j++) {
8429     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8430     //       MACC(Ra, Rb, t0, t1, t2);
8431     //       Ra = *++Pa;
8432     //       Rb = *--Pb;
8433     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8434     //       MACC(Rm, Rn, t0, t1, t2);
8435     //       Rm = *++Pm;
8436     //       Rn = *--Pn;
8437     //     }
8438 
8439     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8440     //     MACC(Ra, Rb, t0, t1, t2);
8441     //     *Pm = Rm = t0 * inv;
8442     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8443     //     MACC(Rm, Rn, t0, t1, t2);
8444 
8445     //     assert(t0 == 0, "broken Montgomery multiply");
8446 
8447     //     t0 = t1; t1 = t2; t2 = 0;
8448     //   }
8449 
8450     //   for (i = len; i < 2*len; i++) {
8451     //     int j;
8452 
8453     //     Pa = Pa_base + i-len;
8454     //     Pb = Pb_base + len;
8455     //     Pm = Pm_base + i-len;
8456     //     Pn = Pn_base + len;
8457 
8458     //     Ra = *++Pa;
8459     //     Rb = *--Pb;
8460     //     Rm = *++Pm;
8461     //     Rn = *--Pn;
8462 
8463     //     int iters = len*2-i-1;
8464     //     for (j = i-len+1; iters--; j++) {
8465     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8466     //       MACC(Ra, Rb, t0, t1, t2);
8467     //       Ra = *++Pa;
8468     //       Rb = *--Pb;
8469     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8470     //       MACC(Rm, Rn, t0, t1, t2);
8471     //       Rm = *++Pm;
8472     //       Rn = *--Pn;
8473     //     }
8474 
8475     //     Pm_base[i-len] = t0;
8476     //     t0 = t1; t1 = t2; t2 = 0;
8477     //   }
8478 
8479     //   while (t0)
8480     //     t0 = sub(Pm_base, Pn_base, t0, len);
8481     // }
8482 
8483     /**
8484      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8485      * multiplies than Montgomery multiplication so it should be up to
8486      * 25% faster.  However, its loop control is more complex and it
8487      * may actually run slower on some machines.
8488      *
8489      * Arguments:
8490      *
8491      * Inputs:
8492      *   c_rarg0   - int array elements a
8493      *   c_rarg1   - int array elements n (the modulus)
8494      *   c_rarg2   - int length
8495      *   c_rarg3   - int inv
8496      *   c_rarg4   - int array elements m (the result)
8497      *
8498      */
8499     address generate_square() {
8500       Label argh;
8501       bind(argh);
8502       stop("MontgomeryMultiply total_allocation must be <= 8192");
8503 
8504       align(CodeEntryAlignment);
8505       address entry = pc();
8506 
8507       enter();
8508 
8509       // Make room.
8510       cmpw(Rlen, 512);
8511       br(Assembler::HI, argh);
8512       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8513       andr(sp, Ra, -2 * wordSize);
8514 
8515       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8516 
8517       {
8518         // Copy input args, reversing as we go.  We use Ra as a
8519         // temporary variable.
8520         reverse(Ra, Pa_base, Rlen, t0, t1);
8521         reverse(Ra, Pn_base, Rlen, t0, t1);
8522       }
8523 
8524       // Push all call-saved registers and also Pm_base which we'll need
8525       // at the end.
8526       save_regs();
8527 
8528       mov(Pm_base, Ra);
8529 
8530       mov(t0, zr);
8531       mov(t1, zr);
8532       mov(t2, zr);
8533 
8534       block_comment("for (int i = 0; i < len; i++) {");
8535       mov(Ri, zr); {
8536         Label loop, end;
8537         bind(loop);
8538         cmp(Ri, Rlen);
8539         br(Assembler::GE, end);
8540 
8541         pre1(Ri);
8542 
8543         block_comment("for (j = (i+1)/2; j; j--) {"); {
8544           add(Rj, Ri, 1);
8545           lsr(Rj, Rj, 1);
8546           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8547         } block_comment("  } // j");
8548 
8549         last_squaring(Ri);
8550 
8551         block_comment("  for (j = i/2; j; j--) {"); {
8552           lsr(Rj, Ri, 1);
8553           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8554         } block_comment("  } // j");
8555 
8556         post1_squaring();
8557         add(Ri, Ri, 1);
8558         cmp(Ri, Rlen);
8559         br(Assembler::LT, loop);
8560 
8561         bind(end);
8562         block_comment("} // i");
8563       }
8564 
8565       block_comment("for (int i = len; i < 2*len; i++) {");
8566       mov(Ri, Rlen); {
8567         Label loop, end;
8568         bind(loop);
8569         cmp(Ri, Rlen, Assembler::LSL, 1);
8570         br(Assembler::GE, end);
8571 
8572         pre2(Ri, Rlen);
8573 
8574         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8575           lsl(Rj, Rlen, 1);
8576           sub(Rj, Rj, Ri);
8577           sub(Rj, Rj, 1);
8578           lsr(Rj, Rj, 1);
8579           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8580         } block_comment("  } // j");
8581 
8582         last_squaring(Ri);
8583 
8584         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8585           lsl(Rj, Rlen, 1);
8586           sub(Rj, Rj, Ri);
8587           lsr(Rj, Rj, 1);
8588           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8589         } block_comment("  } // j");
8590 
8591         post2(Ri, Rlen);
8592         add(Ri, Ri, 1);
8593         cmp(Ri, Rlen, Assembler::LSL, 1);
8594 
8595         br(Assembler::LT, loop);
8596         bind(end);
8597         block_comment("} // i");
8598       }
8599 
8600       normalize(Rlen);
8601 
8602       mov(Ra, Pm_base);  // Save Pm_base in Ra
8603       restore_regs();  // Restore caller's Pm_base
8604 
8605       // Copy our result into caller's Pm_base
8606       reverse(Pm_base, Ra, Rlen, t0, t1);
8607 
8608       leave();
8609       ret(lr);
8610 
8611       return entry;
8612     }
8613     // In C, approximately:
8614 
8615     // void
8616     // montgomery_square(julong Pa_base[], julong Pn_base[],
8617     //                   julong Pm_base[], julong inv, int len) {
8618     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8619     //   julong *Pa, *Pb, *Pn, *Pm;
8620     //   julong Ra, Rb, Rn, Rm;
8621 
8622     //   int i;
8623 
8624     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8625 
8626     //   for (i = 0; i < len; i++) {
8627     //     int j;
8628 
8629     //     Pa = Pa_base;
8630     //     Pb = Pa_base + i;
8631     //     Pm = Pm_base;
8632     //     Pn = Pn_base + i;
8633 
8634     //     Ra = *Pa;
8635     //     Rb = *Pb;
8636     //     Rm = *Pm;
8637     //     Rn = *Pn;
8638 
8639     //     int iters = (i+1)/2;
8640     //     for (j = 0; iters--; j++) {
8641     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8642     //       MACC2(Ra, Rb, t0, t1, t2);
8643     //       Ra = *++Pa;
8644     //       Rb = *--Pb;
8645     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8646     //       MACC(Rm, Rn, t0, t1, t2);
8647     //       Rm = *++Pm;
8648     //       Rn = *--Pn;
8649     //     }
8650     //     if ((i & 1) == 0) {
8651     //       assert(Ra == Pa_base[j], "must be");
8652     //       MACC(Ra, Ra, t0, t1, t2);
8653     //     }
8654     //     iters = i/2;
8655     //     assert(iters == i-j, "must be");
8656     //     for (; iters--; j++) {
8657     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8658     //       MACC(Rm, Rn, t0, t1, t2);
8659     //       Rm = *++Pm;
8660     //       Rn = *--Pn;
8661     //     }
8662 
8663     //     *Pm = Rm = t0 * inv;
8664     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8665     //     MACC(Rm, Rn, t0, t1, t2);
8666 
8667     //     assert(t0 == 0, "broken Montgomery multiply");
8668 
8669     //     t0 = t1; t1 = t2; t2 = 0;
8670     //   }
8671 
8672     //   for (i = len; i < 2*len; i++) {
8673     //     int start = i-len+1;
8674     //     int end = start + (len - start)/2;
8675     //     int j;
8676 
8677     //     Pa = Pa_base + i-len;
8678     //     Pb = Pa_base + len;
8679     //     Pm = Pm_base + i-len;
8680     //     Pn = Pn_base + len;
8681 
8682     //     Ra = *++Pa;
8683     //     Rb = *--Pb;
8684     //     Rm = *++Pm;
8685     //     Rn = *--Pn;
8686 
8687     //     int iters = (2*len-i-1)/2;
8688     //     assert(iters == end-start, "must be");
8689     //     for (j = start; iters--; j++) {
8690     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8691     //       MACC2(Ra, Rb, t0, t1, t2);
8692     //       Ra = *++Pa;
8693     //       Rb = *--Pb;
8694     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8695     //       MACC(Rm, Rn, t0, t1, t2);
8696     //       Rm = *++Pm;
8697     //       Rn = *--Pn;
8698     //     }
8699     //     if ((i & 1) == 0) {
8700     //       assert(Ra == Pa_base[j], "must be");
8701     //       MACC(Ra, Ra, t0, t1, t2);
8702     //     }
8703     //     iters =  (2*len-i)/2;
8704     //     assert(iters == len-j, "must be");
8705     //     for (; iters--; j++) {
8706     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8707     //       MACC(Rm, Rn, t0, t1, t2);
8708     //       Rm = *++Pm;
8709     //       Rn = *--Pn;
8710     //     }
8711     //     Pm_base[i-len] = t0;
8712     //     t0 = t1; t1 = t2; t2 = 0;
8713     //   }
8714 
8715     //   while (t0)
8716     //     t0 = sub(Pm_base, Pn_base, t0, len);
8717     // }
8718   };
8719 
8720   void generate_vector_math_stubs() {
8721     // Get native vector math stub routine addresses
8722     void* libsleef = nullptr;
8723     char ebuf[1024];
8724     char dll_name[JVM_MAXPATHLEN];
8725     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
8726       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
8727     }
8728     if (libsleef == nullptr) {
8729       log_info(library)("Failed to load native vector math library, %s!", ebuf);
8730       return;
8731     }
8732     // Method naming convention
8733     //   All the methods are named as <OP><T><N>_<U><suffix>
8734     //   Where:
8735     //     <OP>     is the operation name, e.g. sin
8736     //     <T>      is optional to indicate float/double
8737     //              "f/d" for vector float/double operation
8738     //     <N>      is the number of elements in the vector
8739     //              "2/4" for neon, and "x" for sve
8740     //     <U>      is the precision level
8741     //              "u10/u05" represents 1.0/0.5 ULP error bounds
8742     //               We use "u10" for all operations by default
8743     //               But for those functions do not have u10 support, we use "u05" instead
8744     //     <suffix> indicates neon/sve
8745     //              "sve/advsimd" for sve/neon implementations
8746     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
8747     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
8748     //
8749     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
8750 
8751     // Math vector stubs implemented with SVE for scalable vector size.
8752     if (UseSVE > 0) {
8753       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8754         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8755         // Skip "tanh" because there is performance regression
8756         if (vop == VectorSupport::VECTOR_OP_TANH) {
8757           continue;
8758         }
8759 
8760         // The native library does not support u10 level of "hypot".
8761         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8762 
8763         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
8764         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8765 
8766         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
8767         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8768       }
8769     }
8770 
8771     // Math vector stubs implemented with NEON for 64/128 bits vector size.
8772     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8773       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8774       // Skip "tanh" because there is performance regression
8775       if (vop == VectorSupport::VECTOR_OP_TANH) {
8776         continue;
8777       }
8778 
8779       // The native library does not support u10 level of "hypot".
8780       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8781 
8782       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8783       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
8784 
8785       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8786       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8787 
8788       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
8789       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8790     }
8791   }
8792 
8793   // Initialization
8794   void generate_initial_stubs() {
8795     // Generate initial stubs and initializes the entry points
8796 
8797     // entry points that exist in all platforms Note: This is code
8798     // that could be shared among different platforms - however the
8799     // benefit seems to be smaller than the disadvantage of having a
8800     // much more complicated generator structure. See also comment in
8801     // stubRoutines.hpp.
8802 
8803     StubRoutines::_forward_exception_entry = generate_forward_exception();
8804 
8805     StubRoutines::_call_stub_entry =
8806       generate_call_stub(StubRoutines::_call_stub_return_address);
8807 
8808     // is referenced by megamorphic call
8809     StubRoutines::_catch_exception_entry = generate_catch_exception();
8810 
8811     // Initialize table for copy memory (arraycopy) check.
8812     if (UnsafeMemoryAccess::_table == nullptr) {
8813       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
8814     }
8815 
8816     if (UseCRC32Intrinsics) {
8817       // set table address before stub generation which use it
8818       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8819       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8820     }
8821 
8822     if (UseCRC32CIntrinsics) {
8823       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8824     }
8825 
8826     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8827       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8828     }
8829 
8830     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8831       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8832     }
8833 
8834     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8835         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8836       StubRoutines::_hf2f = generate_float16ToFloat();
8837       StubRoutines::_f2hf = generate_floatToFloat16();
8838     }
8839   }
8840 
8841   void generate_continuation_stubs() {
8842     // Continuation stubs:
8843     StubRoutines::_cont_thaw          = generate_cont_thaw();
8844     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8845     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8846     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
8847   }
8848 
8849   void generate_final_stubs() {
8850     // support for verify_oop (must happen after universe_init)
8851     if (VerifyOops) {
8852       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8853     }
8854 
8855     // arraycopy stubs used by compilers
8856     generate_arraycopy_stubs();
8857 
8858     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8859     if (bs_nm != nullptr) {
8860       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8861     }
8862 
8863     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8864 
8865     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8866     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
8867 
8868 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8869 
8870     generate_atomic_entry_points();
8871 
8872 #endif // LINUX
8873 
8874 #ifdef COMPILER2
8875     if (UseSecondarySupersTable) {
8876       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
8877       if (! InlineSecondarySupersTest) {
8878         generate_lookup_secondary_supers_table_stub();
8879       }
8880     }
8881 #endif
8882 
8883     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8884   }
8885 
8886   void generate_compiler_stubs() {
8887 #if COMPILER2_OR_JVMCI
8888 
8889     if (UseSVE == 0) {
8890       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
8891     }
8892 
8893     // array equals stub for large arrays.
8894     if (!UseSimpleArrayEquals) {
8895       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8896     }
8897 
8898     // arrays_hascode stub for large arrays.
8899     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
8900     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
8901     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
8902     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
8903     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
8904 
8905     // byte_array_inflate stub for large arrays.
8906     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8907 
8908     // countPositives stub for large arrays.
8909     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8910 
8911     generate_compare_long_strings();
8912 
8913     generate_string_indexof_stubs();
8914 
8915 #ifdef COMPILER2
8916     if (UseMultiplyToLenIntrinsic) {
8917       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8918     }
8919 
8920     if (UseSquareToLenIntrinsic) {
8921       StubRoutines::_squareToLen = generate_squareToLen();
8922     }
8923 
8924     if (UseMulAddIntrinsic) {
8925       StubRoutines::_mulAdd = generate_mulAdd();
8926     }
8927 
8928     if (UseSIMDForBigIntegerShiftIntrinsics) {
8929       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8930       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8931     }
8932 
8933     if (UseMontgomeryMultiplyIntrinsic) {
8934       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
8935       StubCodeMark mark(this, stub_id);
8936       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8937       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8938     }
8939 
8940     if (UseMontgomerySquareIntrinsic) {
8941       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
8942       StubCodeMark mark(this, stub_id);
8943       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8944       // We use generate_multiply() rather than generate_square()
8945       // because it's faster for the sizes of modulus we care about.
8946       StubRoutines::_montgomerySquare = g.generate_multiply();
8947     }
8948 
8949     generate_vector_math_stubs();
8950 
8951 #endif // COMPILER2
8952 
8953     if (UseChaCha20Intrinsics) {
8954       StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
8955     }
8956 
8957     if (UseBASE64Intrinsics) {
8958         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8959         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8960     }
8961 
8962     // data cache line writeback
8963     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8964     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8965 
8966     if (UseAESIntrinsics) {
8967       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8968       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8969       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8970       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8971       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8972     }
8973     if (UseGHASHIntrinsics) {
8974       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8975       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8976     }
8977     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8978       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8979     }
8980 
8981     if (UseMD5Intrinsics) {
8982       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
8983       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
8984     }
8985     if (UseSHA1Intrinsics) {
8986       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
8987       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
8988     }
8989     if (UseSHA256Intrinsics) {
8990       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
8991       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
8992     }
8993     if (UseSHA512Intrinsics) {
8994       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
8995       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
8996     }
8997     if (UseSHA3Intrinsics) {
8998       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
8999       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
9000     }
9001 
9002     if (UsePoly1305Intrinsics) {
9003       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
9004     }
9005 
9006     // generate Adler32 intrinsics code
9007     if (UseAdler32Intrinsics) {
9008       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
9009     }
9010 
9011 #endif // COMPILER2_OR_JVMCI
9012   }
9013 
9014  public:
9015   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
9016     switch(blob_id) {
9017     case initial_id:
9018       generate_initial_stubs();
9019       break;
9020      case continuation_id:
9021       generate_continuation_stubs();
9022       break;
9023     case compiler_id:
9024       generate_compiler_stubs();
9025       break;
9026     case final_id:
9027       generate_final_stubs();
9028       break;
9029     default:
9030       fatal("unexpected blob id: %d", blob_id);
9031       break;
9032     };
9033   }
9034 }; // end class declaration
9035 
9036 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
9037   StubGenerator g(code, blob_id);
9038 }
9039 
9040 
9041 #if defined (LINUX)
9042 
9043 // Define pointers to atomic stubs and initialize them to point to the
9044 // code in atomic_aarch64.S.
9045 
9046 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
9047   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
9048     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
9049   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
9050     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
9051 
9052 DEFAULT_ATOMIC_OP(fetch_add, 4, )
9053 DEFAULT_ATOMIC_OP(fetch_add, 8, )
9054 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
9055 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
9056 DEFAULT_ATOMIC_OP(xchg, 4, )
9057 DEFAULT_ATOMIC_OP(xchg, 8, )
9058 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
9059 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
9060 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
9061 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
9062 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
9063 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
9064 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
9065 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
9066 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
9067 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
9068 
9069 #undef DEFAULT_ATOMIC_OP
9070 
9071 #endif // LINUX