1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/continuation.hpp"
  46 #include "runtime/continuationEntry.inline.hpp"
  47 #include "runtime/frame.inline.hpp"
  48 #include "runtime/handles.inline.hpp"
  49 #include "runtime/javaThread.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubCodeGenerator.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "utilities/align.hpp"
  54 #include "utilities/globalDefinitions.hpp"
  55 #include "utilities/powerOfTwo.hpp"
  56 #ifdef COMPILER2
  57 #include "opto/runtime.hpp"
  58 #endif
  59 #if INCLUDE_ZGC
  60 #include "gc/z/zThreadLocalData.hpp"
  61 #endif
  62 
  63 // Declaration and definition of StubGenerator (no .hpp file).
  64 // For a more detailed description of the stub routine structure
  65 // see the comment in stubRoutines.hpp
  66 
  67 #undef __
  68 #define __ _masm->
  69 
  70 #ifdef PRODUCT
  71 #define BLOCK_COMMENT(str) /* nothing */
  72 #else
  73 #define BLOCK_COMMENT(str) __ block_comment(str)
  74 #endif
  75 
  76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  77 
  78 // Stub Code definitions
  79 
  80 class StubGenerator: public StubCodeGenerator {
  81  private:
  82 
  83 #ifdef PRODUCT
  84 #define inc_counter_np(counter) ((void)0)
  85 #else
  86   void inc_counter_np_(int& counter) {
  87     __ lea(rscratch2, ExternalAddress((address)&counter));
  88     __ ldrw(rscratch1, Address(rscratch2));
  89     __ addw(rscratch1, rscratch1, 1);
  90     __ strw(rscratch1, Address(rscratch2));
  91   }
  92 #define inc_counter_np(counter) \
  93   BLOCK_COMMENT("inc_counter " #counter); \
  94   inc_counter_np_(counter);
  95 #endif
  96 
  97   // Call stubs are used to call Java from C
  98   //
  99   // Arguments:
 100   //    c_rarg0:   call wrapper address                   address
 101   //    c_rarg1:   result                                 address
 102   //    c_rarg2:   result type                            BasicType
 103   //    c_rarg3:   method                                 Method*
 104   //    c_rarg4:   (interpreter) entry point              address
 105   //    c_rarg5:   parameters                             intptr_t*
 106   //    c_rarg6:   parameter size (in words)              int
 107   //    c_rarg7:   thread                                 Thread*
 108   //
 109   // There is no return from the stub itself as any Java result
 110   // is written to result
 111   //
 112   // we save r30 (lr) as the return PC at the base of the frame and
 113   // link r29 (fp) below it as the frame pointer installing sp (r31)
 114   // into fp.
 115   //
 116   // we save r0-r7, which accounts for all the c arguments.
 117   //
 118   // TODO: strictly do we need to save them all? they are treated as
 119   // volatile by C so could we omit saving the ones we are going to
 120   // place in global registers (thread? method?) or those we only use
 121   // during setup of the Java call?
 122   //
 123   // we don't need to save r8 which C uses as an indirect result location
 124   // return register.
 125   //
 126   // we don't need to save r9-r15 which both C and Java treat as
 127   // volatile
 128   //
 129   // we don't need to save r16-18 because Java does not use them
 130   //
 131   // we save r19-r28 which Java uses as scratch registers and C
 132   // expects to be callee-save
 133   //
 134   // we save the bottom 64 bits of each value stored in v8-v15; it is
 135   // the responsibility of the caller to preserve larger values.
 136   //
 137   // so the stub frame looks like this when we enter Java code
 138   //
 139   //     [ return_from_Java     ] <--- sp
 140   //     [ argument word n      ]
 141   //      ...
 142   // -27 [ argument word 1      ]
 143   // -26 [ saved v15            ] <--- sp_after_call
 144   // -25 [ saved v14            ]
 145   // -24 [ saved v13            ]
 146   // -23 [ saved v12            ]
 147   // -22 [ saved v11            ]
 148   // -21 [ saved v10            ]
 149   // -20 [ saved v9             ]
 150   // -19 [ saved v8             ]
 151   // -18 [ saved r28            ]
 152   // -17 [ saved r27            ]
 153   // -16 [ saved r26            ]
 154   // -15 [ saved r25            ]
 155   // -14 [ saved r24            ]
 156   // -13 [ saved r23            ]
 157   // -12 [ saved r22            ]
 158   // -11 [ saved r21            ]
 159   // -10 [ saved r20            ]
 160   //  -9 [ saved r19            ]
 161   //  -8 [ call wrapper    (r0) ]
 162   //  -7 [ result          (r1) ]
 163   //  -6 [ result type     (r2) ]
 164   //  -5 [ method          (r3) ]
 165   //  -4 [ entry point     (r4) ]
 166   //  -3 [ parameters      (r5) ]
 167   //  -2 [ parameter size  (r6) ]
 168   //  -1 [ thread (r7)          ]
 169   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 170   //   1 [ saved lr       (r30) ]
 171 
 172   // Call stub stack layout word offsets from fp
 173   enum call_stub_layout {
 174     sp_after_call_off = -26,
 175 
 176     d15_off            = -26,
 177     d13_off            = -24,
 178     d11_off            = -22,
 179     d9_off             = -20,
 180 
 181     r28_off            = -18,
 182     r26_off            = -16,
 183     r24_off            = -14,
 184     r22_off            = -12,
 185     r20_off            = -10,
 186     call_wrapper_off   =  -8,
 187     result_off         =  -7,
 188     result_type_off    =  -6,
 189     method_off         =  -5,
 190     entry_point_off    =  -4,
 191     parameter_size_off =  -2,
 192     thread_off         =  -1,
 193     fp_f               =   0,
 194     retaddr_off        =   1,
 195   };
 196 
 197   address generate_call_stub(address& return_address) {
 198     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 199            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 200            "adjust this code");
 201 
 202     StubCodeMark mark(this, "StubRoutines", "call_stub");
 203     address start = __ pc();
 204 
 205     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 206 
 207     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 208     const Address result        (rfp, result_off         * wordSize);
 209     const Address result_type   (rfp, result_type_off    * wordSize);
 210     const Address method        (rfp, method_off         * wordSize);
 211     const Address entry_point   (rfp, entry_point_off    * wordSize);
 212     const Address parameter_size(rfp, parameter_size_off * wordSize);
 213 
 214     const Address thread        (rfp, thread_off         * wordSize);
 215 
 216     const Address d15_save      (rfp, d15_off * wordSize);
 217     const Address d13_save      (rfp, d13_off * wordSize);
 218     const Address d11_save      (rfp, d11_off * wordSize);
 219     const Address d9_save       (rfp, d9_off * wordSize);
 220 
 221     const Address r28_save      (rfp, r28_off * wordSize);
 222     const Address r26_save      (rfp, r26_off * wordSize);
 223     const Address r24_save      (rfp, r24_off * wordSize);
 224     const Address r22_save      (rfp, r22_off * wordSize);
 225     const Address r20_save      (rfp, r20_off * wordSize);
 226 
 227     // stub code
 228 
 229     address aarch64_entry = __ pc();
 230 
 231     // set up frame and move sp to end of save area
 232     __ enter();
 233     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 234 
 235     // save register parameters and Java scratch/global registers
 236     // n.b. we save thread even though it gets installed in
 237     // rthread because we want to sanity check rthread later
 238     __ str(c_rarg7,  thread);
 239     __ strw(c_rarg6, parameter_size);
 240     __ stp(c_rarg4, c_rarg5,  entry_point);
 241     __ stp(c_rarg2, c_rarg3,  result_type);
 242     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 243 
 244     __ stp(r20, r19,   r20_save);
 245     __ stp(r22, r21,   r22_save);
 246     __ stp(r24, r23,   r24_save);
 247     __ stp(r26, r25,   r26_save);
 248     __ stp(r28, r27,   r28_save);
 249 
 250     __ stpd(v9,  v8,   d9_save);
 251     __ stpd(v11, v10,  d11_save);
 252     __ stpd(v13, v12,  d13_save);
 253     __ stpd(v15, v14,  d15_save);
 254 
 255     // install Java thread in global register now we have saved
 256     // whatever value it held
 257     __ mov(rthread, c_rarg7);
 258     // And method
 259     __ mov(rmethod, c_rarg3);
 260 
 261     // set up the heapbase register
 262     __ reinit_heapbase();
 263 
 264 #ifdef ASSERT
 265     // make sure we have no pending exceptions
 266     {
 267       Label L;
 268       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 269       __ cmp(rscratch1, (u1)NULL_WORD);
 270       __ br(Assembler::EQ, L);
 271       __ stop("StubRoutines::call_stub: entered with pending exception");
 272       __ BIND(L);
 273     }
 274 #endif
 275     // pass parameters if any
 276     __ mov(esp, sp);
 277     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 278     __ andr(sp, rscratch1, -2 * wordSize);
 279 
 280     BLOCK_COMMENT("pass parameters if any");
 281     Label parameters_done;
 282     // parameter count is still in c_rarg6
 283     // and parameter pointer identifying param 1 is in c_rarg5
 284     __ cbzw(c_rarg6, parameters_done);
 285 
 286     address loop = __ pc();
 287     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 288     __ subsw(c_rarg6, c_rarg6, 1);
 289     __ push(rscratch1);
 290     __ br(Assembler::GT, loop);
 291 
 292     __ BIND(parameters_done);
 293 
 294     // call Java entry -- passing methdoOop, and current sp
 295     //      rmethod: Method*
 296     //      r19_sender_sp: sender sp
 297     BLOCK_COMMENT("call Java function");
 298     __ mov(r19_sender_sp, sp);
 299     __ blr(c_rarg4);
 300 
 301     // we do this here because the notify will already have been done
 302     // if we get to the next instruction via an exception
 303     //
 304     // n.b. adding this instruction here affects the calculation of
 305     // whether or not a routine returns to the call stub (used when
 306     // doing stack walks) since the normal test is to check the return
 307     // pc against the address saved below. so we may need to allow for
 308     // this extra instruction in the check.
 309 
 310     // save current address for use by exception handling code
 311 
 312     return_address = __ pc();
 313 
 314     // store result depending on type (everything that is not
 315     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 316     // n.b. this assumes Java returns an integral result in r0
 317     // and a floating result in j_farg0
 318     __ ldr(j_rarg2, result);
 319     Label is_long, is_float, is_double, exit;
 320     __ ldr(j_rarg1, result_type);
 321     __ cmp(j_rarg1, (u1)T_OBJECT);
 322     __ br(Assembler::EQ, is_long);
 323     __ cmp(j_rarg1, (u1)T_LONG);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(j_rarg1, (u1)T_FLOAT);
 326     __ br(Assembler::EQ, is_float);
 327     __ cmp(j_rarg1, (u1)T_DOUBLE);
 328     __ br(Assembler::EQ, is_double);
 329 
 330     // handle T_INT case
 331     __ strw(r0, Address(j_rarg2));
 332 
 333     __ BIND(exit);
 334 
 335     // pop parameters
 336     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 337 
 338 #ifdef ASSERT
 339     // verify that threads correspond
 340     {
 341       Label L, S;
 342       __ ldr(rscratch1, thread);
 343       __ cmp(rthread, rscratch1);
 344       __ br(Assembler::NE, S);
 345       __ get_thread(rscratch1);
 346       __ cmp(rthread, rscratch1);
 347       __ br(Assembler::EQ, L);
 348       __ BIND(S);
 349       __ stop("StubRoutines::call_stub: threads must correspond");
 350       __ BIND(L);
 351     }
 352 #endif
 353 
 354     __ pop_cont_fastpath(rthread);
 355 
 356     // restore callee-save registers
 357     __ ldpd(v15, v14,  d15_save);
 358     __ ldpd(v13, v12,  d13_save);
 359     __ ldpd(v11, v10,  d11_save);
 360     __ ldpd(v9,  v8,   d9_save);
 361 
 362     __ ldp(r28, r27,   r28_save);
 363     __ ldp(r26, r25,   r26_save);
 364     __ ldp(r24, r23,   r24_save);
 365     __ ldp(r22, r21,   r22_save);
 366     __ ldp(r20, r19,   r20_save);
 367 
 368     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 369     __ ldrw(c_rarg2, result_type);
 370     __ ldr(c_rarg3,  method);
 371     __ ldp(c_rarg4, c_rarg5,  entry_point);
 372     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 373 
 374     // leave frame and return to caller
 375     __ leave();
 376     __ ret(lr);
 377 
 378     // handle return types different from T_INT
 379 
 380     __ BIND(is_long);
 381     __ str(r0, Address(j_rarg2, 0));
 382     __ br(Assembler::AL, exit);
 383 
 384     __ BIND(is_float);
 385     __ strs(j_farg0, Address(j_rarg2, 0));
 386     __ br(Assembler::AL, exit);
 387 
 388     __ BIND(is_double);
 389     __ strd(j_farg0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     return start;
 393   }
 394 
 395   // Return point for a Java call if there's an exception thrown in
 396   // Java code.  The exception is caught and transformed into a
 397   // pending exception stored in JavaThread that can be tested from
 398   // within the VM.
 399   //
 400   // Note: Usually the parameters are removed by the callee. In case
 401   // of an exception crossing an activation frame boundary, that is
 402   // not the case if the callee is compiled code => need to setup the
 403   // rsp.
 404   //
 405   // r0: exception oop
 406 
 407   address generate_catch_exception() {
 408     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 409     address start = __ pc();
 410 
 411     // same as in generate_call_stub():
 412     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 413     const Address thread        (rfp, thread_off         * wordSize);
 414 
 415 #ifdef ASSERT
 416     // verify that threads correspond
 417     {
 418       Label L, S;
 419       __ ldr(rscratch1, thread);
 420       __ cmp(rthread, rscratch1);
 421       __ br(Assembler::NE, S);
 422       __ get_thread(rscratch1);
 423       __ cmp(rthread, rscratch1);
 424       __ br(Assembler::EQ, L);
 425       __ bind(S);
 426       __ stop("StubRoutines::catch_exception: threads must correspond");
 427       __ bind(L);
 428     }
 429 #endif
 430 
 431     // set pending exception
 432     __ verify_oop(r0);
 433 
 434     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 435     __ mov(rscratch1, (address)__FILE__);
 436     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 437     __ movw(rscratch1, (int)__LINE__);
 438     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 439 
 440     // complete return to VM
 441     assert(StubRoutines::_call_stub_return_address != nullptr,
 442            "_call_stub_return_address must have been generated before");
 443     __ b(StubRoutines::_call_stub_return_address);
 444 
 445     return start;
 446   }
 447 
 448   // Continuation point for runtime calls returning with a pending
 449   // exception.  The pending exception check happened in the runtime
 450   // or native call stub.  The pending exception in Thread is
 451   // converted into a Java-level exception.
 452   //
 453   // Contract with Java-level exception handlers:
 454   // r0: exception
 455   // r3: throwing pc
 456   //
 457   // NOTE: At entry of this stub, exception-pc must be in LR !!
 458 
 459   // NOTE: this is always used as a jump target within generated code
 460   // so it just needs to be generated code with no x86 prolog
 461 
 462   address generate_forward_exception() {
 463     StubCodeMark mark(this, "StubRoutines", "forward exception");
 464     address start = __ pc();
 465 
 466     // Upon entry, LR points to the return address returning into
 467     // Java (interpreted or compiled) code; i.e., the return address
 468     // becomes the throwing pc.
 469     //
 470     // Arguments pushed before the runtime call are still on the stack
 471     // but the exception handler will reset the stack pointer ->
 472     // ignore them.  A potential result in registers can be ignored as
 473     // well.
 474 
 475 #ifdef ASSERT
 476     // make sure this code is only executed if there is a pending exception
 477     {
 478       Label L;
 479       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 480       __ cbnz(rscratch1, L);
 481       __ stop("StubRoutines::forward exception: no pending exception (1)");
 482       __ bind(L);
 483     }
 484 #endif
 485 
 486     // compute exception handler into r19
 487 
 488     // call the VM to find the handler address associated with the
 489     // caller address. pass thread in r0 and caller pc (ret address)
 490     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 491     // the stack.
 492     __ mov(c_rarg1, lr);
 493     // lr will be trashed by the VM call so we move it to R19
 494     // (callee-saved) because we also need to pass it to the handler
 495     // returned by this call.
 496     __ mov(r19, lr);
 497     BLOCK_COMMENT("call exception_handler_for_return_address");
 498     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 499                          SharedRuntime::exception_handler_for_return_address),
 500                     rthread, c_rarg1);
 501     // Reinitialize the ptrue predicate register, in case the external runtime
 502     // call clobbers ptrue reg, as we may return to SVE compiled code.
 503     __ reinitialize_ptrue();
 504 
 505     // we should not really care that lr is no longer the callee
 506     // address. we saved the value the handler needs in r19 so we can
 507     // just copy it to r3. however, the C2 handler will push its own
 508     // frame and then calls into the VM and the VM code asserts that
 509     // the PC for the frame above the handler belongs to a compiled
 510     // Java method. So, we restore lr here to satisfy that assert.
 511     __ mov(lr, r19);
 512     // setup r0 & r3 & clear pending exception
 513     __ mov(r3, r19);
 514     __ mov(r19, r0);
 515     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 516     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 517 
 518 #ifdef ASSERT
 519     // make sure exception is set
 520     {
 521       Label L;
 522       __ cbnz(r0, L);
 523       __ stop("StubRoutines::forward exception: no pending exception (2)");
 524       __ bind(L);
 525     }
 526 #endif
 527 
 528     // continue at exception handler
 529     // r0: exception
 530     // r3: throwing pc
 531     // r19: exception handler
 532     __ verify_oop(r0);
 533     __ br(r19);
 534 
 535     return start;
 536   }
 537 
 538   // Non-destructive plausibility checks for oops
 539   //
 540   // Arguments:
 541   //    r0: oop to verify
 542   //    rscratch1: error message
 543   //
 544   // Stack after saving c_rarg3:
 545   //    [tos + 0]: saved c_rarg3
 546   //    [tos + 1]: saved c_rarg2
 547   //    [tos + 2]: saved lr
 548   //    [tos + 3]: saved rscratch2
 549   //    [tos + 4]: saved r0
 550   //    [tos + 5]: saved rscratch1
 551   address generate_verify_oop() {
 552 
 553     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 554     address start = __ pc();
 555 
 556     Label exit, error;
 557 
 558     // save c_rarg2 and c_rarg3
 559     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 560 
 561     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 562     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 563     __ ldr(c_rarg3, Address(c_rarg2));
 564     __ add(c_rarg3, c_rarg3, 1);
 565     __ str(c_rarg3, Address(c_rarg2));
 566 
 567     // object is in r0
 568     // make sure object is 'reasonable'
 569     __ cbz(r0, exit); // if obj is null it is OK
 570 
 571     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 572     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 573 
 574     // return if everything seems ok
 575     __ bind(exit);
 576 
 577     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 578     __ ret(lr);
 579 
 580     // handle errors
 581     __ bind(error);
 582     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 583 
 584     __ push(RegSet::range(r0, r29), sp);
 585     // debug(char* msg, int64_t pc, int64_t regs[])
 586     __ mov(c_rarg0, rscratch1);      // pass address of error message
 587     __ mov(c_rarg1, lr);             // pass return address
 588     __ mov(c_rarg2, sp);             // pass address of regs on stack
 589 #ifndef PRODUCT
 590     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 591 #endif
 592     BLOCK_COMMENT("call MacroAssembler::debug");
 593     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 594     __ blr(rscratch1);
 595     __ hlt(0);
 596 
 597     return start;
 598   }
 599 
 600   // Generate indices for iota vector.
 601   address generate_iota_indices(const char *stub_name) {
 602     __ align(CodeEntryAlignment);
 603     StubCodeMark mark(this, "StubRoutines", stub_name);
 604     address start = __ pc();
 605     // B
 606     __ emit_data64(0x0706050403020100, relocInfo::none);
 607     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 608     // H
 609     __ emit_data64(0x0003000200010000, relocInfo::none);
 610     __ emit_data64(0x0007000600050004, relocInfo::none);
 611     // S
 612     __ emit_data64(0x0000000100000000, relocInfo::none);
 613     __ emit_data64(0x0000000300000002, relocInfo::none);
 614     // D
 615     __ emit_data64(0x0000000000000000, relocInfo::none);
 616     __ emit_data64(0x0000000000000001, relocInfo::none);
 617     // S - FP
 618     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 619     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 620     // D - FP
 621     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 622     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 623     return start;
 624   }
 625 
 626   // The inner part of zero_words().  This is the bulk operation,
 627   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 628   // caller is responsible for zeroing the last few words.
 629   //
 630   // Inputs:
 631   // r10: the HeapWord-aligned base address of an array to zero.
 632   // r11: the count in HeapWords, r11 > 0.
 633   //
 634   // Returns r10 and r11, adjusted for the caller to clear.
 635   // r10: the base address of the tail of words left to clear.
 636   // r11: the number of words in the tail.
 637   //      r11 < MacroAssembler::zero_words_block_size.
 638 
 639   address generate_zero_blocks() {
 640     Label done;
 641     Label base_aligned;
 642 
 643     Register base = r10, cnt = r11;
 644 
 645     __ align(CodeEntryAlignment);
 646     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 647     address start = __ pc();
 648 
 649     if (UseBlockZeroing) {
 650       int zva_length = VM_Version::zva_length();
 651 
 652       // Ensure ZVA length can be divided by 16. This is required by
 653       // the subsequent operations.
 654       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 655 
 656       __ tbz(base, 3, base_aligned);
 657       __ str(zr, Address(__ post(base, 8)));
 658       __ sub(cnt, cnt, 1);
 659       __ bind(base_aligned);
 660 
 661       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 662       // alignment.
 663       Label small;
 664       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 665       __ subs(rscratch1, cnt, low_limit >> 3);
 666       __ br(Assembler::LT, small);
 667       __ zero_dcache_blocks(base, cnt);
 668       __ bind(small);
 669     }
 670 
 671     {
 672       // Number of stp instructions we'll unroll
 673       const int unroll =
 674         MacroAssembler::zero_words_block_size / 2;
 675       // Clear the remaining blocks.
 676       Label loop;
 677       __ subs(cnt, cnt, unroll * 2);
 678       __ br(Assembler::LT, done);
 679       __ bind(loop);
 680       for (int i = 0; i < unroll; i++)
 681         __ stp(zr, zr, __ post(base, 16));
 682       __ subs(cnt, cnt, unroll * 2);
 683       __ br(Assembler::GE, loop);
 684       __ bind(done);
 685       __ add(cnt, cnt, unroll * 2);
 686     }
 687 
 688     __ ret(lr);
 689 
 690     return start;
 691   }
 692 
 693 
 694   typedef enum {
 695     copy_forwards = 1,
 696     copy_backwards = -1
 697   } copy_direction;
 698 
 699   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 700   // for arraycopy stubs.
 701   class ArrayCopyBarrierSetHelper : StackObj {
 702     BarrierSetAssembler* _bs_asm;
 703     MacroAssembler* _masm;
 704     DecoratorSet _decorators;
 705     BasicType _type;
 706     Register _gct1;
 707     Register _gct2;
 708     Register _gct3;
 709     FloatRegister _gcvt1;
 710     FloatRegister _gcvt2;
 711     FloatRegister _gcvt3;
 712 
 713   public:
 714     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 715                               DecoratorSet decorators,
 716                               BasicType type,
 717                               Register gct1,
 718                               Register gct2,
 719                               Register gct3,
 720                               FloatRegister gcvt1,
 721                               FloatRegister gcvt2,
 722                               FloatRegister gcvt3)
 723       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 724         _masm(masm),
 725         _decorators(decorators),
 726         _type(type),
 727         _gct1(gct1),
 728         _gct2(gct2),
 729         _gct3(gct3),
 730         _gcvt1(gcvt1),
 731         _gcvt2(gcvt2),
 732         _gcvt3(gcvt3) {
 733     }
 734 
 735     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 736       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 737                             dst1, dst2, src,
 738                             _gct1, _gct2, _gcvt1);
 739     }
 740 
 741     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 742       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 743                              dst, src1, src2,
 744                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 745     }
 746 
 747     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 748       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 749                             dst1, dst2, src,
 750                             _gct1);
 751     }
 752 
 753     void copy_store_at_16(Address dst, Register src1, Register src2) {
 754       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 755                              dst, src1, src2,
 756                              _gct1, _gct2, _gct3);
 757     }
 758 
 759     void copy_load_at_8(Register dst, Address src) {
 760       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 761                             dst, noreg, src,
 762                             _gct1);
 763     }
 764 
 765     void copy_store_at_8(Address dst, Register src) {
 766       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 767                              dst, src, noreg,
 768                              _gct1, _gct2, _gct3);
 769     }
 770   };
 771 
 772   // Bulk copy of blocks of 8 words.
 773   //
 774   // count is a count of words.
 775   //
 776   // Precondition: count >= 8
 777   //
 778   // Postconditions:
 779   //
 780   // The least significant bit of count contains the remaining count
 781   // of words to copy.  The rest of count is trash.
 782   //
 783   // s and d are adjusted to point to the remaining words to copy
 784   //
 785   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 786                            copy_direction direction) {
 787     int unit = wordSize * direction;
 788     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 789 
 790     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 791       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 792     const Register stride = r14;
 793     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 794     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 795     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 796 
 797     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 798     assert_different_registers(s, d, count, rscratch1, rscratch2);
 799 
 800     Label again, drain;
 801     const char *stub_name;
 802     if (direction == copy_forwards)
 803       stub_name = "forward_copy_longs";
 804     else
 805       stub_name = "backward_copy_longs";
 806 
 807     __ align(CodeEntryAlignment);
 808 
 809     StubCodeMark mark(this, "StubRoutines", stub_name);
 810 
 811     __ bind(start);
 812 
 813     Label unaligned_copy_long;
 814     if (AvoidUnalignedAccesses) {
 815       __ tbnz(d, 3, unaligned_copy_long);
 816     }
 817 
 818     if (direction == copy_forwards) {
 819       __ sub(s, s, bias);
 820       __ sub(d, d, bias);
 821     }
 822 
 823 #ifdef ASSERT
 824     // Make sure we are never given < 8 words
 825     {
 826       Label L;
 827       __ cmp(count, (u1)8);
 828       __ br(Assembler::GE, L);
 829       __ stop("genrate_copy_longs called with < 8 words");
 830       __ bind(L);
 831     }
 832 #endif
 833 
 834     // Fill 8 registers
 835     if (UseSIMDForMemoryOps) {
 836       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 837       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 838     } else {
 839       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 840       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 841       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 842       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 843     }
 844 
 845     __ subs(count, count, 16);
 846     __ br(Assembler::LO, drain);
 847 
 848     int prefetch = PrefetchCopyIntervalInBytes;
 849     bool use_stride = false;
 850     if (direction == copy_backwards) {
 851        use_stride = prefetch > 256;
 852        prefetch = -prefetch;
 853        if (use_stride) __ mov(stride, prefetch);
 854     }
 855 
 856     __ bind(again);
 857 
 858     if (PrefetchCopyIntervalInBytes > 0)
 859       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 860 
 861     if (UseSIMDForMemoryOps) {
 862       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 863       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 864       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 865       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 866     } else {
 867       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 868       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 869       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 870       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 871       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 872       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 873       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 874       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 875     }
 876 
 877     __ subs(count, count, 8);
 878     __ br(Assembler::HS, again);
 879 
 880     // Drain
 881     __ bind(drain);
 882     if (UseSIMDForMemoryOps) {
 883       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 884       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 885     } else {
 886       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 887       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 888       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 889       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 890     }
 891 
 892     {
 893       Label L1, L2;
 894       __ tbz(count, exact_log2(4), L1);
 895       if (UseSIMDForMemoryOps) {
 896         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 897         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 898       } else {
 899         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 900         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 901         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 902         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 903       }
 904       __ bind(L1);
 905 
 906       if (direction == copy_forwards) {
 907         __ add(s, s, bias);
 908         __ add(d, d, bias);
 909       }
 910 
 911       __ tbz(count, 1, L2);
 912       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 913       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 914       __ bind(L2);
 915     }
 916 
 917     __ ret(lr);
 918 
 919     if (AvoidUnalignedAccesses) {
 920       Label drain, again;
 921       // Register order for storing. Order is different for backward copy.
 922 
 923       __ bind(unaligned_copy_long);
 924 
 925       // source address is even aligned, target odd aligned
 926       //
 927       // when forward copying word pairs we read long pairs at offsets
 928       // {0, 2, 4, 6} (in long words). when backwards copying we read
 929       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 930       // address by -2 in the forwards case so we can compute the
 931       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 932       // or -1.
 933       //
 934       // when forward copying we need to store 1 word, 3 pairs and
 935       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 936       // zero offset We adjust the destination by -1 which means we
 937       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 938       //
 939       // When backwards copyng we need to store 1 word, 3 pairs and
 940       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 941       // offsets {1, 3, 5, 7, 8} * unit.
 942 
 943       if (direction == copy_forwards) {
 944         __ sub(s, s, 16);
 945         __ sub(d, d, 8);
 946       }
 947 
 948       // Fill 8 registers
 949       //
 950       // for forwards copy s was offset by -16 from the original input
 951       // value of s so the register contents are at these offsets
 952       // relative to the 64 bit block addressed by that original input
 953       // and so on for each successive 64 byte block when s is updated
 954       //
 955       // t0 at offset 0,  t1 at offset 8
 956       // t2 at offset 16, t3 at offset 24
 957       // t4 at offset 32, t5 at offset 40
 958       // t6 at offset 48, t7 at offset 56
 959 
 960       // for backwards copy s was not offset so the register contents
 961       // are at these offsets into the preceding 64 byte block
 962       // relative to that original input and so on for each successive
 963       // preceding 64 byte block when s is updated. this explains the
 964       // slightly counter-intuitive looking pattern of register usage
 965       // in the stp instructions for backwards copy.
 966       //
 967       // t0 at offset -16, t1 at offset -8
 968       // t2 at offset -32, t3 at offset -24
 969       // t4 at offset -48, t5 at offset -40
 970       // t6 at offset -64, t7 at offset -56
 971 
 972       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 973       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 974       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 975       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 976 
 977       __ subs(count, count, 16);
 978       __ br(Assembler::LO, drain);
 979 
 980       int prefetch = PrefetchCopyIntervalInBytes;
 981       bool use_stride = false;
 982       if (direction == copy_backwards) {
 983          use_stride = prefetch > 256;
 984          prefetch = -prefetch;
 985          if (use_stride) __ mov(stride, prefetch);
 986       }
 987 
 988       __ bind(again);
 989 
 990       if (PrefetchCopyIntervalInBytes > 0)
 991         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 992 
 993       if (direction == copy_forwards) {
 994        // allowing for the offset of -8 the store instructions place
 995        // registers into the target 64 bit block at the following
 996        // offsets
 997        //
 998        // t0 at offset 0
 999        // t1 at offset 8,  t2 at offset 16
1000        // t3 at offset 24, t4 at offset 32
1001        // t5 at offset 40, t6 at offset 48
1002        // t7 at offset 56
1003 
1004         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1005         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1006         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1007         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1008         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1009         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1010         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1011         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1012         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1013       } else {
1014        // d was not offset when we started so the registers are
1015        // written into the 64 bit block preceding d with the following
1016        // offsets
1017        //
1018        // t1 at offset -8
1019        // t3 at offset -24, t0 at offset -16
1020        // t5 at offset -48, t2 at offset -32
1021        // t7 at offset -56, t4 at offset -48
1022        //                   t6 at offset -64
1023        //
1024        // note that this matches the offsets previously noted for the
1025        // loads
1026 
1027         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1028         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1029         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1030         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1031         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1032         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1033         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1034         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1035         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1036       }
1037 
1038       __ subs(count, count, 8);
1039       __ br(Assembler::HS, again);
1040 
1041       // Drain
1042       //
1043       // this uses the same pattern of offsets and register arguments
1044       // as above
1045       __ bind(drain);
1046       if (direction == copy_forwards) {
1047         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1048         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1049         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1050         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1051         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1052       } else {
1053         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1054         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1055         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1056         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1057         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1058       }
1059       // now we need to copy any remaining part block which may
1060       // include a 4 word block subblock and/or a 2 word subblock.
1061       // bits 2 and 1 in the count are the tell-tale for whether we
1062       // have each such subblock
1063       {
1064         Label L1, L2;
1065         __ tbz(count, exact_log2(4), L1);
1066        // this is the same as above but copying only 4 longs hence
1067        // with only one intervening stp between the str instructions
1068        // but note that the offsets and registers still follow the
1069        // same pattern
1070         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1071         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1072         if (direction == copy_forwards) {
1073           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1074           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1075           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1076         } else {
1077           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1078           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1079           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1080         }
1081         __ bind(L1);
1082 
1083         __ tbz(count, 1, L2);
1084        // this is the same as above but copying only 2 longs hence
1085        // there is no intervening stp between the str instructions
1086        // but note that the offset and register patterns are still
1087        // the same
1088         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1089         if (direction == copy_forwards) {
1090           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1091           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1092         } else {
1093           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1094           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1095         }
1096         __ bind(L2);
1097 
1098        // for forwards copy we need to re-adjust the offsets we
1099        // applied so that s and d are follow the last words written
1100 
1101        if (direction == copy_forwards) {
1102          __ add(s, s, 16);
1103          __ add(d, d, 8);
1104        }
1105 
1106       }
1107 
1108       __ ret(lr);
1109       }
1110   }
1111 
1112   // Small copy: less than 16 bytes.
1113   //
1114   // NB: Ignores all of the bits of count which represent more than 15
1115   // bytes, so a caller doesn't have to mask them.
1116 
1117   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1118     bool is_backwards = step < 0;
1119     size_t granularity = uabs(step);
1120     int direction = is_backwards ? -1 : 1;
1121 
1122     Label Lword, Lint, Lshort, Lbyte;
1123 
1124     assert(granularity
1125            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1126 
1127     const Register t0 = r3;
1128     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1129     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1130 
1131     // ??? I don't know if this bit-test-and-branch is the right thing
1132     // to do.  It does a lot of jumping, resulting in several
1133     // mispredicted branches.  It might make more sense to do this
1134     // with something like Duff's device with a single computed branch.
1135 
1136     __ tbz(count, 3 - exact_log2(granularity), Lword);
1137     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1138     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1139     __ bind(Lword);
1140 
1141     if (granularity <= sizeof (jint)) {
1142       __ tbz(count, 2 - exact_log2(granularity), Lint);
1143       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1144       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1145       __ bind(Lint);
1146     }
1147 
1148     if (granularity <= sizeof (jshort)) {
1149       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1150       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1151       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1152       __ bind(Lshort);
1153     }
1154 
1155     if (granularity <= sizeof (jbyte)) {
1156       __ tbz(count, 0, Lbyte);
1157       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1158       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1159       __ bind(Lbyte);
1160     }
1161   }
1162 
1163   Label copy_f, copy_b;
1164   Label copy_obj_f, copy_obj_b;
1165   Label copy_obj_uninit_f, copy_obj_uninit_b;
1166 
1167   // All-singing all-dancing memory copy.
1168   //
1169   // Copy count units of memory from s to d.  The size of a unit is
1170   // step, which can be positive or negative depending on the direction
1171   // of copy.  If is_aligned is false, we align the source address.
1172   //
1173 
1174   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1175                    Register s, Register d, Register count, int step) {
1176     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1177     bool is_backwards = step < 0;
1178     unsigned int granularity = uabs(step);
1179     const Register t0 = r3, t1 = r4;
1180 
1181     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1182     // load all the data before writing anything
1183     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1184     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1185     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1186     const Register send = r17, dend = r16;
1187     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1188     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1189     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1190 
1191     if (PrefetchCopyIntervalInBytes > 0)
1192       __ prfm(Address(s, 0), PLDL1KEEP);
1193     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1194     __ br(Assembler::HI, copy_big);
1195 
1196     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1197     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1198 
1199     __ cmp(count, u1(16/granularity));
1200     __ br(Assembler::LS, copy16);
1201 
1202     __ cmp(count, u1(64/granularity));
1203     __ br(Assembler::HI, copy80);
1204 
1205     __ cmp(count, u1(32/granularity));
1206     __ br(Assembler::LS, copy32);
1207 
1208     // 33..64 bytes
1209     if (UseSIMDForMemoryOps) {
1210       bs.copy_load_at_32(v0, v1, Address(s, 0));
1211       bs.copy_load_at_32(v2, v3, Address(send, -32));
1212       bs.copy_store_at_32(Address(d, 0), v0, v1);
1213       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1214     } else {
1215       bs.copy_load_at_16(t0, t1, Address(s, 0));
1216       bs.copy_load_at_16(t2, t3, Address(s, 16));
1217       bs.copy_load_at_16(t4, t5, Address(send, -32));
1218       bs.copy_load_at_16(t6, t7, Address(send, -16));
1219 
1220       bs.copy_store_at_16(Address(d, 0), t0, t1);
1221       bs.copy_store_at_16(Address(d, 16), t2, t3);
1222       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1223       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1224     }
1225     __ b(finish);
1226 
1227     // 17..32 bytes
1228     __ bind(copy32);
1229     bs.copy_load_at_16(t0, t1, Address(s, 0));
1230     bs.copy_load_at_16(t6, t7, Address(send, -16));
1231 
1232     bs.copy_store_at_16(Address(d, 0), t0, t1);
1233     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1234     __ b(finish);
1235 
1236     // 65..80/96 bytes
1237     // (96 bytes if SIMD because we do 32 byes per instruction)
1238     __ bind(copy80);
1239     if (UseSIMDForMemoryOps) {
1240       bs.copy_load_at_32(v0, v1, Address(s, 0));
1241       bs.copy_load_at_32(v2, v3, Address(s, 32));
1242       // Unaligned pointers can be an issue for copying.
1243       // The issue has more chances to happen when granularity of data is
1244       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1245       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1246       // The most performance drop has been seen for the range 65-80 bytes.
1247       // For such cases using the pair of ldp/stp instead of the third pair of
1248       // ldpq/stpq fixes the performance issue.
1249       if (granularity < sizeof (jint)) {
1250         Label copy96;
1251         __ cmp(count, u1(80/granularity));
1252         __ br(Assembler::HI, copy96);
1253         bs.copy_load_at_16(t0, t1, Address(send, -16));
1254 
1255         bs.copy_store_at_32(Address(d, 0), v0, v1);
1256         bs.copy_store_at_32(Address(d, 32), v2, v3);
1257 
1258         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1259         __ b(finish);
1260 
1261         __ bind(copy96);
1262       }
1263       bs.copy_load_at_32(v4, v5, Address(send, -32));
1264 
1265       bs.copy_store_at_32(Address(d, 0), v0, v1);
1266       bs.copy_store_at_32(Address(d, 32), v2, v3);
1267 
1268       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1269     } else {
1270       bs.copy_load_at_16(t0, t1, Address(s, 0));
1271       bs.copy_load_at_16(t2, t3, Address(s, 16));
1272       bs.copy_load_at_16(t4, t5, Address(s, 32));
1273       bs.copy_load_at_16(t6, t7, Address(s, 48));
1274       bs.copy_load_at_16(t8, t9, Address(send, -16));
1275 
1276       bs.copy_store_at_16(Address(d, 0), t0, t1);
1277       bs.copy_store_at_16(Address(d, 16), t2, t3);
1278       bs.copy_store_at_16(Address(d, 32), t4, t5);
1279       bs.copy_store_at_16(Address(d, 48), t6, t7);
1280       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1281     }
1282     __ b(finish);
1283 
1284     // 0..16 bytes
1285     __ bind(copy16);
1286     __ cmp(count, u1(8/granularity));
1287     __ br(Assembler::LO, copy8);
1288 
1289     // 8..16 bytes
1290     bs.copy_load_at_8(t0, Address(s, 0));
1291     bs.copy_load_at_8(t1, Address(send, -8));
1292     bs.copy_store_at_8(Address(d, 0), t0);
1293     bs.copy_store_at_8(Address(dend, -8), t1);
1294     __ b(finish);
1295 
1296     if (granularity < 8) {
1297       // 4..7 bytes
1298       __ bind(copy8);
1299       __ tbz(count, 2 - exact_log2(granularity), copy4);
1300       __ ldrw(t0, Address(s, 0));
1301       __ ldrw(t1, Address(send, -4));
1302       __ strw(t0, Address(d, 0));
1303       __ strw(t1, Address(dend, -4));
1304       __ b(finish);
1305       if (granularity < 4) {
1306         // 0..3 bytes
1307         __ bind(copy4);
1308         __ cbz(count, finish); // get rid of 0 case
1309         if (granularity == 2) {
1310           __ ldrh(t0, Address(s, 0));
1311           __ strh(t0, Address(d, 0));
1312         } else { // granularity == 1
1313           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1314           // the first and last byte.
1315           // Handle the 3 byte case by loading and storing base + count/2
1316           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1317           // This does means in the 1 byte case we load/store the same
1318           // byte 3 times.
1319           __ lsr(count, count, 1);
1320           __ ldrb(t0, Address(s, 0));
1321           __ ldrb(t1, Address(send, -1));
1322           __ ldrb(t2, Address(s, count));
1323           __ strb(t0, Address(d, 0));
1324           __ strb(t1, Address(dend, -1));
1325           __ strb(t2, Address(d, count));
1326         }
1327         __ b(finish);
1328       }
1329     }
1330 
1331     __ bind(copy_big);
1332     if (is_backwards) {
1333       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1334       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1335     }
1336 
1337     // Now we've got the small case out of the way we can align the
1338     // source address on a 2-word boundary.
1339 
1340     // Here we will materialize a count in r15, which is used by copy_memory_small
1341     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1342     // Up until here, we have used t9, which aliases r15, but from here on, that register
1343     // can not be used as a temp register, as it contains the count.
1344 
1345     Label aligned;
1346 
1347     if (is_aligned) {
1348       // We may have to adjust by 1 word to get s 2-word-aligned.
1349       __ tbz(s, exact_log2(wordSize), aligned);
1350       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1351       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1352       __ sub(count, count, wordSize/granularity);
1353     } else {
1354       if (is_backwards) {
1355         __ andr(r15, s, 2 * wordSize - 1);
1356       } else {
1357         __ neg(r15, s);
1358         __ andr(r15, r15, 2 * wordSize - 1);
1359       }
1360       // r15 is the byte adjustment needed to align s.
1361       __ cbz(r15, aligned);
1362       int shift = exact_log2(granularity);
1363       if (shift)  __ lsr(r15, r15, shift);
1364       __ sub(count, count, r15);
1365 
1366 #if 0
1367       // ?? This code is only correct for a disjoint copy.  It may or
1368       // may not make sense to use it in that case.
1369 
1370       // Copy the first pair; s and d may not be aligned.
1371       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1372       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1373 
1374       // Align s and d, adjust count
1375       if (is_backwards) {
1376         __ sub(s, s, r15);
1377         __ sub(d, d, r15);
1378       } else {
1379         __ add(s, s, r15);
1380         __ add(d, d, r15);
1381       }
1382 #else
1383       copy_memory_small(decorators, type, s, d, r15, step);
1384 #endif
1385     }
1386 
1387     __ bind(aligned);
1388 
1389     // s is now 2-word-aligned.
1390 
1391     // We have a count of units and some trailing bytes.  Adjust the
1392     // count and do a bulk copy of words.
1393     __ lsr(r15, count, exact_log2(wordSize/granularity));
1394     if (direction == copy_forwards) {
1395       if (type != T_OBJECT) {
1396         __ bl(copy_f);
1397       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1398         __ bl(copy_obj_uninit_f);
1399       } else {
1400         __ bl(copy_obj_f);
1401       }
1402     } else {
1403       if (type != T_OBJECT) {
1404         __ bl(copy_b);
1405       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1406         __ bl(copy_obj_uninit_b);
1407       } else {
1408         __ bl(copy_obj_b);
1409       }
1410     }
1411 
1412     // And the tail.
1413     copy_memory_small(decorators, type, s, d, count, step);
1414 
1415     if (granularity >= 8) __ bind(copy8);
1416     if (granularity >= 4) __ bind(copy4);
1417     __ bind(finish);
1418   }
1419 
1420 
1421   void clobber_registers() {
1422 #ifdef ASSERT
1423     RegSet clobbered
1424       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1425     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1426     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1427     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1428       __ mov(*it, rscratch1);
1429     }
1430 #endif
1431 
1432   }
1433 
1434   // Scan over array at a for count oops, verifying each one.
1435   // Preserves a and count, clobbers rscratch1 and rscratch2.
1436   void verify_oop_array (int size, Register a, Register count, Register temp) {
1437     Label loop, end;
1438     __ mov(rscratch1, a);
1439     __ mov(rscratch2, zr);
1440     __ bind(loop);
1441     __ cmp(rscratch2, count);
1442     __ br(Assembler::HS, end);
1443     if (size == wordSize) {
1444       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1445       __ verify_oop(temp);
1446     } else {
1447       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1448       __ decode_heap_oop(temp); // calls verify_oop
1449     }
1450     __ add(rscratch2, rscratch2, 1);
1451     __ b(loop);
1452     __ bind(end);
1453   }
1454 
1455   // Arguments:
1456   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1457   //             ignored
1458   //   is_oop  - true => oop array, so generate store check code
1459   //   name    - stub name string
1460   //
1461   // Inputs:
1462   //   c_rarg0   - source array address
1463   //   c_rarg1   - destination array address
1464   //   c_rarg2   - element count, treated as ssize_t, can be zero
1465   //
1466   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1467   // the hardware handle it.  The two dwords within qwords that span
1468   // cache line boundaries will still be loaded and stored atomically.
1469   //
1470   // Side Effects:
1471   //   disjoint_int_copy_entry is set to the no-overlap entry point
1472   //   used by generate_conjoint_int_oop_copy().
1473   //
1474   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1475                                   const char *name, bool dest_uninitialized = false) {
1476     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1477     RegSet saved_reg = RegSet::of(s, d, count);
1478     __ align(CodeEntryAlignment);
1479     StubCodeMark mark(this, "StubRoutines", name);
1480     address start = __ pc();
1481     __ enter();
1482 
1483     if (entry != nullptr) {
1484       *entry = __ pc();
1485       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1486       BLOCK_COMMENT("Entry:");
1487     }
1488 
1489     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1490     if (dest_uninitialized) {
1491       decorators |= IS_DEST_UNINITIALIZED;
1492     }
1493     if (aligned) {
1494       decorators |= ARRAYCOPY_ALIGNED;
1495     }
1496 
1497     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1498     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1499 
1500     if (is_oop) {
1501       // save regs before copy_memory
1502       __ push(RegSet::of(d, count), sp);
1503     }
1504     {
1505       // UnsafeCopyMemory page error: continue after ucm
1506       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1507       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1508       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1509     }
1510 
1511     if (is_oop) {
1512       __ pop(RegSet::of(d, count), sp);
1513       if (VerifyOops)
1514         verify_oop_array(size, d, count, r16);
1515     }
1516 
1517     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1518 
1519     __ leave();
1520     __ mov(r0, zr); // return 0
1521     __ ret(lr);
1522     return start;
1523   }
1524 
1525   // Arguments:
1526   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1527   //             ignored
1528   //   is_oop  - true => oop array, so generate store check code
1529   //   name    - stub name string
1530   //
1531   // Inputs:
1532   //   c_rarg0   - source array address
1533   //   c_rarg1   - destination array address
1534   //   c_rarg2   - element count, treated as ssize_t, can be zero
1535   //
1536   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1537   // the hardware handle it.  The two dwords within qwords that span
1538   // cache line boundaries will still be loaded and stored atomically.
1539   //
1540   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1541                                  address *entry, const char *name,
1542                                  bool dest_uninitialized = false) {
1543     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1544     RegSet saved_regs = RegSet::of(s, d, count);
1545     StubCodeMark mark(this, "StubRoutines", name);
1546     address start = __ pc();
1547     __ enter();
1548 
1549     if (entry != nullptr) {
1550       *entry = __ pc();
1551       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1552       BLOCK_COMMENT("Entry:");
1553     }
1554 
1555     // use fwd copy when (d-s) above_equal (count*size)
1556     __ sub(rscratch1, d, s);
1557     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1558     __ br(Assembler::HS, nooverlap_target);
1559 
1560     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1561     if (dest_uninitialized) {
1562       decorators |= IS_DEST_UNINITIALIZED;
1563     }
1564     if (aligned) {
1565       decorators |= ARRAYCOPY_ALIGNED;
1566     }
1567 
1568     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1569     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1570 
1571     if (is_oop) {
1572       // save regs before copy_memory
1573       __ push(RegSet::of(d, count), sp);
1574     }
1575     {
1576       // UnsafeCopyMemory page error: continue after ucm
1577       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1578       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1579       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1580     }
1581     if (is_oop) {
1582       __ pop(RegSet::of(d, count), sp);
1583       if (VerifyOops)
1584         verify_oop_array(size, d, count, r16);
1585     }
1586     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1587     __ leave();
1588     __ mov(r0, zr); // return 0
1589     __ ret(lr);
1590     return start;
1591 }
1592 
1593   // Arguments:
1594   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1595   //             ignored
1596   //   name    - stub name string
1597   //
1598   // Inputs:
1599   //   c_rarg0   - source array address
1600   //   c_rarg1   - destination array address
1601   //   c_rarg2   - element count, treated as ssize_t, can be zero
1602   //
1603   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1604   // we let the hardware handle it.  The one to eight bytes within words,
1605   // dwords or qwords that span cache line boundaries will still be loaded
1606   // and stored atomically.
1607   //
1608   // Side Effects:
1609   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1610   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1611   // we let the hardware handle it.  The one to eight bytes within words,
1612   // dwords or qwords that span cache line boundaries will still be loaded
1613   // and stored atomically.
1614   //
1615   // Side Effects:
1616   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1617   //   used by generate_conjoint_byte_copy().
1618   //
1619   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1620     const bool not_oop = false;
1621     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1622   }
1623 
1624   // Arguments:
1625   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1626   //             ignored
1627   //   name    - stub name string
1628   //
1629   // Inputs:
1630   //   c_rarg0   - source array address
1631   //   c_rarg1   - destination array address
1632   //   c_rarg2   - element count, treated as ssize_t, can be zero
1633   //
1634   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1635   // we let the hardware handle it.  The one to eight bytes within words,
1636   // dwords or qwords that span cache line boundaries will still be loaded
1637   // and stored atomically.
1638   //
1639   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1640                                       address* entry, const char *name) {
1641     const bool not_oop = false;
1642     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1643   }
1644 
1645   // Arguments:
1646   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1647   //             ignored
1648   //   name    - stub name string
1649   //
1650   // Inputs:
1651   //   c_rarg0   - source array address
1652   //   c_rarg1   - destination array address
1653   //   c_rarg2   - element count, treated as ssize_t, can be zero
1654   //
1655   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1656   // let the hardware handle it.  The two or four words within dwords
1657   // or qwords that span cache line boundaries will still be loaded
1658   // and stored atomically.
1659   //
1660   // Side Effects:
1661   //   disjoint_short_copy_entry is set to the no-overlap entry point
1662   //   used by generate_conjoint_short_copy().
1663   //
1664   address generate_disjoint_short_copy(bool aligned,
1665                                        address* entry, const char *name) {
1666     const bool not_oop = false;
1667     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1668   }
1669 
1670   // Arguments:
1671   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1672   //             ignored
1673   //   name    - stub name string
1674   //
1675   // Inputs:
1676   //   c_rarg0   - source array address
1677   //   c_rarg1   - destination array address
1678   //   c_rarg2   - element count, treated as ssize_t, can be zero
1679   //
1680   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1681   // let the hardware handle it.  The two or four words within dwords
1682   // or qwords that span cache line boundaries will still be loaded
1683   // and stored atomically.
1684   //
1685   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1686                                        address *entry, const char *name) {
1687     const bool not_oop = false;
1688     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1689 
1690   }
1691   // Arguments:
1692   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1693   //             ignored
1694   //   name    - stub name string
1695   //
1696   // Inputs:
1697   //   c_rarg0   - source array address
1698   //   c_rarg1   - destination array address
1699   //   c_rarg2   - element count, treated as ssize_t, can be zero
1700   //
1701   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1702   // the hardware handle it.  The two dwords within qwords that span
1703   // cache line boundaries will still be loaded and stored atomically.
1704   //
1705   // Side Effects:
1706   //   disjoint_int_copy_entry is set to the no-overlap entry point
1707   //   used by generate_conjoint_int_oop_copy().
1708   //
1709   address generate_disjoint_int_copy(bool aligned, address *entry,
1710                                          const char *name, bool dest_uninitialized = false) {
1711     const bool not_oop = false;
1712     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1713   }
1714 
1715   // Arguments:
1716   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1717   //             ignored
1718   //   name    - stub name string
1719   //
1720   // Inputs:
1721   //   c_rarg0   - source array address
1722   //   c_rarg1   - destination array address
1723   //   c_rarg2   - element count, treated as ssize_t, can be zero
1724   //
1725   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1726   // the hardware handle it.  The two dwords within qwords that span
1727   // cache line boundaries will still be loaded and stored atomically.
1728   //
1729   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1730                                      address *entry, const char *name,
1731                                      bool dest_uninitialized = false) {
1732     const bool not_oop = false;
1733     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1734   }
1735 
1736 
1737   // Arguments:
1738   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1739   //             ignored
1740   //   name    - stub name string
1741   //
1742   // Inputs:
1743   //   c_rarg0   - source array address
1744   //   c_rarg1   - destination array address
1745   //   c_rarg2   - element count, treated as size_t, can be zero
1746   //
1747   // Side Effects:
1748   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1749   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1750   //
1751   address generate_disjoint_long_copy(bool aligned, address *entry,
1752                                           const char *name, bool dest_uninitialized = false) {
1753     const bool not_oop = false;
1754     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1755   }
1756 
1757   // Arguments:
1758   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1759   //             ignored
1760   //   name    - stub name string
1761   //
1762   // Inputs:
1763   //   c_rarg0   - source array address
1764   //   c_rarg1   - destination array address
1765   //   c_rarg2   - element count, treated as size_t, can be zero
1766   //
1767   address generate_conjoint_long_copy(bool aligned,
1768                                       address nooverlap_target, address *entry,
1769                                       const char *name, bool dest_uninitialized = false) {
1770     const bool not_oop = false;
1771     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1772   }
1773 
1774   // Arguments:
1775   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1776   //             ignored
1777   //   name    - stub name string
1778   //
1779   // Inputs:
1780   //   c_rarg0   - source array address
1781   //   c_rarg1   - destination array address
1782   //   c_rarg2   - element count, treated as size_t, can be zero
1783   //
1784   // Side Effects:
1785   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1786   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1787   //
1788   address generate_disjoint_oop_copy(bool aligned, address *entry,
1789                                      const char *name, bool dest_uninitialized) {
1790     const bool is_oop = true;
1791     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1792     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1793   }
1794 
1795   // Arguments:
1796   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1797   //             ignored
1798   //   name    - stub name string
1799   //
1800   // Inputs:
1801   //   c_rarg0   - source array address
1802   //   c_rarg1   - destination array address
1803   //   c_rarg2   - element count, treated as size_t, can be zero
1804   //
1805   address generate_conjoint_oop_copy(bool aligned,
1806                                      address nooverlap_target, address *entry,
1807                                      const char *name, bool dest_uninitialized) {
1808     const bool is_oop = true;
1809     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1810     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1811                                   name, dest_uninitialized);
1812   }
1813 
1814 
1815   // Helper for generating a dynamic type check.
1816   // Smashes rscratch1, rscratch2.
1817   void generate_type_check(Register sub_klass,
1818                            Register super_check_offset,
1819                            Register super_klass,
1820                            Label& L_success) {
1821     assert_different_registers(sub_klass, super_check_offset, super_klass);
1822 
1823     BLOCK_COMMENT("type_check:");
1824 
1825     Label L_miss;
1826 
1827     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1828                                      super_check_offset);
1829     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1830 
1831     // Fall through on failure!
1832     __ BIND(L_miss);
1833   }
1834 
1835   //
1836   //  Generate checkcasting array copy stub
1837   //
1838   //  Input:
1839   //    c_rarg0   - source array address
1840   //    c_rarg1   - destination array address
1841   //    c_rarg2   - element count, treated as ssize_t, can be zero
1842   //    c_rarg3   - size_t ckoff (super_check_offset)
1843   //    c_rarg4   - oop ckval (super_klass)
1844   //
1845   //  Output:
1846   //    r0 ==  0  -  success
1847   //    r0 == -1^K - failure, where K is partial transfer count
1848   //
1849   address generate_checkcast_copy(const char *name, address *entry,
1850                                   bool dest_uninitialized = false) {
1851 
1852     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1853 
1854     // Input registers (after setup_arg_regs)
1855     const Register from        = c_rarg0;   // source array address
1856     const Register to          = c_rarg1;   // destination array address
1857     const Register count       = c_rarg2;   // elementscount
1858     const Register ckoff       = c_rarg3;   // super_check_offset
1859     const Register ckval       = c_rarg4;   // super_klass
1860 
1861     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1862     RegSet wb_post_saved_regs = RegSet::of(count);
1863 
1864     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1865     const Register copied_oop  = r22;       // actual oop copied
1866     const Register count_save  = r21;       // orig elementscount
1867     const Register start_to    = r20;       // destination array start address
1868     const Register r19_klass   = r19;       // oop._klass
1869 
1870     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1871     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1872 
1873     //---------------------------------------------------------------
1874     // Assembler stub will be used for this call to arraycopy
1875     // if the two arrays are subtypes of Object[] but the
1876     // destination array type is not equal to or a supertype
1877     // of the source type.  Each element must be separately
1878     // checked.
1879 
1880     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1881                                copied_oop, r19_klass, count_save);
1882 
1883     __ align(CodeEntryAlignment);
1884     StubCodeMark mark(this, "StubRoutines", name);
1885     address start = __ pc();
1886 
1887     __ enter(); // required for proper stackwalking of RuntimeStub frame
1888 
1889 #ifdef ASSERT
1890     // caller guarantees that the arrays really are different
1891     // otherwise, we would have to make conjoint checks
1892     { Label L;
1893       __ b(L);                  // conjoint check not yet implemented
1894       __ stop("checkcast_copy within a single array");
1895       __ bind(L);
1896     }
1897 #endif //ASSERT
1898 
1899     // Caller of this entry point must set up the argument registers.
1900     if (entry != nullptr) {
1901       *entry = __ pc();
1902       BLOCK_COMMENT("Entry:");
1903     }
1904 
1905      // Empty array:  Nothing to do.
1906     __ cbz(count, L_done);
1907     __ push(RegSet::of(r19, r20, r21, r22), sp);
1908 
1909 #ifdef ASSERT
1910     BLOCK_COMMENT("assert consistent ckoff/ckval");
1911     // The ckoff and ckval must be mutually consistent,
1912     // even though caller generates both.
1913     { Label L;
1914       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1915       __ ldrw(start_to, Address(ckval, sco_offset));
1916       __ cmpw(ckoff, start_to);
1917       __ br(Assembler::EQ, L);
1918       __ stop("super_check_offset inconsistent");
1919       __ bind(L);
1920     }
1921 #endif //ASSERT
1922 
1923     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1924     bool is_oop = true;
1925     int element_size = UseCompressedOops ? 4 : 8;
1926     if (dest_uninitialized) {
1927       decorators |= IS_DEST_UNINITIALIZED;
1928     }
1929 
1930     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1931     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1932 
1933     // save the original count
1934     __ mov(count_save, count);
1935 
1936     // Copy from low to high addresses
1937     __ mov(start_to, to);              // Save destination array start address
1938     __ b(L_load_element);
1939 
1940     // ======== begin loop ========
1941     // (Loop is rotated; its entry is L_load_element.)
1942     // Loop control:
1943     //   for (; count != 0; count--) {
1944     //     copied_oop = load_heap_oop(from++);
1945     //     ... generate_type_check ...;
1946     //     store_heap_oop(to++, copied_oop);
1947     //   }
1948     __ align(OptoLoopAlignment);
1949 
1950     __ BIND(L_store_element);
1951     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1952                       __ post(to, element_size), copied_oop, noreg,
1953                       gct1, gct2, gct3);
1954     __ sub(count, count, 1);
1955     __ cbz(count, L_do_card_marks);
1956 
1957     // ======== loop entry is here ========
1958     __ BIND(L_load_element);
1959     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1960                      copied_oop, noreg, __ post(from, element_size),
1961                      gct1);
1962     __ cbz(copied_oop, L_store_element);
1963 
1964     __ load_klass(r19_klass, copied_oop);// query the object klass
1965     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1966     // ======== end loop ========
1967 
1968     // It was a real error; we must depend on the caller to finish the job.
1969     // Register count = remaining oops, count_orig = total oops.
1970     // Emit GC store barriers for the oops we have copied and report
1971     // their number to the caller.
1972 
1973     __ subs(count, count_save, count);     // K = partially copied oop count
1974     __ eon(count, count, zr);                   // report (-1^K) to caller
1975     __ br(Assembler::EQ, L_done_pop);
1976 
1977     __ BIND(L_do_card_marks);
1978     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1979 
1980     __ bind(L_done_pop);
1981     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1982     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1983 
1984     __ bind(L_done);
1985     __ mov(r0, count);
1986     __ leave();
1987     __ ret(lr);
1988 
1989     return start;
1990   }
1991 
1992   // Perform range checks on the proposed arraycopy.
1993   // Kills temp, but nothing else.
1994   // Also, clean the sign bits of src_pos and dst_pos.
1995   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1996                               Register src_pos, // source position (c_rarg1)
1997                               Register dst,     // destination array oo (c_rarg2)
1998                               Register dst_pos, // destination position (c_rarg3)
1999                               Register length,
2000                               Register temp,
2001                               Label& L_failed) {
2002     BLOCK_COMMENT("arraycopy_range_checks:");
2003 
2004     assert_different_registers(rscratch1, temp);
2005 
2006     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2007     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2008     __ addw(temp, length, src_pos);
2009     __ cmpw(temp, rscratch1);
2010     __ br(Assembler::HI, L_failed);
2011 
2012     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2013     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2014     __ addw(temp, length, dst_pos);
2015     __ cmpw(temp, rscratch1);
2016     __ br(Assembler::HI, L_failed);
2017 
2018     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2019     __ movw(src_pos, src_pos);
2020     __ movw(dst_pos, dst_pos);
2021 
2022     BLOCK_COMMENT("arraycopy_range_checks done");
2023   }
2024 
2025   // These stubs get called from some dumb test routine.
2026   // I'll write them properly when they're called from
2027   // something that's actually doing something.
2028   static void fake_arraycopy_stub(address src, address dst, int count) {
2029     assert(count == 0, "huh?");
2030   }
2031 
2032 
2033   //
2034   //  Generate 'unsafe' array copy stub
2035   //  Though just as safe as the other stubs, it takes an unscaled
2036   //  size_t argument instead of an element count.
2037   //
2038   //  Input:
2039   //    c_rarg0   - source array address
2040   //    c_rarg1   - destination array address
2041   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2042   //
2043   // Examines the alignment of the operands and dispatches
2044   // to a long, int, short, or byte copy loop.
2045   //
2046   address generate_unsafe_copy(const char *name,
2047                                address byte_copy_entry,
2048                                address short_copy_entry,
2049                                address int_copy_entry,
2050                                address long_copy_entry) {
2051     Label L_long_aligned, L_int_aligned, L_short_aligned;
2052     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2053 
2054     __ align(CodeEntryAlignment);
2055     StubCodeMark mark(this, "StubRoutines", name);
2056     address start = __ pc();
2057     __ enter(); // required for proper stackwalking of RuntimeStub frame
2058 
2059     // bump this on entry, not on exit:
2060     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2061 
2062     __ orr(rscratch1, s, d);
2063     __ orr(rscratch1, rscratch1, count);
2064 
2065     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2066     __ cbz(rscratch1, L_long_aligned);
2067     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2068     __ cbz(rscratch1, L_int_aligned);
2069     __ tbz(rscratch1, 0, L_short_aligned);
2070     __ b(RuntimeAddress(byte_copy_entry));
2071 
2072     __ BIND(L_short_aligned);
2073     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2074     __ b(RuntimeAddress(short_copy_entry));
2075     __ BIND(L_int_aligned);
2076     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2077     __ b(RuntimeAddress(int_copy_entry));
2078     __ BIND(L_long_aligned);
2079     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2080     __ b(RuntimeAddress(long_copy_entry));
2081 
2082     return start;
2083   }
2084 
2085   //
2086   //  Generate generic array copy stubs
2087   //
2088   //  Input:
2089   //    c_rarg0    -  src oop
2090   //    c_rarg1    -  src_pos (32-bits)
2091   //    c_rarg2    -  dst oop
2092   //    c_rarg3    -  dst_pos (32-bits)
2093   //    c_rarg4    -  element count (32-bits)
2094   //
2095   //  Output:
2096   //    r0 ==  0  -  success
2097   //    r0 == -1^K - failure, where K is partial transfer count
2098   //
2099   address generate_generic_copy(const char *name,
2100                                 address byte_copy_entry, address short_copy_entry,
2101                                 address int_copy_entry, address oop_copy_entry,
2102                                 address long_copy_entry, address checkcast_copy_entry) {
2103 
2104     Label L_failed, L_objArray;
2105     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2106 
2107     // Input registers
2108     const Register src        = c_rarg0;  // source array oop
2109     const Register src_pos    = c_rarg1;  // source position
2110     const Register dst        = c_rarg2;  // destination array oop
2111     const Register dst_pos    = c_rarg3;  // destination position
2112     const Register length     = c_rarg4;
2113 
2114 
2115     // Registers used as temps
2116     const Register dst_klass  = c_rarg5;
2117 
2118     __ align(CodeEntryAlignment);
2119 
2120     StubCodeMark mark(this, "StubRoutines", name);
2121 
2122     address start = __ pc();
2123 
2124     __ enter(); // required for proper stackwalking of RuntimeStub frame
2125 
2126     // bump this on entry, not on exit:
2127     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2128 
2129     //-----------------------------------------------------------------------
2130     // Assembler stub will be used for this call to arraycopy
2131     // if the following conditions are met:
2132     //
2133     // (1) src and dst must not be null.
2134     // (2) src_pos must not be negative.
2135     // (3) dst_pos must not be negative.
2136     // (4) length  must not be negative.
2137     // (5) src klass and dst klass should be the same and not null.
2138     // (6) src and dst should be arrays.
2139     // (7) src_pos + length must not exceed length of src.
2140     // (8) dst_pos + length must not exceed length of dst.
2141     //
2142 
2143     //  if (src == nullptr) return -1;
2144     __ cbz(src, L_failed);
2145 
2146     //  if (src_pos < 0) return -1;
2147     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2148 
2149     //  if (dst == nullptr) return -1;
2150     __ cbz(dst, L_failed);
2151 
2152     //  if (dst_pos < 0) return -1;
2153     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2154 
2155     // registers used as temp
2156     const Register scratch_length    = r16; // elements count to copy
2157     const Register scratch_src_klass = r17; // array klass
2158     const Register lh                = r15; // layout helper
2159 
2160     //  if (length < 0) return -1;
2161     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2162     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2163 
2164     __ load_klass(scratch_src_klass, src);
2165 #ifdef ASSERT
2166     //  assert(src->klass() != nullptr);
2167     {
2168       BLOCK_COMMENT("assert klasses not null {");
2169       Label L1, L2;
2170       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2171       __ bind(L1);
2172       __ stop("broken null klass");
2173       __ bind(L2);
2174       __ load_klass(rscratch1, dst);
2175       __ cbz(rscratch1, L1);     // this would be broken also
2176       BLOCK_COMMENT("} assert klasses not null done");
2177     }
2178 #endif
2179 
2180     // Load layout helper (32-bits)
2181     //
2182     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2183     // 32        30    24            16              8     2                 0
2184     //
2185     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2186     //
2187 
2188     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2189 
2190     // Handle objArrays completely differently...
2191     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2192     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2193     __ movw(rscratch1, objArray_lh);
2194     __ eorw(rscratch2, lh, rscratch1);
2195     __ cbzw(rscratch2, L_objArray);
2196 
2197     //  if (src->klass() != dst->klass()) return -1;
2198     __ load_klass(rscratch2, dst);
2199     __ eor(rscratch2, rscratch2, scratch_src_klass);
2200     __ cbnz(rscratch2, L_failed);
2201 
2202     //  if (!src->is_Array()) return -1;
2203     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2204 
2205     // At this point, it is known to be a typeArray (array_tag 0x3).
2206 #ifdef ASSERT
2207     {
2208       BLOCK_COMMENT("assert primitive array {");
2209       Label L;
2210       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2211       __ cmpw(lh, rscratch2);
2212       __ br(Assembler::GE, L);
2213       __ stop("must be a primitive array");
2214       __ bind(L);
2215       BLOCK_COMMENT("} assert primitive array done");
2216     }
2217 #endif
2218 
2219     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2220                            rscratch2, L_failed);
2221 
2222     // TypeArrayKlass
2223     //
2224     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2225     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2226     //
2227 
2228     const Register rscratch1_offset = rscratch1;    // array offset
2229     const Register r15_elsize = lh; // element size
2230 
2231     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2232            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2233     __ add(src, src, rscratch1_offset);           // src array offset
2234     __ add(dst, dst, rscratch1_offset);           // dst array offset
2235     BLOCK_COMMENT("choose copy loop based on element size");
2236 
2237     // next registers should be set before the jump to corresponding stub
2238     const Register from     = c_rarg0;  // source array address
2239     const Register to       = c_rarg1;  // destination array address
2240     const Register count    = c_rarg2;  // elements count
2241 
2242     // 'from', 'to', 'count' registers should be set in such order
2243     // since they are the same as 'src', 'src_pos', 'dst'.
2244 
2245     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2246 
2247     // The possible values of elsize are 0-3, i.e. exact_log2(element
2248     // size in bytes).  We do a simple bitwise binary search.
2249   __ BIND(L_copy_bytes);
2250     __ tbnz(r15_elsize, 1, L_copy_ints);
2251     __ tbnz(r15_elsize, 0, L_copy_shorts);
2252     __ lea(from, Address(src, src_pos));// src_addr
2253     __ lea(to,   Address(dst, dst_pos));// dst_addr
2254     __ movw(count, scratch_length); // length
2255     __ b(RuntimeAddress(byte_copy_entry));
2256 
2257   __ BIND(L_copy_shorts);
2258     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2259     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2260     __ movw(count, scratch_length); // length
2261     __ b(RuntimeAddress(short_copy_entry));
2262 
2263   __ BIND(L_copy_ints);
2264     __ tbnz(r15_elsize, 0, L_copy_longs);
2265     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2266     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2267     __ movw(count, scratch_length); // length
2268     __ b(RuntimeAddress(int_copy_entry));
2269 
2270   __ BIND(L_copy_longs);
2271 #ifdef ASSERT
2272     {
2273       BLOCK_COMMENT("assert long copy {");
2274       Label L;
2275       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2276       __ cmpw(r15_elsize, LogBytesPerLong);
2277       __ br(Assembler::EQ, L);
2278       __ stop("must be long copy, but elsize is wrong");
2279       __ bind(L);
2280       BLOCK_COMMENT("} assert long copy done");
2281     }
2282 #endif
2283     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2284     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2285     __ movw(count, scratch_length); // length
2286     __ b(RuntimeAddress(long_copy_entry));
2287 
2288     // ObjArrayKlass
2289   __ BIND(L_objArray);
2290     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2291 
2292     Label L_plain_copy, L_checkcast_copy;
2293     //  test array classes for subtyping
2294     __ load_klass(r15, dst);
2295     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2296     __ br(Assembler::NE, L_checkcast_copy);
2297 
2298     // Identically typed arrays can be copied without element-wise checks.
2299     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2300                            rscratch2, L_failed);
2301 
2302     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2303     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2304     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2305     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2306     __ movw(count, scratch_length); // length
2307   __ BIND(L_plain_copy);
2308     __ b(RuntimeAddress(oop_copy_entry));
2309 
2310   __ BIND(L_checkcast_copy);
2311     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2312     {
2313       // Before looking at dst.length, make sure dst is also an objArray.
2314       __ ldrw(rscratch1, Address(r15, lh_offset));
2315       __ movw(rscratch2, objArray_lh);
2316       __ eorw(rscratch1, rscratch1, rscratch2);
2317       __ cbnzw(rscratch1, L_failed);
2318 
2319       // It is safe to examine both src.length and dst.length.
2320       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2321                              r15, L_failed);
2322 
2323       __ load_klass(dst_klass, dst); // reload
2324 
2325       // Marshal the base address arguments now, freeing registers.
2326       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2327       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2328       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2329       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2330       __ movw(count, length);           // length (reloaded)
2331       Register sco_temp = c_rarg3;      // this register is free now
2332       assert_different_registers(from, to, count, sco_temp,
2333                                  dst_klass, scratch_src_klass);
2334       // assert_clean_int(count, sco_temp);
2335 
2336       // Generate the type check.
2337       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2338       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2339 
2340       // Smashes rscratch1, rscratch2
2341       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2342 
2343       // Fetch destination element klass from the ObjArrayKlass header.
2344       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2345       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2346       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2347 
2348       // the checkcast_copy loop needs two extra arguments:
2349       assert(c_rarg3 == sco_temp, "#3 already in place");
2350       // Set up arguments for checkcast_copy_entry.
2351       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2352       __ b(RuntimeAddress(checkcast_copy_entry));
2353     }
2354 
2355   __ BIND(L_failed);
2356     __ mov(r0, -1);
2357     __ leave();   // required for proper stackwalking of RuntimeStub frame
2358     __ ret(lr);
2359 
2360     return start;
2361   }
2362 
2363   //
2364   // Generate stub for array fill. If "aligned" is true, the
2365   // "to" address is assumed to be heapword aligned.
2366   //
2367   // Arguments for generated stub:
2368   //   to:    c_rarg0
2369   //   value: c_rarg1
2370   //   count: c_rarg2 treated as signed
2371   //
2372   address generate_fill(BasicType t, bool aligned, const char *name) {
2373     __ align(CodeEntryAlignment);
2374     StubCodeMark mark(this, "StubRoutines", name);
2375     address start = __ pc();
2376 
2377     BLOCK_COMMENT("Entry:");
2378 
2379     const Register to        = c_rarg0;  // source array address
2380     const Register value     = c_rarg1;  // value
2381     const Register count     = c_rarg2;  // elements count
2382 
2383     const Register bz_base = r10;        // base for block_zero routine
2384     const Register cnt_words = r11;      // temp register
2385 
2386     __ enter();
2387 
2388     Label L_fill_elements, L_exit1;
2389 
2390     int shift = -1;
2391     switch (t) {
2392       case T_BYTE:
2393         shift = 0;
2394         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2395         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2396         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2397         __ br(Assembler::LO, L_fill_elements);
2398         break;
2399       case T_SHORT:
2400         shift = 1;
2401         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2402         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2403         __ br(Assembler::LO, L_fill_elements);
2404         break;
2405       case T_INT:
2406         shift = 2;
2407         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2408         __ br(Assembler::LO, L_fill_elements);
2409         break;
2410       default: ShouldNotReachHere();
2411     }
2412 
2413     // Align source address at 8 bytes address boundary.
2414     Label L_skip_align1, L_skip_align2, L_skip_align4;
2415     if (!aligned) {
2416       switch (t) {
2417         case T_BYTE:
2418           // One byte misalignment happens only for byte arrays.
2419           __ tbz(to, 0, L_skip_align1);
2420           __ strb(value, Address(__ post(to, 1)));
2421           __ subw(count, count, 1);
2422           __ bind(L_skip_align1);
2423           // Fallthrough
2424         case T_SHORT:
2425           // Two bytes misalignment happens only for byte and short (char) arrays.
2426           __ tbz(to, 1, L_skip_align2);
2427           __ strh(value, Address(__ post(to, 2)));
2428           __ subw(count, count, 2 >> shift);
2429           __ bind(L_skip_align2);
2430           // Fallthrough
2431         case T_INT:
2432           // Align to 8 bytes, we know we are 4 byte aligned to start.
2433           __ tbz(to, 2, L_skip_align4);
2434           __ strw(value, Address(__ post(to, 4)));
2435           __ subw(count, count, 4 >> shift);
2436           __ bind(L_skip_align4);
2437           break;
2438         default: ShouldNotReachHere();
2439       }
2440     }
2441 
2442     //
2443     //  Fill large chunks
2444     //
2445     __ lsrw(cnt_words, count, 3 - shift); // number of words
2446     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2447     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2448     if (UseBlockZeroing) {
2449       Label non_block_zeroing, rest;
2450       // If the fill value is zero we can use the fast zero_words().
2451       __ cbnz(value, non_block_zeroing);
2452       __ mov(bz_base, to);
2453       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2454       address tpc = __ zero_words(bz_base, cnt_words);
2455       if (tpc == nullptr) {
2456         fatal("CodeCache is full at generate_fill");
2457       }
2458       __ b(rest);
2459       __ bind(non_block_zeroing);
2460       __ fill_words(to, cnt_words, value);
2461       __ bind(rest);
2462     } else {
2463       __ fill_words(to, cnt_words, value);
2464     }
2465 
2466     // Remaining count is less than 8 bytes. Fill it by a single store.
2467     // Note that the total length is no less than 8 bytes.
2468     if (t == T_BYTE || t == T_SHORT) {
2469       Label L_exit1;
2470       __ cbzw(count, L_exit1);
2471       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2472       __ str(value, Address(to, -8));    // overwrite some elements
2473       __ bind(L_exit1);
2474       __ leave();
2475       __ ret(lr);
2476     }
2477 
2478     // Handle copies less than 8 bytes.
2479     Label L_fill_2, L_fill_4, L_exit2;
2480     __ bind(L_fill_elements);
2481     switch (t) {
2482       case T_BYTE:
2483         __ tbz(count, 0, L_fill_2);
2484         __ strb(value, Address(__ post(to, 1)));
2485         __ bind(L_fill_2);
2486         __ tbz(count, 1, L_fill_4);
2487         __ strh(value, Address(__ post(to, 2)));
2488         __ bind(L_fill_4);
2489         __ tbz(count, 2, L_exit2);
2490         __ strw(value, Address(to));
2491         break;
2492       case T_SHORT:
2493         __ tbz(count, 0, L_fill_4);
2494         __ strh(value, Address(__ post(to, 2)));
2495         __ bind(L_fill_4);
2496         __ tbz(count, 1, L_exit2);
2497         __ strw(value, Address(to));
2498         break;
2499       case T_INT:
2500         __ cbzw(count, L_exit2);
2501         __ strw(value, Address(to));
2502         break;
2503       default: ShouldNotReachHere();
2504     }
2505     __ bind(L_exit2);
2506     __ leave();
2507     __ ret(lr);
2508     return start;
2509   }
2510 
2511   address generate_data_cache_writeback() {
2512     const Register line        = c_rarg0;  // address of line to write back
2513 
2514     __ align(CodeEntryAlignment);
2515 
2516     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2517 
2518     address start = __ pc();
2519     __ enter();
2520     __ cache_wb(Address(line, 0));
2521     __ leave();
2522     __ ret(lr);
2523 
2524     return start;
2525   }
2526 
2527   address generate_data_cache_writeback_sync() {
2528     const Register is_pre     = c_rarg0;  // pre or post sync
2529 
2530     __ align(CodeEntryAlignment);
2531 
2532     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2533 
2534     // pre wbsync is a no-op
2535     // post wbsync translates to an sfence
2536 
2537     Label skip;
2538     address start = __ pc();
2539     __ enter();
2540     __ cbnz(is_pre, skip);
2541     __ cache_wbsync(false);
2542     __ bind(skip);
2543     __ leave();
2544     __ ret(lr);
2545 
2546     return start;
2547   }
2548 
2549   void generate_arraycopy_stubs() {
2550     address entry;
2551     address entry_jbyte_arraycopy;
2552     address entry_jshort_arraycopy;
2553     address entry_jint_arraycopy;
2554     address entry_oop_arraycopy;
2555     address entry_jlong_arraycopy;
2556     address entry_checkcast_arraycopy;
2557 
2558     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2559     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2560 
2561     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2562     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2563 
2564     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2565     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2566 
2567     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2568 
2569     //*** jbyte
2570     // Always need aligned and unaligned versions
2571     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2572                                                                                   "jbyte_disjoint_arraycopy");
2573     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2574                                                                                   &entry_jbyte_arraycopy,
2575                                                                                   "jbyte_arraycopy");
2576     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2577                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2578     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2579                                                                                   "arrayof_jbyte_arraycopy");
2580 
2581     //*** jshort
2582     // Always need aligned and unaligned versions
2583     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2584                                                                                     "jshort_disjoint_arraycopy");
2585     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2586                                                                                     &entry_jshort_arraycopy,
2587                                                                                     "jshort_arraycopy");
2588     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2589                                                                                     "arrayof_jshort_disjoint_arraycopy");
2590     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2591                                                                                     "arrayof_jshort_arraycopy");
2592 
2593     //*** jint
2594     // Aligned versions
2595     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2596                                                                                 "arrayof_jint_disjoint_arraycopy");
2597     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2598                                                                                 "arrayof_jint_arraycopy");
2599     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2600     // entry_jint_arraycopy always points to the unaligned version
2601     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2602                                                                                 "jint_disjoint_arraycopy");
2603     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2604                                                                                 &entry_jint_arraycopy,
2605                                                                                 "jint_arraycopy");
2606 
2607     //*** jlong
2608     // It is always aligned
2609     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2610                                                                                   "arrayof_jlong_disjoint_arraycopy");
2611     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2612                                                                                   "arrayof_jlong_arraycopy");
2613     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2614     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2615 
2616     //*** oops
2617     {
2618       // With compressed oops we need unaligned versions; notice that
2619       // we overwrite entry_oop_arraycopy.
2620       bool aligned = !UseCompressedOops;
2621 
2622       StubRoutines::_arrayof_oop_disjoint_arraycopy
2623         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2624                                      /*dest_uninitialized*/false);
2625       StubRoutines::_arrayof_oop_arraycopy
2626         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2627                                      /*dest_uninitialized*/false);
2628       // Aligned versions without pre-barriers
2629       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2630         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2631                                      /*dest_uninitialized*/true);
2632       StubRoutines::_arrayof_oop_arraycopy_uninit
2633         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2634                                      /*dest_uninitialized*/true);
2635     }
2636 
2637     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2638     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2639     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2640     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2641 
2642     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2643     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2644                                                                         /*dest_uninitialized*/true);
2645 
2646     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2647                                                               entry_jbyte_arraycopy,
2648                                                               entry_jshort_arraycopy,
2649                                                               entry_jint_arraycopy,
2650                                                               entry_jlong_arraycopy);
2651 
2652     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2653                                                                entry_jbyte_arraycopy,
2654                                                                entry_jshort_arraycopy,
2655                                                                entry_jint_arraycopy,
2656                                                                entry_oop_arraycopy,
2657                                                                entry_jlong_arraycopy,
2658                                                                entry_checkcast_arraycopy);
2659 
2660     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2661     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2662     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2663     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2664     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2665     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2666   }
2667 
2668   void generate_math_stubs() { Unimplemented(); }
2669 
2670   // Arguments:
2671   //
2672   // Inputs:
2673   //   c_rarg0   - source byte array address
2674   //   c_rarg1   - destination byte array address
2675   //   c_rarg2   - K (key) in little endian int array
2676   //
2677   address generate_aescrypt_encryptBlock() {
2678     __ align(CodeEntryAlignment);
2679     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2680 
2681     const Register from        = c_rarg0;  // source array address
2682     const Register to          = c_rarg1;  // destination array address
2683     const Register key         = c_rarg2;  // key array address
2684     const Register keylen      = rscratch1;
2685 
2686     address start = __ pc();
2687     __ enter();
2688 
2689     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2690 
2691     __ aesenc_loadkeys(key, keylen);
2692     __ aesecb_encrypt(from, to, keylen);
2693 
2694     __ mov(r0, 0);
2695 
2696     __ leave();
2697     __ ret(lr);
2698 
2699     return start;
2700   }
2701 
2702   // Arguments:
2703   //
2704   // Inputs:
2705   //   c_rarg0   - source byte array address
2706   //   c_rarg1   - destination byte array address
2707   //   c_rarg2   - K (key) in little endian int array
2708   //
2709   address generate_aescrypt_decryptBlock() {
2710     assert(UseAES, "need AES cryptographic extension support");
2711     __ align(CodeEntryAlignment);
2712     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2713     Label L_doLast;
2714 
2715     const Register from        = c_rarg0;  // source array address
2716     const Register to          = c_rarg1;  // destination array address
2717     const Register key         = c_rarg2;  // key array address
2718     const Register keylen      = rscratch1;
2719 
2720     address start = __ pc();
2721     __ enter(); // required for proper stackwalking of RuntimeStub frame
2722 
2723     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2724 
2725     __ aesecb_decrypt(from, to, key, keylen);
2726 
2727     __ mov(r0, 0);
2728 
2729     __ leave();
2730     __ ret(lr);
2731 
2732     return start;
2733   }
2734 
2735   // Arguments:
2736   //
2737   // Inputs:
2738   //   c_rarg0   - source byte array address
2739   //   c_rarg1   - destination byte array address
2740   //   c_rarg2   - K (key) in little endian int array
2741   //   c_rarg3   - r vector byte array address
2742   //   c_rarg4   - input length
2743   //
2744   // Output:
2745   //   x0        - input length
2746   //
2747   address generate_cipherBlockChaining_encryptAESCrypt() {
2748     assert(UseAES, "need AES cryptographic extension support");
2749     __ align(CodeEntryAlignment);
2750     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2751 
2752     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2753 
2754     const Register from        = c_rarg0;  // source array address
2755     const Register to          = c_rarg1;  // destination array address
2756     const Register key         = c_rarg2;  // key array address
2757     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2758                                            // and left with the results of the last encryption block
2759     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2760     const Register keylen      = rscratch1;
2761 
2762     address start = __ pc();
2763 
2764       __ enter();
2765 
2766       __ movw(rscratch2, len_reg);
2767 
2768       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2769 
2770       __ ld1(v0, __ T16B, rvec);
2771 
2772       __ cmpw(keylen, 52);
2773       __ br(Assembler::CC, L_loadkeys_44);
2774       __ br(Assembler::EQ, L_loadkeys_52);
2775 
2776       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2777       __ rev32(v17, __ T16B, v17);
2778       __ rev32(v18, __ T16B, v18);
2779     __ BIND(L_loadkeys_52);
2780       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2781       __ rev32(v19, __ T16B, v19);
2782       __ rev32(v20, __ T16B, v20);
2783     __ BIND(L_loadkeys_44);
2784       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2785       __ rev32(v21, __ T16B, v21);
2786       __ rev32(v22, __ T16B, v22);
2787       __ rev32(v23, __ T16B, v23);
2788       __ rev32(v24, __ T16B, v24);
2789       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2790       __ rev32(v25, __ T16B, v25);
2791       __ rev32(v26, __ T16B, v26);
2792       __ rev32(v27, __ T16B, v27);
2793       __ rev32(v28, __ T16B, v28);
2794       __ ld1(v29, v30, v31, __ T16B, key);
2795       __ rev32(v29, __ T16B, v29);
2796       __ rev32(v30, __ T16B, v30);
2797       __ rev32(v31, __ T16B, v31);
2798 
2799     __ BIND(L_aes_loop);
2800       __ ld1(v1, __ T16B, __ post(from, 16));
2801       __ eor(v0, __ T16B, v0, v1);
2802 
2803       __ br(Assembler::CC, L_rounds_44);
2804       __ br(Assembler::EQ, L_rounds_52);
2805 
2806       __ aese(v0, v17); __ aesmc(v0, v0);
2807       __ aese(v0, v18); __ aesmc(v0, v0);
2808     __ BIND(L_rounds_52);
2809       __ aese(v0, v19); __ aesmc(v0, v0);
2810       __ aese(v0, v20); __ aesmc(v0, v0);
2811     __ BIND(L_rounds_44);
2812       __ aese(v0, v21); __ aesmc(v0, v0);
2813       __ aese(v0, v22); __ aesmc(v0, v0);
2814       __ aese(v0, v23); __ aesmc(v0, v0);
2815       __ aese(v0, v24); __ aesmc(v0, v0);
2816       __ aese(v0, v25); __ aesmc(v0, v0);
2817       __ aese(v0, v26); __ aesmc(v0, v0);
2818       __ aese(v0, v27); __ aesmc(v0, v0);
2819       __ aese(v0, v28); __ aesmc(v0, v0);
2820       __ aese(v0, v29); __ aesmc(v0, v0);
2821       __ aese(v0, v30);
2822       __ eor(v0, __ T16B, v0, v31);
2823 
2824       __ st1(v0, __ T16B, __ post(to, 16));
2825 
2826       __ subw(len_reg, len_reg, 16);
2827       __ cbnzw(len_reg, L_aes_loop);
2828 
2829       __ st1(v0, __ T16B, rvec);
2830 
2831       __ mov(r0, rscratch2);
2832 
2833       __ leave();
2834       __ ret(lr);
2835 
2836       return start;
2837   }
2838 
2839   // Arguments:
2840   //
2841   // Inputs:
2842   //   c_rarg0   - source byte array address
2843   //   c_rarg1   - destination byte array address
2844   //   c_rarg2   - K (key) in little endian int array
2845   //   c_rarg3   - r vector byte array address
2846   //   c_rarg4   - input length
2847   //
2848   // Output:
2849   //   r0        - input length
2850   //
2851   address generate_cipherBlockChaining_decryptAESCrypt() {
2852     assert(UseAES, "need AES cryptographic extension support");
2853     __ align(CodeEntryAlignment);
2854     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2855 
2856     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2857 
2858     const Register from        = c_rarg0;  // source array address
2859     const Register to          = c_rarg1;  // destination array address
2860     const Register key         = c_rarg2;  // key array address
2861     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2862                                            // and left with the results of the last encryption block
2863     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2864     const Register keylen      = rscratch1;
2865 
2866     address start = __ pc();
2867 
2868       __ enter();
2869 
2870       __ movw(rscratch2, len_reg);
2871 
2872       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2873 
2874       __ ld1(v2, __ T16B, rvec);
2875 
2876       __ ld1(v31, __ T16B, __ post(key, 16));
2877       __ rev32(v31, __ T16B, v31);
2878 
2879       __ cmpw(keylen, 52);
2880       __ br(Assembler::CC, L_loadkeys_44);
2881       __ br(Assembler::EQ, L_loadkeys_52);
2882 
2883       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2884       __ rev32(v17, __ T16B, v17);
2885       __ rev32(v18, __ T16B, v18);
2886     __ BIND(L_loadkeys_52);
2887       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2888       __ rev32(v19, __ T16B, v19);
2889       __ rev32(v20, __ T16B, v20);
2890     __ BIND(L_loadkeys_44);
2891       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2892       __ rev32(v21, __ T16B, v21);
2893       __ rev32(v22, __ T16B, v22);
2894       __ rev32(v23, __ T16B, v23);
2895       __ rev32(v24, __ T16B, v24);
2896       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2897       __ rev32(v25, __ T16B, v25);
2898       __ rev32(v26, __ T16B, v26);
2899       __ rev32(v27, __ T16B, v27);
2900       __ rev32(v28, __ T16B, v28);
2901       __ ld1(v29, v30, __ T16B, key);
2902       __ rev32(v29, __ T16B, v29);
2903       __ rev32(v30, __ T16B, v30);
2904 
2905     __ BIND(L_aes_loop);
2906       __ ld1(v0, __ T16B, __ post(from, 16));
2907       __ orr(v1, __ T16B, v0, v0);
2908 
2909       __ br(Assembler::CC, L_rounds_44);
2910       __ br(Assembler::EQ, L_rounds_52);
2911 
2912       __ aesd(v0, v17); __ aesimc(v0, v0);
2913       __ aesd(v0, v18); __ aesimc(v0, v0);
2914     __ BIND(L_rounds_52);
2915       __ aesd(v0, v19); __ aesimc(v0, v0);
2916       __ aesd(v0, v20); __ aesimc(v0, v0);
2917     __ BIND(L_rounds_44);
2918       __ aesd(v0, v21); __ aesimc(v0, v0);
2919       __ aesd(v0, v22); __ aesimc(v0, v0);
2920       __ aesd(v0, v23); __ aesimc(v0, v0);
2921       __ aesd(v0, v24); __ aesimc(v0, v0);
2922       __ aesd(v0, v25); __ aesimc(v0, v0);
2923       __ aesd(v0, v26); __ aesimc(v0, v0);
2924       __ aesd(v0, v27); __ aesimc(v0, v0);
2925       __ aesd(v0, v28); __ aesimc(v0, v0);
2926       __ aesd(v0, v29); __ aesimc(v0, v0);
2927       __ aesd(v0, v30);
2928       __ eor(v0, __ T16B, v0, v31);
2929       __ eor(v0, __ T16B, v0, v2);
2930 
2931       __ st1(v0, __ T16B, __ post(to, 16));
2932       __ orr(v2, __ T16B, v1, v1);
2933 
2934       __ subw(len_reg, len_reg, 16);
2935       __ cbnzw(len_reg, L_aes_loop);
2936 
2937       __ st1(v2, __ T16B, rvec);
2938 
2939       __ mov(r0, rscratch2);
2940 
2941       __ leave();
2942       __ ret(lr);
2943 
2944     return start;
2945   }
2946 
2947   // CTR AES crypt.
2948   // Arguments:
2949   //
2950   // Inputs:
2951   //   c_rarg0   - source byte array address
2952   //   c_rarg1   - destination byte array address
2953   //   c_rarg2   - K (key) in little endian int array
2954   //   c_rarg3   - counter vector byte array address
2955   //   c_rarg4   - input length
2956   //   c_rarg5   - saved encryptedCounter start
2957   //   c_rarg6   - saved used length
2958   //
2959   // Output:
2960   //   r0       - input length
2961   //
2962   address generate_counterMode_AESCrypt() {
2963     const Register in = c_rarg0;
2964     const Register out = c_rarg1;
2965     const Register key = c_rarg2;
2966     const Register counter = c_rarg3;
2967     const Register saved_len = c_rarg4, len = r10;
2968     const Register saved_encrypted_ctr = c_rarg5;
2969     const Register used_ptr = c_rarg6, used = r12;
2970 
2971     const Register offset = r7;
2972     const Register keylen = r11;
2973 
2974     const unsigned char block_size = 16;
2975     const int bulk_width = 4;
2976     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2977     // performance with larger data sizes, but it also means that the
2978     // fast path isn't used until you have at least 8 blocks, and up
2979     // to 127 bytes of data will be executed on the slow path. For
2980     // that reason, and also so as not to blow away too much icache, 4
2981     // blocks seems like a sensible compromise.
2982 
2983     // Algorithm:
2984     //
2985     //    if (len == 0) {
2986     //        goto DONE;
2987     //    }
2988     //    int result = len;
2989     //    do {
2990     //        if (used >= blockSize) {
2991     //            if (len >= bulk_width * blockSize) {
2992     //                CTR_large_block();
2993     //                if (len == 0)
2994     //                    goto DONE;
2995     //            }
2996     //            for (;;) {
2997     //                16ByteVector v0 = counter;
2998     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2999     //                used = 0;
3000     //                if (len < blockSize)
3001     //                    break;    /* goto NEXT */
3002     //                16ByteVector v1 = load16Bytes(in, offset);
3003     //                v1 = v1 ^ encryptedCounter;
3004     //                store16Bytes(out, offset);
3005     //                used = blockSize;
3006     //                offset += blockSize;
3007     //                len -= blockSize;
3008     //                if (len == 0)
3009     //                    goto DONE;
3010     //            }
3011     //        }
3012     //      NEXT:
3013     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3014     //        len--;
3015     //    } while (len != 0);
3016     //  DONE:
3017     //    return result;
3018     //
3019     // CTR_large_block()
3020     //    Wide bulk encryption of whole blocks.
3021 
3022     __ align(CodeEntryAlignment);
3023     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3024     const address start = __ pc();
3025     __ enter();
3026 
3027     Label DONE, CTR_large_block, large_block_return;
3028     __ ldrw(used, Address(used_ptr));
3029     __ cbzw(saved_len, DONE);
3030 
3031     __ mov(len, saved_len);
3032     __ mov(offset, 0);
3033 
3034     // Compute #rounds for AES based on the length of the key array
3035     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3036 
3037     __ aesenc_loadkeys(key, keylen);
3038 
3039     {
3040       Label L_CTR_loop, NEXT;
3041 
3042       __ bind(L_CTR_loop);
3043 
3044       __ cmp(used, block_size);
3045       __ br(__ LO, NEXT);
3046 
3047       // Maybe we have a lot of data
3048       __ subsw(rscratch1, len, bulk_width * block_size);
3049       __ br(__ HS, CTR_large_block);
3050       __ BIND(large_block_return);
3051       __ cbzw(len, DONE);
3052 
3053       // Setup the counter
3054       __ movi(v4, __ T4S, 0);
3055       __ movi(v5, __ T4S, 1);
3056       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
3057 
3058       __ ld1(v0, __ T16B, counter); // Load the counter into v0
3059       __ rev32(v16, __ T16B, v0);
3060       __ addv(v16, __ T4S, v16, v4);
3061       __ rev32(v16, __ T16B, v16);
3062       __ st1(v16, __ T16B, counter); // Save the incremented counter back
3063 
3064       {
3065         // We have fewer than bulk_width blocks of data left. Encrypt
3066         // them one by one until there is less than a full block
3067         // remaining, being careful to save both the encrypted counter
3068         // and the counter.
3069 
3070         Label inner_loop;
3071         __ bind(inner_loop);
3072         // Counter to encrypt is in v0
3073         __ aesecb_encrypt(noreg, noreg, keylen);
3074         __ st1(v0, __ T16B, saved_encrypted_ctr);
3075 
3076         // Do we have a remaining full block?
3077 
3078         __ mov(used, 0);
3079         __ cmp(len, block_size);
3080         __ br(__ LO, NEXT);
3081 
3082         // Yes, we have a full block
3083         __ ldrq(v1, Address(in, offset));
3084         __ eor(v1, __ T16B, v1, v0);
3085         __ strq(v1, Address(out, offset));
3086         __ mov(used, block_size);
3087         __ add(offset, offset, block_size);
3088 
3089         __ subw(len, len, block_size);
3090         __ cbzw(len, DONE);
3091 
3092         // Increment the counter, store it back
3093         __ orr(v0, __ T16B, v16, v16);
3094         __ rev32(v16, __ T16B, v16);
3095         __ addv(v16, __ T4S, v16, v4);
3096         __ rev32(v16, __ T16B, v16);
3097         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3098 
3099         __ b(inner_loop);
3100       }
3101 
3102       __ BIND(NEXT);
3103 
3104       // Encrypt a single byte, and loop.
3105       // We expect this to be a rare event.
3106       __ ldrb(rscratch1, Address(in, offset));
3107       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3108       __ eor(rscratch1, rscratch1, rscratch2);
3109       __ strb(rscratch1, Address(out, offset));
3110       __ add(offset, offset, 1);
3111       __ add(used, used, 1);
3112       __ subw(len, len,1);
3113       __ cbnzw(len, L_CTR_loop);
3114     }
3115 
3116     __ bind(DONE);
3117     __ strw(used, Address(used_ptr));
3118     __ mov(r0, saved_len);
3119 
3120     __ leave(); // required for proper stackwalking of RuntimeStub frame
3121     __ ret(lr);
3122 
3123     // Bulk encryption
3124 
3125     __ BIND (CTR_large_block);
3126     assert(bulk_width == 4 || bulk_width == 8, "must be");
3127 
3128     if (bulk_width == 8) {
3129       __ sub(sp, sp, 4 * 16);
3130       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3131     }
3132     __ sub(sp, sp, 4 * 16);
3133     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3134     RegSet saved_regs = (RegSet::of(in, out, offset)
3135                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3136     __ push(saved_regs, sp);
3137     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3138     __ add(in, in, offset);
3139     __ add(out, out, offset);
3140 
3141     // Keys should already be loaded into the correct registers
3142 
3143     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3144     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3145 
3146     // AES/CTR loop
3147     {
3148       Label L_CTR_loop;
3149       __ BIND(L_CTR_loop);
3150 
3151       // Setup the counters
3152       __ movi(v8, __ T4S, 0);
3153       __ movi(v9, __ T4S, 1);
3154       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3155 
3156       for (int i = 0; i < bulk_width; i++) {
3157         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3158         __ rev32(v0_ofs, __ T16B, v16);
3159         __ addv(v16, __ T4S, v16, v8);
3160       }
3161 
3162       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3163 
3164       // Encrypt the counters
3165       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3166 
3167       if (bulk_width == 8) {
3168         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3169       }
3170 
3171       // XOR the encrypted counters with the inputs
3172       for (int i = 0; i < bulk_width; i++) {
3173         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3174         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3175         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3176       }
3177 
3178       // Write the encrypted data
3179       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3180       if (bulk_width == 8) {
3181         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3182       }
3183 
3184       __ subw(len, len, 16 * bulk_width);
3185       __ cbnzw(len, L_CTR_loop);
3186     }
3187 
3188     // Save the counter back where it goes
3189     __ rev32(v16, __ T16B, v16);
3190     __ st1(v16, __ T16B, counter);
3191 
3192     __ pop(saved_regs, sp);
3193 
3194     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3195     if (bulk_width == 8) {
3196       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3197     }
3198 
3199     __ andr(rscratch1, len, -16 * bulk_width);
3200     __ sub(len, len, rscratch1);
3201     __ add(offset, offset, rscratch1);
3202     __ mov(used, 16);
3203     __ strw(used, Address(used_ptr));
3204     __ b(large_block_return);
3205 
3206     return start;
3207   }
3208 
3209   // Vector AES Galois Counter Mode implementation. Parameters:
3210   //
3211   // in = c_rarg0
3212   // len = c_rarg1
3213   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3214   // out = c_rarg3
3215   // key = c_rarg4
3216   // state = c_rarg5 - GHASH.state
3217   // subkeyHtbl = c_rarg6 - powers of H
3218   // counter = c_rarg7 - 16 bytes of CTR
3219   // return - number of processed bytes
3220   address generate_galoisCounterMode_AESCrypt() {
3221     address ghash_polynomial = __ pc();
3222     __ emit_int64(0x87);  // The low-order bits of the field
3223                           // polynomial (i.e. p = z^7+z^2+z+1)
3224                           // repeated in the low and high parts of a
3225                           // 128-bit vector
3226     __ emit_int64(0x87);
3227 
3228     __ align(CodeEntryAlignment);
3229      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3230     address start = __ pc();
3231     __ enter();
3232 
3233     const Register in = c_rarg0;
3234     const Register len = c_rarg1;
3235     const Register ct = c_rarg2;
3236     const Register out = c_rarg3;
3237     // and updated with the incremented counter in the end
3238 
3239     const Register key = c_rarg4;
3240     const Register state = c_rarg5;
3241 
3242     const Register subkeyHtbl = c_rarg6;
3243 
3244     const Register counter = c_rarg7;
3245 
3246     const Register keylen = r10;
3247     // Save state before entering routine
3248     __ sub(sp, sp, 4 * 16);
3249     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3250     __ sub(sp, sp, 4 * 16);
3251     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3252 
3253     // __ andr(len, len, -512);
3254     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3255     __ str(len, __ pre(sp, -2 * wordSize));
3256 
3257     Label DONE;
3258     __ cbz(len, DONE);
3259 
3260     // Compute #rounds for AES based on the length of the key array
3261     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3262 
3263     __ aesenc_loadkeys(key, keylen);
3264     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3265     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3266 
3267     // AES/CTR loop
3268     {
3269       Label L_CTR_loop;
3270       __ BIND(L_CTR_loop);
3271 
3272       // Setup the counters
3273       __ movi(v8, __ T4S, 0);
3274       __ movi(v9, __ T4S, 1);
3275       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3276 
3277       assert(v0->encoding() < v8->encoding(), "");
3278       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3279         FloatRegister f = as_FloatRegister(i);
3280         __ rev32(f, __ T16B, v16);
3281         __ addv(v16, __ T4S, v16, v8);
3282       }
3283 
3284       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3285 
3286       // Encrypt the counters
3287       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3288 
3289       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3290 
3291       // XOR the encrypted counters with the inputs
3292       for (int i = 0; i < 8; i++) {
3293         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3294         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3295         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3296       }
3297       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3298       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3299 
3300       __ subw(len, len, 16 * 8);
3301       __ cbnzw(len, L_CTR_loop);
3302     }
3303 
3304     __ rev32(v16, __ T16B, v16);
3305     __ st1(v16, __ T16B, counter);
3306 
3307     __ ldr(len, Address(sp));
3308     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3309 
3310     // GHASH/CTR loop
3311     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3312                                 len, /*unrolls*/4);
3313 
3314 #ifdef ASSERT
3315     { Label L;
3316       __ cmp(len, (unsigned char)0);
3317       __ br(Assembler::EQ, L);
3318       __ stop("stubGenerator: abort");
3319       __ bind(L);
3320   }
3321 #endif
3322 
3323   __ bind(DONE);
3324     // Return the number of bytes processed
3325     __ ldr(r0, __ post(sp, 2 * wordSize));
3326 
3327     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3328     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3329 
3330     __ leave(); // required for proper stackwalking of RuntimeStub frame
3331     __ ret(lr);
3332      return start;
3333   }
3334 
3335   class Cached64Bytes {
3336   private:
3337     MacroAssembler *_masm;
3338     Register _regs[8];
3339 
3340   public:
3341     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3342       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3343       auto it = rs.begin();
3344       for (auto &r: _regs) {
3345         r = *it;
3346         ++it;
3347       }
3348     }
3349 
3350     void gen_loads(Register base) {
3351       for (int i = 0; i < 8; i += 2) {
3352         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3353       }
3354     }
3355 
3356     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3357     void extract_u32(Register dest, int i) {
3358       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3359     }
3360   };
3361 
3362   // Utility routines for md5.
3363   // Clobbers r10 and r11.
3364   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3365               int k, int s, int t) {
3366     Register rscratch3 = r10;
3367     Register rscratch4 = r11;
3368 
3369     __ eorw(rscratch3, r3, r4);
3370     __ movw(rscratch2, t);
3371     __ andw(rscratch3, rscratch3, r2);
3372     __ addw(rscratch4, r1, rscratch2);
3373     reg_cache.extract_u32(rscratch1, k);
3374     __ eorw(rscratch3, rscratch3, r4);
3375     __ addw(rscratch4, rscratch4, rscratch1);
3376     __ addw(rscratch3, rscratch3, rscratch4);
3377     __ rorw(rscratch2, rscratch3, 32 - s);
3378     __ addw(r1, rscratch2, r2);
3379   }
3380 
3381   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3382               int k, int s, int t) {
3383     Register rscratch3 = r10;
3384     Register rscratch4 = r11;
3385 
3386     __ andw(rscratch3, r2, r4);
3387     __ bicw(rscratch4, r3, r4);
3388     reg_cache.extract_u32(rscratch1, k);
3389     __ movw(rscratch2, t);
3390     __ orrw(rscratch3, rscratch3, rscratch4);
3391     __ addw(rscratch4, r1, rscratch2);
3392     __ addw(rscratch4, rscratch4, rscratch1);
3393     __ addw(rscratch3, rscratch3, rscratch4);
3394     __ rorw(rscratch2, rscratch3, 32 - s);
3395     __ addw(r1, rscratch2, r2);
3396   }
3397 
3398   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3399               int k, int s, int t) {
3400     Register rscratch3 = r10;
3401     Register rscratch4 = r11;
3402 
3403     __ eorw(rscratch3, r3, r4);
3404     __ movw(rscratch2, t);
3405     __ addw(rscratch4, r1, rscratch2);
3406     reg_cache.extract_u32(rscratch1, k);
3407     __ eorw(rscratch3, rscratch3, r2);
3408     __ addw(rscratch4, rscratch4, rscratch1);
3409     __ addw(rscratch3, rscratch3, rscratch4);
3410     __ rorw(rscratch2, rscratch3, 32 - s);
3411     __ addw(r1, rscratch2, r2);
3412   }
3413 
3414   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3415               int k, int s, int t) {
3416     Register rscratch3 = r10;
3417     Register rscratch4 = r11;
3418 
3419     __ movw(rscratch3, t);
3420     __ ornw(rscratch2, r2, r4);
3421     __ addw(rscratch4, r1, rscratch3);
3422     reg_cache.extract_u32(rscratch1, k);
3423     __ eorw(rscratch3, rscratch2, r3);
3424     __ addw(rscratch4, rscratch4, rscratch1);
3425     __ addw(rscratch3, rscratch3, rscratch4);
3426     __ rorw(rscratch2, rscratch3, 32 - s);
3427     __ addw(r1, rscratch2, r2);
3428   }
3429 
3430   // Arguments:
3431   //
3432   // Inputs:
3433   //   c_rarg0   - byte[]  source+offset
3434   //   c_rarg1   - int[]   SHA.state
3435   //   c_rarg2   - int     offset
3436   //   c_rarg3   - int     limit
3437   //
3438   address generate_md5_implCompress(bool multi_block, const char *name) {
3439     __ align(CodeEntryAlignment);
3440     StubCodeMark mark(this, "StubRoutines", name);
3441     address start = __ pc();
3442 
3443     Register buf       = c_rarg0;
3444     Register state     = c_rarg1;
3445     Register ofs       = c_rarg2;
3446     Register limit     = c_rarg3;
3447     Register a         = r4;
3448     Register b         = r5;
3449     Register c         = r6;
3450     Register d         = r7;
3451     Register rscratch3 = r10;
3452     Register rscratch4 = r11;
3453 
3454     Register state_regs[2] = { r12, r13 };
3455     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3456     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3457 
3458     __ push(saved_regs, sp);
3459 
3460     __ ldp(state_regs[0], state_regs[1], Address(state));
3461     __ ubfx(a, state_regs[0],  0, 32);
3462     __ ubfx(b, state_regs[0], 32, 32);
3463     __ ubfx(c, state_regs[1],  0, 32);
3464     __ ubfx(d, state_regs[1], 32, 32);
3465 
3466     Label md5_loop;
3467     __ BIND(md5_loop);
3468 
3469     reg_cache.gen_loads(buf);
3470 
3471     // Round 1
3472     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3473     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3474     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3475     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3476     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3477     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3478     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3479     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3480     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3481     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3482     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3483     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3484     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3485     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3486     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3487     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3488 
3489     // Round 2
3490     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3491     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3492     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3493     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3494     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3495     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3496     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3497     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3498     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3499     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3500     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3501     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3502     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3503     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3504     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3505     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3506 
3507     // Round 3
3508     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3509     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3510     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3511     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3512     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3513     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3514     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3515     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3516     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3517     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3518     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3519     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3520     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3521     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3522     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3523     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3524 
3525     // Round 4
3526     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3527     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3528     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3529     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3530     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3531     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3532     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3533     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3534     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3535     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3536     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3537     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3538     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3539     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3540     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3541     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3542 
3543     __ addw(a, state_regs[0], a);
3544     __ ubfx(rscratch2, state_regs[0], 32, 32);
3545     __ addw(b, rscratch2, b);
3546     __ addw(c, state_regs[1], c);
3547     __ ubfx(rscratch4, state_regs[1], 32, 32);
3548     __ addw(d, rscratch4, d);
3549 
3550     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3551     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3552 
3553     if (multi_block) {
3554       __ add(buf, buf, 64);
3555       __ add(ofs, ofs, 64);
3556       __ cmp(ofs, limit);
3557       __ br(Assembler::LE, md5_loop);
3558       __ mov(c_rarg0, ofs); // return ofs
3559     }
3560 
3561     // write hash values back in the correct order
3562     __ stp(state_regs[0], state_regs[1], Address(state));
3563 
3564     __ pop(saved_regs, sp);
3565 
3566     __ ret(lr);
3567 
3568     return start;
3569   }
3570 
3571   // Arguments:
3572   //
3573   // Inputs:
3574   //   c_rarg0   - byte[]  source+offset
3575   //   c_rarg1   - int[]   SHA.state
3576   //   c_rarg2   - int     offset
3577   //   c_rarg3   - int     limit
3578   //
3579   address generate_sha1_implCompress(bool multi_block, const char *name) {
3580     __ align(CodeEntryAlignment);
3581     StubCodeMark mark(this, "StubRoutines", name);
3582     address start = __ pc();
3583 
3584     Register buf   = c_rarg0;
3585     Register state = c_rarg1;
3586     Register ofs   = c_rarg2;
3587     Register limit = c_rarg3;
3588 
3589     Label keys;
3590     Label sha1_loop;
3591 
3592     // load the keys into v0..v3
3593     __ adr(rscratch1, keys);
3594     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3595     // load 5 words state into v6, v7
3596     __ ldrq(v6, Address(state, 0));
3597     __ ldrs(v7, Address(state, 16));
3598 
3599 
3600     __ BIND(sha1_loop);
3601     // load 64 bytes of data into v16..v19
3602     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3603     __ rev32(v16, __ T16B, v16);
3604     __ rev32(v17, __ T16B, v17);
3605     __ rev32(v18, __ T16B, v18);
3606     __ rev32(v19, __ T16B, v19);
3607 
3608     // do the sha1
3609     __ addv(v4, __ T4S, v16, v0);
3610     __ orr(v20, __ T16B, v6, v6);
3611 
3612     FloatRegister d0 = v16;
3613     FloatRegister d1 = v17;
3614     FloatRegister d2 = v18;
3615     FloatRegister d3 = v19;
3616 
3617     for (int round = 0; round < 20; round++) {
3618       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3619       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3620       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3621       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3622       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3623 
3624       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3625       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3626       __ sha1h(tmp2, __ T4S, v20);
3627       if (round < 5)
3628         __ sha1c(v20, __ T4S, tmp3, tmp4);
3629       else if (round < 10 || round >= 15)
3630         __ sha1p(v20, __ T4S, tmp3, tmp4);
3631       else
3632         __ sha1m(v20, __ T4S, tmp3, tmp4);
3633       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3634 
3635       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3636     }
3637 
3638     __ addv(v7, __ T2S, v7, v21);
3639     __ addv(v6, __ T4S, v6, v20);
3640 
3641     if (multi_block) {
3642       __ add(ofs, ofs, 64);
3643       __ cmp(ofs, limit);
3644       __ br(Assembler::LE, sha1_loop);
3645       __ mov(c_rarg0, ofs); // return ofs
3646     }
3647 
3648     __ strq(v6, Address(state, 0));
3649     __ strs(v7, Address(state, 16));
3650 
3651     __ ret(lr);
3652 
3653     __ bind(keys);
3654     __ emit_int32(0x5a827999);
3655     __ emit_int32(0x6ed9eba1);
3656     __ emit_int32(0x8f1bbcdc);
3657     __ emit_int32(0xca62c1d6);
3658 
3659     return start;
3660   }
3661 
3662 
3663   // Arguments:
3664   //
3665   // Inputs:
3666   //   c_rarg0   - byte[]  source+offset
3667   //   c_rarg1   - int[]   SHA.state
3668   //   c_rarg2   - int     offset
3669   //   c_rarg3   - int     limit
3670   //
3671   address generate_sha256_implCompress(bool multi_block, const char *name) {
3672     static const uint32_t round_consts[64] = {
3673       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3674       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3675       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3676       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3677       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3678       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3679       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3680       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3681       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3682       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3683       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3684       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3685       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3686       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3687       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3688       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3689     };
3690     __ align(CodeEntryAlignment);
3691     StubCodeMark mark(this, "StubRoutines", name);
3692     address start = __ pc();
3693 
3694     Register buf   = c_rarg0;
3695     Register state = c_rarg1;
3696     Register ofs   = c_rarg2;
3697     Register limit = c_rarg3;
3698 
3699     Label sha1_loop;
3700 
3701     __ stpd(v8, v9, __ pre(sp, -32));
3702     __ stpd(v10, v11, Address(sp, 16));
3703 
3704 // dga == v0
3705 // dgb == v1
3706 // dg0 == v2
3707 // dg1 == v3
3708 // dg2 == v4
3709 // t0 == v6
3710 // t1 == v7
3711 
3712     // load 16 keys to v16..v31
3713     __ lea(rscratch1, ExternalAddress((address)round_consts));
3714     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3715     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3716     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3717     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3718 
3719     // load 8 words (256 bits) state
3720     __ ldpq(v0, v1, state);
3721 
3722     __ BIND(sha1_loop);
3723     // load 64 bytes of data into v8..v11
3724     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3725     __ rev32(v8, __ T16B, v8);
3726     __ rev32(v9, __ T16B, v9);
3727     __ rev32(v10, __ T16B, v10);
3728     __ rev32(v11, __ T16B, v11);
3729 
3730     __ addv(v6, __ T4S, v8, v16);
3731     __ orr(v2, __ T16B, v0, v0);
3732     __ orr(v3, __ T16B, v1, v1);
3733 
3734     FloatRegister d0 = v8;
3735     FloatRegister d1 = v9;
3736     FloatRegister d2 = v10;
3737     FloatRegister d3 = v11;
3738 
3739 
3740     for (int round = 0; round < 16; round++) {
3741       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3742       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3743       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3744       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3745 
3746       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3747        __ orr(v4, __ T16B, v2, v2);
3748       if (round < 15)
3749         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3750       __ sha256h(v2, __ T4S, v3, tmp2);
3751       __ sha256h2(v3, __ T4S, v4, tmp2);
3752       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3753 
3754       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3755     }
3756 
3757     __ addv(v0, __ T4S, v0, v2);
3758     __ addv(v1, __ T4S, v1, v3);
3759 
3760     if (multi_block) {
3761       __ add(ofs, ofs, 64);
3762       __ cmp(ofs, limit);
3763       __ br(Assembler::LE, sha1_loop);
3764       __ mov(c_rarg0, ofs); // return ofs
3765     }
3766 
3767     __ ldpd(v10, v11, Address(sp, 16));
3768     __ ldpd(v8, v9, __ post(sp, 32));
3769 
3770     __ stpq(v0, v1, state);
3771 
3772     __ ret(lr);
3773 
3774     return start;
3775   }
3776 
3777   // Double rounds for sha512.
3778   void sha512_dround(int dr,
3779                      FloatRegister vi0, FloatRegister vi1,
3780                      FloatRegister vi2, FloatRegister vi3,
3781                      FloatRegister vi4, FloatRegister vrc0,
3782                      FloatRegister vrc1, FloatRegister vin0,
3783                      FloatRegister vin1, FloatRegister vin2,
3784                      FloatRegister vin3, FloatRegister vin4) {
3785       if (dr < 36) {
3786         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3787       }
3788       __ addv(v5, __ T2D, vrc0, vin0);
3789       __ ext(v6, __ T16B, vi2, vi3, 8);
3790       __ ext(v5, __ T16B, v5, v5, 8);
3791       __ ext(v7, __ T16B, vi1, vi2, 8);
3792       __ addv(vi3, __ T2D, vi3, v5);
3793       if (dr < 32) {
3794         __ ext(v5, __ T16B, vin3, vin4, 8);
3795         __ sha512su0(vin0, __ T2D, vin1);
3796       }
3797       __ sha512h(vi3, __ T2D, v6, v7);
3798       if (dr < 32) {
3799         __ sha512su1(vin0, __ T2D, vin2, v5);
3800       }
3801       __ addv(vi4, __ T2D, vi1, vi3);
3802       __ sha512h2(vi3, __ T2D, vi1, vi0);
3803   }
3804 
3805   // Arguments:
3806   //
3807   // Inputs:
3808   //   c_rarg0   - byte[]  source+offset
3809   //   c_rarg1   - int[]   SHA.state
3810   //   c_rarg2   - int     offset
3811   //   c_rarg3   - int     limit
3812   //
3813   address generate_sha512_implCompress(bool multi_block, const char *name) {
3814     static const uint64_t round_consts[80] = {
3815       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3816       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3817       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3818       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3819       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3820       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3821       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3822       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3823       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3824       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3825       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3826       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3827       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3828       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3829       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3830       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3831       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3832       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3833       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3834       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3835       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3836       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3837       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3838       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3839       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3840       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3841       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3842     };
3843 
3844     __ align(CodeEntryAlignment);
3845     StubCodeMark mark(this, "StubRoutines", name);
3846     address start = __ pc();
3847 
3848     Register buf   = c_rarg0;
3849     Register state = c_rarg1;
3850     Register ofs   = c_rarg2;
3851     Register limit = c_rarg3;
3852 
3853     __ stpd(v8, v9, __ pre(sp, -64));
3854     __ stpd(v10, v11, Address(sp, 16));
3855     __ stpd(v12, v13, Address(sp, 32));
3856     __ stpd(v14, v15, Address(sp, 48));
3857 
3858     Label sha512_loop;
3859 
3860     // load state
3861     __ ld1(v8, v9, v10, v11, __ T2D, state);
3862 
3863     // load first 4 round constants
3864     __ lea(rscratch1, ExternalAddress((address)round_consts));
3865     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3866 
3867     __ BIND(sha512_loop);
3868     // load 128B of data into v12..v19
3869     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3870     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3871     __ rev64(v12, __ T16B, v12);
3872     __ rev64(v13, __ T16B, v13);
3873     __ rev64(v14, __ T16B, v14);
3874     __ rev64(v15, __ T16B, v15);
3875     __ rev64(v16, __ T16B, v16);
3876     __ rev64(v17, __ T16B, v17);
3877     __ rev64(v18, __ T16B, v18);
3878     __ rev64(v19, __ T16B, v19);
3879 
3880     __ mov(rscratch2, rscratch1);
3881 
3882     __ mov(v0, __ T16B, v8);
3883     __ mov(v1, __ T16B, v9);
3884     __ mov(v2, __ T16B, v10);
3885     __ mov(v3, __ T16B, v11);
3886 
3887     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3888     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3889     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3890     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3891     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3892     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3893     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3894     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3895     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3896     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3897     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3898     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3899     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3900     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3901     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3902     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3903     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3904     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3905     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3906     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3907     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3908     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3909     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3910     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3911     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3912     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3913     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3914     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3915     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3916     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3917     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3918     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3919     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3920     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3921     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3922     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3923     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3924     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3925     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3926     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3927 
3928     __ addv(v8, __ T2D, v8, v0);
3929     __ addv(v9, __ T2D, v9, v1);
3930     __ addv(v10, __ T2D, v10, v2);
3931     __ addv(v11, __ T2D, v11, v3);
3932 
3933     if (multi_block) {
3934       __ add(ofs, ofs, 128);
3935       __ cmp(ofs, limit);
3936       __ br(Assembler::LE, sha512_loop);
3937       __ mov(c_rarg0, ofs); // return ofs
3938     }
3939 
3940     __ st1(v8, v9, v10, v11, __ T2D, state);
3941 
3942     __ ldpd(v14, v15, Address(sp, 48));
3943     __ ldpd(v12, v13, Address(sp, 32));
3944     __ ldpd(v10, v11, Address(sp, 16));
3945     __ ldpd(v8, v9, __ post(sp, 64));
3946 
3947     __ ret(lr);
3948 
3949     return start;
3950   }
3951 
3952   // Arguments:
3953   //
3954   // Inputs:
3955   //   c_rarg0   - byte[]  source+offset
3956   //   c_rarg1   - byte[]  SHA.state
3957   //   c_rarg2   - int     block_size
3958   //   c_rarg3   - int     offset
3959   //   c_rarg4   - int     limit
3960   //
3961   address generate_sha3_implCompress(bool multi_block, const char *name) {
3962     static const uint64_t round_consts[24] = {
3963       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3964       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3965       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3966       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3967       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3968       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3969       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3970       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3971     };
3972 
3973     __ align(CodeEntryAlignment);
3974     StubCodeMark mark(this, "StubRoutines", name);
3975     address start = __ pc();
3976 
3977     Register buf           = c_rarg0;
3978     Register state         = c_rarg1;
3979     Register block_size    = c_rarg2;
3980     Register ofs           = c_rarg3;
3981     Register limit         = c_rarg4;
3982 
3983     Label sha3_loop, rounds24_loop;
3984     Label sha3_512_or_sha3_384, shake128;
3985 
3986     __ stpd(v8, v9, __ pre(sp, -64));
3987     __ stpd(v10, v11, Address(sp, 16));
3988     __ stpd(v12, v13, Address(sp, 32));
3989     __ stpd(v14, v15, Address(sp, 48));
3990 
3991     // load state
3992     __ add(rscratch1, state, 32);
3993     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3994     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3995     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3996     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3997     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3998     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3999     __ ld1(v24, __ T1D, rscratch1);
4000 
4001     __ BIND(sha3_loop);
4002 
4003     // 24 keccak rounds
4004     __ movw(rscratch2, 24);
4005 
4006     // load round_constants base
4007     __ lea(rscratch1, ExternalAddress((address) round_consts));
4008 
4009     // load input
4010     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4011     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4012     __ eor(v0, __ T8B, v0, v25);
4013     __ eor(v1, __ T8B, v1, v26);
4014     __ eor(v2, __ T8B, v2, v27);
4015     __ eor(v3, __ T8B, v3, v28);
4016     __ eor(v4, __ T8B, v4, v29);
4017     __ eor(v5, __ T8B, v5, v30);
4018     __ eor(v6, __ T8B, v6, v31);
4019 
4020     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4021     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4022 
4023     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4024     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4025     __ eor(v7, __ T8B, v7, v25);
4026     __ eor(v8, __ T8B, v8, v26);
4027     __ eor(v9, __ T8B, v9, v27);
4028     __ eor(v10, __ T8B, v10, v28);
4029     __ eor(v11, __ T8B, v11, v29);
4030     __ eor(v12, __ T8B, v12, v30);
4031     __ eor(v13, __ T8B, v13, v31);
4032 
4033     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4034     __ eor(v14, __ T8B, v14, v25);
4035     __ eor(v15, __ T8B, v15, v26);
4036     __ eor(v16, __ T8B, v16, v27);
4037 
4038     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4039     __ andw(c_rarg5, block_size, 48);
4040     __ cbzw(c_rarg5, rounds24_loop);
4041 
4042     __ tbnz(block_size, 5, shake128);
4043     // block_size == 144, bit5 == 0, SHA3-244
4044     __ ldrd(v28, __ post(buf, 8));
4045     __ eor(v17, __ T8B, v17, v28);
4046     __ b(rounds24_loop);
4047 
4048     __ BIND(shake128);
4049     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4050     __ eor(v17, __ T8B, v17, v28);
4051     __ eor(v18, __ T8B, v18, v29);
4052     __ eor(v19, __ T8B, v19, v30);
4053     __ eor(v20, __ T8B, v20, v31);
4054     __ b(rounds24_loop); // block_size == 168, SHAKE128
4055 
4056     __ BIND(sha3_512_or_sha3_384);
4057     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4058     __ eor(v7, __ T8B, v7, v25);
4059     __ eor(v8, __ T8B, v8, v26);
4060     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4061 
4062     // SHA3-384
4063     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4064     __ eor(v9,  __ T8B, v9,  v27);
4065     __ eor(v10, __ T8B, v10, v28);
4066     __ eor(v11, __ T8B, v11, v29);
4067     __ eor(v12, __ T8B, v12, v30);
4068 
4069     __ BIND(rounds24_loop);
4070     __ subw(rscratch2, rscratch2, 1);
4071 
4072     __ eor3(v29, __ T16B, v4, v9, v14);
4073     __ eor3(v26, __ T16B, v1, v6, v11);
4074     __ eor3(v28, __ T16B, v3, v8, v13);
4075     __ eor3(v25, __ T16B, v0, v5, v10);
4076     __ eor3(v27, __ T16B, v2, v7, v12);
4077     __ eor3(v29, __ T16B, v29, v19, v24);
4078     __ eor3(v26, __ T16B, v26, v16, v21);
4079     __ eor3(v28, __ T16B, v28, v18, v23);
4080     __ eor3(v25, __ T16B, v25, v15, v20);
4081     __ eor3(v27, __ T16B, v27, v17, v22);
4082 
4083     __ rax1(v30, __ T2D, v29, v26);
4084     __ rax1(v26, __ T2D, v26, v28);
4085     __ rax1(v28, __ T2D, v28, v25);
4086     __ rax1(v25, __ T2D, v25, v27);
4087     __ rax1(v27, __ T2D, v27, v29);
4088 
4089     __ eor(v0, __ T16B, v0, v30);
4090     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4091     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4092     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4093     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4094     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4095     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4096     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4097     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4098     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4099     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4100     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4101     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4102     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4103     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4104     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4105     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4106     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4107     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4108     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4109     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4110     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4111     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4112     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4113     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4114 
4115     __ bcax(v20, __ T16B, v31, v22, v8);
4116     __ bcax(v21, __ T16B, v8,  v23, v22);
4117     __ bcax(v22, __ T16B, v22, v24, v23);
4118     __ bcax(v23, __ T16B, v23, v31, v24);
4119     __ bcax(v24, __ T16B, v24, v8,  v31);
4120 
4121     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4122 
4123     __ bcax(v17, __ T16B, v25, v19, v3);
4124     __ bcax(v18, __ T16B, v3,  v15, v19);
4125     __ bcax(v19, __ T16B, v19, v16, v15);
4126     __ bcax(v15, __ T16B, v15, v25, v16);
4127     __ bcax(v16, __ T16B, v16, v3,  v25);
4128 
4129     __ bcax(v10, __ T16B, v29, v12, v26);
4130     __ bcax(v11, __ T16B, v26, v13, v12);
4131     __ bcax(v12, __ T16B, v12, v14, v13);
4132     __ bcax(v13, __ T16B, v13, v29, v14);
4133     __ bcax(v14, __ T16B, v14, v26, v29);
4134 
4135     __ bcax(v7, __ T16B, v30, v9,  v4);
4136     __ bcax(v8, __ T16B, v4,  v5,  v9);
4137     __ bcax(v9, __ T16B, v9,  v6,  v5);
4138     __ bcax(v5, __ T16B, v5,  v30, v6);
4139     __ bcax(v6, __ T16B, v6,  v4,  v30);
4140 
4141     __ bcax(v3, __ T16B, v27, v0,  v28);
4142     __ bcax(v4, __ T16B, v28, v1,  v0);
4143     __ bcax(v0, __ T16B, v0,  v2,  v1);
4144     __ bcax(v1, __ T16B, v1,  v27, v2);
4145     __ bcax(v2, __ T16B, v2,  v28, v27);
4146 
4147     __ eor(v0, __ T16B, v0, v31);
4148 
4149     __ cbnzw(rscratch2, rounds24_loop);
4150 
4151     if (multi_block) {
4152       __ add(ofs, ofs, block_size);
4153       __ cmp(ofs, limit);
4154       __ br(Assembler::LE, sha3_loop);
4155       __ mov(c_rarg0, ofs); // return ofs
4156     }
4157 
4158     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4159     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4160     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4161     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4162     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4163     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4164     __ st1(v24, __ T1D, state);
4165 
4166     __ ldpd(v14, v15, Address(sp, 48));
4167     __ ldpd(v12, v13, Address(sp, 32));
4168     __ ldpd(v10, v11, Address(sp, 16));
4169     __ ldpd(v8, v9, __ post(sp, 64));
4170 
4171     __ ret(lr);
4172 
4173     return start;
4174   }
4175 
4176   /**
4177    *  Arguments:
4178    *
4179    * Inputs:
4180    *   c_rarg0   - int crc
4181    *   c_rarg1   - byte* buf
4182    *   c_rarg2   - int length
4183    *
4184    * Output:
4185    *       rax   - int crc result
4186    */
4187   address generate_updateBytesCRC32() {
4188     assert(UseCRC32Intrinsics, "what are we doing here?");
4189 
4190     __ align(CodeEntryAlignment);
4191     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4192 
4193     address start = __ pc();
4194 
4195     const Register crc   = c_rarg0;  // crc
4196     const Register buf   = c_rarg1;  // source java byte array address
4197     const Register len   = c_rarg2;  // length
4198     const Register table0 = c_rarg3; // crc_table address
4199     const Register table1 = c_rarg4;
4200     const Register table2 = c_rarg5;
4201     const Register table3 = c_rarg6;
4202     const Register tmp3 = c_rarg7;
4203 
4204     BLOCK_COMMENT("Entry:");
4205     __ enter(); // required for proper stackwalking of RuntimeStub frame
4206 
4207     __ kernel_crc32(crc, buf, len,
4208               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4209 
4210     __ leave(); // required for proper stackwalking of RuntimeStub frame
4211     __ ret(lr);
4212 
4213     return start;
4214   }
4215 
4216   // ChaCha20 block function.  This version parallelizes by loading
4217   // individual 32-bit state elements into vectors for four blocks
4218   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4219   //
4220   // state (int[16]) = c_rarg0
4221   // keystream (byte[1024]) = c_rarg1
4222   // return - number of bytes of keystream (always 256)
4223   address generate_chacha20Block_blockpar() {
4224     Label L_twoRounds, L_cc20_const;
4225     // The constant data is broken into two 128-bit segments to be loaded
4226     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4227     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4228     // The second 128-bits is a table constant used for 8-bit left rotations.
4229     __ BIND(L_cc20_const);
4230     __ emit_int64(0x0000000100000000UL);
4231     __ emit_int64(0x0000000300000002UL);
4232     __ emit_int64(0x0605040702010003UL);
4233     __ emit_int64(0x0E0D0C0F0A09080BUL);
4234 
4235     __ align(CodeEntryAlignment);
4236     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4237     address start = __ pc();
4238     __ enter();
4239 
4240     int i, j;
4241     const Register state = c_rarg0;
4242     const Register keystream = c_rarg1;
4243     const Register loopCtr = r10;
4244     const Register tmpAddr = r11;
4245 
4246     const FloatRegister stateFirst = v0;
4247     const FloatRegister stateSecond = v1;
4248     const FloatRegister stateThird = v2;
4249     const FloatRegister stateFourth = v3;
4250     const FloatRegister origCtrState = v28;
4251     const FloatRegister scratch = v29;
4252     const FloatRegister lrot8Tbl = v30;
4253 
4254     // Organize SIMD registers in an array that facilitates
4255     // putting repetitive opcodes into loop structures.  It is
4256     // important that each grouping of 4 registers is monotonically
4257     // increasing to support the requirements of multi-register
4258     // instructions (e.g. ld4r, st4, etc.)
4259     const FloatRegister workSt[16] = {
4260          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4261         v20, v21, v22, v23, v24, v25, v26, v27
4262     };
4263 
4264     // Load from memory and interlace across 16 SIMD registers,
4265     // With each word from memory being broadcast to all lanes of
4266     // each successive SIMD register.
4267     //      Addr(0) -> All lanes in workSt[i]
4268     //      Addr(4) -> All lanes workSt[i + 1], etc.
4269     __ mov(tmpAddr, state);
4270     for (i = 0; i < 16; i += 4) {
4271       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4272           __ post(tmpAddr, 16));
4273     }
4274 
4275     // Pull in constant data.  The first 16 bytes are the add overlay
4276     // which is applied to the vector holding the counter (state[12]).
4277     // The second 16 bytes is the index register for the 8-bit left
4278     // rotation tbl instruction.
4279     __ adr(tmpAddr, L_cc20_const);
4280     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4281     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4282 
4283     // Set up the 10 iteration loop and perform all 8 quarter round ops
4284     __ mov(loopCtr, 10);
4285     __ BIND(L_twoRounds);
4286 
4287     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4288         scratch, lrot8Tbl);
4289     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4290         scratch, lrot8Tbl);
4291     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4292         scratch, lrot8Tbl);
4293     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4294         scratch, lrot8Tbl);
4295 
4296     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4297         scratch, lrot8Tbl);
4298     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4299         scratch, lrot8Tbl);
4300     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4301         scratch, lrot8Tbl);
4302     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4303         scratch, lrot8Tbl);
4304 
4305     // Decrement and iterate
4306     __ sub(loopCtr, loopCtr, 1);
4307     __ cbnz(loopCtr, L_twoRounds);
4308 
4309     __ mov(tmpAddr, state);
4310 
4311     // Add the starting state back to the post-loop keystream
4312     // state.  We read/interlace the state array from memory into
4313     // 4 registers similar to what we did in the beginning.  Then
4314     // add the counter overlay onto workSt[12] at the end.
4315     for (i = 0; i < 16; i += 4) {
4316       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4317           __ post(tmpAddr, 16));
4318       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4319       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4320       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4321       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4322     }
4323     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4324 
4325     // Write to key stream, storing the same element out of workSt[0..15]
4326     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4327     // for the next element position.
4328     for (i = 0; i < 4; i++) {
4329       for (j = 0; j < 16; j += 4) {
4330         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4331             __ post(keystream, 16));
4332       }
4333     }
4334 
4335     __ mov(r0, 256);             // Return length of output keystream
4336     __ leave();
4337     __ ret(lr);
4338 
4339     return start;
4340   }
4341 
4342   /**
4343    *  Arguments:
4344    *
4345    * Inputs:
4346    *   c_rarg0   - int crc
4347    *   c_rarg1   - byte* buf
4348    *   c_rarg2   - int length
4349    *   c_rarg3   - int* table
4350    *
4351    * Output:
4352    *       r0   - int crc result
4353    */
4354   address generate_updateBytesCRC32C() {
4355     assert(UseCRC32CIntrinsics, "what are we doing here?");
4356 
4357     __ align(CodeEntryAlignment);
4358     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4359 
4360     address start = __ pc();
4361 
4362     const Register crc   = c_rarg0;  // crc
4363     const Register buf   = c_rarg1;  // source java byte array address
4364     const Register len   = c_rarg2;  // length
4365     const Register table0 = c_rarg3; // crc_table address
4366     const Register table1 = c_rarg4;
4367     const Register table2 = c_rarg5;
4368     const Register table3 = c_rarg6;
4369     const Register tmp3 = c_rarg7;
4370 
4371     BLOCK_COMMENT("Entry:");
4372     __ enter(); // required for proper stackwalking of RuntimeStub frame
4373 
4374     __ kernel_crc32c(crc, buf, len,
4375               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4376 
4377     __ leave(); // required for proper stackwalking of RuntimeStub frame
4378     __ ret(lr);
4379 
4380     return start;
4381   }
4382 
4383   /***
4384    *  Arguments:
4385    *
4386    *  Inputs:
4387    *   c_rarg0   - int   adler
4388    *   c_rarg1   - byte* buff
4389    *   c_rarg2   - int   len
4390    *
4391    * Output:
4392    *   c_rarg0   - int adler result
4393    */
4394   address generate_updateBytesAdler32() {
4395     __ align(CodeEntryAlignment);
4396     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4397     address start = __ pc();
4398 
4399     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4400 
4401     // Aliases
4402     Register adler  = c_rarg0;
4403     Register s1     = c_rarg0;
4404     Register s2     = c_rarg3;
4405     Register buff   = c_rarg1;
4406     Register len    = c_rarg2;
4407     Register nmax  = r4;
4408     Register base  = r5;
4409     Register count = r6;
4410     Register temp0 = rscratch1;
4411     Register temp1 = rscratch2;
4412     FloatRegister vbytes = v0;
4413     FloatRegister vs1acc = v1;
4414     FloatRegister vs2acc = v2;
4415     FloatRegister vtable = v3;
4416 
4417     // Max number of bytes we can process before having to take the mod
4418     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4419     uint64_t BASE = 0xfff1;
4420     uint64_t NMAX = 0x15B0;
4421 
4422     __ mov(base, BASE);
4423     __ mov(nmax, NMAX);
4424 
4425     // Load accumulation coefficients for the upper 16 bits
4426     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4427     __ ld1(vtable, __ T16B, Address(temp0));
4428 
4429     // s1 is initialized to the lower 16 bits of adler
4430     // s2 is initialized to the upper 16 bits of adler
4431     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4432     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4433 
4434     // The pipelined loop needs at least 16 elements for 1 iteration
4435     // It does check this, but it is more effective to skip to the cleanup loop
4436     __ cmp(len, (u1)16);
4437     __ br(Assembler::HS, L_nmax);
4438     __ cbz(len, L_combine);
4439 
4440     __ bind(L_simple_by1_loop);
4441     __ ldrb(temp0, Address(__ post(buff, 1)));
4442     __ add(s1, s1, temp0);
4443     __ add(s2, s2, s1);
4444     __ subs(len, len, 1);
4445     __ br(Assembler::HI, L_simple_by1_loop);
4446 
4447     // s1 = s1 % BASE
4448     __ subs(temp0, s1, base);
4449     __ csel(s1, temp0, s1, Assembler::HS);
4450 
4451     // s2 = s2 % BASE
4452     __ lsr(temp0, s2, 16);
4453     __ lsl(temp1, temp0, 4);
4454     __ sub(temp1, temp1, temp0);
4455     __ add(s2, temp1, s2, ext::uxth);
4456 
4457     __ subs(temp0, s2, base);
4458     __ csel(s2, temp0, s2, Assembler::HS);
4459 
4460     __ b(L_combine);
4461 
4462     __ bind(L_nmax);
4463     __ subs(len, len, nmax);
4464     __ sub(count, nmax, 16);
4465     __ br(Assembler::LO, L_by16);
4466 
4467     __ bind(L_nmax_loop);
4468 
4469     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4470                                       vbytes, vs1acc, vs2acc, vtable);
4471 
4472     __ subs(count, count, 16);
4473     __ br(Assembler::HS, L_nmax_loop);
4474 
4475     // s1 = s1 % BASE
4476     __ lsr(temp0, s1, 16);
4477     __ lsl(temp1, temp0, 4);
4478     __ sub(temp1, temp1, temp0);
4479     __ add(temp1, temp1, s1, ext::uxth);
4480 
4481     __ lsr(temp0, temp1, 16);
4482     __ lsl(s1, temp0, 4);
4483     __ sub(s1, s1, temp0);
4484     __ add(s1, s1, temp1, ext:: uxth);
4485 
4486     __ subs(temp0, s1, base);
4487     __ csel(s1, temp0, s1, Assembler::HS);
4488 
4489     // s2 = s2 % BASE
4490     __ lsr(temp0, s2, 16);
4491     __ lsl(temp1, temp0, 4);
4492     __ sub(temp1, temp1, temp0);
4493     __ add(temp1, temp1, s2, ext::uxth);
4494 
4495     __ lsr(temp0, temp1, 16);
4496     __ lsl(s2, temp0, 4);
4497     __ sub(s2, s2, temp0);
4498     __ add(s2, s2, temp1, ext:: uxth);
4499 
4500     __ subs(temp0, s2, base);
4501     __ csel(s2, temp0, s2, Assembler::HS);
4502 
4503     __ subs(len, len, nmax);
4504     __ sub(count, nmax, 16);
4505     __ br(Assembler::HS, L_nmax_loop);
4506 
4507     __ bind(L_by16);
4508     __ adds(len, len, count);
4509     __ br(Assembler::LO, L_by1);
4510 
4511     __ bind(L_by16_loop);
4512 
4513     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4514                                       vbytes, vs1acc, vs2acc, vtable);
4515 
4516     __ subs(len, len, 16);
4517     __ br(Assembler::HS, L_by16_loop);
4518 
4519     __ bind(L_by1);
4520     __ adds(len, len, 15);
4521     __ br(Assembler::LO, L_do_mod);
4522 
4523     __ bind(L_by1_loop);
4524     __ ldrb(temp0, Address(__ post(buff, 1)));
4525     __ add(s1, temp0, s1);
4526     __ add(s2, s2, s1);
4527     __ subs(len, len, 1);
4528     __ br(Assembler::HS, L_by1_loop);
4529 
4530     __ bind(L_do_mod);
4531     // s1 = s1 % BASE
4532     __ lsr(temp0, s1, 16);
4533     __ lsl(temp1, temp0, 4);
4534     __ sub(temp1, temp1, temp0);
4535     __ add(temp1, temp1, s1, ext::uxth);
4536 
4537     __ lsr(temp0, temp1, 16);
4538     __ lsl(s1, temp0, 4);
4539     __ sub(s1, s1, temp0);
4540     __ add(s1, s1, temp1, ext:: uxth);
4541 
4542     __ subs(temp0, s1, base);
4543     __ csel(s1, temp0, s1, Assembler::HS);
4544 
4545     // s2 = s2 % BASE
4546     __ lsr(temp0, s2, 16);
4547     __ lsl(temp1, temp0, 4);
4548     __ sub(temp1, temp1, temp0);
4549     __ add(temp1, temp1, s2, ext::uxth);
4550 
4551     __ lsr(temp0, temp1, 16);
4552     __ lsl(s2, temp0, 4);
4553     __ sub(s2, s2, temp0);
4554     __ add(s2, s2, temp1, ext:: uxth);
4555 
4556     __ subs(temp0, s2, base);
4557     __ csel(s2, temp0, s2, Assembler::HS);
4558 
4559     // Combine lower bits and higher bits
4560     __ bind(L_combine);
4561     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4562 
4563     __ ret(lr);
4564 
4565     return start;
4566   }
4567 
4568   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4569           Register temp0, Register temp1, FloatRegister vbytes,
4570           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4571     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4572     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4573     // In non-vectorized code, we update s1 and s2 as:
4574     //   s1 <- s1 + b1
4575     //   s2 <- s2 + s1
4576     //   s1 <- s1 + b2
4577     //   s2 <- s2 + b1
4578     //   ...
4579     //   s1 <- s1 + b16
4580     //   s2 <- s2 + s1
4581     // Putting above assignments together, we have:
4582     //   s1_new = s1 + b1 + b2 + ... + b16
4583     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4584     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4585     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4586     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4587 
4588     // s2 = s2 + s1 * 16
4589     __ add(s2, s2, s1, Assembler::LSL, 4);
4590 
4591     // vs1acc = b1 + b2 + b3 + ... + b16
4592     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4593     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4594     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4595     __ uaddlv(vs1acc, __ T16B, vbytes);
4596     __ uaddlv(vs2acc, __ T8H, vs2acc);
4597 
4598     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4599     __ fmovd(temp0, vs1acc);
4600     __ fmovd(temp1, vs2acc);
4601     __ add(s1, s1, temp0);
4602     __ add(s2, s2, temp1);
4603   }
4604 
4605   /**
4606    *  Arguments:
4607    *
4608    *  Input:
4609    *    c_rarg0   - x address
4610    *    c_rarg1   - x length
4611    *    c_rarg2   - y address
4612    *    c_rarg3   - y length
4613    *    c_rarg4   - z address
4614    *    c_rarg5   - z length
4615    */
4616   address generate_multiplyToLen() {
4617     __ align(CodeEntryAlignment);
4618     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4619 
4620     address start = __ pc();
4621     const Register x     = r0;
4622     const Register xlen  = r1;
4623     const Register y     = r2;
4624     const Register ylen  = r3;
4625     const Register z     = r4;
4626     const Register zlen  = r5;
4627 
4628     const Register tmp1  = r10;
4629     const Register tmp2  = r11;
4630     const Register tmp3  = r12;
4631     const Register tmp4  = r13;
4632     const Register tmp5  = r14;
4633     const Register tmp6  = r15;
4634     const Register tmp7  = r16;
4635 
4636     BLOCK_COMMENT("Entry:");
4637     __ enter(); // required for proper stackwalking of RuntimeStub frame
4638     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4639     __ leave(); // required for proper stackwalking of RuntimeStub frame
4640     __ ret(lr);
4641 
4642     return start;
4643   }
4644 
4645   address generate_squareToLen() {
4646     // squareToLen algorithm for sizes 1..127 described in java code works
4647     // faster than multiply_to_len on some CPUs and slower on others, but
4648     // multiply_to_len shows a bit better overall results
4649     __ align(CodeEntryAlignment);
4650     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4651     address start = __ pc();
4652 
4653     const Register x     = r0;
4654     const Register xlen  = r1;
4655     const Register z     = r2;
4656     const Register zlen  = r3;
4657     const Register y     = r4; // == x
4658     const Register ylen  = r5; // == xlen
4659 
4660     const Register tmp1  = r10;
4661     const Register tmp2  = r11;
4662     const Register tmp3  = r12;
4663     const Register tmp4  = r13;
4664     const Register tmp5  = r14;
4665     const Register tmp6  = r15;
4666     const Register tmp7  = r16;
4667 
4668     RegSet spilled_regs = RegSet::of(y, ylen);
4669     BLOCK_COMMENT("Entry:");
4670     __ enter();
4671     __ push(spilled_regs, sp);
4672     __ mov(y, x);
4673     __ mov(ylen, xlen);
4674     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4675     __ pop(spilled_regs, sp);
4676     __ leave();
4677     __ ret(lr);
4678     return start;
4679   }
4680 
4681   address generate_mulAdd() {
4682     __ align(CodeEntryAlignment);
4683     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4684 
4685     address start = __ pc();
4686 
4687     const Register out     = r0;
4688     const Register in      = r1;
4689     const Register offset  = r2;
4690     const Register len     = r3;
4691     const Register k       = r4;
4692 
4693     BLOCK_COMMENT("Entry:");
4694     __ enter();
4695     __ mul_add(out, in, offset, len, k);
4696     __ leave();
4697     __ ret(lr);
4698 
4699     return start;
4700   }
4701 
4702   // Arguments:
4703   //
4704   // Input:
4705   //   c_rarg0   - newArr address
4706   //   c_rarg1   - oldArr address
4707   //   c_rarg2   - newIdx
4708   //   c_rarg3   - shiftCount
4709   //   c_rarg4   - numIter
4710   //
4711   address generate_bigIntegerRightShift() {
4712     __ align(CodeEntryAlignment);
4713     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4714     address start = __ pc();
4715 
4716     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4717 
4718     Register newArr        = c_rarg0;
4719     Register oldArr        = c_rarg1;
4720     Register newIdx        = c_rarg2;
4721     Register shiftCount    = c_rarg3;
4722     Register numIter       = c_rarg4;
4723     Register idx           = numIter;
4724 
4725     Register newArrCur     = rscratch1;
4726     Register shiftRevCount = rscratch2;
4727     Register oldArrCur     = r13;
4728     Register oldArrNext    = r14;
4729 
4730     FloatRegister oldElem0        = v0;
4731     FloatRegister oldElem1        = v1;
4732     FloatRegister newElem         = v2;
4733     FloatRegister shiftVCount     = v3;
4734     FloatRegister shiftVRevCount  = v4;
4735 
4736     __ cbz(idx, Exit);
4737 
4738     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4739 
4740     // left shift count
4741     __ movw(shiftRevCount, 32);
4742     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4743 
4744     // numIter too small to allow a 4-words SIMD loop, rolling back
4745     __ cmp(numIter, (u1)4);
4746     __ br(Assembler::LT, ShiftThree);
4747 
4748     __ dup(shiftVCount,    __ T4S, shiftCount);
4749     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4750     __ negr(shiftVCount,   __ T4S, shiftVCount);
4751 
4752     __ BIND(ShiftSIMDLoop);
4753 
4754     // Calculate the load addresses
4755     __ sub(idx, idx, 4);
4756     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4757     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4758     __ add(oldArrCur,  oldArrNext, 4);
4759 
4760     // Load 4 words and process
4761     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4762     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4763     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4764     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4765     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4766     __ st1(newElem,   __ T4S,  Address(newArrCur));
4767 
4768     __ cmp(idx, (u1)4);
4769     __ br(Assembler::LT, ShiftTwoLoop);
4770     __ b(ShiftSIMDLoop);
4771 
4772     __ BIND(ShiftTwoLoop);
4773     __ cbz(idx, Exit);
4774     __ cmp(idx, (u1)1);
4775     __ br(Assembler::EQ, ShiftOne);
4776 
4777     // Calculate the load addresses
4778     __ sub(idx, idx, 2);
4779     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4780     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4781     __ add(oldArrCur,  oldArrNext, 4);
4782 
4783     // Load 2 words and process
4784     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4785     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4786     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4787     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4788     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4789     __ st1(newElem,   __ T2S, Address(newArrCur));
4790     __ b(ShiftTwoLoop);
4791 
4792     __ BIND(ShiftThree);
4793     __ tbz(idx, 1, ShiftOne);
4794     __ tbz(idx, 0, ShiftTwo);
4795     __ ldrw(r10,  Address(oldArr, 12));
4796     __ ldrw(r11,  Address(oldArr, 8));
4797     __ lsrvw(r10, r10, shiftCount);
4798     __ lslvw(r11, r11, shiftRevCount);
4799     __ orrw(r12,  r10, r11);
4800     __ strw(r12,  Address(newArr, 8));
4801 
4802     __ BIND(ShiftTwo);
4803     __ ldrw(r10,  Address(oldArr, 8));
4804     __ ldrw(r11,  Address(oldArr, 4));
4805     __ lsrvw(r10, r10, shiftCount);
4806     __ lslvw(r11, r11, shiftRevCount);
4807     __ orrw(r12,  r10, r11);
4808     __ strw(r12,  Address(newArr, 4));
4809 
4810     __ BIND(ShiftOne);
4811     __ ldrw(r10,  Address(oldArr, 4));
4812     __ ldrw(r11,  Address(oldArr));
4813     __ lsrvw(r10, r10, shiftCount);
4814     __ lslvw(r11, r11, shiftRevCount);
4815     __ orrw(r12,  r10, r11);
4816     __ strw(r12,  Address(newArr));
4817 
4818     __ BIND(Exit);
4819     __ ret(lr);
4820 
4821     return start;
4822   }
4823 
4824   // Arguments:
4825   //
4826   // Input:
4827   //   c_rarg0   - newArr address
4828   //   c_rarg1   - oldArr address
4829   //   c_rarg2   - newIdx
4830   //   c_rarg3   - shiftCount
4831   //   c_rarg4   - numIter
4832   //
4833   address generate_bigIntegerLeftShift() {
4834     __ align(CodeEntryAlignment);
4835     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4836     address start = __ pc();
4837 
4838     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4839 
4840     Register newArr        = c_rarg0;
4841     Register oldArr        = c_rarg1;
4842     Register newIdx        = c_rarg2;
4843     Register shiftCount    = c_rarg3;
4844     Register numIter       = c_rarg4;
4845 
4846     Register shiftRevCount = rscratch1;
4847     Register oldArrNext    = rscratch2;
4848 
4849     FloatRegister oldElem0        = v0;
4850     FloatRegister oldElem1        = v1;
4851     FloatRegister newElem         = v2;
4852     FloatRegister shiftVCount     = v3;
4853     FloatRegister shiftVRevCount  = v4;
4854 
4855     __ cbz(numIter, Exit);
4856 
4857     __ add(oldArrNext, oldArr, 4);
4858     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4859 
4860     // right shift count
4861     __ movw(shiftRevCount, 32);
4862     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4863 
4864     // numIter too small to allow a 4-words SIMD loop, rolling back
4865     __ cmp(numIter, (u1)4);
4866     __ br(Assembler::LT, ShiftThree);
4867 
4868     __ dup(shiftVCount,     __ T4S, shiftCount);
4869     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4870     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4871 
4872     __ BIND(ShiftSIMDLoop);
4873 
4874     // load 4 words and process
4875     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4876     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4877     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4878     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4879     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4880     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4881     __ sub(numIter,   numIter, 4);
4882 
4883     __ cmp(numIter, (u1)4);
4884     __ br(Assembler::LT, ShiftTwoLoop);
4885     __ b(ShiftSIMDLoop);
4886 
4887     __ BIND(ShiftTwoLoop);
4888     __ cbz(numIter, Exit);
4889     __ cmp(numIter, (u1)1);
4890     __ br(Assembler::EQ, ShiftOne);
4891 
4892     // load 2 words and process
4893     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4894     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4895     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4896     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4897     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4898     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4899     __ sub(numIter,   numIter, 2);
4900     __ b(ShiftTwoLoop);
4901 
4902     __ BIND(ShiftThree);
4903     __ ldrw(r10,  __ post(oldArr, 4));
4904     __ ldrw(r11,  __ post(oldArrNext, 4));
4905     __ lslvw(r10, r10, shiftCount);
4906     __ lsrvw(r11, r11, shiftRevCount);
4907     __ orrw(r12,  r10, r11);
4908     __ strw(r12,  __ post(newArr, 4));
4909     __ tbz(numIter, 1, Exit);
4910     __ tbz(numIter, 0, ShiftOne);
4911 
4912     __ BIND(ShiftTwo);
4913     __ ldrw(r10,  __ post(oldArr, 4));
4914     __ ldrw(r11,  __ post(oldArrNext, 4));
4915     __ lslvw(r10, r10, shiftCount);
4916     __ lsrvw(r11, r11, shiftRevCount);
4917     __ orrw(r12,  r10, r11);
4918     __ strw(r12,  __ post(newArr, 4));
4919 
4920     __ BIND(ShiftOne);
4921     __ ldrw(r10,  Address(oldArr));
4922     __ ldrw(r11,  Address(oldArrNext));
4923     __ lslvw(r10, r10, shiftCount);
4924     __ lsrvw(r11, r11, shiftRevCount);
4925     __ orrw(r12,  r10, r11);
4926     __ strw(r12,  Address(newArr));
4927 
4928     __ BIND(Exit);
4929     __ ret(lr);
4930 
4931     return start;
4932   }
4933 
4934   address generate_count_positives(address &count_positives_long) {
4935     const u1 large_loop_size = 64;
4936     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4937     int dcache_line = VM_Version::dcache_line_size();
4938 
4939     Register ary1 = r1, len = r2, result = r0;
4940 
4941     __ align(CodeEntryAlignment);
4942 
4943     StubCodeMark mark(this, "StubRoutines", "count_positives");
4944 
4945     address entry = __ pc();
4946 
4947     __ enter();
4948     // precondition: a copy of len is already in result
4949     // __ mov(result, len);
4950 
4951   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4952         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4953 
4954   __ cmp(len, (u1)15);
4955   __ br(Assembler::GT, LEN_OVER_15);
4956   // The only case when execution falls into this code is when pointer is near
4957   // the end of memory page and we have to avoid reading next page
4958   __ add(ary1, ary1, len);
4959   __ subs(len, len, 8);
4960   __ br(Assembler::GT, LEN_OVER_8);
4961   __ ldr(rscratch2, Address(ary1, -8));
4962   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4963   __ lsrv(rscratch2, rscratch2, rscratch1);
4964   __ tst(rscratch2, UPPER_BIT_MASK);
4965   __ csel(result, zr, result, Assembler::NE);
4966   __ leave();
4967   __ ret(lr);
4968   __ bind(LEN_OVER_8);
4969   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4970   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4971   __ tst(rscratch2, UPPER_BIT_MASK);
4972   __ br(Assembler::NE, RET_NO_POP);
4973   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4974   __ lsrv(rscratch1, rscratch1, rscratch2);
4975   __ tst(rscratch1, UPPER_BIT_MASK);
4976   __ bind(RET_NO_POP);
4977   __ csel(result, zr, result, Assembler::NE);
4978   __ leave();
4979   __ ret(lr);
4980 
4981   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4982   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4983 
4984   count_positives_long = __ pc(); // 2nd entry point
4985 
4986   __ enter();
4987 
4988   __ bind(LEN_OVER_15);
4989     __ push(spilled_regs, sp);
4990     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4991     __ cbz(rscratch2, ALIGNED);
4992     __ ldp(tmp6, tmp1, Address(ary1));
4993     __ mov(tmp5, 16);
4994     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4995     __ add(ary1, ary1, rscratch1);
4996     __ orr(tmp6, tmp6, tmp1);
4997     __ tst(tmp6, UPPER_BIT_MASK);
4998     __ br(Assembler::NE, RET_ADJUST);
4999     __ sub(len, len, rscratch1);
5000 
5001   __ bind(ALIGNED);
5002     __ cmp(len, large_loop_size);
5003     __ br(Assembler::LT, CHECK_16);
5004     // Perform 16-byte load as early return in pre-loop to handle situation
5005     // when initially aligned large array has negative values at starting bytes,
5006     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5007     // slower. Cases with negative bytes further ahead won't be affected that
5008     // much. In fact, it'll be faster due to early loads, less instructions and
5009     // less branches in LARGE_LOOP.
5010     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5011     __ sub(len, len, 16);
5012     __ orr(tmp6, tmp6, tmp1);
5013     __ tst(tmp6, UPPER_BIT_MASK);
5014     __ br(Assembler::NE, RET_ADJUST_16);
5015     __ cmp(len, large_loop_size);
5016     __ br(Assembler::LT, CHECK_16);
5017 
5018     if (SoftwarePrefetchHintDistance >= 0
5019         && SoftwarePrefetchHintDistance >= dcache_line) {
5020       // initial prefetch
5021       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5022     }
5023   __ bind(LARGE_LOOP);
5024     if (SoftwarePrefetchHintDistance >= 0) {
5025       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5026     }
5027     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5028     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5029     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5030     // instructions per cycle and have less branches, but this approach disables
5031     // early return, thus, all 64 bytes are loaded and checked every time.
5032     __ ldp(tmp2, tmp3, Address(ary1));
5033     __ ldp(tmp4, tmp5, Address(ary1, 16));
5034     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5035     __ ldp(tmp6, tmp1, Address(ary1, 48));
5036     __ add(ary1, ary1, large_loop_size);
5037     __ sub(len, len, large_loop_size);
5038     __ orr(tmp2, tmp2, tmp3);
5039     __ orr(tmp4, tmp4, tmp5);
5040     __ orr(rscratch1, rscratch1, rscratch2);
5041     __ orr(tmp6, tmp6, tmp1);
5042     __ orr(tmp2, tmp2, tmp4);
5043     __ orr(rscratch1, rscratch1, tmp6);
5044     __ orr(tmp2, tmp2, rscratch1);
5045     __ tst(tmp2, UPPER_BIT_MASK);
5046     __ br(Assembler::NE, RET_ADJUST_LONG);
5047     __ cmp(len, large_loop_size);
5048     __ br(Assembler::GE, LARGE_LOOP);
5049 
5050   __ bind(CHECK_16); // small 16-byte load pre-loop
5051     __ cmp(len, (u1)16);
5052     __ br(Assembler::LT, POST_LOOP16);
5053 
5054   __ bind(LOOP16); // small 16-byte load loop
5055     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5056     __ sub(len, len, 16);
5057     __ orr(tmp2, tmp2, tmp3);
5058     __ tst(tmp2, UPPER_BIT_MASK);
5059     __ br(Assembler::NE, RET_ADJUST_16);
5060     __ cmp(len, (u1)16);
5061     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5062 
5063   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5064     __ cmp(len, (u1)8);
5065     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5066     __ ldr(tmp3, Address(__ post(ary1, 8)));
5067     __ tst(tmp3, UPPER_BIT_MASK);
5068     __ br(Assembler::NE, RET_ADJUST);
5069     __ sub(len, len, 8);
5070 
5071   __ bind(POST_LOOP16_LOAD_TAIL);
5072     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5073     __ ldr(tmp1, Address(ary1));
5074     __ mov(tmp2, 64);
5075     __ sub(tmp4, tmp2, len, __ LSL, 3);
5076     __ lslv(tmp1, tmp1, tmp4);
5077     __ tst(tmp1, UPPER_BIT_MASK);
5078     __ br(Assembler::NE, RET_ADJUST);
5079     // Fallthrough
5080 
5081   __ bind(RET_LEN);
5082     __ pop(spilled_regs, sp);
5083     __ leave();
5084     __ ret(lr);
5085 
5086     // difference result - len is the count of guaranteed to be
5087     // positive bytes
5088 
5089   __ bind(RET_ADJUST_LONG);
5090     __ add(len, len, (u1)(large_loop_size - 16));
5091   __ bind(RET_ADJUST_16);
5092     __ add(len, len, 16);
5093   __ bind(RET_ADJUST);
5094     __ pop(spilled_regs, sp);
5095     __ leave();
5096     __ sub(result, result, len);
5097     __ ret(lr);
5098 
5099     return entry;
5100   }
5101 
5102   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5103         bool usePrefetch, Label &NOT_EQUAL) {
5104     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5105         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5106         tmp7 = r12, tmp8 = r13;
5107     Label LOOP;
5108 
5109     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5110     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5111     __ bind(LOOP);
5112     if (usePrefetch) {
5113       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5114       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5115     }
5116     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5117     __ eor(tmp1, tmp1, tmp2);
5118     __ eor(tmp3, tmp3, tmp4);
5119     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5120     __ orr(tmp1, tmp1, tmp3);
5121     __ cbnz(tmp1, NOT_EQUAL);
5122     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5123     __ eor(tmp5, tmp5, tmp6);
5124     __ eor(tmp7, tmp7, tmp8);
5125     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5126     __ orr(tmp5, tmp5, tmp7);
5127     __ cbnz(tmp5, NOT_EQUAL);
5128     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5129     __ eor(tmp1, tmp1, tmp2);
5130     __ eor(tmp3, tmp3, tmp4);
5131     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5132     __ orr(tmp1, tmp1, tmp3);
5133     __ cbnz(tmp1, NOT_EQUAL);
5134     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5135     __ eor(tmp5, tmp5, tmp6);
5136     __ sub(cnt1, cnt1, 8 * wordSize);
5137     __ eor(tmp7, tmp7, tmp8);
5138     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5139     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5140     // cmp) because subs allows an unlimited range of immediate operand.
5141     __ subs(tmp6, cnt1, loopThreshold);
5142     __ orr(tmp5, tmp5, tmp7);
5143     __ cbnz(tmp5, NOT_EQUAL);
5144     __ br(__ GE, LOOP);
5145     // post-loop
5146     __ eor(tmp1, tmp1, tmp2);
5147     __ eor(tmp3, tmp3, tmp4);
5148     __ orr(tmp1, tmp1, tmp3);
5149     __ sub(cnt1, cnt1, 2 * wordSize);
5150     __ cbnz(tmp1, NOT_EQUAL);
5151   }
5152 
5153   void generate_large_array_equals_loop_simd(int loopThreshold,
5154         bool usePrefetch, Label &NOT_EQUAL) {
5155     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5156         tmp2 = rscratch2;
5157     Label LOOP;
5158 
5159     __ bind(LOOP);
5160     if (usePrefetch) {
5161       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5162       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5163     }
5164     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5165     __ sub(cnt1, cnt1, 8 * wordSize);
5166     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5167     __ subs(tmp1, cnt1, loopThreshold);
5168     __ eor(v0, __ T16B, v0, v4);
5169     __ eor(v1, __ T16B, v1, v5);
5170     __ eor(v2, __ T16B, v2, v6);
5171     __ eor(v3, __ T16B, v3, v7);
5172     __ orr(v0, __ T16B, v0, v1);
5173     __ orr(v1, __ T16B, v2, v3);
5174     __ orr(v0, __ T16B, v0, v1);
5175     __ umov(tmp1, v0, __ D, 0);
5176     __ umov(tmp2, v0, __ D, 1);
5177     __ orr(tmp1, tmp1, tmp2);
5178     __ cbnz(tmp1, NOT_EQUAL);
5179     __ br(__ GE, LOOP);
5180   }
5181 
5182   // a1 = r1 - array1 address
5183   // a2 = r2 - array2 address
5184   // result = r0 - return value. Already contains "false"
5185   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5186   // r3-r5 are reserved temporary registers
5187   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5188   address generate_large_array_equals() {
5189     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5190         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5191         tmp7 = r12, tmp8 = r13;
5192     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5193         SMALL_LOOP, POST_LOOP;
5194     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5195     // calculate if at least 32 prefetched bytes are used
5196     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5197     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5198     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5199     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5200         tmp5, tmp6, tmp7, tmp8);
5201 
5202     __ align(CodeEntryAlignment);
5203 
5204     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5205 
5206     address entry = __ pc();
5207     __ enter();
5208     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5209     // also advance pointers to use post-increment instead of pre-increment
5210     __ add(a1, a1, wordSize);
5211     __ add(a2, a2, wordSize);
5212     if (AvoidUnalignedAccesses) {
5213       // both implementations (SIMD/nonSIMD) are using relatively large load
5214       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5215       // on some CPUs in case of address is not at least 16-byte aligned.
5216       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5217       // load if needed at least for 1st address and make if 16-byte aligned.
5218       Label ALIGNED16;
5219       __ tbz(a1, 3, ALIGNED16);
5220       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5221       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5222       __ sub(cnt1, cnt1, wordSize);
5223       __ eor(tmp1, tmp1, tmp2);
5224       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5225       __ bind(ALIGNED16);
5226     }
5227     if (UseSIMDForArrayEquals) {
5228       if (SoftwarePrefetchHintDistance >= 0) {
5229         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5230         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5231         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5232             /* prfm = */ true, NOT_EQUAL);
5233         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5234         __ br(__ LT, TAIL);
5235       }
5236       __ bind(NO_PREFETCH_LARGE_LOOP);
5237       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5238           /* prfm = */ false, NOT_EQUAL);
5239     } else {
5240       __ push(spilled_regs, sp);
5241       if (SoftwarePrefetchHintDistance >= 0) {
5242         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5243         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5244         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5245             /* prfm = */ true, NOT_EQUAL);
5246         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5247         __ br(__ LT, TAIL);
5248       }
5249       __ bind(NO_PREFETCH_LARGE_LOOP);
5250       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5251           /* prfm = */ false, NOT_EQUAL);
5252     }
5253     __ bind(TAIL);
5254       __ cbz(cnt1, EQUAL);
5255       __ subs(cnt1, cnt1, wordSize);
5256       __ br(__ LE, POST_LOOP);
5257     __ bind(SMALL_LOOP);
5258       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5259       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5260       __ subs(cnt1, cnt1, wordSize);
5261       __ eor(tmp1, tmp1, tmp2);
5262       __ cbnz(tmp1, NOT_EQUAL);
5263       __ br(__ GT, SMALL_LOOP);
5264     __ bind(POST_LOOP);
5265       __ ldr(tmp1, Address(a1, cnt1));
5266       __ ldr(tmp2, Address(a2, cnt1));
5267       __ eor(tmp1, tmp1, tmp2);
5268       __ cbnz(tmp1, NOT_EQUAL);
5269     __ bind(EQUAL);
5270       __ mov(result, true);
5271     __ bind(NOT_EQUAL);
5272       if (!UseSIMDForArrayEquals) {
5273         __ pop(spilled_regs, sp);
5274       }
5275     __ bind(NOT_EQUAL_NO_POP);
5276     __ leave();
5277     __ ret(lr);
5278     return entry;
5279   }
5280 
5281   address generate_dsin_dcos(bool isCos) {
5282     __ align(CodeEntryAlignment);
5283     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5284     address start = __ pc();
5285     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5286         (address)StubRoutines::aarch64::_two_over_pi,
5287         (address)StubRoutines::aarch64::_pio2,
5288         (address)StubRoutines::aarch64::_dsin_coef,
5289         (address)StubRoutines::aarch64::_dcos_coef);
5290     return start;
5291   }
5292 
5293   address generate_dlog() {
5294     __ align(CodeEntryAlignment);
5295     StubCodeMark mark(this, "StubRoutines", "dlog");
5296     address entry = __ pc();
5297     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5298         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5299     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5300     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5301         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5302     return entry;
5303   }
5304 
5305 
5306   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5307   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5308       Label &DIFF2) {
5309     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5310     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5311 
5312     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5313     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5314     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5315     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5316 
5317     __ fmovd(tmpL, vtmp3);
5318     __ eor(rscratch2, tmp3, tmpL);
5319     __ cbnz(rscratch2, DIFF2);
5320 
5321     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5322     __ umov(tmpL, vtmp3, __ D, 1);
5323     __ eor(rscratch2, tmpU, tmpL);
5324     __ cbnz(rscratch2, DIFF1);
5325 
5326     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5327     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5328     __ fmovd(tmpL, vtmp);
5329     __ eor(rscratch2, tmp3, tmpL);
5330     __ cbnz(rscratch2, DIFF2);
5331 
5332     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5333     __ umov(tmpL, vtmp, __ D, 1);
5334     __ eor(rscratch2, tmpU, tmpL);
5335     __ cbnz(rscratch2, DIFF1);
5336   }
5337 
5338   // r0  = result
5339   // r1  = str1
5340   // r2  = cnt1
5341   // r3  = str2
5342   // r4  = cnt2
5343   // r10 = tmp1
5344   // r11 = tmp2
5345   address generate_compare_long_string_different_encoding(bool isLU) {
5346     __ align(CodeEntryAlignment);
5347     StubCodeMark mark(this, "StubRoutines", isLU
5348         ? "compare_long_string_different_encoding LU"
5349         : "compare_long_string_different_encoding UL");
5350     address entry = __ pc();
5351     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5352         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5353         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5354     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5355         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5356     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5357     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5358 
5359     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5360 
5361     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5362     // cnt2 == amount of characters left to compare
5363     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5364     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5365     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5366     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5367     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5368     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5369     __ eor(rscratch2, tmp1, tmp2);
5370     __ mov(rscratch1, tmp2);
5371     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5372     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5373              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5374     __ push(spilled_regs, sp);
5375     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5376     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5377 
5378     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5379 
5380     if (SoftwarePrefetchHintDistance >= 0) {
5381       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5382       __ br(__ LT, NO_PREFETCH);
5383       __ bind(LARGE_LOOP_PREFETCH);
5384         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5385         __ mov(tmp4, 2);
5386         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5387         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5388           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5389           __ subs(tmp4, tmp4, 1);
5390           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5391           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5392           __ mov(tmp4, 2);
5393         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5394           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5395           __ subs(tmp4, tmp4, 1);
5396           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5397           __ sub(cnt2, cnt2, 64);
5398           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5399           __ br(__ GE, LARGE_LOOP_PREFETCH);
5400     }
5401     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5402     __ bind(NO_PREFETCH);
5403     __ subs(cnt2, cnt2, 16);
5404     __ br(__ LT, TAIL);
5405     __ align(OptoLoopAlignment);
5406     __ bind(SMALL_LOOP); // smaller loop
5407       __ subs(cnt2, cnt2, 16);
5408       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5409       __ br(__ GE, SMALL_LOOP);
5410       __ cmn(cnt2, (u1)16);
5411       __ br(__ EQ, LOAD_LAST);
5412     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5413       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5414       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5415       __ ldr(tmp3, Address(cnt1, -8));
5416       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5417       __ b(LOAD_LAST);
5418     __ bind(DIFF2);
5419       __ mov(tmpU, tmp3);
5420     __ bind(DIFF1);
5421       __ pop(spilled_regs, sp);
5422       __ b(CALCULATE_DIFFERENCE);
5423     __ bind(LOAD_LAST);
5424       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5425       // No need to load it again
5426       __ mov(tmpU, tmp3);
5427       __ pop(spilled_regs, sp);
5428 
5429       // tmp2 points to the address of the last 4 Latin1 characters right now
5430       __ ldrs(vtmp, Address(tmp2));
5431       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5432       __ fmovd(tmpL, vtmp);
5433 
5434       __ eor(rscratch2, tmpU, tmpL);
5435       __ cbz(rscratch2, DONE);
5436 
5437     // Find the first different characters in the longwords and
5438     // compute their difference.
5439     __ bind(CALCULATE_DIFFERENCE);
5440       __ rev(rscratch2, rscratch2);
5441       __ clz(rscratch2, rscratch2);
5442       __ andr(rscratch2, rscratch2, -16);
5443       __ lsrv(tmp1, tmp1, rscratch2);
5444       __ uxthw(tmp1, tmp1);
5445       __ lsrv(rscratch1, rscratch1, rscratch2);
5446       __ uxthw(rscratch1, rscratch1);
5447       __ subw(result, tmp1, rscratch1);
5448     __ bind(DONE);
5449       __ ret(lr);
5450     return entry;
5451   }
5452 
5453   address generate_method_entry_barrier() {
5454     __ align(CodeEntryAlignment);
5455     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5456 
5457     Label deoptimize_label;
5458 
5459     address start = __ pc();
5460 
5461     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5462 
5463     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5464       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5465       // We can get here despite the nmethod being good, if we have not
5466       // yet applied our cross modification fence (or data fence).
5467       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5468       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5469       __ ldrw(rscratch2, rscratch2);
5470       __ strw(rscratch2, thread_epoch_addr);
5471       __ isb();
5472       __ membar(__ LoadLoad);
5473     }
5474 
5475     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5476 
5477     __ enter();
5478     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5479 
5480     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5481 
5482     __ push_call_clobbered_registers();
5483 
5484     __ mov(c_rarg0, rscratch2);
5485     __ call_VM_leaf
5486          (CAST_FROM_FN_PTR
5487           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5488 
5489     __ reset_last_Java_frame(true);
5490 
5491     __ mov(rscratch1, r0);
5492 
5493     __ pop_call_clobbered_registers();
5494 
5495     __ cbnz(rscratch1, deoptimize_label);
5496 
5497     __ leave();
5498     __ ret(lr);
5499 
5500     __ BIND(deoptimize_label);
5501 
5502     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5503     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5504 
5505     __ mov(sp, rscratch1);
5506     __ br(rscratch2);
5507 
5508     return start;
5509   }
5510 
5511   // r0  = result
5512   // r1  = str1
5513   // r2  = cnt1
5514   // r3  = str2
5515   // r4  = cnt2
5516   // r10 = tmp1
5517   // r11 = tmp2
5518   address generate_compare_long_string_same_encoding(bool isLL) {
5519     __ align(CodeEntryAlignment);
5520     StubCodeMark mark(this, "StubRoutines", isLL
5521         ? "compare_long_string_same_encoding LL"
5522         : "compare_long_string_same_encoding UU");
5523     address entry = __ pc();
5524     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5525         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5526 
5527     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5528 
5529     // exit from large loop when less than 64 bytes left to read or we're about
5530     // to prefetch memory behind array border
5531     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5532 
5533     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5534     __ eor(rscratch2, tmp1, tmp2);
5535     __ cbnz(rscratch2, CAL_DIFFERENCE);
5536 
5537     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5538     // update pointers, because of previous read
5539     __ add(str1, str1, wordSize);
5540     __ add(str2, str2, wordSize);
5541     if (SoftwarePrefetchHintDistance >= 0) {
5542       __ align(OptoLoopAlignment);
5543       __ bind(LARGE_LOOP_PREFETCH);
5544         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5545         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5546 
5547         for (int i = 0; i < 4; i++) {
5548           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5549           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5550           __ cmp(tmp1, tmp2);
5551           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5552           __ br(Assembler::NE, DIFF);
5553         }
5554         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5555         __ add(str1, str1, 64);
5556         __ add(str2, str2, 64);
5557         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5558         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5559         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5560     }
5561 
5562     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5563     __ br(Assembler::LE, LESS16);
5564     __ align(OptoLoopAlignment);
5565     __ bind(LOOP_COMPARE16);
5566       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5567       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5568       __ cmp(tmp1, tmp2);
5569       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5570       __ br(Assembler::NE, DIFF);
5571       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5572       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5573       __ br(Assembler::LT, LESS16);
5574 
5575       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5576       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5577       __ cmp(tmp1, tmp2);
5578       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5579       __ br(Assembler::NE, DIFF);
5580       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5581       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5582       __ br(Assembler::GE, LOOP_COMPARE16);
5583       __ cbz(cnt2, LENGTH_DIFF);
5584 
5585     __ bind(LESS16);
5586       // each 8 compare
5587       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5588       __ br(Assembler::LE, LESS8);
5589       __ ldr(tmp1, Address(__ post(str1, 8)));
5590       __ ldr(tmp2, Address(__ post(str2, 8)));
5591       __ eor(rscratch2, tmp1, tmp2);
5592       __ cbnz(rscratch2, CAL_DIFFERENCE);
5593       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5594 
5595     __ bind(LESS8); // directly load last 8 bytes
5596       if (!isLL) {
5597         __ add(cnt2, cnt2, cnt2);
5598       }
5599       __ ldr(tmp1, Address(str1, cnt2));
5600       __ ldr(tmp2, Address(str2, cnt2));
5601       __ eor(rscratch2, tmp1, tmp2);
5602       __ cbz(rscratch2, LENGTH_DIFF);
5603       __ b(CAL_DIFFERENCE);
5604 
5605     __ bind(DIFF);
5606       __ cmp(tmp1, tmp2);
5607       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5608       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5609       // reuse rscratch2 register for the result of eor instruction
5610       __ eor(rscratch2, tmp1, tmp2);
5611 
5612     __ bind(CAL_DIFFERENCE);
5613       __ rev(rscratch2, rscratch2);
5614       __ clz(rscratch2, rscratch2);
5615       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5616       __ lsrv(tmp1, tmp1, rscratch2);
5617       __ lsrv(tmp2, tmp2, rscratch2);
5618       if (isLL) {
5619         __ uxtbw(tmp1, tmp1);
5620         __ uxtbw(tmp2, tmp2);
5621       } else {
5622         __ uxthw(tmp1, tmp1);
5623         __ uxthw(tmp2, tmp2);
5624       }
5625       __ subw(result, tmp1, tmp2);
5626 
5627     __ bind(LENGTH_DIFF);
5628       __ ret(lr);
5629     return entry;
5630   }
5631 
5632   enum string_compare_mode {
5633     LL,
5634     LU,
5635     UL,
5636     UU,
5637   };
5638 
5639   // The following registers are declared in aarch64.ad
5640   // r0  = result
5641   // r1  = str1
5642   // r2  = cnt1
5643   // r3  = str2
5644   // r4  = cnt2
5645   // r10 = tmp1
5646   // r11 = tmp2
5647   // z0  = ztmp1
5648   // z1  = ztmp2
5649   // p0  = pgtmp1
5650   // p1  = pgtmp2
5651   address generate_compare_long_string_sve(string_compare_mode mode) {
5652     __ align(CodeEntryAlignment);
5653     address entry = __ pc();
5654     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5655              tmp1 = r10, tmp2 = r11;
5656 
5657     Label LOOP, DONE, MISMATCH;
5658     Register vec_len = tmp1;
5659     Register idx = tmp2;
5660     // The minimum of the string lengths has been stored in cnt2.
5661     Register cnt = cnt2;
5662     FloatRegister ztmp1 = z0, ztmp2 = z1;
5663     PRegister pgtmp1 = p0, pgtmp2 = p1;
5664 
5665 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5666     switch (mode) {                                                            \
5667       case LL:                                                                 \
5668         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5669         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5670         break;                                                                 \
5671       case LU:                                                                 \
5672         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5673         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5674         break;                                                                 \
5675       case UL:                                                                 \
5676         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5677         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5678         break;                                                                 \
5679       case UU:                                                                 \
5680         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5681         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5682         break;                                                                 \
5683       default:                                                                 \
5684         ShouldNotReachHere();                                                  \
5685     }
5686 
5687     const char* stubname;
5688     switch (mode) {
5689       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5690       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5691       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5692       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5693       default: ShouldNotReachHere();
5694     }
5695 
5696     StubCodeMark mark(this, "StubRoutines", stubname);
5697 
5698     __ mov(idx, 0);
5699     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5700 
5701     if (mode == LL) {
5702       __ sve_cntb(vec_len);
5703     } else {
5704       __ sve_cnth(vec_len);
5705     }
5706 
5707     __ sub(rscratch1, cnt, vec_len);
5708 
5709     __ bind(LOOP);
5710 
5711       // main loop
5712       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5713       __ add(idx, idx, vec_len);
5714       // Compare strings.
5715       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5716       __ br(__ NE, MISMATCH);
5717       __ cmp(idx, rscratch1);
5718       __ br(__ LT, LOOP);
5719 
5720     // post loop, last iteration
5721     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5722 
5723     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5724     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5725     __ br(__ EQ, DONE);
5726 
5727     __ bind(MISMATCH);
5728 
5729     // Crop the vector to find its location.
5730     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5731     // Extract the first different characters of each string.
5732     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5733     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5734 
5735     // Compute the difference of the first different characters.
5736     __ sub(result, rscratch1, rscratch2);
5737 
5738     __ bind(DONE);
5739     __ ret(lr);
5740 #undef LOAD_PAIR
5741     return entry;
5742   }
5743 
5744   void generate_compare_long_strings() {
5745     if (UseSVE == 0) {
5746       StubRoutines::aarch64::_compare_long_string_LL
5747           = generate_compare_long_string_same_encoding(true);
5748       StubRoutines::aarch64::_compare_long_string_UU
5749           = generate_compare_long_string_same_encoding(false);
5750       StubRoutines::aarch64::_compare_long_string_LU
5751           = generate_compare_long_string_different_encoding(true);
5752       StubRoutines::aarch64::_compare_long_string_UL
5753           = generate_compare_long_string_different_encoding(false);
5754     } else {
5755       StubRoutines::aarch64::_compare_long_string_LL
5756           = generate_compare_long_string_sve(LL);
5757       StubRoutines::aarch64::_compare_long_string_UU
5758           = generate_compare_long_string_sve(UU);
5759       StubRoutines::aarch64::_compare_long_string_LU
5760           = generate_compare_long_string_sve(LU);
5761       StubRoutines::aarch64::_compare_long_string_UL
5762           = generate_compare_long_string_sve(UL);
5763     }
5764   }
5765 
5766   // R0 = result
5767   // R1 = str2
5768   // R2 = cnt1
5769   // R3 = str1
5770   // R4 = cnt2
5771   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5772   //
5773   // This generic linear code use few additional ideas, which makes it faster:
5774   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5775   // in order to skip initial loading(help in systems with 1 ld pipeline)
5776   // 2) we can use "fast" algorithm of finding single character to search for
5777   // first symbol with less branches(1 branch per each loaded register instead
5778   // of branch for each symbol), so, this is where constants like
5779   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5780   // 3) after loading and analyzing 1st register of source string, it can be
5781   // used to search for every 1st character entry, saving few loads in
5782   // comparison with "simplier-but-slower" implementation
5783   // 4) in order to avoid lots of push/pop operations, code below is heavily
5784   // re-using/re-initializing/compressing register values, which makes code
5785   // larger and a bit less readable, however, most of extra operations are
5786   // issued during loads or branches, so, penalty is minimal
5787   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5788     const char* stubName = str1_isL
5789         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5790         : "indexof_linear_uu";
5791     __ align(CodeEntryAlignment);
5792     StubCodeMark mark(this, "StubRoutines", stubName);
5793     address entry = __ pc();
5794 
5795     int str1_chr_size = str1_isL ? 1 : 2;
5796     int str2_chr_size = str2_isL ? 1 : 2;
5797     int str1_chr_shift = str1_isL ? 0 : 1;
5798     int str2_chr_shift = str2_isL ? 0 : 1;
5799     bool isL = str1_isL && str2_isL;
5800    // parameters
5801     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5802     // temporary registers
5803     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5804     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5805     // redefinitions
5806     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5807 
5808     __ push(spilled_regs, sp);
5809     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5810         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5811         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5812         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5813         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5814         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5815     // Read whole register from str1. It is safe, because length >=8 here
5816     __ ldr(ch1, Address(str1));
5817     // Read whole register from str2. It is safe, because length >=8 here
5818     __ ldr(ch2, Address(str2));
5819     __ sub(cnt2, cnt2, cnt1);
5820     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5821     if (str1_isL != str2_isL) {
5822       __ eor(v0, __ T16B, v0, v0);
5823     }
5824     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5825     __ mul(first, first, tmp1);
5826     // check if we have less than 1 register to check
5827     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5828     if (str1_isL != str2_isL) {
5829       __ fmovd(v1, ch1);
5830     }
5831     __ br(__ LE, L_SMALL);
5832     __ eor(ch2, first, ch2);
5833     if (str1_isL != str2_isL) {
5834       __ zip1(v1, __ T16B, v1, v0);
5835     }
5836     __ sub(tmp2, ch2, tmp1);
5837     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5838     __ bics(tmp2, tmp2, ch2);
5839     if (str1_isL != str2_isL) {
5840       __ fmovd(ch1, v1);
5841     }
5842     __ br(__ NE, L_HAS_ZERO);
5843     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5844     __ add(result, result, wordSize/str2_chr_size);
5845     __ add(str2, str2, wordSize);
5846     __ br(__ LT, L_POST_LOOP);
5847     __ BIND(L_LOOP);
5848       __ ldr(ch2, Address(str2));
5849       __ eor(ch2, first, ch2);
5850       __ sub(tmp2, ch2, tmp1);
5851       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5852       __ bics(tmp2, tmp2, ch2);
5853       __ br(__ NE, L_HAS_ZERO);
5854     __ BIND(L_LOOP_PROCEED);
5855       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5856       __ add(str2, str2, wordSize);
5857       __ add(result, result, wordSize/str2_chr_size);
5858       __ br(__ GE, L_LOOP);
5859     __ BIND(L_POST_LOOP);
5860       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5861       __ br(__ LE, NOMATCH);
5862       __ ldr(ch2, Address(str2));
5863       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5864       __ eor(ch2, first, ch2);
5865       __ sub(tmp2, ch2, tmp1);
5866       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5867       __ mov(tmp4, -1); // all bits set
5868       __ b(L_SMALL_PROCEED);
5869     __ align(OptoLoopAlignment);
5870     __ BIND(L_SMALL);
5871       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5872       __ eor(ch2, first, ch2);
5873       if (str1_isL != str2_isL) {
5874         __ zip1(v1, __ T16B, v1, v0);
5875       }
5876       __ sub(tmp2, ch2, tmp1);
5877       __ mov(tmp4, -1); // all bits set
5878       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5879       if (str1_isL != str2_isL) {
5880         __ fmovd(ch1, v1); // move converted 4 symbols
5881       }
5882     __ BIND(L_SMALL_PROCEED);
5883       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5884       __ bic(tmp2, tmp2, ch2);
5885       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5886       __ rbit(tmp2, tmp2);
5887       __ br(__ EQ, NOMATCH);
5888     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5889       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5890       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5891       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5892       if (str2_isL) { // LL
5893         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5894         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5895         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5896         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5897         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5898       } else {
5899         __ mov(ch2, 0xE); // all bits in byte set except last one
5900         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5901         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5902         __ lslv(tmp2, tmp2, tmp4);
5903         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5904         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5905         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5906         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5907       }
5908       __ cmp(ch1, ch2);
5909       __ mov(tmp4, wordSize/str2_chr_size);
5910       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5911     __ BIND(L_SMALL_CMP_LOOP);
5912       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5913                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5914       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5915                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5916       __ add(tmp4, tmp4, 1);
5917       __ cmp(tmp4, cnt1);
5918       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5919       __ cmp(first, ch2);
5920       __ br(__ EQ, L_SMALL_CMP_LOOP);
5921     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5922       __ cbz(tmp2, NOMATCH); // no more matches. exit
5923       __ clz(tmp4, tmp2);
5924       __ add(result, result, 1); // advance index
5925       __ add(str2, str2, str2_chr_size); // advance pointer
5926       __ b(L_SMALL_HAS_ZERO_LOOP);
5927     __ align(OptoLoopAlignment);
5928     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5929       __ cmp(first, ch2);
5930       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5931       __ b(DONE);
5932     __ align(OptoLoopAlignment);
5933     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5934       if (str2_isL) { // LL
5935         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5936         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5937         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5938         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5939         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5940       } else {
5941         __ mov(ch2, 0xE); // all bits in byte set except last one
5942         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5943         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5944         __ lslv(tmp2, tmp2, tmp4);
5945         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5946         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5947         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5948         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5949       }
5950       __ cmp(ch1, ch2);
5951       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5952       __ b(DONE);
5953     __ align(OptoLoopAlignment);
5954     __ BIND(L_HAS_ZERO);
5955       __ rbit(tmp2, tmp2);
5956       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5957       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5958       // It's fine because both counters are 32bit and are not changed in this
5959       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5960       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5961       __ sub(result, result, 1);
5962     __ BIND(L_HAS_ZERO_LOOP);
5963       __ mov(cnt1, wordSize/str2_chr_size);
5964       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5965       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5966       if (str2_isL) {
5967         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5968         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5969         __ lslv(tmp2, tmp2, tmp4);
5970         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5971         __ add(tmp4, tmp4, 1);
5972         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5973         __ lsl(tmp2, tmp2, 1);
5974         __ mov(tmp4, wordSize/str2_chr_size);
5975       } else {
5976         __ mov(ch2, 0xE);
5977         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5978         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5979         __ lslv(tmp2, tmp2, tmp4);
5980         __ add(tmp4, tmp4, 1);
5981         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5982         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5983         __ lsl(tmp2, tmp2, 1);
5984         __ mov(tmp4, wordSize/str2_chr_size);
5985         __ sub(str2, str2, str2_chr_size);
5986       }
5987       __ cmp(ch1, ch2);
5988       __ mov(tmp4, wordSize/str2_chr_size);
5989       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5990     __ BIND(L_CMP_LOOP);
5991       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5992                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5993       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5994                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5995       __ add(tmp4, tmp4, 1);
5996       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5997       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5998       __ cmp(cnt1, ch2);
5999       __ br(__ EQ, L_CMP_LOOP);
6000     __ BIND(L_CMP_LOOP_NOMATCH);
6001       // here we're not matched
6002       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6003       __ clz(tmp4, tmp2);
6004       __ add(str2, str2, str2_chr_size); // advance pointer
6005       __ b(L_HAS_ZERO_LOOP);
6006     __ align(OptoLoopAlignment);
6007     __ BIND(L_CMP_LOOP_LAST_CMP);
6008       __ cmp(cnt1, ch2);
6009       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6010       __ b(DONE);
6011     __ align(OptoLoopAlignment);
6012     __ BIND(L_CMP_LOOP_LAST_CMP2);
6013       if (str2_isL) {
6014         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6015         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6016         __ lslv(tmp2, tmp2, tmp4);
6017         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6018         __ add(tmp4, tmp4, 1);
6019         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6020         __ lsl(tmp2, tmp2, 1);
6021       } else {
6022         __ mov(ch2, 0xE);
6023         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6024         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6025         __ lslv(tmp2, tmp2, tmp4);
6026         __ add(tmp4, tmp4, 1);
6027         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6028         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6029         __ lsl(tmp2, tmp2, 1);
6030         __ sub(str2, str2, str2_chr_size);
6031       }
6032       __ cmp(ch1, ch2);
6033       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6034       __ b(DONE);
6035     __ align(OptoLoopAlignment);
6036     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6037       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6038       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6039       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6040       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6041       // result by analyzed characters value, so, we can just reset lower bits
6042       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6043       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6044       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6045       // index of last analyzed substring inside current octet. So, str2 in at
6046       // respective start address. We need to advance it to next octet
6047       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6048       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6049       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6050       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6051       __ movw(cnt2, cnt2);
6052       __ b(L_LOOP_PROCEED);
6053     __ align(OptoLoopAlignment);
6054     __ BIND(NOMATCH);
6055       __ mov(result, -1);
6056     __ BIND(DONE);
6057       __ pop(spilled_regs, sp);
6058       __ ret(lr);
6059     return entry;
6060   }
6061 
6062   void generate_string_indexof_stubs() {
6063     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6064     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6065     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6066   }
6067 
6068   void inflate_and_store_2_fp_registers(bool generatePrfm,
6069       FloatRegister src1, FloatRegister src2) {
6070     Register dst = r1;
6071     __ zip1(v1, __ T16B, src1, v0);
6072     __ zip2(v2, __ T16B, src1, v0);
6073     if (generatePrfm) {
6074       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6075     }
6076     __ zip1(v3, __ T16B, src2, v0);
6077     __ zip2(v4, __ T16B, src2, v0);
6078     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6079   }
6080 
6081   // R0 = src
6082   // R1 = dst
6083   // R2 = len
6084   // R3 = len >> 3
6085   // V0 = 0
6086   // v1 = loaded 8 bytes
6087   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6088   address generate_large_byte_array_inflate() {
6089     __ align(CodeEntryAlignment);
6090     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6091     address entry = __ pc();
6092     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6093     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6094     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6095 
6096     // do one more 8-byte read to have address 16-byte aligned in most cases
6097     // also use single store instruction
6098     __ ldrd(v2, __ post(src, 8));
6099     __ sub(octetCounter, octetCounter, 2);
6100     __ zip1(v1, __ T16B, v1, v0);
6101     __ zip1(v2, __ T16B, v2, v0);
6102     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6103     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6104     __ subs(rscratch1, octetCounter, large_loop_threshold);
6105     __ br(__ LE, LOOP_START);
6106     __ b(LOOP_PRFM_START);
6107     __ bind(LOOP_PRFM);
6108       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6109     __ bind(LOOP_PRFM_START);
6110       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6111       __ sub(octetCounter, octetCounter, 8);
6112       __ subs(rscratch1, octetCounter, large_loop_threshold);
6113       inflate_and_store_2_fp_registers(true, v3, v4);
6114       inflate_and_store_2_fp_registers(true, v5, v6);
6115       __ br(__ GT, LOOP_PRFM);
6116       __ cmp(octetCounter, (u1)8);
6117       __ br(__ LT, DONE);
6118     __ bind(LOOP);
6119       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6120       __ bind(LOOP_START);
6121       __ sub(octetCounter, octetCounter, 8);
6122       __ cmp(octetCounter, (u1)8);
6123       inflate_and_store_2_fp_registers(false, v3, v4);
6124       inflate_and_store_2_fp_registers(false, v5, v6);
6125       __ br(__ GE, LOOP);
6126     __ bind(DONE);
6127       __ ret(lr);
6128     return entry;
6129   }
6130 
6131   /**
6132    *  Arguments:
6133    *
6134    *  Input:
6135    *  c_rarg0   - current state address
6136    *  c_rarg1   - H key address
6137    *  c_rarg2   - data address
6138    *  c_rarg3   - number of blocks
6139    *
6140    *  Output:
6141    *  Updated state at c_rarg0
6142    */
6143   address generate_ghash_processBlocks() {
6144     // Bafflingly, GCM uses little-endian for the byte order, but
6145     // big-endian for the bit order.  For example, the polynomial 1 is
6146     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6147     //
6148     // So, we must either reverse the bytes in each word and do
6149     // everything big-endian or reverse the bits in each byte and do
6150     // it little-endian.  On AArch64 it's more idiomatic to reverse
6151     // the bits in each byte (we have an instruction, RBIT, to do
6152     // that) and keep the data in little-endian bit order through the
6153     // calculation, bit-reversing the inputs and outputs.
6154 
6155     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6156     __ align(wordSize * 2);
6157     address p = __ pc();
6158     __ emit_int64(0x87);  // The low-order bits of the field
6159                           // polynomial (i.e. p = z^7+z^2+z+1)
6160                           // repeated in the low and high parts of a
6161                           // 128-bit vector
6162     __ emit_int64(0x87);
6163 
6164     __ align(CodeEntryAlignment);
6165     address start = __ pc();
6166 
6167     Register state   = c_rarg0;
6168     Register subkeyH = c_rarg1;
6169     Register data    = c_rarg2;
6170     Register blocks  = c_rarg3;
6171 
6172     FloatRegister vzr = v30;
6173     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6174 
6175     __ ldrq(v24, p);    // The field polynomial
6176 
6177     __ ldrq(v0, Address(state));
6178     __ ldrq(v1, Address(subkeyH));
6179 
6180     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6181     __ rbit(v0, __ T16B, v0);
6182     __ rev64(v1, __ T16B, v1);
6183     __ rbit(v1, __ T16B, v1);
6184 
6185     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6186     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6187 
6188     {
6189       Label L_ghash_loop;
6190       __ bind(L_ghash_loop);
6191 
6192       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6193                                                  // reversing each byte
6194       __ rbit(v2, __ T16B, v2);
6195       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6196 
6197       // Multiply state in v2 by subkey in v1
6198       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6199                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6200                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6201       // Reduce v7:v5 by the field polynomial
6202       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6203 
6204       __ sub(blocks, blocks, 1);
6205       __ cbnz(blocks, L_ghash_loop);
6206     }
6207 
6208     // The bit-reversed result is at this point in v0
6209     __ rev64(v0, __ T16B, v0);
6210     __ rbit(v0, __ T16B, v0);
6211 
6212     __ st1(v0, __ T16B, state);
6213     __ ret(lr);
6214 
6215     return start;
6216   }
6217 
6218   address generate_ghash_processBlocks_wide() {
6219     address small = generate_ghash_processBlocks();
6220 
6221     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6222     __ align(wordSize * 2);
6223     address p = __ pc();
6224     __ emit_int64(0x87);  // The low-order bits of the field
6225                           // polynomial (i.e. p = z^7+z^2+z+1)
6226                           // repeated in the low and high parts of a
6227                           // 128-bit vector
6228     __ emit_int64(0x87);
6229 
6230     __ align(CodeEntryAlignment);
6231     address start = __ pc();
6232 
6233     Register state   = c_rarg0;
6234     Register subkeyH = c_rarg1;
6235     Register data    = c_rarg2;
6236     Register blocks  = c_rarg3;
6237 
6238     const int unroll = 4;
6239 
6240     __ cmp(blocks, (unsigned char)(unroll * 2));
6241     __ br(__ LT, small);
6242 
6243     if (unroll > 1) {
6244     // Save state before entering routine
6245       __ sub(sp, sp, 4 * 16);
6246       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6247       __ sub(sp, sp, 4 * 16);
6248       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6249     }
6250 
6251     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6252 
6253     if (unroll > 1) {
6254       // And restore state
6255       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6256       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6257     }
6258 
6259     __ cmp(blocks, (unsigned char)0);
6260     __ br(__ GT, small);
6261 
6262     __ ret(lr);
6263 
6264     return start;
6265   }
6266 
6267   void generate_base64_encode_simdround(Register src, Register dst,
6268         FloatRegister codec, u8 size) {
6269 
6270     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6271     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6272     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6273 
6274     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6275 
6276     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6277 
6278     __ ushr(ind0, arrangement, in0,  2);
6279 
6280     __ ushr(ind1, arrangement, in1,  2);
6281     __ shl(in0,   arrangement, in0,  6);
6282     __ orr(ind1,  arrangement, ind1, in0);
6283     __ ushr(ind1, arrangement, ind1, 2);
6284 
6285     __ ushr(ind2, arrangement, in2,  4);
6286     __ shl(in1,   arrangement, in1,  4);
6287     __ orr(ind2,  arrangement, in1,  ind2);
6288     __ ushr(ind2, arrangement, ind2, 2);
6289 
6290     __ shl(ind3,  arrangement, in2,  2);
6291     __ ushr(ind3, arrangement, ind3, 2);
6292 
6293     __ tbl(out0,  arrangement, codec,  4, ind0);
6294     __ tbl(out1,  arrangement, codec,  4, ind1);
6295     __ tbl(out2,  arrangement, codec,  4, ind2);
6296     __ tbl(out3,  arrangement, codec,  4, ind3);
6297 
6298     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6299   }
6300 
6301    /**
6302    *  Arguments:
6303    *
6304    *  Input:
6305    *  c_rarg0   - src_start
6306    *  c_rarg1   - src_offset
6307    *  c_rarg2   - src_length
6308    *  c_rarg3   - dest_start
6309    *  c_rarg4   - dest_offset
6310    *  c_rarg5   - isURL
6311    *
6312    */
6313   address generate_base64_encodeBlock() {
6314 
6315     static const char toBase64[64] = {
6316       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6317       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6318       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6319       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6320       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6321     };
6322 
6323     static const char toBase64URL[64] = {
6324       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6325       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6326       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6327       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6328       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6329     };
6330 
6331     __ align(CodeEntryAlignment);
6332     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6333     address start = __ pc();
6334 
6335     Register src   = c_rarg0;  // source array
6336     Register soff  = c_rarg1;  // source start offset
6337     Register send  = c_rarg2;  // source end offset
6338     Register dst   = c_rarg3;  // dest array
6339     Register doff  = c_rarg4;  // position for writing to dest array
6340     Register isURL = c_rarg5;  // Base64 or URL character set
6341 
6342     // c_rarg6 and c_rarg7 are free to use as temps
6343     Register codec  = c_rarg6;
6344     Register length = c_rarg7;
6345 
6346     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6347 
6348     __ add(src, src, soff);
6349     __ add(dst, dst, doff);
6350     __ sub(length, send, soff);
6351 
6352     // load the codec base address
6353     __ lea(codec, ExternalAddress((address) toBase64));
6354     __ cbz(isURL, ProcessData);
6355     __ lea(codec, ExternalAddress((address) toBase64URL));
6356 
6357     __ BIND(ProcessData);
6358 
6359     // too short to formup a SIMD loop, roll back
6360     __ cmp(length, (u1)24);
6361     __ br(Assembler::LT, Process3B);
6362 
6363     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6364 
6365     __ BIND(Process48B);
6366     __ cmp(length, (u1)48);
6367     __ br(Assembler::LT, Process24B);
6368     generate_base64_encode_simdround(src, dst, v0, 16);
6369     __ sub(length, length, 48);
6370     __ b(Process48B);
6371 
6372     __ BIND(Process24B);
6373     __ cmp(length, (u1)24);
6374     __ br(Assembler::LT, SIMDExit);
6375     generate_base64_encode_simdround(src, dst, v0, 8);
6376     __ sub(length, length, 24);
6377 
6378     __ BIND(SIMDExit);
6379     __ cbz(length, Exit);
6380 
6381     __ BIND(Process3B);
6382     //  3 src bytes, 24 bits
6383     __ ldrb(r10, __ post(src, 1));
6384     __ ldrb(r11, __ post(src, 1));
6385     __ ldrb(r12, __ post(src, 1));
6386     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6387     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6388     // codec index
6389     __ ubfmw(r15, r12, 18, 23);
6390     __ ubfmw(r14, r12, 12, 17);
6391     __ ubfmw(r13, r12, 6,  11);
6392     __ andw(r12,  r12, 63);
6393     // get the code based on the codec
6394     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6395     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6396     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6397     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6398     __ strb(r15, __ post(dst, 1));
6399     __ strb(r14, __ post(dst, 1));
6400     __ strb(r13, __ post(dst, 1));
6401     __ strb(r12, __ post(dst, 1));
6402     __ sub(length, length, 3);
6403     __ cbnz(length, Process3B);
6404 
6405     __ BIND(Exit);
6406     __ ret(lr);
6407 
6408     return start;
6409   }
6410 
6411   void generate_base64_decode_simdround(Register src, Register dst,
6412         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6413 
6414     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6415     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6416 
6417     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6418     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6419 
6420     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6421 
6422     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6423 
6424     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6425 
6426     // we need unsigned saturating subtract, to make sure all input values
6427     // in range [0, 63] will have 0U value in the higher half lookup
6428     __ uqsubv(decH0, __ T16B, in0, v27);
6429     __ uqsubv(decH1, __ T16B, in1, v27);
6430     __ uqsubv(decH2, __ T16B, in2, v27);
6431     __ uqsubv(decH3, __ T16B, in3, v27);
6432 
6433     // lower half lookup
6434     __ tbl(decL0, arrangement, codecL, 4, in0);
6435     __ tbl(decL1, arrangement, codecL, 4, in1);
6436     __ tbl(decL2, arrangement, codecL, 4, in2);
6437     __ tbl(decL3, arrangement, codecL, 4, in3);
6438 
6439     // higher half lookup
6440     __ tbx(decH0, arrangement, codecH, 4, decH0);
6441     __ tbx(decH1, arrangement, codecH, 4, decH1);
6442     __ tbx(decH2, arrangement, codecH, 4, decH2);
6443     __ tbx(decH3, arrangement, codecH, 4, decH3);
6444 
6445     // combine lower and higher
6446     __ orr(decL0, arrangement, decL0, decH0);
6447     __ orr(decL1, arrangement, decL1, decH1);
6448     __ orr(decL2, arrangement, decL2, decH2);
6449     __ orr(decL3, arrangement, decL3, decH3);
6450 
6451     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6452     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6453     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6454     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6455     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6456     __ orr(in0, arrangement, decH0, decH1);
6457     __ orr(in1, arrangement, decH2, decH3);
6458     __ orr(in2, arrangement, in0,   in1);
6459     __ umaxv(in3, arrangement, in2);
6460     __ umov(rscratch2, in3, __ B, 0);
6461 
6462     // get the data to output
6463     __ shl(out0,  arrangement, decL0, 2);
6464     __ ushr(out1, arrangement, decL1, 4);
6465     __ orr(out0,  arrangement, out0,  out1);
6466     __ shl(out1,  arrangement, decL1, 4);
6467     __ ushr(out2, arrangement, decL2, 2);
6468     __ orr(out1,  arrangement, out1,  out2);
6469     __ shl(out2,  arrangement, decL2, 6);
6470     __ orr(out2,  arrangement, out2,  decL3);
6471 
6472     __ cbz(rscratch2, NoIllegalData);
6473 
6474     // handle illegal input
6475     __ umov(r10, in2, __ D, 0);
6476     if (size == 16) {
6477       __ cbnz(r10, ErrorInLowerHalf);
6478 
6479       // illegal input is in higher half, store the lower half now.
6480       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6481 
6482       __ umov(r10, in2,  __ D, 1);
6483       __ umov(r11, out0, __ D, 1);
6484       __ umov(r12, out1, __ D, 1);
6485       __ umov(r13, out2, __ D, 1);
6486       __ b(StoreLegalData);
6487 
6488       __ BIND(ErrorInLowerHalf);
6489     }
6490     __ umov(r11, out0, __ D, 0);
6491     __ umov(r12, out1, __ D, 0);
6492     __ umov(r13, out2, __ D, 0);
6493 
6494     __ BIND(StoreLegalData);
6495     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6496     __ strb(r11, __ post(dst, 1));
6497     __ strb(r12, __ post(dst, 1));
6498     __ strb(r13, __ post(dst, 1));
6499     __ lsr(r10, r10, 8);
6500     __ lsr(r11, r11, 8);
6501     __ lsr(r12, r12, 8);
6502     __ lsr(r13, r13, 8);
6503     __ b(StoreLegalData);
6504 
6505     __ BIND(NoIllegalData);
6506     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6507   }
6508 
6509 
6510    /**
6511    *  Arguments:
6512    *
6513    *  Input:
6514    *  c_rarg0   - src_start
6515    *  c_rarg1   - src_offset
6516    *  c_rarg2   - src_length
6517    *  c_rarg3   - dest_start
6518    *  c_rarg4   - dest_offset
6519    *  c_rarg5   - isURL
6520    *  c_rarg6   - isMIME
6521    *
6522    */
6523   address generate_base64_decodeBlock() {
6524 
6525     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6526     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6527     // titled "Base64 decoding".
6528 
6529     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6530     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6531     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6532     static const uint8_t fromBase64ForNoSIMD[256] = {
6533       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6534       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6535       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6536        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6537       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6538        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6539       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6540        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6541       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6542       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6543       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6544       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6545       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6546       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6547       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6548       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6549     };
6550 
6551     static const uint8_t fromBase64URLForNoSIMD[256] = {
6552       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6553       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6554       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6555        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6556       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6557        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6558       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6559        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6560       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6561       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6562       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6563       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6564       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6565       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6566       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6567       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6568     };
6569 
6570     // A legal value of base64 code is in range [0, 127].  We need two lookups
6571     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6572     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6573     // table vector lookup use tbx, out of range indices are unchanged in
6574     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6575     // The value of index 64 is set to 0, so that we know that we already get the
6576     // decoded data with the 1st lookup.
6577     static const uint8_t fromBase64ForSIMD[128] = {
6578       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6579       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6580       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6581        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6582         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6583        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6584       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6585        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6586     };
6587 
6588     static const uint8_t fromBase64URLForSIMD[128] = {
6589       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6590       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6591       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6592        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6593         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6594        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6595        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6596        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6597     };
6598 
6599     __ align(CodeEntryAlignment);
6600     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6601     address start = __ pc();
6602 
6603     Register src    = c_rarg0;  // source array
6604     Register soff   = c_rarg1;  // source start offset
6605     Register send   = c_rarg2;  // source end offset
6606     Register dst    = c_rarg3;  // dest array
6607     Register doff   = c_rarg4;  // position for writing to dest array
6608     Register isURL  = c_rarg5;  // Base64 or URL character set
6609     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6610 
6611     Register length = send;    // reuse send as length of source data to process
6612 
6613     Register simd_codec   = c_rarg6;
6614     Register nosimd_codec = c_rarg7;
6615 
6616     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6617 
6618     __ enter();
6619 
6620     __ add(src, src, soff);
6621     __ add(dst, dst, doff);
6622 
6623     __ mov(doff, dst);
6624 
6625     __ sub(length, send, soff);
6626     __ bfm(length, zr, 0, 1);
6627 
6628     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6629     __ cbz(isURL, ProcessData);
6630     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6631 
6632     __ BIND(ProcessData);
6633     __ mov(rscratch1, length);
6634     __ cmp(length, (u1)144); // 144 = 80 + 64
6635     __ br(Assembler::LT, Process4B);
6636 
6637     // In the MIME case, the line length cannot be more than 76
6638     // bytes (see RFC 2045). This is too short a block for SIMD
6639     // to be worthwhile, so we use non-SIMD here.
6640     __ movw(rscratch1, 79);
6641 
6642     __ BIND(Process4B);
6643     __ ldrw(r14, __ post(src, 4));
6644     __ ubfxw(r10, r14, 0,  8);
6645     __ ubfxw(r11, r14, 8,  8);
6646     __ ubfxw(r12, r14, 16, 8);
6647     __ ubfxw(r13, r14, 24, 8);
6648     // get the de-code
6649     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6650     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6651     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6652     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6653     // error detection, 255u indicates an illegal input
6654     __ orrw(r14, r10, r11);
6655     __ orrw(r15, r12, r13);
6656     __ orrw(r14, r14, r15);
6657     __ tbnz(r14, 7, Exit);
6658     // recover the data
6659     __ lslw(r14, r10, 10);
6660     __ bfiw(r14, r11, 4, 6);
6661     __ bfmw(r14, r12, 2, 5);
6662     __ rev16w(r14, r14);
6663     __ bfiw(r13, r12, 6, 2);
6664     __ strh(r14, __ post(dst, 2));
6665     __ strb(r13, __ post(dst, 1));
6666     // non-simd loop
6667     __ subsw(rscratch1, rscratch1, 4);
6668     __ br(Assembler::GT, Process4B);
6669 
6670     // if exiting from PreProcess80B, rscratch1 == -1;
6671     // otherwise, rscratch1 == 0.
6672     __ cbzw(rscratch1, Exit);
6673     __ sub(length, length, 80);
6674 
6675     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6676     __ cbz(isURL, SIMDEnter);
6677     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6678 
6679     __ BIND(SIMDEnter);
6680     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6681     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6682     __ mov(rscratch1, 63);
6683     __ dup(v27, __ T16B, rscratch1);
6684 
6685     __ BIND(Process64B);
6686     __ cmp(length, (u1)64);
6687     __ br(Assembler::LT, Process32B);
6688     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6689     __ sub(length, length, 64);
6690     __ b(Process64B);
6691 
6692     __ BIND(Process32B);
6693     __ cmp(length, (u1)32);
6694     __ br(Assembler::LT, SIMDExit);
6695     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6696     __ sub(length, length, 32);
6697     __ b(Process32B);
6698 
6699     __ BIND(SIMDExit);
6700     __ cbz(length, Exit);
6701     __ movw(rscratch1, length);
6702     __ b(Process4B);
6703 
6704     __ BIND(Exit);
6705     __ sub(c_rarg0, dst, doff);
6706 
6707     __ leave();
6708     __ ret(lr);
6709 
6710     return start;
6711   }
6712 
6713   // Support for spin waits.
6714   address generate_spin_wait() {
6715     __ align(CodeEntryAlignment);
6716     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6717     address start = __ pc();
6718 
6719     __ spin_wait();
6720     __ ret(lr);
6721 
6722     return start;
6723   }
6724 
6725 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6726 
6727   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6728   //
6729   // If LSE is in use, generate LSE versions of all the stubs. The
6730   // non-LSE versions are in atomic_aarch64.S.
6731 
6732   // class AtomicStubMark records the entry point of a stub and the
6733   // stub pointer which will point to it. The stub pointer is set to
6734   // the entry point when ~AtomicStubMark() is called, which must be
6735   // after ICache::invalidate_range. This ensures safe publication of
6736   // the generated code.
6737   class AtomicStubMark {
6738     address _entry_point;
6739     aarch64_atomic_stub_t *_stub;
6740     MacroAssembler *_masm;
6741   public:
6742     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6743       _masm = masm;
6744       __ align(32);
6745       _entry_point = __ pc();
6746       _stub = stub;
6747     }
6748     ~AtomicStubMark() {
6749       *_stub = (aarch64_atomic_stub_t)_entry_point;
6750     }
6751   };
6752 
6753   // NB: For memory_order_conservative we need a trailing membar after
6754   // LSE atomic operations but not a leading membar.
6755   //
6756   // We don't need a leading membar because a clause in the Arm ARM
6757   // says:
6758   //
6759   //   Barrier-ordered-before
6760   //
6761   //   Barrier instructions order prior Memory effects before subsequent
6762   //   Memory effects generated by the same Observer. A read or a write
6763   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6764   //   Observer if and only if RW1 appears in program order before RW 2
6765   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6766   //   instruction with both Acquire and Release semantics.
6767   //
6768   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6769   // and Release semantics, therefore we don't need a leading
6770   // barrier. However, there is no corresponding Barrier-ordered-after
6771   // relationship, therefore we need a trailing membar to prevent a
6772   // later store or load from being reordered with the store in an
6773   // atomic instruction.
6774   //
6775   // This was checked by using the herd7 consistency model simulator
6776   // (http://diy.inria.fr/) with this test case:
6777   //
6778   // AArch64 LseCas
6779   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6780   // P0 | P1;
6781   // LDR W4, [X2] | MOV W3, #0;
6782   // DMB LD       | MOV W4, #1;
6783   // LDR W3, [X1] | CASAL W3, W4, [X1];
6784   //              | DMB ISH;
6785   //              | STR W4, [X2];
6786   // exists
6787   // (0:X3=0 /\ 0:X4=1)
6788   //
6789   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6790   // with the store to x in P1. Without the DMB in P1 this may happen.
6791   //
6792   // At the time of writing we don't know of any AArch64 hardware that
6793   // reorders stores in this way, but the Reference Manual permits it.
6794 
6795   void gen_cas_entry(Assembler::operand_size size,
6796                      atomic_memory_order order) {
6797     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6798       exchange_val = c_rarg2;
6799     bool acquire, release;
6800     switch (order) {
6801       case memory_order_relaxed:
6802         acquire = false;
6803         release = false;
6804         break;
6805       case memory_order_release:
6806         acquire = false;
6807         release = true;
6808         break;
6809       default:
6810         acquire = true;
6811         release = true;
6812         break;
6813     }
6814     __ mov(prev, compare_val);
6815     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6816     if (order == memory_order_conservative) {
6817       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6818     }
6819     if (size == Assembler::xword) {
6820       __ mov(r0, prev);
6821     } else {
6822       __ movw(r0, prev);
6823     }
6824     __ ret(lr);
6825   }
6826 
6827   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6828     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6829     // If not relaxed, then default to conservative.  Relaxed is the only
6830     // case we use enough to be worth specializing.
6831     if (order == memory_order_relaxed) {
6832       __ ldadd(size, incr, prev, addr);
6833     } else {
6834       __ ldaddal(size, incr, prev, addr);
6835       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6836     }
6837     if (size == Assembler::xword) {
6838       __ mov(r0, prev);
6839     } else {
6840       __ movw(r0, prev);
6841     }
6842     __ ret(lr);
6843   }
6844 
6845   void gen_swpal_entry(Assembler::operand_size size) {
6846     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6847     __ swpal(size, incr, prev, addr);
6848     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6849     if (size == Assembler::xword) {
6850       __ mov(r0, prev);
6851     } else {
6852       __ movw(r0, prev);
6853     }
6854     __ ret(lr);
6855   }
6856 
6857   void generate_atomic_entry_points() {
6858     if (! UseLSE) {
6859       return;
6860     }
6861 
6862     __ align(CodeEntryAlignment);
6863     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6864     address first_entry = __ pc();
6865 
6866     // ADD, memory_order_conservative
6867     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6868     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6869     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6870     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6871 
6872     // ADD, memory_order_relaxed
6873     AtomicStubMark mark_fetch_add_4_relaxed
6874       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6875     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6876     AtomicStubMark mark_fetch_add_8_relaxed
6877       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6878     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6879 
6880     // XCHG, memory_order_conservative
6881     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6882     gen_swpal_entry(Assembler::word);
6883     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6884     gen_swpal_entry(Assembler::xword);
6885 
6886     // CAS, memory_order_conservative
6887     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6888     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6889     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6890     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6891     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6892     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6893 
6894     // CAS, memory_order_relaxed
6895     AtomicStubMark mark_cmpxchg_1_relaxed
6896       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6897     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6898     AtomicStubMark mark_cmpxchg_4_relaxed
6899       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6900     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6901     AtomicStubMark mark_cmpxchg_8_relaxed
6902       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6903     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6904 
6905     AtomicStubMark mark_cmpxchg_4_release
6906       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6907     gen_cas_entry(MacroAssembler::word, memory_order_release);
6908     AtomicStubMark mark_cmpxchg_8_release
6909       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6910     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6911 
6912     AtomicStubMark mark_cmpxchg_4_seq_cst
6913       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6914     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6915     AtomicStubMark mark_cmpxchg_8_seq_cst
6916       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6917     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6918 
6919     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6920   }
6921 #endif // LINUX
6922 
6923   address generate_cont_thaw(Continuation::thaw_kind kind) {
6924     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6925     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6926 
6927     address start = __ pc();
6928 
6929     if (return_barrier) {
6930       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6931       __ mov(sp, rscratch1);
6932     }
6933     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6934 
6935     if (return_barrier) {
6936       // preserve possible return value from a method returning to the return barrier
6937       __ fmovd(rscratch1, v0);
6938       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6939     }
6940 
6941     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6942     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6943     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6944 
6945     if (return_barrier) {
6946       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6947       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6948       __ fmovd(v0, rscratch1);
6949     }
6950     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6951 
6952 
6953     Label thaw_success;
6954     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6955     __ cbnz(rscratch2, thaw_success);
6956     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6957     __ br(rscratch1);
6958     __ bind(thaw_success);
6959 
6960     // make room for the thawed frames
6961     __ sub(rscratch1, sp, rscratch2);
6962     __ andr(rscratch1, rscratch1, -16); // align
6963     __ mov(sp, rscratch1);
6964 
6965     if (return_barrier) {
6966       // save original return value -- again
6967       __ fmovd(rscratch1, v0);
6968       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6969     }
6970 
6971     // If we want, we can templatize thaw by kind, and have three different entries
6972     __ movw(c_rarg1, (uint32_t)kind);
6973 
6974     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
6975     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
6976 
6977     if (return_barrier) {
6978       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6979       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6980       __ fmovd(v0, rscratch1);
6981     } else {
6982       __ mov(r0, zr); // return 0 (success) from doYield
6983     }
6984 
6985     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
6986     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
6987     __ mov(rfp, sp);
6988 
6989     if (return_barrier_exception) {
6990       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
6991       __ verify_oop(r0);
6992       __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
6993 
6994       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
6995 
6996       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
6997       // __ reinitialize_ptrue();
6998 
6999       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7000 
7001       __ mov(r1, r0); // the exception handler
7002       __ mov(r0, r19); // restore return value contaning the exception oop
7003       __ verify_oop(r0);
7004 
7005       __ leave();
7006       __ mov(r3, lr);
7007       __ br(r1); // the exception handler
7008     } else {
7009       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7010       __ leave();
7011       __ ret(lr);
7012     }
7013 
7014     return start;
7015   }
7016 
7017   address generate_cont_thaw() {
7018     if (!Continuations::enabled()) return nullptr;
7019 
7020     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7021     address start = __ pc();
7022     generate_cont_thaw(Continuation::thaw_top);
7023     return start;
7024   }
7025 
7026   address generate_cont_returnBarrier() {
7027     if (!Continuations::enabled()) return nullptr;
7028 
7029     // TODO: will probably need multiple return barriers depending on return type
7030     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7031     address start = __ pc();
7032 
7033     generate_cont_thaw(Continuation::thaw_return_barrier);
7034 
7035     return start;
7036   }
7037 
7038   address generate_cont_returnBarrier_exception() {
7039     if (!Continuations::enabled()) return nullptr;
7040 
7041     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7042     address start = __ pc();
7043 
7044     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7045 
7046     return start;
7047   }
7048 
7049 #if INCLUDE_JFR
7050 
7051   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
7052     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7053     __ mov(c_rarg0, thread);
7054   }
7055 
7056   // The handle is dereferenced through a load barrier.
7057   static void jfr_epilogue(MacroAssembler* _masm) {
7058     __ reset_last_Java_frame(true);
7059     __ resolve_global_jobject(r0, rscratch1, rscratch2);
7060   }
7061 
7062   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
7063   // It returns a jobject handle to the event writer.
7064   // The handle is dereferenced and the return value is the event writer oop.
7065   static RuntimeStub* generate_jfr_write_checkpoint() {
7066     enum layout {
7067       rbp_off,
7068       rbpH_off,
7069       return_off,
7070       return_off2,
7071       framesize // inclusive of return address
7072     };
7073 
7074     int insts_size = 1024;
7075     int locs_size = 64;
7076     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
7077     OopMapSet* oop_maps = new OopMapSet();
7078     MacroAssembler* masm = new MacroAssembler(&code);
7079     MacroAssembler* _masm = masm;
7080 
7081     address start = __ pc();
7082     __ enter();
7083     int frame_complete = __ pc() - start;
7084     address the_pc = __ pc();
7085     jfr_prologue(the_pc, _masm, rthread);
7086     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
7087     jfr_epilogue(_masm);
7088     __ leave();
7089     __ ret(lr);
7090 
7091     OopMap* map = new OopMap(framesize, 1); // rfp
7092     oop_maps->add_gc_map(the_pc - start, map);
7093 
7094     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7095       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
7096                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7097                                     oop_maps, false);
7098     return stub;
7099   }
7100 
7101 #endif // INCLUDE_JFR
7102 
7103   // Continuation point for throwing of implicit exceptions that are
7104   // not handled in the current activation. Fabricates an exception
7105   // oop and initiates normal exception dispatching in this
7106   // frame. Since we need to preserve callee-saved values (currently
7107   // only for C2, but done for C1 as well) we need a callee-saved oop
7108   // map and therefore have to make these stubs into RuntimeStubs
7109   // rather than BufferBlobs.  If the compiler needs all registers to
7110   // be preserved between the fault point and the exception handler
7111   // then it must assume responsibility for that in
7112   // AbstractCompiler::continuation_for_implicit_null_exception or
7113   // continuation_for_implicit_division_by_zero_exception. All other
7114   // implicit exceptions (e.g., NullPointerException or
7115   // AbstractMethodError on entry) are either at call sites or
7116   // otherwise assume that stack unwinding will be initiated, so
7117   // caller saved registers were assumed volatile in the compiler.
7118 
7119 #undef __
7120 #define __ masm->
7121 
7122   address generate_throw_exception(const char* name,
7123                                    address runtime_entry,
7124                                    Register arg1 = noreg,
7125                                    Register arg2 = noreg) {
7126     // Information about frame layout at time of blocking runtime call.
7127     // Note that we only have to preserve callee-saved registers since
7128     // the compilers are responsible for supplying a continuation point
7129     // if they expect all registers to be preserved.
7130     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7131     enum layout {
7132       rfp_off = 0,
7133       rfp_off2,
7134       return_off,
7135       return_off2,
7136       framesize // inclusive of return address
7137     };
7138 
7139     int insts_size = 512;
7140     int locs_size  = 64;
7141 
7142     CodeBuffer code(name, insts_size, locs_size);
7143     OopMapSet* oop_maps  = new OopMapSet();
7144     MacroAssembler* masm = new MacroAssembler(&code);
7145 
7146     address start = __ pc();
7147 
7148     // This is an inlined and slightly modified version of call_VM
7149     // which has the ability to fetch the return PC out of
7150     // thread-local storage and also sets up last_Java_sp slightly
7151     // differently than the real call_VM
7152 
7153     __ enter(); // Save FP and LR before call
7154 
7155     assert(is_even(framesize/2), "sp not 16-byte aligned");
7156 
7157     // lr and fp are already in place
7158     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7159 
7160     int frame_complete = __ pc() - start;
7161 
7162     // Set up last_Java_sp and last_Java_fp
7163     address the_pc = __ pc();
7164     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7165 
7166     // Call runtime
7167     if (arg1 != noreg) {
7168       assert(arg2 != c_rarg1, "clobbered");
7169       __ mov(c_rarg1, arg1);
7170     }
7171     if (arg2 != noreg) {
7172       __ mov(c_rarg2, arg2);
7173     }
7174     __ mov(c_rarg0, rthread);
7175     BLOCK_COMMENT("call runtime_entry");
7176     __ mov(rscratch1, runtime_entry);
7177     __ blr(rscratch1);
7178 
7179     // Generate oop map
7180     OopMap* map = new OopMap(framesize, 0);
7181 
7182     oop_maps->add_gc_map(the_pc - start, map);
7183 
7184     __ reset_last_Java_frame(true);
7185 
7186     // Reinitialize the ptrue predicate register, in case the external runtime
7187     // call clobbers ptrue reg, as we may return to SVE compiled code.
7188     __ reinitialize_ptrue();
7189 
7190     __ leave();
7191 
7192     // check for pending exceptions
7193 #ifdef ASSERT
7194     Label L;
7195     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7196     __ cbnz(rscratch1, L);
7197     __ should_not_reach_here();
7198     __ bind(L);
7199 #endif // ASSERT
7200     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7201 
7202     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7203     RuntimeStub* stub =
7204       RuntimeStub::new_runtime_stub(name,
7205                                     &code,
7206                                     frame_complete,
7207                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7208                                     oop_maps, false);
7209     return stub->entry_point();
7210   }
7211 
7212   class MontgomeryMultiplyGenerator : public MacroAssembler {
7213 
7214     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7215       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7216 
7217     RegSet _toSave;
7218     bool _squaring;
7219 
7220   public:
7221     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7222       : MacroAssembler(as->code()), _squaring(squaring) {
7223 
7224       // Register allocation
7225 
7226       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7227       Pa_base = *regs;       // Argument registers
7228       if (squaring)
7229         Pb_base = Pa_base;
7230       else
7231         Pb_base = *++regs;
7232       Pn_base = *++regs;
7233       Rlen= *++regs;
7234       inv = *++regs;
7235       Pm_base = *++regs;
7236 
7237                           // Working registers:
7238       Ra =  *++regs;        // The current digit of a, b, n, and m.
7239       Rb =  *++regs;
7240       Rm =  *++regs;
7241       Rn =  *++regs;
7242 
7243       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7244       Pb =  *++regs;
7245       Pm =  *++regs;
7246       Pn =  *++regs;
7247 
7248       t0 =  *++regs;        // Three registers which form a
7249       t1 =  *++regs;        // triple-precision accumuator.
7250       t2 =  *++regs;
7251 
7252       Ri =  *++regs;        // Inner and outer loop indexes.
7253       Rj =  *++regs;
7254 
7255       Rhi_ab = *++regs;     // Product registers: low and high parts
7256       Rlo_ab = *++regs;     // of a*b and m*n.
7257       Rhi_mn = *++regs;
7258       Rlo_mn = *++regs;
7259 
7260       // r19 and up are callee-saved.
7261       _toSave = RegSet::range(r19, *regs) + Pm_base;
7262     }
7263 
7264   private:
7265     void save_regs() {
7266       push(_toSave, sp);
7267     }
7268 
7269     void restore_regs() {
7270       pop(_toSave, sp);
7271     }
7272 
7273     template <typename T>
7274     void unroll_2(Register count, T block) {
7275       Label loop, end, odd;
7276       tbnz(count, 0, odd);
7277       cbz(count, end);
7278       align(16);
7279       bind(loop);
7280       (this->*block)();
7281       bind(odd);
7282       (this->*block)();
7283       subs(count, count, 2);
7284       br(Assembler::GT, loop);
7285       bind(end);
7286     }
7287 
7288     template <typename T>
7289     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7290       Label loop, end, odd;
7291       tbnz(count, 0, odd);
7292       cbz(count, end);
7293       align(16);
7294       bind(loop);
7295       (this->*block)(d, s, tmp);
7296       bind(odd);
7297       (this->*block)(d, s, tmp);
7298       subs(count, count, 2);
7299       br(Assembler::GT, loop);
7300       bind(end);
7301     }
7302 
7303     void pre1(RegisterOrConstant i) {
7304       block_comment("pre1");
7305       // Pa = Pa_base;
7306       // Pb = Pb_base + i;
7307       // Pm = Pm_base;
7308       // Pn = Pn_base + i;
7309       // Ra = *Pa;
7310       // Rb = *Pb;
7311       // Rm = *Pm;
7312       // Rn = *Pn;
7313       ldr(Ra, Address(Pa_base));
7314       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7315       ldr(Rm, Address(Pm_base));
7316       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7317       lea(Pa, Address(Pa_base));
7318       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7319       lea(Pm, Address(Pm_base));
7320       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7321 
7322       // Zero the m*n result.
7323       mov(Rhi_mn, zr);
7324       mov(Rlo_mn, zr);
7325     }
7326 
7327     // The core multiply-accumulate step of a Montgomery
7328     // multiplication.  The idea is to schedule operations as a
7329     // pipeline so that instructions with long latencies (loads and
7330     // multiplies) have time to complete before their results are
7331     // used.  This most benefits in-order implementations of the
7332     // architecture but out-of-order ones also benefit.
7333     void step() {
7334       block_comment("step");
7335       // MACC(Ra, Rb, t0, t1, t2);
7336       // Ra = *++Pa;
7337       // Rb = *--Pb;
7338       umulh(Rhi_ab, Ra, Rb);
7339       mul(Rlo_ab, Ra, Rb);
7340       ldr(Ra, pre(Pa, wordSize));
7341       ldr(Rb, pre(Pb, -wordSize));
7342       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7343                                        // previous iteration.
7344       // MACC(Rm, Rn, t0, t1, t2);
7345       // Rm = *++Pm;
7346       // Rn = *--Pn;
7347       umulh(Rhi_mn, Rm, Rn);
7348       mul(Rlo_mn, Rm, Rn);
7349       ldr(Rm, pre(Pm, wordSize));
7350       ldr(Rn, pre(Pn, -wordSize));
7351       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7352     }
7353 
7354     void post1() {
7355       block_comment("post1");
7356 
7357       // MACC(Ra, Rb, t0, t1, t2);
7358       // Ra = *++Pa;
7359       // Rb = *--Pb;
7360       umulh(Rhi_ab, Ra, Rb);
7361       mul(Rlo_ab, Ra, Rb);
7362       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7363       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7364 
7365       // *Pm = Rm = t0 * inv;
7366       mul(Rm, t0, inv);
7367       str(Rm, Address(Pm));
7368 
7369       // MACC(Rm, Rn, t0, t1, t2);
7370       // t0 = t1; t1 = t2; t2 = 0;
7371       umulh(Rhi_mn, Rm, Rn);
7372 
7373 #ifndef PRODUCT
7374       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7375       {
7376         mul(Rlo_mn, Rm, Rn);
7377         add(Rlo_mn, t0, Rlo_mn);
7378         Label ok;
7379         cbz(Rlo_mn, ok); {
7380           stop("broken Montgomery multiply");
7381         } bind(ok);
7382       }
7383 #endif
7384       // We have very carefully set things up so that
7385       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7386       // the lower half of Rm * Rn because we know the result already:
7387       // it must be -t0.  t0 + (-t0) must generate a carry iff
7388       // t0 != 0.  So, rather than do a mul and an adds we just set
7389       // the carry flag iff t0 is nonzero.
7390       //
7391       // mul(Rlo_mn, Rm, Rn);
7392       // adds(zr, t0, Rlo_mn);
7393       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7394       adcs(t0, t1, Rhi_mn);
7395       adc(t1, t2, zr);
7396       mov(t2, zr);
7397     }
7398 
7399     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7400       block_comment("pre2");
7401       // Pa = Pa_base + i-len;
7402       // Pb = Pb_base + len;
7403       // Pm = Pm_base + i-len;
7404       // Pn = Pn_base + len;
7405 
7406       if (i.is_register()) {
7407         sub(Rj, i.as_register(), len);
7408       } else {
7409         mov(Rj, i.as_constant());
7410         sub(Rj, Rj, len);
7411       }
7412       // Rj == i-len
7413 
7414       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7415       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7416       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7417       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7418 
7419       // Ra = *++Pa;
7420       // Rb = *--Pb;
7421       // Rm = *++Pm;
7422       // Rn = *--Pn;
7423       ldr(Ra, pre(Pa, wordSize));
7424       ldr(Rb, pre(Pb, -wordSize));
7425       ldr(Rm, pre(Pm, wordSize));
7426       ldr(Rn, pre(Pn, -wordSize));
7427 
7428       mov(Rhi_mn, zr);
7429       mov(Rlo_mn, zr);
7430     }
7431 
7432     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7433       block_comment("post2");
7434       if (i.is_constant()) {
7435         mov(Rj, i.as_constant()-len.as_constant());
7436       } else {
7437         sub(Rj, i.as_register(), len);
7438       }
7439 
7440       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7441 
7442       // As soon as we know the least significant digit of our result,
7443       // store it.
7444       // Pm_base[i-len] = t0;
7445       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7446 
7447       // t0 = t1; t1 = t2; t2 = 0;
7448       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7449       adc(t1, t2, zr);
7450       mov(t2, zr);
7451     }
7452 
7453     // A carry in t0 after Montgomery multiplication means that we
7454     // should subtract multiples of n from our result in m.  We'll
7455     // keep doing that until there is no carry.
7456     void normalize(RegisterOrConstant len) {
7457       block_comment("normalize");
7458       // while (t0)
7459       //   t0 = sub(Pm_base, Pn_base, t0, len);
7460       Label loop, post, again;
7461       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7462       cbz(t0, post); {
7463         bind(again); {
7464           mov(i, zr);
7465           mov(cnt, len);
7466           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7467           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7468           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7469           align(16);
7470           bind(loop); {
7471             sbcs(Rm, Rm, Rn);
7472             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7473             add(i, i, 1);
7474             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7475             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7476             sub(cnt, cnt, 1);
7477           } cbnz(cnt, loop);
7478           sbc(t0, t0, zr);
7479         } cbnz(t0, again);
7480       } bind(post);
7481     }
7482 
7483     // Move memory at s to d, reversing words.
7484     //    Increments d to end of copied memory
7485     //    Destroys tmp1, tmp2
7486     //    Preserves len
7487     //    Leaves s pointing to the address which was in d at start
7488     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7489       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7490       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7491 
7492       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7493       mov(tmp1, len);
7494       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7495       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7496     }
7497     // where
7498     void reverse1(Register d, Register s, Register tmp) {
7499       ldr(tmp, pre(s, -wordSize));
7500       ror(tmp, tmp, 32);
7501       str(tmp, post(d, wordSize));
7502     }
7503 
7504     void step_squaring() {
7505       // An extra ACC
7506       step();
7507       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7508     }
7509 
7510     void last_squaring(RegisterOrConstant i) {
7511       Label dont;
7512       // if ((i & 1) == 0) {
7513       tbnz(i.as_register(), 0, dont); {
7514         // MACC(Ra, Rb, t0, t1, t2);
7515         // Ra = *++Pa;
7516         // Rb = *--Pb;
7517         umulh(Rhi_ab, Ra, Rb);
7518         mul(Rlo_ab, Ra, Rb);
7519         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7520       } bind(dont);
7521     }
7522 
7523     void extra_step_squaring() {
7524       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7525 
7526       // MACC(Rm, Rn, t0, t1, t2);
7527       // Rm = *++Pm;
7528       // Rn = *--Pn;
7529       umulh(Rhi_mn, Rm, Rn);
7530       mul(Rlo_mn, Rm, Rn);
7531       ldr(Rm, pre(Pm, wordSize));
7532       ldr(Rn, pre(Pn, -wordSize));
7533     }
7534 
7535     void post1_squaring() {
7536       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7537 
7538       // *Pm = Rm = t0 * inv;
7539       mul(Rm, t0, inv);
7540       str(Rm, Address(Pm));
7541 
7542       // MACC(Rm, Rn, t0, t1, t2);
7543       // t0 = t1; t1 = t2; t2 = 0;
7544       umulh(Rhi_mn, Rm, Rn);
7545 
7546 #ifndef PRODUCT
7547       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7548       {
7549         mul(Rlo_mn, Rm, Rn);
7550         add(Rlo_mn, t0, Rlo_mn);
7551         Label ok;
7552         cbz(Rlo_mn, ok); {
7553           stop("broken Montgomery multiply");
7554         } bind(ok);
7555       }
7556 #endif
7557       // We have very carefully set things up so that
7558       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7559       // the lower half of Rm * Rn because we know the result already:
7560       // it must be -t0.  t0 + (-t0) must generate a carry iff
7561       // t0 != 0.  So, rather than do a mul and an adds we just set
7562       // the carry flag iff t0 is nonzero.
7563       //
7564       // mul(Rlo_mn, Rm, Rn);
7565       // adds(zr, t0, Rlo_mn);
7566       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7567       adcs(t0, t1, Rhi_mn);
7568       adc(t1, t2, zr);
7569       mov(t2, zr);
7570     }
7571 
7572     void acc(Register Rhi, Register Rlo,
7573              Register t0, Register t1, Register t2) {
7574       adds(t0, t0, Rlo);
7575       adcs(t1, t1, Rhi);
7576       adc(t2, t2, zr);
7577     }
7578 
7579   public:
7580     /**
7581      * Fast Montgomery multiplication.  The derivation of the
7582      * algorithm is in A Cryptographic Library for the Motorola
7583      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7584      *
7585      * Arguments:
7586      *
7587      * Inputs for multiplication:
7588      *   c_rarg0   - int array elements a
7589      *   c_rarg1   - int array elements b
7590      *   c_rarg2   - int array elements n (the modulus)
7591      *   c_rarg3   - int length
7592      *   c_rarg4   - int inv
7593      *   c_rarg5   - int array elements m (the result)
7594      *
7595      * Inputs for squaring:
7596      *   c_rarg0   - int array elements a
7597      *   c_rarg1   - int array elements n (the modulus)
7598      *   c_rarg2   - int length
7599      *   c_rarg3   - int inv
7600      *   c_rarg4   - int array elements m (the result)
7601      *
7602      */
7603     address generate_multiply() {
7604       Label argh, nothing;
7605       bind(argh);
7606       stop("MontgomeryMultiply total_allocation must be <= 8192");
7607 
7608       align(CodeEntryAlignment);
7609       address entry = pc();
7610 
7611       cbzw(Rlen, nothing);
7612 
7613       enter();
7614 
7615       // Make room.
7616       cmpw(Rlen, 512);
7617       br(Assembler::HI, argh);
7618       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7619       andr(sp, Ra, -2 * wordSize);
7620 
7621       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7622 
7623       {
7624         // Copy input args, reversing as we go.  We use Ra as a
7625         // temporary variable.
7626         reverse(Ra, Pa_base, Rlen, t0, t1);
7627         if (!_squaring)
7628           reverse(Ra, Pb_base, Rlen, t0, t1);
7629         reverse(Ra, Pn_base, Rlen, t0, t1);
7630       }
7631 
7632       // Push all call-saved registers and also Pm_base which we'll need
7633       // at the end.
7634       save_regs();
7635 
7636 #ifndef PRODUCT
7637       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7638       {
7639         ldr(Rn, Address(Pn_base, 0));
7640         mul(Rlo_mn, Rn, inv);
7641         subs(zr, Rlo_mn, -1);
7642         Label ok;
7643         br(EQ, ok); {
7644           stop("broken inverse in Montgomery multiply");
7645         } bind(ok);
7646       }
7647 #endif
7648 
7649       mov(Pm_base, Ra);
7650 
7651       mov(t0, zr);
7652       mov(t1, zr);
7653       mov(t2, zr);
7654 
7655       block_comment("for (int i = 0; i < len; i++) {");
7656       mov(Ri, zr); {
7657         Label loop, end;
7658         cmpw(Ri, Rlen);
7659         br(Assembler::GE, end);
7660 
7661         bind(loop);
7662         pre1(Ri);
7663 
7664         block_comment("  for (j = i; j; j--) {"); {
7665           movw(Rj, Ri);
7666           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7667         } block_comment("  } // j");
7668 
7669         post1();
7670         addw(Ri, Ri, 1);
7671         cmpw(Ri, Rlen);
7672         br(Assembler::LT, loop);
7673         bind(end);
7674         block_comment("} // i");
7675       }
7676 
7677       block_comment("for (int i = len; i < 2*len; i++) {");
7678       mov(Ri, Rlen); {
7679         Label loop, end;
7680         cmpw(Ri, Rlen, Assembler::LSL, 1);
7681         br(Assembler::GE, end);
7682 
7683         bind(loop);
7684         pre2(Ri, Rlen);
7685 
7686         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7687           lslw(Rj, Rlen, 1);
7688           subw(Rj, Rj, Ri);
7689           subw(Rj, Rj, 1);
7690           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7691         } block_comment("  } // j");
7692 
7693         post2(Ri, Rlen);
7694         addw(Ri, Ri, 1);
7695         cmpw(Ri, Rlen, Assembler::LSL, 1);
7696         br(Assembler::LT, loop);
7697         bind(end);
7698       }
7699       block_comment("} // i");
7700 
7701       normalize(Rlen);
7702 
7703       mov(Ra, Pm_base);  // Save Pm_base in Ra
7704       restore_regs();  // Restore caller's Pm_base
7705 
7706       // Copy our result into caller's Pm_base
7707       reverse(Pm_base, Ra, Rlen, t0, t1);
7708 
7709       leave();
7710       bind(nothing);
7711       ret(lr);
7712 
7713       return entry;
7714     }
7715     // In C, approximately:
7716 
7717     // void
7718     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7719     //                     julong Pn_base[], julong Pm_base[],
7720     //                     julong inv, int len) {
7721     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7722     //   julong *Pa, *Pb, *Pn, *Pm;
7723     //   julong Ra, Rb, Rn, Rm;
7724 
7725     //   int i;
7726 
7727     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7728 
7729     //   for (i = 0; i < len; i++) {
7730     //     int j;
7731 
7732     //     Pa = Pa_base;
7733     //     Pb = Pb_base + i;
7734     //     Pm = Pm_base;
7735     //     Pn = Pn_base + i;
7736 
7737     //     Ra = *Pa;
7738     //     Rb = *Pb;
7739     //     Rm = *Pm;
7740     //     Rn = *Pn;
7741 
7742     //     int iters = i;
7743     //     for (j = 0; iters--; j++) {
7744     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7745     //       MACC(Ra, Rb, t0, t1, t2);
7746     //       Ra = *++Pa;
7747     //       Rb = *--Pb;
7748     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7749     //       MACC(Rm, Rn, t0, t1, t2);
7750     //       Rm = *++Pm;
7751     //       Rn = *--Pn;
7752     //     }
7753 
7754     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7755     //     MACC(Ra, Rb, t0, t1, t2);
7756     //     *Pm = Rm = t0 * inv;
7757     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7758     //     MACC(Rm, Rn, t0, t1, t2);
7759 
7760     //     assert(t0 == 0, "broken Montgomery multiply");
7761 
7762     //     t0 = t1; t1 = t2; t2 = 0;
7763     //   }
7764 
7765     //   for (i = len; i < 2*len; i++) {
7766     //     int j;
7767 
7768     //     Pa = Pa_base + i-len;
7769     //     Pb = Pb_base + len;
7770     //     Pm = Pm_base + i-len;
7771     //     Pn = Pn_base + len;
7772 
7773     //     Ra = *++Pa;
7774     //     Rb = *--Pb;
7775     //     Rm = *++Pm;
7776     //     Rn = *--Pn;
7777 
7778     //     int iters = len*2-i-1;
7779     //     for (j = i-len+1; iters--; j++) {
7780     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7781     //       MACC(Ra, Rb, t0, t1, t2);
7782     //       Ra = *++Pa;
7783     //       Rb = *--Pb;
7784     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7785     //       MACC(Rm, Rn, t0, t1, t2);
7786     //       Rm = *++Pm;
7787     //       Rn = *--Pn;
7788     //     }
7789 
7790     //     Pm_base[i-len] = t0;
7791     //     t0 = t1; t1 = t2; t2 = 0;
7792     //   }
7793 
7794     //   while (t0)
7795     //     t0 = sub(Pm_base, Pn_base, t0, len);
7796     // }
7797 
7798     /**
7799      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7800      * multiplies than Montgomery multiplication so it should be up to
7801      * 25% faster.  However, its loop control is more complex and it
7802      * may actually run slower on some machines.
7803      *
7804      * Arguments:
7805      *
7806      * Inputs:
7807      *   c_rarg0   - int array elements a
7808      *   c_rarg1   - int array elements n (the modulus)
7809      *   c_rarg2   - int length
7810      *   c_rarg3   - int inv
7811      *   c_rarg4   - int array elements m (the result)
7812      *
7813      */
7814     address generate_square() {
7815       Label argh;
7816       bind(argh);
7817       stop("MontgomeryMultiply total_allocation must be <= 8192");
7818 
7819       align(CodeEntryAlignment);
7820       address entry = pc();
7821 
7822       enter();
7823 
7824       // Make room.
7825       cmpw(Rlen, 512);
7826       br(Assembler::HI, argh);
7827       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7828       andr(sp, Ra, -2 * wordSize);
7829 
7830       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7831 
7832       {
7833         // Copy input args, reversing as we go.  We use Ra as a
7834         // temporary variable.
7835         reverse(Ra, Pa_base, Rlen, t0, t1);
7836         reverse(Ra, Pn_base, Rlen, t0, t1);
7837       }
7838 
7839       // Push all call-saved registers and also Pm_base which we'll need
7840       // at the end.
7841       save_regs();
7842 
7843       mov(Pm_base, Ra);
7844 
7845       mov(t0, zr);
7846       mov(t1, zr);
7847       mov(t2, zr);
7848 
7849       block_comment("for (int i = 0; i < len; i++) {");
7850       mov(Ri, zr); {
7851         Label loop, end;
7852         bind(loop);
7853         cmp(Ri, Rlen);
7854         br(Assembler::GE, end);
7855 
7856         pre1(Ri);
7857 
7858         block_comment("for (j = (i+1)/2; j; j--) {"); {
7859           add(Rj, Ri, 1);
7860           lsr(Rj, Rj, 1);
7861           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7862         } block_comment("  } // j");
7863 
7864         last_squaring(Ri);
7865 
7866         block_comment("  for (j = i/2; j; j--) {"); {
7867           lsr(Rj, Ri, 1);
7868           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7869         } block_comment("  } // j");
7870 
7871         post1_squaring();
7872         add(Ri, Ri, 1);
7873         cmp(Ri, Rlen);
7874         br(Assembler::LT, loop);
7875 
7876         bind(end);
7877         block_comment("} // i");
7878       }
7879 
7880       block_comment("for (int i = len; i < 2*len; i++) {");
7881       mov(Ri, Rlen); {
7882         Label loop, end;
7883         bind(loop);
7884         cmp(Ri, Rlen, Assembler::LSL, 1);
7885         br(Assembler::GE, end);
7886 
7887         pre2(Ri, Rlen);
7888 
7889         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7890           lsl(Rj, Rlen, 1);
7891           sub(Rj, Rj, Ri);
7892           sub(Rj, Rj, 1);
7893           lsr(Rj, Rj, 1);
7894           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7895         } block_comment("  } // j");
7896 
7897         last_squaring(Ri);
7898 
7899         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7900           lsl(Rj, Rlen, 1);
7901           sub(Rj, Rj, Ri);
7902           lsr(Rj, Rj, 1);
7903           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7904         } block_comment("  } // j");
7905 
7906         post2(Ri, Rlen);
7907         add(Ri, Ri, 1);
7908         cmp(Ri, Rlen, Assembler::LSL, 1);
7909 
7910         br(Assembler::LT, loop);
7911         bind(end);
7912         block_comment("} // i");
7913       }
7914 
7915       normalize(Rlen);
7916 
7917       mov(Ra, Pm_base);  // Save Pm_base in Ra
7918       restore_regs();  // Restore caller's Pm_base
7919 
7920       // Copy our result into caller's Pm_base
7921       reverse(Pm_base, Ra, Rlen, t0, t1);
7922 
7923       leave();
7924       ret(lr);
7925 
7926       return entry;
7927     }
7928     // In C, approximately:
7929 
7930     // void
7931     // montgomery_square(julong Pa_base[], julong Pn_base[],
7932     //                   julong Pm_base[], julong inv, int len) {
7933     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7934     //   julong *Pa, *Pb, *Pn, *Pm;
7935     //   julong Ra, Rb, Rn, Rm;
7936 
7937     //   int i;
7938 
7939     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7940 
7941     //   for (i = 0; i < len; i++) {
7942     //     int j;
7943 
7944     //     Pa = Pa_base;
7945     //     Pb = Pa_base + i;
7946     //     Pm = Pm_base;
7947     //     Pn = Pn_base + i;
7948 
7949     //     Ra = *Pa;
7950     //     Rb = *Pb;
7951     //     Rm = *Pm;
7952     //     Rn = *Pn;
7953 
7954     //     int iters = (i+1)/2;
7955     //     for (j = 0; iters--; j++) {
7956     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7957     //       MACC2(Ra, Rb, t0, t1, t2);
7958     //       Ra = *++Pa;
7959     //       Rb = *--Pb;
7960     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7961     //       MACC(Rm, Rn, t0, t1, t2);
7962     //       Rm = *++Pm;
7963     //       Rn = *--Pn;
7964     //     }
7965     //     if ((i & 1) == 0) {
7966     //       assert(Ra == Pa_base[j], "must be");
7967     //       MACC(Ra, Ra, t0, t1, t2);
7968     //     }
7969     //     iters = i/2;
7970     //     assert(iters == i-j, "must be");
7971     //     for (; iters--; j++) {
7972     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7973     //       MACC(Rm, Rn, t0, t1, t2);
7974     //       Rm = *++Pm;
7975     //       Rn = *--Pn;
7976     //     }
7977 
7978     //     *Pm = Rm = t0 * inv;
7979     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7980     //     MACC(Rm, Rn, t0, t1, t2);
7981 
7982     //     assert(t0 == 0, "broken Montgomery multiply");
7983 
7984     //     t0 = t1; t1 = t2; t2 = 0;
7985     //   }
7986 
7987     //   for (i = len; i < 2*len; i++) {
7988     //     int start = i-len+1;
7989     //     int end = start + (len - start)/2;
7990     //     int j;
7991 
7992     //     Pa = Pa_base + i-len;
7993     //     Pb = Pa_base + len;
7994     //     Pm = Pm_base + i-len;
7995     //     Pn = Pn_base + len;
7996 
7997     //     Ra = *++Pa;
7998     //     Rb = *--Pb;
7999     //     Rm = *++Pm;
8000     //     Rn = *--Pn;
8001 
8002     //     int iters = (2*len-i-1)/2;
8003     //     assert(iters == end-start, "must be");
8004     //     for (j = start; iters--; j++) {
8005     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8006     //       MACC2(Ra, Rb, t0, t1, t2);
8007     //       Ra = *++Pa;
8008     //       Rb = *--Pb;
8009     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8010     //       MACC(Rm, Rn, t0, t1, t2);
8011     //       Rm = *++Pm;
8012     //       Rn = *--Pn;
8013     //     }
8014     //     if ((i & 1) == 0) {
8015     //       assert(Ra == Pa_base[j], "must be");
8016     //       MACC(Ra, Ra, t0, t1, t2);
8017     //     }
8018     //     iters =  (2*len-i)/2;
8019     //     assert(iters == len-j, "must be");
8020     //     for (; iters--; j++) {
8021     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8022     //       MACC(Rm, Rn, t0, t1, t2);
8023     //       Rm = *++Pm;
8024     //       Rn = *--Pn;
8025     //     }
8026     //     Pm_base[i-len] = t0;
8027     //     t0 = t1; t1 = t2; t2 = 0;
8028     //   }
8029 
8030     //   while (t0)
8031     //     t0 = sub(Pm_base, Pn_base, t0, len);
8032     // }
8033   };
8034 
8035 
8036   // Initialization
8037   void generate_initial_stubs() {
8038     // Generate initial stubs and initializes the entry points
8039 
8040     // entry points that exist in all platforms Note: This is code
8041     // that could be shared among different platforms - however the
8042     // benefit seems to be smaller than the disadvantage of having a
8043     // much more complicated generator structure. See also comment in
8044     // stubRoutines.hpp.
8045 
8046     StubRoutines::_forward_exception_entry = generate_forward_exception();
8047 
8048     StubRoutines::_call_stub_entry =
8049       generate_call_stub(StubRoutines::_call_stub_return_address);
8050 
8051     // is referenced by megamorphic call
8052     StubRoutines::_catch_exception_entry = generate_catch_exception();
8053 
8054     // Build this early so it's available for the interpreter.
8055     StubRoutines::_throw_StackOverflowError_entry =
8056       generate_throw_exception("StackOverflowError throw_exception",
8057                                CAST_FROM_FN_PTR(address,
8058                                                 SharedRuntime::throw_StackOverflowError));
8059     StubRoutines::_throw_delayed_StackOverflowError_entry =
8060       generate_throw_exception("delayed StackOverflowError throw_exception",
8061                                CAST_FROM_FN_PTR(address,
8062                                                 SharedRuntime::throw_delayed_StackOverflowError));
8063 
8064     // Initialize table for copy memory (arraycopy) check.
8065     if (UnsafeCopyMemory::_table == nullptr) {
8066       UnsafeCopyMemory::create_table(8);
8067     }
8068 
8069     if (UseCRC32Intrinsics) {
8070       // set table address before stub generation which use it
8071       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8072       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8073     }
8074 
8075     if (UseCRC32CIntrinsics) {
8076       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8077     }
8078 
8079     // Disabled until JDK-8210858 is fixed
8080     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
8081     //   StubRoutines::_dlog = generate_dlog();
8082     // }
8083 
8084     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8085       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8086     }
8087 
8088     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8089       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8090     }
8091   }
8092 
8093   void generate_continuation_stubs() {
8094     // Continuation stubs:
8095     StubRoutines::_cont_thaw          = generate_cont_thaw();
8096     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8097     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8098 
8099     JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();)
8100     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();)
8101   }
8102 
8103   void generate_final_stubs() {
8104     // support for verify_oop (must happen after universe_init)
8105     if (VerifyOops) {
8106       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8107     }
8108     StubRoutines::_throw_AbstractMethodError_entry =
8109       generate_throw_exception("AbstractMethodError throw_exception",
8110                                CAST_FROM_FN_PTR(address,
8111                                                 SharedRuntime::
8112                                                 throw_AbstractMethodError));
8113 
8114     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8115       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8116                                CAST_FROM_FN_PTR(address,
8117                                                 SharedRuntime::
8118                                                 throw_IncompatibleClassChangeError));
8119 
8120     StubRoutines::_throw_NullPointerException_at_call_entry =
8121       generate_throw_exception("NullPointerException at call throw_exception",
8122                                CAST_FROM_FN_PTR(address,
8123                                                 SharedRuntime::
8124                                                 throw_NullPointerException_at_call));
8125 
8126     // arraycopy stubs used by compilers
8127     generate_arraycopy_stubs();
8128 
8129     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8130     if (bs_nm != nullptr) {
8131       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
8132     }
8133 
8134     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8135 
8136 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8137 
8138     generate_atomic_entry_points();
8139 
8140 #endif // LINUX
8141 
8142     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8143   }
8144 
8145   void generate_compiler_stubs() {
8146 #if COMPILER2_OR_JVMCI
8147 
8148     if (UseSVE == 0) {
8149       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8150     }
8151 
8152     // array equals stub for large arrays.
8153     if (!UseSimpleArrayEquals) {
8154       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8155     }
8156 
8157     // byte_array_inflate stub for large arrays.
8158     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8159 
8160     // countPositives stub for large arrays.
8161     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8162 
8163     generate_compare_long_strings();
8164 
8165     generate_string_indexof_stubs();
8166 
8167 #ifdef COMPILER2
8168     if (UseMultiplyToLenIntrinsic) {
8169       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8170     }
8171 
8172     if (UseSquareToLenIntrinsic) {
8173       StubRoutines::_squareToLen = generate_squareToLen();
8174     }
8175 
8176     if (UseMulAddIntrinsic) {
8177       StubRoutines::_mulAdd = generate_mulAdd();
8178     }
8179 
8180     if (UseSIMDForBigIntegerShiftIntrinsics) {
8181       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8182       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8183     }
8184 
8185     if (UseMontgomeryMultiplyIntrinsic) {
8186       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8187       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8188       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8189     }
8190 
8191     if (UseMontgomerySquareIntrinsic) {
8192       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8193       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8194       // We use generate_multiply() rather than generate_square()
8195       // because it's faster for the sizes of modulus we care about.
8196       StubRoutines::_montgomerySquare = g.generate_multiply();
8197     }
8198 #endif // COMPILER2
8199 
8200     if (UseChaCha20Intrinsics) {
8201       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8202     }
8203 
8204     if (UseBASE64Intrinsics) {
8205         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8206         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8207     }
8208 
8209     // data cache line writeback
8210     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8211     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8212 
8213     if (UseAESIntrinsics) {
8214       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8215       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8216       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8217       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8218       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8219     }
8220     if (UseGHASHIntrinsics) {
8221       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8222       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8223     }
8224     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8225       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8226     }
8227 
8228     if (UseMD5Intrinsics) {
8229       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8230       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8231     }
8232     if (UseSHA1Intrinsics) {
8233       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8234       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8235     }
8236     if (UseSHA256Intrinsics) {
8237       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8238       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8239     }
8240     if (UseSHA512Intrinsics) {
8241       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8242       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8243     }
8244     if (UseSHA3Intrinsics) {
8245       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8246       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8247     }
8248 
8249     // generate Adler32 intrinsics code
8250     if (UseAdler32Intrinsics) {
8251       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8252     }
8253 #endif // COMPILER2_OR_JVMCI
8254   }
8255 
8256  public:
8257   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8258     switch(kind) {
8259     case Initial_stubs:
8260       generate_initial_stubs();
8261       break;
8262      case Continuation_stubs:
8263       generate_continuation_stubs();
8264       break;
8265     case Compiler_stubs:
8266       generate_compiler_stubs();
8267       break;
8268     case Final_stubs:
8269       generate_final_stubs();
8270       break;
8271     default:
8272       fatal("unexpected stubs kind: %d", kind);
8273       break;
8274     };
8275   }
8276 }; // end class declaration
8277 
8278 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8279   StubGenerator g(code, kind);
8280 }
8281 
8282 
8283 #if defined (LINUX)
8284 
8285 // Define pointers to atomic stubs and initialize them to point to the
8286 // code in atomic_aarch64.S.
8287 
8288 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8289   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8290     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8291   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8292     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8293 
8294 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8295 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8296 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8297 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8298 DEFAULT_ATOMIC_OP(xchg, 4, )
8299 DEFAULT_ATOMIC_OP(xchg, 8, )
8300 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8301 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8302 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8303 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8304 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8305 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8306 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8307 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8308 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8309 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8310 
8311 #undef DEFAULT_ATOMIC_OP
8312 
8313 #endif // LINUX