1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/frame.inline.hpp"
  49 #include "runtime/handles.inline.hpp"
  50 #include "runtime/javaThread.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubCodeGenerator.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/checkedCast.hpp"
  56 #include "utilities/globalDefinitions.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER2
  59 #include "opto/runtime.hpp"
  60 #endif
  61 #if INCLUDE_ZGC
  62 #include "gc/z/zThreadLocalData.hpp"
  63 #endif
  64 
  65 // Declaration and definition of StubGenerator (no .hpp file).
  66 // For a more detailed description of the stub routine structure
  67 // see the comment in stubRoutines.hpp
  68 
  69 #undef __
  70 #define __ _masm->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 
  80 // Stub Code definitions
  81 
  82 class StubGenerator: public StubCodeGenerator {
  83  private:
  84 
  85 #ifdef PRODUCT
  86 #define inc_counter_np(counter) ((void)0)
  87 #else
  88   void inc_counter_np_(uint& counter) {
  89     __ lea(rscratch2, ExternalAddress((address)&counter));
  90     __ ldrw(rscratch1, Address(rscratch2));
  91     __ addw(rscratch1, rscratch1, 1);
  92     __ strw(rscratch1, Address(rscratch2));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -29 [ argument word 1      ]
 145   // -28 [ saved Floating-point Control Register ]
 146   // -26 [ saved v15            ] <--- sp_after_call
 147   // -25 [ saved v14            ]
 148   // -24 [ saved v13            ]
 149   // -23 [ saved v12            ]
 150   // -22 [ saved v11            ]
 151   // -21 [ saved v10            ]
 152   // -20 [ saved v9             ]
 153   // -19 [ saved v8             ]
 154   // -18 [ saved r28            ]
 155   // -17 [ saved r27            ]
 156   // -16 [ saved r26            ]
 157   // -15 [ saved r25            ]
 158   // -14 [ saved r24            ]
 159   // -13 [ saved r23            ]
 160   // -12 [ saved r22            ]
 161   // -11 [ saved r21            ]
 162   // -10 [ saved r20            ]
 163   //  -9 [ saved r19            ]
 164   //  -8 [ call wrapper    (r0) ]
 165   //  -7 [ result          (r1) ]
 166   //  -6 [ result type     (r2) ]
 167   //  -5 [ method          (r3) ]
 168   //  -4 [ entry point     (r4) ]
 169   //  -3 [ parameters      (r5) ]
 170   //  -2 [ parameter size  (r6) ]
 171   //  -1 [ thread (r7)          ]
 172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 173   //   1 [ saved lr       (r30) ]
 174 
 175   // Call stub stack layout word offsets from fp
 176   enum call_stub_layout {
 177     sp_after_call_off  = -28,
 178 
 179     fpcr_off           = sp_after_call_off,
 180     d15_off            = -26,
 181     d13_off            = -24,
 182     d11_off            = -22,
 183     d9_off             = -20,
 184 
 185     r28_off            = -18,
 186     r26_off            = -16,
 187     r24_off            = -14,
 188     r22_off            = -12,
 189     r20_off            = -10,
 190     call_wrapper_off   =  -8,
 191     result_off         =  -7,
 192     result_type_off    =  -6,
 193     method_off         =  -5,
 194     entry_point_off    =  -4,
 195     parameter_size_off =  -2,
 196     thread_off         =  -1,
 197     fp_f               =   0,
 198     retaddr_off        =   1,
 199   };
 200 
 201   address generate_call_stub(address& return_address) {
 202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 204            "adjust this code");
 205 
 206     StubCodeMark mark(this, "StubRoutines", "call_stub");
 207     address start = __ pc();
 208 
 209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 210 
 211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     __ get_fpcr(rscratch1);
 261     __ str(rscratch1, fpcr_save);
 262     // Set FPCR to the state we need. We do want Round to Nearest. We
 263     // don't want non-IEEE rounding modes or floating-point traps.
 264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 266     __ set_fpcr(rscratch1);
 267 
 268     // install Java thread in global register now we have saved
 269     // whatever value it held
 270     __ mov(rthread, c_rarg7);
 271     // And method
 272     __ mov(rmethod, c_rarg3);
 273 
 274     // set up the heapbase register
 275     __ reinit_heapbase();
 276 
 277 #ifdef ASSERT
 278     // make sure we have no pending exceptions
 279     {
 280       Label L;
 281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 282       __ cmp(rscratch1, (u1)NULL_WORD);
 283       __ br(Assembler::EQ, L);
 284       __ stop("StubRoutines::call_stub: entered with pending exception");
 285       __ BIND(L);
 286     }
 287 #endif
 288     // pass parameters if any
 289     __ mov(esp, sp);
 290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 291     __ andr(sp, rscratch1, -2 * wordSize);
 292 
 293     BLOCK_COMMENT("pass parameters if any");
 294     Label parameters_done;
 295     // parameter count is still in c_rarg6
 296     // and parameter pointer identifying param 1 is in c_rarg5
 297     __ cbzw(c_rarg6, parameters_done);
 298 
 299     address loop = __ pc();
 300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 301     __ subsw(c_rarg6, c_rarg6, 1);
 302     __ push(rscratch1);
 303     __ br(Assembler::GT, loop);
 304 
 305     __ BIND(parameters_done);
 306 
 307     // call Java entry -- passing methdoOop, and current sp
 308     //      rmethod: Method*
 309     //      r19_sender_sp: sender sp
 310     BLOCK_COMMENT("call Java function");
 311     __ mov(r19_sender_sp, sp);
 312     __ blr(c_rarg4);
 313 
 314     // we do this here because the notify will already have been done
 315     // if we get to the next instruction via an exception
 316     //
 317     // n.b. adding this instruction here affects the calculation of
 318     // whether or not a routine returns to the call stub (used when
 319     // doing stack walks) since the normal test is to check the return
 320     // pc against the address saved below. so we may need to allow for
 321     // this extra instruction in the check.
 322 
 323     // save current address for use by exception handling code
 324 
 325     return_address = __ pc();
 326 
 327     // store result depending on type (everything that is not
 328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 329     // n.b. this assumes Java returns an integral result in r0
 330     // and a floating result in j_farg0
 331     // All of j_rargN may be used to return inline type fields so be careful
 332     // not to clobber those.
 333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 334     // assignment of Rresult below.
 335     Register Rresult = r14, Rresult_type = r15;
 336     __ ldr(Rresult, result);
 337     Label is_long, is_float, is_double, check_prim, exit;
 338     __ ldr(Rresult_type, result_type);
 339     __ cmp(Rresult_type, (u1)T_OBJECT);
 340     __ br(Assembler::EQ, check_prim);
 341     __ cmp(Rresult_type, (u1)T_LONG);
 342     __ br(Assembler::EQ, is_long);
 343     __ cmp(Rresult_type, (u1)T_FLOAT);
 344     __ br(Assembler::EQ, is_float);
 345     __ cmp(Rresult_type, (u1)T_DOUBLE);
 346     __ br(Assembler::EQ, is_double);
 347 
 348     // handle T_INT case
 349     __ strw(r0, Address(Rresult));
 350 
 351     __ BIND(exit);
 352 
 353     // pop parameters
 354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 355 
 356 #ifdef ASSERT
 357     // verify that threads correspond
 358     {
 359       Label L, S;
 360       __ ldr(rscratch1, thread);
 361       __ cmp(rthread, rscratch1);
 362       __ br(Assembler::NE, S);
 363       __ get_thread(rscratch1);
 364       __ cmp(rthread, rscratch1);
 365       __ br(Assembler::EQ, L);
 366       __ BIND(S);
 367       __ stop("StubRoutines::call_stub: threads must correspond");
 368       __ BIND(L);
 369     }
 370 #endif
 371 
 372     __ pop_cont_fastpath(rthread);
 373 
 374     // restore callee-save registers
 375     __ ldpd(v15, v14,  d15_save);
 376     __ ldpd(v13, v12,  d13_save);
 377     __ ldpd(v11, v10,  d11_save);
 378     __ ldpd(v9,  v8,   d9_save);
 379 
 380     __ ldp(r28, r27,   r28_save);
 381     __ ldp(r26, r25,   r26_save);
 382     __ ldp(r24, r23,   r24_save);
 383     __ ldp(r22, r21,   r22_save);
 384     __ ldp(r20, r19,   r20_save);
 385 
 386     // restore fpcr
 387     __ ldr(rscratch1,  fpcr_save);
 388     __ set_fpcr(rscratch1);
 389 
 390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 391     __ ldrw(c_rarg2, result_type);
 392     __ ldr(c_rarg3,  method);
 393     __ ldp(c_rarg4, c_rarg5,  entry_point);
 394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 395 
 396     // leave frame and return to caller
 397     __ leave();
 398     __ ret(lr);
 399 
 400     // handle return types different from T_INT
 401     __ BIND(check_prim);
 402     if (InlineTypeReturnedAsFields) {
 403       // Check for scalarized return value
 404       __ tbz(r0, 0, is_long);
 405       // Load pack handler address
 406       __ andr(rscratch1, r0, -2);
 407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 409       __ blr(rscratch1);
 410       __ b(exit);
 411     }
 412 
 413     __ BIND(is_long);
 414     __ str(r0, Address(Rresult, 0));
 415     __ br(Assembler::AL, exit);
 416 
 417     __ BIND(is_float);
 418     __ strs(j_farg0, Address(Rresult, 0));
 419     __ br(Assembler::AL, exit);
 420 
 421     __ BIND(is_double);
 422     __ strd(j_farg0, Address(Rresult, 0));
 423     __ br(Assembler::AL, exit);
 424 
 425     return start;
 426   }
 427 
 428   // Return point for a Java call if there's an exception thrown in
 429   // Java code.  The exception is caught and transformed into a
 430   // pending exception stored in JavaThread that can be tested from
 431   // within the VM.
 432   //
 433   // Note: Usually the parameters are removed by the callee. In case
 434   // of an exception crossing an activation frame boundary, that is
 435   // not the case if the callee is compiled code => need to setup the
 436   // rsp.
 437   //
 438   // r0: exception oop
 439 
 440   address generate_catch_exception() {
 441     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 442     address start = __ pc();
 443 
 444     // same as in generate_call_stub():
 445     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 446     const Address thread        (rfp, thread_off         * wordSize);
 447 
 448 #ifdef ASSERT
 449     // verify that threads correspond
 450     {
 451       Label L, S;
 452       __ ldr(rscratch1, thread);
 453       __ cmp(rthread, rscratch1);
 454       __ br(Assembler::NE, S);
 455       __ get_thread(rscratch1);
 456       __ cmp(rthread, rscratch1);
 457       __ br(Assembler::EQ, L);
 458       __ bind(S);
 459       __ stop("StubRoutines::catch_exception: threads must correspond");
 460       __ bind(L);
 461     }
 462 #endif
 463 
 464     // set pending exception
 465     __ verify_oop(r0);
 466 
 467     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 468     __ mov(rscratch1, (address)__FILE__);
 469     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 470     __ movw(rscratch1, (int)__LINE__);
 471     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 472 
 473     // complete return to VM
 474     assert(StubRoutines::_call_stub_return_address != nullptr,
 475            "_call_stub_return_address must have been generated before");
 476     __ b(StubRoutines::_call_stub_return_address);
 477 
 478     return start;
 479   }
 480 
 481   // Continuation point for runtime calls returning with a pending
 482   // exception.  The pending exception check happened in the runtime
 483   // or native call stub.  The pending exception in Thread is
 484   // converted into a Java-level exception.
 485   //
 486   // Contract with Java-level exception handlers:
 487   // r0: exception
 488   // r3: throwing pc
 489   //
 490   // NOTE: At entry of this stub, exception-pc must be in LR !!
 491 
 492   // NOTE: this is always used as a jump target within generated code
 493   // so it just needs to be generated code with no x86 prolog
 494 
 495   address generate_forward_exception() {
 496     StubCodeMark mark(this, "StubRoutines", "forward exception");
 497     address start = __ pc();
 498 
 499     // Upon entry, LR points to the return address returning into
 500     // Java (interpreted or compiled) code; i.e., the return address
 501     // becomes the throwing pc.
 502     //
 503     // Arguments pushed before the runtime call are still on the stack
 504     // but the exception handler will reset the stack pointer ->
 505     // ignore them.  A potential result in registers can be ignored as
 506     // well.
 507 
 508 #ifdef ASSERT
 509     // make sure this code is only executed if there is a pending exception
 510     {
 511       Label L;
 512       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 513       __ cbnz(rscratch1, L);
 514       __ stop("StubRoutines::forward exception: no pending exception (1)");
 515       __ bind(L);
 516     }
 517 #endif
 518 
 519     // compute exception handler into r19
 520 
 521     // call the VM to find the handler address associated with the
 522     // caller address. pass thread in r0 and caller pc (ret address)
 523     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 524     // the stack.
 525     __ mov(c_rarg1, lr);
 526     // lr will be trashed by the VM call so we move it to R19
 527     // (callee-saved) because we also need to pass it to the handler
 528     // returned by this call.
 529     __ mov(r19, lr);
 530     BLOCK_COMMENT("call exception_handler_for_return_address");
 531     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 532                          SharedRuntime::exception_handler_for_return_address),
 533                     rthread, c_rarg1);
 534     // Reinitialize the ptrue predicate register, in case the external runtime
 535     // call clobbers ptrue reg, as we may return to SVE compiled code.
 536     __ reinitialize_ptrue();
 537 
 538     // we should not really care that lr is no longer the callee
 539     // address. we saved the value the handler needs in r19 so we can
 540     // just copy it to r3. however, the C2 handler will push its own
 541     // frame and then calls into the VM and the VM code asserts that
 542     // the PC for the frame above the handler belongs to a compiled
 543     // Java method. So, we restore lr here to satisfy that assert.
 544     __ mov(lr, r19);
 545     // setup r0 & r3 & clear pending exception
 546     __ mov(r3, r19);
 547     __ mov(r19, r0);
 548     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 549     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 550 
 551 #ifdef ASSERT
 552     // make sure exception is set
 553     {
 554       Label L;
 555       __ cbnz(r0, L);
 556       __ stop("StubRoutines::forward exception: no pending exception (2)");
 557       __ bind(L);
 558     }
 559 #endif
 560 
 561     // continue at exception handler
 562     // r0: exception
 563     // r3: throwing pc
 564     // r19: exception handler
 565     __ verify_oop(r0);
 566     __ br(r19);
 567 
 568     return start;
 569   }
 570 
 571   // Non-destructive plausibility checks for oops
 572   //
 573   // Arguments:
 574   //    r0: oop to verify
 575   //    rscratch1: error message
 576   //
 577   // Stack after saving c_rarg3:
 578   //    [tos + 0]: saved c_rarg3
 579   //    [tos + 1]: saved c_rarg2
 580   //    [tos + 2]: saved lr
 581   //    [tos + 3]: saved rscratch2
 582   //    [tos + 4]: saved r0
 583   //    [tos + 5]: saved rscratch1
 584   address generate_verify_oop() {
 585 
 586     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 587     address start = __ pc();
 588 
 589     Label exit, error;
 590 
 591     // save c_rarg2 and c_rarg3
 592     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 593 
 594     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 595     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 596     __ ldr(c_rarg3, Address(c_rarg2));
 597     __ add(c_rarg3, c_rarg3, 1);
 598     __ str(c_rarg3, Address(c_rarg2));
 599 
 600     // object is in r0
 601     // make sure object is 'reasonable'
 602     __ cbz(r0, exit); // if obj is null it is OK
 603 
 604     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 605     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 606 
 607     // return if everything seems ok
 608     __ bind(exit);
 609 
 610     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 611     __ ret(lr);
 612 
 613     // handle errors
 614     __ bind(error);
 615     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 616 
 617     __ push(RegSet::range(r0, r29), sp);
 618     // debug(char* msg, int64_t pc, int64_t regs[])
 619     __ mov(c_rarg0, rscratch1);      // pass address of error message
 620     __ mov(c_rarg1, lr);             // pass return address
 621     __ mov(c_rarg2, sp);             // pass address of regs on stack
 622 #ifndef PRODUCT
 623     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 624 #endif
 625     BLOCK_COMMENT("call MacroAssembler::debug");
 626     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 627     __ blr(rscratch1);
 628     __ hlt(0);
 629 
 630     return start;
 631   }
 632 
 633   // Generate indices for iota vector.
 634   address generate_iota_indices(const char *stub_name) {
 635     __ align(CodeEntryAlignment);
 636     StubCodeMark mark(this, "StubRoutines", stub_name);
 637     address start = __ pc();
 638     // B
 639     __ emit_data64(0x0706050403020100, relocInfo::none);
 640     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 641     // H
 642     __ emit_data64(0x0003000200010000, relocInfo::none);
 643     __ emit_data64(0x0007000600050004, relocInfo::none);
 644     // S
 645     __ emit_data64(0x0000000100000000, relocInfo::none);
 646     __ emit_data64(0x0000000300000002, relocInfo::none);
 647     // D
 648     __ emit_data64(0x0000000000000000, relocInfo::none);
 649     __ emit_data64(0x0000000000000001, relocInfo::none);
 650     // S - FP
 651     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 652     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 653     // D - FP
 654     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 655     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 656     return start;
 657   }
 658 
 659   // The inner part of zero_words().  This is the bulk operation,
 660   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 661   // caller is responsible for zeroing the last few words.
 662   //
 663   // Inputs:
 664   // r10: the HeapWord-aligned base address of an array to zero.
 665   // r11: the count in HeapWords, r11 > 0.
 666   //
 667   // Returns r10 and r11, adjusted for the caller to clear.
 668   // r10: the base address of the tail of words left to clear.
 669   // r11: the number of words in the tail.
 670   //      r11 < MacroAssembler::zero_words_block_size.
 671 
 672   address generate_zero_blocks() {
 673     Label done;
 674     Label base_aligned;
 675 
 676     Register base = r10, cnt = r11;
 677 
 678     __ align(CodeEntryAlignment);
 679     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 680     address start = __ pc();
 681 
 682     if (UseBlockZeroing) {
 683       int zva_length = VM_Version::zva_length();
 684 
 685       // Ensure ZVA length can be divided by 16. This is required by
 686       // the subsequent operations.
 687       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 688 
 689       __ tbz(base, 3, base_aligned);
 690       __ str(zr, Address(__ post(base, 8)));
 691       __ sub(cnt, cnt, 1);
 692       __ bind(base_aligned);
 693 
 694       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 695       // alignment.
 696       Label small;
 697       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 698       __ subs(rscratch1, cnt, low_limit >> 3);
 699       __ br(Assembler::LT, small);
 700       __ zero_dcache_blocks(base, cnt);
 701       __ bind(small);
 702     }
 703 
 704     {
 705       // Number of stp instructions we'll unroll
 706       const int unroll =
 707         MacroAssembler::zero_words_block_size / 2;
 708       // Clear the remaining blocks.
 709       Label loop;
 710       __ subs(cnt, cnt, unroll * 2);
 711       __ br(Assembler::LT, done);
 712       __ bind(loop);
 713       for (int i = 0; i < unroll; i++)
 714         __ stp(zr, zr, __ post(base, 16));
 715       __ subs(cnt, cnt, unroll * 2);
 716       __ br(Assembler::GE, loop);
 717       __ bind(done);
 718       __ add(cnt, cnt, unroll * 2);
 719     }
 720 
 721     __ ret(lr);
 722 
 723     return start;
 724   }
 725 
 726 
 727   typedef enum {
 728     copy_forwards = 1,
 729     copy_backwards = -1
 730   } copy_direction;
 731 
 732   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 733   // for arraycopy stubs.
 734   class ArrayCopyBarrierSetHelper : StackObj {
 735     BarrierSetAssembler* _bs_asm;
 736     MacroAssembler* _masm;
 737     DecoratorSet _decorators;
 738     BasicType _type;
 739     Register _gct1;
 740     Register _gct2;
 741     Register _gct3;
 742     FloatRegister _gcvt1;
 743     FloatRegister _gcvt2;
 744     FloatRegister _gcvt3;
 745 
 746   public:
 747     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 748                               DecoratorSet decorators,
 749                               BasicType type,
 750                               Register gct1,
 751                               Register gct2,
 752                               Register gct3,
 753                               FloatRegister gcvt1,
 754                               FloatRegister gcvt2,
 755                               FloatRegister gcvt3)
 756       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 757         _masm(masm),
 758         _decorators(decorators),
 759         _type(type),
 760         _gct1(gct1),
 761         _gct2(gct2),
 762         _gct3(gct3),
 763         _gcvt1(gcvt1),
 764         _gcvt2(gcvt2),
 765         _gcvt3(gcvt3) {
 766     }
 767 
 768     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 769       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 770                             dst1, dst2, src,
 771                             _gct1, _gct2, _gcvt1);
 772     }
 773 
 774     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 775       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 776                              dst, src1, src2,
 777                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 778     }
 779 
 780     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 781       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 782                             dst1, dst2, src,
 783                             _gct1);
 784     }
 785 
 786     void copy_store_at_16(Address dst, Register src1, Register src2) {
 787       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 788                              dst, src1, src2,
 789                              _gct1, _gct2, _gct3);
 790     }
 791 
 792     void copy_load_at_8(Register dst, Address src) {
 793       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 794                             dst, noreg, src,
 795                             _gct1);
 796     }
 797 
 798     void copy_store_at_8(Address dst, Register src) {
 799       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 800                              dst, src, noreg,
 801                              _gct1, _gct2, _gct3);
 802     }
 803   };
 804 
 805   // Bulk copy of blocks of 8 words.
 806   //
 807   // count is a count of words.
 808   //
 809   // Precondition: count >= 8
 810   //
 811   // Postconditions:
 812   //
 813   // The least significant bit of count contains the remaining count
 814   // of words to copy.  The rest of count is trash.
 815   //
 816   // s and d are adjusted to point to the remaining words to copy
 817   //
 818   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 819                            copy_direction direction) {
 820     int unit = wordSize * direction;
 821     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 822 
 823     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 824       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 825     const Register stride = r14;
 826     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 827     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 828     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 829 
 830     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 831     assert_different_registers(s, d, count, rscratch1, rscratch2);
 832 
 833     Label again, drain;
 834     const char *stub_name;
 835     if (direction == copy_forwards)
 836       stub_name = "forward_copy_longs";
 837     else
 838       stub_name = "backward_copy_longs";
 839 
 840     __ align(CodeEntryAlignment);
 841 
 842     StubCodeMark mark(this, "StubRoutines", stub_name);
 843 
 844     __ bind(start);
 845 
 846     Label unaligned_copy_long;
 847     if (AvoidUnalignedAccesses) {
 848       __ tbnz(d, 3, unaligned_copy_long);
 849     }
 850 
 851     if (direction == copy_forwards) {
 852       __ sub(s, s, bias);
 853       __ sub(d, d, bias);
 854     }
 855 
 856 #ifdef ASSERT
 857     // Make sure we are never given < 8 words
 858     {
 859       Label L;
 860       __ cmp(count, (u1)8);
 861       __ br(Assembler::GE, L);
 862       __ stop("genrate_copy_longs called with < 8 words");
 863       __ bind(L);
 864     }
 865 #endif
 866 
 867     // Fill 8 registers
 868     if (UseSIMDForMemoryOps) {
 869       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 870       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 871     } else {
 872       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 873       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 874       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 875       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 876     }
 877 
 878     __ subs(count, count, 16);
 879     __ br(Assembler::LO, drain);
 880 
 881     int prefetch = PrefetchCopyIntervalInBytes;
 882     bool use_stride = false;
 883     if (direction == copy_backwards) {
 884        use_stride = prefetch > 256;
 885        prefetch = -prefetch;
 886        if (use_stride) __ mov(stride, prefetch);
 887     }
 888 
 889     __ bind(again);
 890 
 891     if (PrefetchCopyIntervalInBytes > 0)
 892       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 893 
 894     if (UseSIMDForMemoryOps) {
 895       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 896       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 897       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 898       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 899     } else {
 900       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 902       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 903       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 904       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 905       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 906       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 907       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 908     }
 909 
 910     __ subs(count, count, 8);
 911     __ br(Assembler::HS, again);
 912 
 913     // Drain
 914     __ bind(drain);
 915     if (UseSIMDForMemoryOps) {
 916       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 917       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 918     } else {
 919       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 920       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 921       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 922       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 923     }
 924 
 925     {
 926       Label L1, L2;
 927       __ tbz(count, exact_log2(4), L1);
 928       if (UseSIMDForMemoryOps) {
 929         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 930         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 931       } else {
 932         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 933         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 934         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 935         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 936       }
 937       __ bind(L1);
 938 
 939       if (direction == copy_forwards) {
 940         __ add(s, s, bias);
 941         __ add(d, d, bias);
 942       }
 943 
 944       __ tbz(count, 1, L2);
 945       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 946       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 947       __ bind(L2);
 948     }
 949 
 950     __ ret(lr);
 951 
 952     if (AvoidUnalignedAccesses) {
 953       Label drain, again;
 954       // Register order for storing. Order is different for backward copy.
 955 
 956       __ bind(unaligned_copy_long);
 957 
 958       // source address is even aligned, target odd aligned
 959       //
 960       // when forward copying word pairs we read long pairs at offsets
 961       // {0, 2, 4, 6} (in long words). when backwards copying we read
 962       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 963       // address by -2 in the forwards case so we can compute the
 964       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 965       // or -1.
 966       //
 967       // when forward copying we need to store 1 word, 3 pairs and
 968       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 969       // zero offset We adjust the destination by -1 which means we
 970       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 971       //
 972       // When backwards copyng we need to store 1 word, 3 pairs and
 973       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 974       // offsets {1, 3, 5, 7, 8} * unit.
 975 
 976       if (direction == copy_forwards) {
 977         __ sub(s, s, 16);
 978         __ sub(d, d, 8);
 979       }
 980 
 981       // Fill 8 registers
 982       //
 983       // for forwards copy s was offset by -16 from the original input
 984       // value of s so the register contents are at these offsets
 985       // relative to the 64 bit block addressed by that original input
 986       // and so on for each successive 64 byte block when s is updated
 987       //
 988       // t0 at offset 0,  t1 at offset 8
 989       // t2 at offset 16, t3 at offset 24
 990       // t4 at offset 32, t5 at offset 40
 991       // t6 at offset 48, t7 at offset 56
 992 
 993       // for backwards copy s was not offset so the register contents
 994       // are at these offsets into the preceding 64 byte block
 995       // relative to that original input and so on for each successive
 996       // preceding 64 byte block when s is updated. this explains the
 997       // slightly counter-intuitive looking pattern of register usage
 998       // in the stp instructions for backwards copy.
 999       //
1000       // t0 at offset -16, t1 at offset -8
1001       // t2 at offset -32, t3 at offset -24
1002       // t4 at offset -48, t5 at offset -40
1003       // t6 at offset -64, t7 at offset -56
1004 
1005       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1006       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1007       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1008       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1009 
1010       __ subs(count, count, 16);
1011       __ br(Assembler::LO, drain);
1012 
1013       int prefetch = PrefetchCopyIntervalInBytes;
1014       bool use_stride = false;
1015       if (direction == copy_backwards) {
1016          use_stride = prefetch > 256;
1017          prefetch = -prefetch;
1018          if (use_stride) __ mov(stride, prefetch);
1019       }
1020 
1021       __ bind(again);
1022 
1023       if (PrefetchCopyIntervalInBytes > 0)
1024         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1025 
1026       if (direction == copy_forwards) {
1027        // allowing for the offset of -8 the store instructions place
1028        // registers into the target 64 bit block at the following
1029        // offsets
1030        //
1031        // t0 at offset 0
1032        // t1 at offset 8,  t2 at offset 16
1033        // t3 at offset 24, t4 at offset 32
1034        // t5 at offset 40, t6 at offset 48
1035        // t7 at offset 56
1036 
1037         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1038         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1039         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1040         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1041         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1042         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1043         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1044         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1045         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1046       } else {
1047        // d was not offset when we started so the registers are
1048        // written into the 64 bit block preceding d with the following
1049        // offsets
1050        //
1051        // t1 at offset -8
1052        // t3 at offset -24, t0 at offset -16
1053        // t5 at offset -48, t2 at offset -32
1054        // t7 at offset -56, t4 at offset -48
1055        //                   t6 at offset -64
1056        //
1057        // note that this matches the offsets previously noted for the
1058        // loads
1059 
1060         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1061         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1062         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1063         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1064         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1065         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1066         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1067         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1068         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1069       }
1070 
1071       __ subs(count, count, 8);
1072       __ br(Assembler::HS, again);
1073 
1074       // Drain
1075       //
1076       // this uses the same pattern of offsets and register arguments
1077       // as above
1078       __ bind(drain);
1079       if (direction == copy_forwards) {
1080         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1081         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1082         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1083         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1084         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1085       } else {
1086         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1087         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1088         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1089         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1090         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1091       }
1092       // now we need to copy any remaining part block which may
1093       // include a 4 word block subblock and/or a 2 word subblock.
1094       // bits 2 and 1 in the count are the tell-tale for whether we
1095       // have each such subblock
1096       {
1097         Label L1, L2;
1098         __ tbz(count, exact_log2(4), L1);
1099        // this is the same as above but copying only 4 longs hence
1100        // with only one intervening stp between the str instructions
1101        // but note that the offsets and registers still follow the
1102        // same pattern
1103         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1104         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1105         if (direction == copy_forwards) {
1106           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1107           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1108           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1109         } else {
1110           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1111           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1112           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1113         }
1114         __ bind(L1);
1115 
1116         __ tbz(count, 1, L2);
1117        // this is the same as above but copying only 2 longs hence
1118        // there is no intervening stp between the str instructions
1119        // but note that the offset and register patterns are still
1120        // the same
1121         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1122         if (direction == copy_forwards) {
1123           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1124           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1125         } else {
1126           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1127           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1128         }
1129         __ bind(L2);
1130 
1131        // for forwards copy we need to re-adjust the offsets we
1132        // applied so that s and d are follow the last words written
1133 
1134        if (direction == copy_forwards) {
1135          __ add(s, s, 16);
1136          __ add(d, d, 8);
1137        }
1138 
1139       }
1140 
1141       __ ret(lr);
1142       }
1143   }
1144 
1145   // Small copy: less than 16 bytes.
1146   //
1147   // NB: Ignores all of the bits of count which represent more than 15
1148   // bytes, so a caller doesn't have to mask them.
1149 
1150   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1151     bool is_backwards = step < 0;
1152     size_t granularity = uabs(step);
1153     int direction = is_backwards ? -1 : 1;
1154 
1155     Label Lword, Lint, Lshort, Lbyte;
1156 
1157     assert(granularity
1158            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1159 
1160     const Register t0 = r3;
1161     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1162     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1163 
1164     // ??? I don't know if this bit-test-and-branch is the right thing
1165     // to do.  It does a lot of jumping, resulting in several
1166     // mispredicted branches.  It might make more sense to do this
1167     // with something like Duff's device with a single computed branch.
1168 
1169     __ tbz(count, 3 - exact_log2(granularity), Lword);
1170     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1171     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1172     __ bind(Lword);
1173 
1174     if (granularity <= sizeof (jint)) {
1175       __ tbz(count, 2 - exact_log2(granularity), Lint);
1176       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1177       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1178       __ bind(Lint);
1179     }
1180 
1181     if (granularity <= sizeof (jshort)) {
1182       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1183       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1184       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1185       __ bind(Lshort);
1186     }
1187 
1188     if (granularity <= sizeof (jbyte)) {
1189       __ tbz(count, 0, Lbyte);
1190       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1191       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1192       __ bind(Lbyte);
1193     }
1194   }
1195 
1196   Label copy_f, copy_b;
1197   Label copy_obj_f, copy_obj_b;
1198   Label copy_obj_uninit_f, copy_obj_uninit_b;
1199 
1200   // All-singing all-dancing memory copy.
1201   //
1202   // Copy count units of memory from s to d.  The size of a unit is
1203   // step, which can be positive or negative depending on the direction
1204   // of copy.  If is_aligned is false, we align the source address.
1205   //
1206 
1207   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1208                    Register s, Register d, Register count, int step) {
1209     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1210     bool is_backwards = step < 0;
1211     unsigned int granularity = uabs(step);
1212     const Register t0 = r3, t1 = r4;
1213 
1214     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1215     // load all the data before writing anything
1216     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1217     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1218     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1219     const Register send = r17, dend = r16;
1220     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1221     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1222     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1223 
1224     if (PrefetchCopyIntervalInBytes > 0)
1225       __ prfm(Address(s, 0), PLDL1KEEP);
1226     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1227     __ br(Assembler::HI, copy_big);
1228 
1229     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1230     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1231 
1232     __ cmp(count, u1(16/granularity));
1233     __ br(Assembler::LS, copy16);
1234 
1235     __ cmp(count, u1(64/granularity));
1236     __ br(Assembler::HI, copy80);
1237 
1238     __ cmp(count, u1(32/granularity));
1239     __ br(Assembler::LS, copy32);
1240 
1241     // 33..64 bytes
1242     if (UseSIMDForMemoryOps) {
1243       bs.copy_load_at_32(v0, v1, Address(s, 0));
1244       bs.copy_load_at_32(v2, v3, Address(send, -32));
1245       bs.copy_store_at_32(Address(d, 0), v0, v1);
1246       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1247     } else {
1248       bs.copy_load_at_16(t0, t1, Address(s, 0));
1249       bs.copy_load_at_16(t2, t3, Address(s, 16));
1250       bs.copy_load_at_16(t4, t5, Address(send, -32));
1251       bs.copy_load_at_16(t6, t7, Address(send, -16));
1252 
1253       bs.copy_store_at_16(Address(d, 0), t0, t1);
1254       bs.copy_store_at_16(Address(d, 16), t2, t3);
1255       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1256       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1257     }
1258     __ b(finish);
1259 
1260     // 17..32 bytes
1261     __ bind(copy32);
1262     bs.copy_load_at_16(t0, t1, Address(s, 0));
1263     bs.copy_load_at_16(t6, t7, Address(send, -16));
1264 
1265     bs.copy_store_at_16(Address(d, 0), t0, t1);
1266     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1267     __ b(finish);
1268 
1269     // 65..80/96 bytes
1270     // (96 bytes if SIMD because we do 32 byes per instruction)
1271     __ bind(copy80);
1272     if (UseSIMDForMemoryOps) {
1273       bs.copy_load_at_32(v0, v1, Address(s, 0));
1274       bs.copy_load_at_32(v2, v3, Address(s, 32));
1275       // Unaligned pointers can be an issue for copying.
1276       // The issue has more chances to happen when granularity of data is
1277       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1278       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1279       // The most performance drop has been seen for the range 65-80 bytes.
1280       // For such cases using the pair of ldp/stp instead of the third pair of
1281       // ldpq/stpq fixes the performance issue.
1282       if (granularity < sizeof (jint)) {
1283         Label copy96;
1284         __ cmp(count, u1(80/granularity));
1285         __ br(Assembler::HI, copy96);
1286         bs.copy_load_at_16(t0, t1, Address(send, -16));
1287 
1288         bs.copy_store_at_32(Address(d, 0), v0, v1);
1289         bs.copy_store_at_32(Address(d, 32), v2, v3);
1290 
1291         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1292         __ b(finish);
1293 
1294         __ bind(copy96);
1295       }
1296       bs.copy_load_at_32(v4, v5, Address(send, -32));
1297 
1298       bs.copy_store_at_32(Address(d, 0), v0, v1);
1299       bs.copy_store_at_32(Address(d, 32), v2, v3);
1300 
1301       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1302     } else {
1303       bs.copy_load_at_16(t0, t1, Address(s, 0));
1304       bs.copy_load_at_16(t2, t3, Address(s, 16));
1305       bs.copy_load_at_16(t4, t5, Address(s, 32));
1306       bs.copy_load_at_16(t6, t7, Address(s, 48));
1307       bs.copy_load_at_16(t8, t9, Address(send, -16));
1308 
1309       bs.copy_store_at_16(Address(d, 0), t0, t1);
1310       bs.copy_store_at_16(Address(d, 16), t2, t3);
1311       bs.copy_store_at_16(Address(d, 32), t4, t5);
1312       bs.copy_store_at_16(Address(d, 48), t6, t7);
1313       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1314     }
1315     __ b(finish);
1316 
1317     // 0..16 bytes
1318     __ bind(copy16);
1319     __ cmp(count, u1(8/granularity));
1320     __ br(Assembler::LO, copy8);
1321 
1322     // 8..16 bytes
1323     bs.copy_load_at_8(t0, Address(s, 0));
1324     bs.copy_load_at_8(t1, Address(send, -8));
1325     bs.copy_store_at_8(Address(d, 0), t0);
1326     bs.copy_store_at_8(Address(dend, -8), t1);
1327     __ b(finish);
1328 
1329     if (granularity < 8) {
1330       // 4..7 bytes
1331       __ bind(copy8);
1332       __ tbz(count, 2 - exact_log2(granularity), copy4);
1333       __ ldrw(t0, Address(s, 0));
1334       __ ldrw(t1, Address(send, -4));
1335       __ strw(t0, Address(d, 0));
1336       __ strw(t1, Address(dend, -4));
1337       __ b(finish);
1338       if (granularity < 4) {
1339         // 0..3 bytes
1340         __ bind(copy4);
1341         __ cbz(count, finish); // get rid of 0 case
1342         if (granularity == 2) {
1343           __ ldrh(t0, Address(s, 0));
1344           __ strh(t0, Address(d, 0));
1345         } else { // granularity == 1
1346           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1347           // the first and last byte.
1348           // Handle the 3 byte case by loading and storing base + count/2
1349           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1350           // This does means in the 1 byte case we load/store the same
1351           // byte 3 times.
1352           __ lsr(count, count, 1);
1353           __ ldrb(t0, Address(s, 0));
1354           __ ldrb(t1, Address(send, -1));
1355           __ ldrb(t2, Address(s, count));
1356           __ strb(t0, Address(d, 0));
1357           __ strb(t1, Address(dend, -1));
1358           __ strb(t2, Address(d, count));
1359         }
1360         __ b(finish);
1361       }
1362     }
1363 
1364     __ bind(copy_big);
1365     if (is_backwards) {
1366       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1367       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1368     }
1369 
1370     // Now we've got the small case out of the way we can align the
1371     // source address on a 2-word boundary.
1372 
1373     // Here we will materialize a count in r15, which is used by copy_memory_small
1374     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1375     // Up until here, we have used t9, which aliases r15, but from here on, that register
1376     // can not be used as a temp register, as it contains the count.
1377 
1378     Label aligned;
1379 
1380     if (is_aligned) {
1381       // We may have to adjust by 1 word to get s 2-word-aligned.
1382       __ tbz(s, exact_log2(wordSize), aligned);
1383       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1384       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1385       __ sub(count, count, wordSize/granularity);
1386     } else {
1387       if (is_backwards) {
1388         __ andr(r15, s, 2 * wordSize - 1);
1389       } else {
1390         __ neg(r15, s);
1391         __ andr(r15, r15, 2 * wordSize - 1);
1392       }
1393       // r15 is the byte adjustment needed to align s.
1394       __ cbz(r15, aligned);
1395       int shift = exact_log2(granularity);
1396       if (shift)  __ lsr(r15, r15, shift);
1397       __ sub(count, count, r15);
1398 
1399 #if 0
1400       // ?? This code is only correct for a disjoint copy.  It may or
1401       // may not make sense to use it in that case.
1402 
1403       // Copy the first pair; s and d may not be aligned.
1404       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1405       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1406 
1407       // Align s and d, adjust count
1408       if (is_backwards) {
1409         __ sub(s, s, r15);
1410         __ sub(d, d, r15);
1411       } else {
1412         __ add(s, s, r15);
1413         __ add(d, d, r15);
1414       }
1415 #else
1416       copy_memory_small(decorators, type, s, d, r15, step);
1417 #endif
1418     }
1419 
1420     __ bind(aligned);
1421 
1422     // s is now 2-word-aligned.
1423 
1424     // We have a count of units and some trailing bytes.  Adjust the
1425     // count and do a bulk copy of words.
1426     __ lsr(r15, count, exact_log2(wordSize/granularity));
1427     if (direction == copy_forwards) {
1428       if (type != T_OBJECT) {
1429         __ bl(copy_f);
1430       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1431         __ bl(copy_obj_uninit_f);
1432       } else {
1433         __ bl(copy_obj_f);
1434       }
1435     } else {
1436       if (type != T_OBJECT) {
1437         __ bl(copy_b);
1438       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1439         __ bl(copy_obj_uninit_b);
1440       } else {
1441         __ bl(copy_obj_b);
1442       }
1443     }
1444 
1445     // And the tail.
1446     copy_memory_small(decorators, type, s, d, count, step);
1447 
1448     if (granularity >= 8) __ bind(copy8);
1449     if (granularity >= 4) __ bind(copy4);
1450     __ bind(finish);
1451   }
1452 
1453 
1454   void clobber_registers() {
1455 #ifdef ASSERT
1456     RegSet clobbered
1457       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1458     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1459     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1460     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1461       __ mov(*it, rscratch1);
1462     }
1463 #endif
1464 
1465   }
1466 
1467   // Scan over array at a for count oops, verifying each one.
1468   // Preserves a and count, clobbers rscratch1 and rscratch2.
1469   void verify_oop_array (int size, Register a, Register count, Register temp) {
1470     Label loop, end;
1471     __ mov(rscratch1, a);
1472     __ mov(rscratch2, zr);
1473     __ bind(loop);
1474     __ cmp(rscratch2, count);
1475     __ br(Assembler::HS, end);
1476     if (size == wordSize) {
1477       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1478       __ verify_oop(temp);
1479     } else {
1480       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1481       __ decode_heap_oop(temp); // calls verify_oop
1482     }
1483     __ add(rscratch2, rscratch2, 1);
1484     __ b(loop);
1485     __ bind(end);
1486   }
1487 
1488   // Arguments:
1489   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1490   //             ignored
1491   //   is_oop  - true => oop array, so generate store check code
1492   //   name    - stub name string
1493   //
1494   // Inputs:
1495   //   c_rarg0   - source array address
1496   //   c_rarg1   - destination array address
1497   //   c_rarg2   - element count, treated as ssize_t, can be zero
1498   //
1499   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1500   // the hardware handle it.  The two dwords within qwords that span
1501   // cache line boundaries will still be loaded and stored atomically.
1502   //
1503   // Side Effects:
1504   //   disjoint_int_copy_entry is set to the no-overlap entry point
1505   //   used by generate_conjoint_int_oop_copy().
1506   //
1507   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1508                                   const char *name, bool dest_uninitialized = false) {
1509     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1510     RegSet saved_reg = RegSet::of(s, d, count);
1511     __ align(CodeEntryAlignment);
1512     StubCodeMark mark(this, "StubRoutines", name);
1513     address start = __ pc();
1514     __ enter();
1515 
1516     if (entry != nullptr) {
1517       *entry = __ pc();
1518       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1519       BLOCK_COMMENT("Entry:");
1520     }
1521 
1522     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1523     if (dest_uninitialized) {
1524       decorators |= IS_DEST_UNINITIALIZED;
1525     }
1526     if (aligned) {
1527       decorators |= ARRAYCOPY_ALIGNED;
1528     }
1529 
1530     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1531     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1532 
1533     if (is_oop) {
1534       // save regs before copy_memory
1535       __ push(RegSet::of(d, count), sp);
1536     }
1537     {
1538       // UnsafeCopyMemory page error: continue after ucm
1539       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1540       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1541       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1542     }
1543 
1544     if (is_oop) {
1545       __ pop(RegSet::of(d, count), sp);
1546       if (VerifyOops)
1547         verify_oop_array(size, d, count, r16);
1548     }
1549 
1550     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1551 
1552     __ leave();
1553     __ mov(r0, zr); // return 0
1554     __ ret(lr);
1555     return start;
1556   }
1557 
1558   // Arguments:
1559   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1560   //             ignored
1561   //   is_oop  - true => oop array, so generate store check code
1562   //   name    - stub name string
1563   //
1564   // Inputs:
1565   //   c_rarg0   - source array address
1566   //   c_rarg1   - destination array address
1567   //   c_rarg2   - element count, treated as ssize_t, can be zero
1568   //
1569   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1570   // the hardware handle it.  The two dwords within qwords that span
1571   // cache line boundaries will still be loaded and stored atomically.
1572   //
1573   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1574                                  address *entry, const char *name,
1575                                  bool dest_uninitialized = false) {
1576     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1577     RegSet saved_regs = RegSet::of(s, d, count);
1578     StubCodeMark mark(this, "StubRoutines", name);
1579     address start = __ pc();
1580     __ enter();
1581 
1582     if (entry != nullptr) {
1583       *entry = __ pc();
1584       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1585       BLOCK_COMMENT("Entry:");
1586     }
1587 
1588     // use fwd copy when (d-s) above_equal (count*size)
1589     __ sub(rscratch1, d, s);
1590     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1591     __ br(Assembler::HS, nooverlap_target);
1592 
1593     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1594     if (dest_uninitialized) {
1595       decorators |= IS_DEST_UNINITIALIZED;
1596     }
1597     if (aligned) {
1598       decorators |= ARRAYCOPY_ALIGNED;
1599     }
1600 
1601     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1602     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1603 
1604     if (is_oop) {
1605       // save regs before copy_memory
1606       __ push(RegSet::of(d, count), sp);
1607     }
1608     {
1609       // UnsafeCopyMemory page error: continue after ucm
1610       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1611       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1612       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1613     }
1614     if (is_oop) {
1615       __ pop(RegSet::of(d, count), sp);
1616       if (VerifyOops)
1617         verify_oop_array(size, d, count, r16);
1618     }
1619     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1620     __ leave();
1621     __ mov(r0, zr); // return 0
1622     __ ret(lr);
1623     return start;
1624 }
1625 
1626   // Arguments:
1627   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1628   //             ignored
1629   //   name    - stub name string
1630   //
1631   // Inputs:
1632   //   c_rarg0   - source array address
1633   //   c_rarg1   - destination array address
1634   //   c_rarg2   - element count, treated as ssize_t, can be zero
1635   //
1636   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1637   // we let the hardware handle it.  The one to eight bytes within words,
1638   // dwords or qwords that span cache line boundaries will still be loaded
1639   // and stored atomically.
1640   //
1641   // Side Effects:
1642   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1643   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1644   // we let the hardware handle it.  The one to eight bytes within words,
1645   // dwords or qwords that span cache line boundaries will still be loaded
1646   // and stored atomically.
1647   //
1648   // Side Effects:
1649   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1650   //   used by generate_conjoint_byte_copy().
1651   //
1652   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1653     const bool not_oop = false;
1654     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1655   }
1656 
1657   // Arguments:
1658   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1659   //             ignored
1660   //   name    - stub name string
1661   //
1662   // Inputs:
1663   //   c_rarg0   - source array address
1664   //   c_rarg1   - destination array address
1665   //   c_rarg2   - element count, treated as ssize_t, can be zero
1666   //
1667   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1668   // we let the hardware handle it.  The one to eight bytes within words,
1669   // dwords or qwords that span cache line boundaries will still be loaded
1670   // and stored atomically.
1671   //
1672   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1673                                       address* entry, const char *name) {
1674     const bool not_oop = false;
1675     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1676   }
1677 
1678   // Arguments:
1679   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1680   //             ignored
1681   //   name    - stub name string
1682   //
1683   // Inputs:
1684   //   c_rarg0   - source array address
1685   //   c_rarg1   - destination array address
1686   //   c_rarg2   - element count, treated as ssize_t, can be zero
1687   //
1688   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1689   // let the hardware handle it.  The two or four words within dwords
1690   // or qwords that span cache line boundaries will still be loaded
1691   // and stored atomically.
1692   //
1693   // Side Effects:
1694   //   disjoint_short_copy_entry is set to the no-overlap entry point
1695   //   used by generate_conjoint_short_copy().
1696   //
1697   address generate_disjoint_short_copy(bool aligned,
1698                                        address* entry, const char *name) {
1699     const bool not_oop = false;
1700     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1701   }
1702 
1703   // Arguments:
1704   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1705   //             ignored
1706   //   name    - stub name string
1707   //
1708   // Inputs:
1709   //   c_rarg0   - source array address
1710   //   c_rarg1   - destination array address
1711   //   c_rarg2   - element count, treated as ssize_t, can be zero
1712   //
1713   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1714   // let the hardware handle it.  The two or four words within dwords
1715   // or qwords that span cache line boundaries will still be loaded
1716   // and stored atomically.
1717   //
1718   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1719                                        address *entry, const char *name) {
1720     const bool not_oop = false;
1721     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1722 
1723   }
1724   // Arguments:
1725   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1726   //             ignored
1727   //   name    - stub name string
1728   //
1729   // Inputs:
1730   //   c_rarg0   - source array address
1731   //   c_rarg1   - destination array address
1732   //   c_rarg2   - element count, treated as ssize_t, can be zero
1733   //
1734   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1735   // the hardware handle it.  The two dwords within qwords that span
1736   // cache line boundaries will still be loaded and stored atomically.
1737   //
1738   // Side Effects:
1739   //   disjoint_int_copy_entry is set to the no-overlap entry point
1740   //   used by generate_conjoint_int_oop_copy().
1741   //
1742   address generate_disjoint_int_copy(bool aligned, address *entry,
1743                                          const char *name, bool dest_uninitialized = false) {
1744     const bool not_oop = false;
1745     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1746   }
1747 
1748   // Arguments:
1749   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1750   //             ignored
1751   //   name    - stub name string
1752   //
1753   // Inputs:
1754   //   c_rarg0   - source array address
1755   //   c_rarg1   - destination array address
1756   //   c_rarg2   - element count, treated as ssize_t, can be zero
1757   //
1758   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1759   // the hardware handle it.  The two dwords within qwords that span
1760   // cache line boundaries will still be loaded and stored atomically.
1761   //
1762   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1763                                      address *entry, const char *name,
1764                                      bool dest_uninitialized = false) {
1765     const bool not_oop = false;
1766     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1767   }
1768 
1769 
1770   // Arguments:
1771   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1772   //             ignored
1773   //   name    - stub name string
1774   //
1775   // Inputs:
1776   //   c_rarg0   - source array address
1777   //   c_rarg1   - destination array address
1778   //   c_rarg2   - element count, treated as size_t, can be zero
1779   //
1780   // Side Effects:
1781   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1782   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1783   //
1784   address generate_disjoint_long_copy(bool aligned, address *entry,
1785                                           const char *name, bool dest_uninitialized = false) {
1786     const bool not_oop = false;
1787     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1788   }
1789 
1790   // Arguments:
1791   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1792   //             ignored
1793   //   name    - stub name string
1794   //
1795   // Inputs:
1796   //   c_rarg0   - source array address
1797   //   c_rarg1   - destination array address
1798   //   c_rarg2   - element count, treated as size_t, can be zero
1799   //
1800   address generate_conjoint_long_copy(bool aligned,
1801                                       address nooverlap_target, address *entry,
1802                                       const char *name, bool dest_uninitialized = false) {
1803     const bool not_oop = false;
1804     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1805   }
1806 
1807   // Arguments:
1808   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1809   //             ignored
1810   //   name    - stub name string
1811   //
1812   // Inputs:
1813   //   c_rarg0   - source array address
1814   //   c_rarg1   - destination array address
1815   //   c_rarg2   - element count, treated as size_t, can be zero
1816   //
1817   // Side Effects:
1818   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1819   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1820   //
1821   address generate_disjoint_oop_copy(bool aligned, address *entry,
1822                                      const char *name, bool dest_uninitialized) {
1823     const bool is_oop = true;
1824     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1825     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1826   }
1827 
1828   // Arguments:
1829   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1830   //             ignored
1831   //   name    - stub name string
1832   //
1833   // Inputs:
1834   //   c_rarg0   - source array address
1835   //   c_rarg1   - destination array address
1836   //   c_rarg2   - element count, treated as size_t, can be zero
1837   //
1838   address generate_conjoint_oop_copy(bool aligned,
1839                                      address nooverlap_target, address *entry,
1840                                      const char *name, bool dest_uninitialized) {
1841     const bool is_oop = true;
1842     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1843     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1844                                   name, dest_uninitialized);
1845   }
1846 
1847 
1848   // Helper for generating a dynamic type check.
1849   // Smashes rscratch1, rscratch2.
1850   void generate_type_check(Register sub_klass,
1851                            Register super_check_offset,
1852                            Register super_klass,
1853                            Label& L_success) {
1854     assert_different_registers(sub_klass, super_check_offset, super_klass);
1855 
1856     BLOCK_COMMENT("type_check:");
1857 
1858     Label L_miss;
1859 
1860     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1861                                      super_check_offset);
1862     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1863 
1864     // Fall through on failure!
1865     __ BIND(L_miss);
1866   }
1867 
1868   //
1869   //  Generate checkcasting array copy stub
1870   //
1871   //  Input:
1872   //    c_rarg0   - source array address
1873   //    c_rarg1   - destination array address
1874   //    c_rarg2   - element count, treated as ssize_t, can be zero
1875   //    c_rarg3   - size_t ckoff (super_check_offset)
1876   //    c_rarg4   - oop ckval (super_klass)
1877   //
1878   //  Output:
1879   //    r0 ==  0  -  success
1880   //    r0 == -1^K - failure, where K is partial transfer count
1881   //
1882   address generate_checkcast_copy(const char *name, address *entry,
1883                                   bool dest_uninitialized = false) {
1884 
1885     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1886 
1887     // Input registers (after setup_arg_regs)
1888     const Register from        = c_rarg0;   // source array address
1889     const Register to          = c_rarg1;   // destination array address
1890     const Register count       = c_rarg2;   // elementscount
1891     const Register ckoff       = c_rarg3;   // super_check_offset
1892     const Register ckval       = c_rarg4;   // super_klass
1893 
1894     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1895     RegSet wb_post_saved_regs = RegSet::of(count);
1896 
1897     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1898     const Register copied_oop  = r22;       // actual oop copied
1899     const Register count_save  = r21;       // orig elementscount
1900     const Register start_to    = r20;       // destination array start address
1901     const Register r19_klass   = r19;       // oop._klass
1902 
1903     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1904     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1905 
1906     //---------------------------------------------------------------
1907     // Assembler stub will be used for this call to arraycopy
1908     // if the two arrays are subtypes of Object[] but the
1909     // destination array type is not equal to or a supertype
1910     // of the source type.  Each element must be separately
1911     // checked.
1912 
1913     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1914                                copied_oop, r19_klass, count_save);
1915 
1916     __ align(CodeEntryAlignment);
1917     StubCodeMark mark(this, "StubRoutines", name);
1918     address start = __ pc();
1919 
1920     __ enter(); // required for proper stackwalking of RuntimeStub frame
1921 
1922 #ifdef ASSERT
1923     // caller guarantees that the arrays really are different
1924     // otherwise, we would have to make conjoint checks
1925     { Label L;
1926       __ b(L);                  // conjoint check not yet implemented
1927       __ stop("checkcast_copy within a single array");
1928       __ bind(L);
1929     }
1930 #endif //ASSERT
1931 
1932     // Caller of this entry point must set up the argument registers.
1933     if (entry != nullptr) {
1934       *entry = __ pc();
1935       BLOCK_COMMENT("Entry:");
1936     }
1937 
1938      // Empty array:  Nothing to do.
1939     __ cbz(count, L_done);
1940     __ push(RegSet::of(r19, r20, r21, r22), sp);
1941 
1942 #ifdef ASSERT
1943     BLOCK_COMMENT("assert consistent ckoff/ckval");
1944     // The ckoff and ckval must be mutually consistent,
1945     // even though caller generates both.
1946     { Label L;
1947       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1948       __ ldrw(start_to, Address(ckval, sco_offset));
1949       __ cmpw(ckoff, start_to);
1950       __ br(Assembler::EQ, L);
1951       __ stop("super_check_offset inconsistent");
1952       __ bind(L);
1953     }
1954 #endif //ASSERT
1955 
1956     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1957     bool is_oop = true;
1958     int element_size = UseCompressedOops ? 4 : 8;
1959     if (dest_uninitialized) {
1960       decorators |= IS_DEST_UNINITIALIZED;
1961     }
1962 
1963     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1964     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1965 
1966     // save the original count
1967     __ mov(count_save, count);
1968 
1969     // Copy from low to high addresses
1970     __ mov(start_to, to);              // Save destination array start address
1971     __ b(L_load_element);
1972 
1973     // ======== begin loop ========
1974     // (Loop is rotated; its entry is L_load_element.)
1975     // Loop control:
1976     //   for (; count != 0; count--) {
1977     //     copied_oop = load_heap_oop(from++);
1978     //     ... generate_type_check ...;
1979     //     store_heap_oop(to++, copied_oop);
1980     //   }
1981     __ align(OptoLoopAlignment);
1982 
1983     __ BIND(L_store_element);
1984     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1985                       __ post(to, element_size), copied_oop, noreg,
1986                       gct1, gct2, gct3);
1987     __ sub(count, count, 1);
1988     __ cbz(count, L_do_card_marks);
1989 
1990     // ======== loop entry is here ========
1991     __ BIND(L_load_element);
1992     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1993                      copied_oop, noreg, __ post(from, element_size),
1994                      gct1);
1995     __ cbz(copied_oop, L_store_element);
1996 
1997     __ load_klass(r19_klass, copied_oop);// query the object klass
1998     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1999     // ======== end loop ========
2000 
2001     // It was a real error; we must depend on the caller to finish the job.
2002     // Register count = remaining oops, count_orig = total oops.
2003     // Emit GC store barriers for the oops we have copied and report
2004     // their number to the caller.
2005 
2006     __ subs(count, count_save, count);     // K = partially copied oop count
2007     __ eon(count, count, zr);                   // report (-1^K) to caller
2008     __ br(Assembler::EQ, L_done_pop);
2009 
2010     __ BIND(L_do_card_marks);
2011     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2012 
2013     __ bind(L_done_pop);
2014     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2015     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2016 
2017     __ bind(L_done);
2018     __ mov(r0, count);
2019     __ leave();
2020     __ ret(lr);
2021 
2022     return start;
2023   }
2024 
2025   // Perform range checks on the proposed arraycopy.
2026   // Kills temp, but nothing else.
2027   // Also, clean the sign bits of src_pos and dst_pos.
2028   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2029                               Register src_pos, // source position (c_rarg1)
2030                               Register dst,     // destination array oo (c_rarg2)
2031                               Register dst_pos, // destination position (c_rarg3)
2032                               Register length,
2033                               Register temp,
2034                               Label& L_failed) {
2035     BLOCK_COMMENT("arraycopy_range_checks:");
2036 
2037     assert_different_registers(rscratch1, temp);
2038 
2039     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2040     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2041     __ addw(temp, length, src_pos);
2042     __ cmpw(temp, rscratch1);
2043     __ br(Assembler::HI, L_failed);
2044 
2045     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2046     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2047     __ addw(temp, length, dst_pos);
2048     __ cmpw(temp, rscratch1);
2049     __ br(Assembler::HI, L_failed);
2050 
2051     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2052     __ movw(src_pos, src_pos);
2053     __ movw(dst_pos, dst_pos);
2054 
2055     BLOCK_COMMENT("arraycopy_range_checks done");
2056   }
2057 
2058   // These stubs get called from some dumb test routine.
2059   // I'll write them properly when they're called from
2060   // something that's actually doing something.
2061   static void fake_arraycopy_stub(address src, address dst, int count) {
2062     assert(count == 0, "huh?");
2063   }
2064 
2065 
2066   //
2067   //  Generate 'unsafe' array copy stub
2068   //  Though just as safe as the other stubs, it takes an unscaled
2069   //  size_t argument instead of an element count.
2070   //
2071   //  Input:
2072   //    c_rarg0   - source array address
2073   //    c_rarg1   - destination array address
2074   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2075   //
2076   // Examines the alignment of the operands and dispatches
2077   // to a long, int, short, or byte copy loop.
2078   //
2079   address generate_unsafe_copy(const char *name,
2080                                address byte_copy_entry,
2081                                address short_copy_entry,
2082                                address int_copy_entry,
2083                                address long_copy_entry) {
2084     Label L_long_aligned, L_int_aligned, L_short_aligned;
2085     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2086 
2087     __ align(CodeEntryAlignment);
2088     StubCodeMark mark(this, "StubRoutines", name);
2089     address start = __ pc();
2090     __ enter(); // required for proper stackwalking of RuntimeStub frame
2091 
2092     // bump this on entry, not on exit:
2093     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2094 
2095     __ orr(rscratch1, s, d);
2096     __ orr(rscratch1, rscratch1, count);
2097 
2098     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2099     __ cbz(rscratch1, L_long_aligned);
2100     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2101     __ cbz(rscratch1, L_int_aligned);
2102     __ tbz(rscratch1, 0, L_short_aligned);
2103     __ b(RuntimeAddress(byte_copy_entry));
2104 
2105     __ BIND(L_short_aligned);
2106     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2107     __ b(RuntimeAddress(short_copy_entry));
2108     __ BIND(L_int_aligned);
2109     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2110     __ b(RuntimeAddress(int_copy_entry));
2111     __ BIND(L_long_aligned);
2112     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2113     __ b(RuntimeAddress(long_copy_entry));
2114 
2115     return start;
2116   }
2117 
2118   //
2119   //  Generate generic array copy stubs
2120   //
2121   //  Input:
2122   //    c_rarg0    -  src oop
2123   //    c_rarg1    -  src_pos (32-bits)
2124   //    c_rarg2    -  dst oop
2125   //    c_rarg3    -  dst_pos (32-bits)
2126   //    c_rarg4    -  element count (32-bits)
2127   //
2128   //  Output:
2129   //    r0 ==  0  -  success
2130   //    r0 == -1^K - failure, where K is partial transfer count
2131   //
2132   address generate_generic_copy(const char *name,
2133                                 address byte_copy_entry, address short_copy_entry,
2134                                 address int_copy_entry, address oop_copy_entry,
2135                                 address long_copy_entry, address checkcast_copy_entry) {
2136 
2137     Label L_failed, L_objArray;
2138     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2139 
2140     // Input registers
2141     const Register src        = c_rarg0;  // source array oop
2142     const Register src_pos    = c_rarg1;  // source position
2143     const Register dst        = c_rarg2;  // destination array oop
2144     const Register dst_pos    = c_rarg3;  // destination position
2145     const Register length     = c_rarg4;
2146 
2147 
2148     // Registers used as temps
2149     const Register dst_klass  = c_rarg5;
2150 
2151     __ align(CodeEntryAlignment);
2152 
2153     StubCodeMark mark(this, "StubRoutines", name);
2154 
2155     address start = __ pc();
2156 
2157     __ enter(); // required for proper stackwalking of RuntimeStub frame
2158 
2159     // bump this on entry, not on exit:
2160     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2161 
2162     //-----------------------------------------------------------------------
2163     // Assembler stub will be used for this call to arraycopy
2164     // if the following conditions are met:
2165     //
2166     // (1) src and dst must not be null.
2167     // (2) src_pos must not be negative.
2168     // (3) dst_pos must not be negative.
2169     // (4) length  must not be negative.
2170     // (5) src klass and dst klass should be the same and not null.
2171     // (6) src and dst should be arrays.
2172     // (7) src_pos + length must not exceed length of src.
2173     // (8) dst_pos + length must not exceed length of dst.
2174     //
2175 
2176     //  if (src == nullptr) return -1;
2177     __ cbz(src, L_failed);
2178 
2179     //  if (src_pos < 0) return -1;
2180     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2181 
2182     //  if (dst == nullptr) return -1;
2183     __ cbz(dst, L_failed);
2184 
2185     //  if (dst_pos < 0) return -1;
2186     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2187 
2188     // registers used as temp
2189     const Register scratch_length    = r16; // elements count to copy
2190     const Register scratch_src_klass = r17; // array klass
2191     const Register lh                = r15; // layout helper
2192 
2193     //  if (length < 0) return -1;
2194     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2195     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2196 
2197     __ load_klass(scratch_src_klass, src);
2198 #ifdef ASSERT
2199     //  assert(src->klass() != nullptr);
2200     {
2201       BLOCK_COMMENT("assert klasses not null {");
2202       Label L1, L2;
2203       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2204       __ bind(L1);
2205       __ stop("broken null klass");
2206       __ bind(L2);
2207       __ load_klass(rscratch1, dst);
2208       __ cbz(rscratch1, L1);     // this would be broken also
2209       BLOCK_COMMENT("} assert klasses not null done");
2210     }
2211 #endif
2212 
2213     // Load layout helper (32-bits)
2214     //
2215     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2216     // 32        30    24            16              8     2                 0
2217     //
2218     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2219     //
2220 
2221     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2222 
2223     // Handle objArrays completely differently...
2224     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2225     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2226     __ movw(rscratch1, objArray_lh);
2227     __ eorw(rscratch2, lh, rscratch1);
2228     __ cbzw(rscratch2, L_objArray);
2229 
2230     //  if (src->klass() != dst->klass()) return -1;
2231     __ load_klass(rscratch2, dst);
2232     __ eor(rscratch2, rscratch2, scratch_src_klass);
2233     __ cbnz(rscratch2, L_failed);
2234 
2235     // Check for flat inline type array -> return -1
2236     __ tst(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2237     __ br(Assembler::NE, L_failed);
2238 
2239     // Check for null-free (non-flat) inline type array -> handle as object array
2240     __ tst(lh, Klass::_lh_null_free_array_bit_inplace);
2241     __ br(Assembler::NE, L_failed);
2242 
2243     //  if (!src->is_Array()) return -1;
2244     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2245 
2246     // At this point, it is known to be a typeArray (array_tag 0x3).
2247 #ifdef ASSERT
2248     {
2249       BLOCK_COMMENT("assert primitive array {");
2250       Label L;
2251       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2252       __ cmpw(lh, rscratch2);
2253       __ br(Assembler::GE, L);
2254       __ stop("must be a primitive array");
2255       __ bind(L);
2256       BLOCK_COMMENT("} assert primitive array done");
2257     }
2258 #endif
2259 
2260     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2261                            rscratch2, L_failed);
2262 
2263     // TypeArrayKlass
2264     //
2265     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2266     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2267     //
2268 
2269     const Register rscratch1_offset = rscratch1;    // array offset
2270     const Register r15_elsize = lh; // element size
2271 
2272     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2273            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2274     __ add(src, src, rscratch1_offset);           // src array offset
2275     __ add(dst, dst, rscratch1_offset);           // dst array offset
2276     BLOCK_COMMENT("choose copy loop based on element size");
2277 
2278     // next registers should be set before the jump to corresponding stub
2279     const Register from     = c_rarg0;  // source array address
2280     const Register to       = c_rarg1;  // destination array address
2281     const Register count    = c_rarg2;  // elements count
2282 
2283     // 'from', 'to', 'count' registers should be set in such order
2284     // since they are the same as 'src', 'src_pos', 'dst'.
2285 
2286     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2287 
2288     // The possible values of elsize are 0-3, i.e. exact_log2(element
2289     // size in bytes).  We do a simple bitwise binary search.
2290   __ BIND(L_copy_bytes);
2291     __ tbnz(r15_elsize, 1, L_copy_ints);
2292     __ tbnz(r15_elsize, 0, L_copy_shorts);
2293     __ lea(from, Address(src, src_pos));// src_addr
2294     __ lea(to,   Address(dst, dst_pos));// dst_addr
2295     __ movw(count, scratch_length); // length
2296     __ b(RuntimeAddress(byte_copy_entry));
2297 
2298   __ BIND(L_copy_shorts);
2299     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2300     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2301     __ movw(count, scratch_length); // length
2302     __ b(RuntimeAddress(short_copy_entry));
2303 
2304   __ BIND(L_copy_ints);
2305     __ tbnz(r15_elsize, 0, L_copy_longs);
2306     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2307     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2308     __ movw(count, scratch_length); // length
2309     __ b(RuntimeAddress(int_copy_entry));
2310 
2311   __ BIND(L_copy_longs);
2312 #ifdef ASSERT
2313     {
2314       BLOCK_COMMENT("assert long copy {");
2315       Label L;
2316       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2317       __ cmpw(r15_elsize, LogBytesPerLong);
2318       __ br(Assembler::EQ, L);
2319       __ stop("must be long copy, but elsize is wrong");
2320       __ bind(L);
2321       BLOCK_COMMENT("} assert long copy done");
2322     }
2323 #endif
2324     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2325     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2326     __ movw(count, scratch_length); // length
2327     __ b(RuntimeAddress(long_copy_entry));
2328 
2329     // ObjArrayKlass
2330   __ BIND(L_objArray);
2331     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2332 
2333     Label L_plain_copy, L_checkcast_copy;
2334     //  test array classes for subtyping
2335     __ load_klass(r15, dst);
2336     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2337     __ br(Assembler::NE, L_checkcast_copy);
2338 
2339     // Identically typed arrays can be copied without element-wise checks.
2340     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2341                            rscratch2, L_failed);
2342 
2343     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2344     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2345     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2346     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2347     __ movw(count, scratch_length); // length
2348   __ BIND(L_plain_copy);
2349     __ b(RuntimeAddress(oop_copy_entry));
2350 
2351   __ BIND(L_checkcast_copy);
2352     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2353     {
2354       // Before looking at dst.length, make sure dst is also an objArray.
2355       __ ldrw(rscratch1, Address(r15, lh_offset));
2356       __ movw(rscratch2, objArray_lh);
2357       __ eorw(rscratch1, rscratch1, rscratch2);
2358       __ cbnzw(rscratch1, L_failed);
2359 
2360       // It is safe to examine both src.length and dst.length.
2361       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2362                              r15, L_failed);
2363 
2364       __ load_klass(dst_klass, dst); // reload
2365 
2366       // Marshal the base address arguments now, freeing registers.
2367       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2368       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2369       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2370       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2371       __ movw(count, length);           // length (reloaded)
2372       Register sco_temp = c_rarg3;      // this register is free now
2373       assert_different_registers(from, to, count, sco_temp,
2374                                  dst_klass, scratch_src_klass);
2375       // assert_clean_int(count, sco_temp);
2376 
2377       // Generate the type check.
2378       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2379       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2380 
2381       // Smashes rscratch1, rscratch2
2382       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2383 
2384       // Fetch destination element klass from the ObjArrayKlass header.
2385       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2386       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2387       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2388 
2389       // the checkcast_copy loop needs two extra arguments:
2390       assert(c_rarg3 == sco_temp, "#3 already in place");
2391       // Set up arguments for checkcast_copy_entry.
2392       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2393       __ b(RuntimeAddress(checkcast_copy_entry));
2394     }
2395 
2396   __ BIND(L_failed);
2397     __ mov(r0, -1);
2398     __ leave();   // required for proper stackwalking of RuntimeStub frame
2399     __ ret(lr);
2400 
2401     return start;
2402   }
2403 
2404   //
2405   // Generate stub for array fill. If "aligned" is true, the
2406   // "to" address is assumed to be heapword aligned.
2407   //
2408   // Arguments for generated stub:
2409   //   to:    c_rarg0
2410   //   value: c_rarg1
2411   //   count: c_rarg2 treated as signed
2412   //
2413   address generate_fill(BasicType t, bool aligned, const char *name) {
2414     __ align(CodeEntryAlignment);
2415     StubCodeMark mark(this, "StubRoutines", name);
2416     address start = __ pc();
2417 
2418     BLOCK_COMMENT("Entry:");
2419 
2420     const Register to        = c_rarg0;  // source array address
2421     const Register value     = c_rarg1;  // value
2422     const Register count     = c_rarg2;  // elements count
2423 
2424     const Register bz_base = r10;        // base for block_zero routine
2425     const Register cnt_words = r11;      // temp register
2426 
2427     __ enter();
2428 
2429     Label L_fill_elements, L_exit1;
2430 
2431     int shift = -1;
2432     switch (t) {
2433       case T_BYTE:
2434         shift = 0;
2435         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2436         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2437         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2438         __ br(Assembler::LO, L_fill_elements);
2439         break;
2440       case T_SHORT:
2441         shift = 1;
2442         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2443         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2444         __ br(Assembler::LO, L_fill_elements);
2445         break;
2446       case T_INT:
2447         shift = 2;
2448         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2449         __ br(Assembler::LO, L_fill_elements);
2450         break;
2451       default: ShouldNotReachHere();
2452     }
2453 
2454     // Align source address at 8 bytes address boundary.
2455     Label L_skip_align1, L_skip_align2, L_skip_align4;
2456     if (!aligned) {
2457       switch (t) {
2458         case T_BYTE:
2459           // One byte misalignment happens only for byte arrays.
2460           __ tbz(to, 0, L_skip_align1);
2461           __ strb(value, Address(__ post(to, 1)));
2462           __ subw(count, count, 1);
2463           __ bind(L_skip_align1);
2464           // Fallthrough
2465         case T_SHORT:
2466           // Two bytes misalignment happens only for byte and short (char) arrays.
2467           __ tbz(to, 1, L_skip_align2);
2468           __ strh(value, Address(__ post(to, 2)));
2469           __ subw(count, count, 2 >> shift);
2470           __ bind(L_skip_align2);
2471           // Fallthrough
2472         case T_INT:
2473           // Align to 8 bytes, we know we are 4 byte aligned to start.
2474           __ tbz(to, 2, L_skip_align4);
2475           __ strw(value, Address(__ post(to, 4)));
2476           __ subw(count, count, 4 >> shift);
2477           __ bind(L_skip_align4);
2478           break;
2479         default: ShouldNotReachHere();
2480       }
2481     }
2482 
2483     //
2484     //  Fill large chunks
2485     //
2486     __ lsrw(cnt_words, count, 3 - shift); // number of words
2487     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2488     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2489     if (UseBlockZeroing) {
2490       Label non_block_zeroing, rest;
2491       // If the fill value is zero we can use the fast zero_words().
2492       __ cbnz(value, non_block_zeroing);
2493       __ mov(bz_base, to);
2494       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2495       address tpc = __ zero_words(bz_base, cnt_words);
2496       if (tpc == nullptr) {
2497         fatal("CodeCache is full at generate_fill");
2498       }
2499       __ b(rest);
2500       __ bind(non_block_zeroing);
2501       __ fill_words(to, cnt_words, value);
2502       __ bind(rest);
2503     } else {
2504       __ fill_words(to, cnt_words, value);
2505     }
2506 
2507     // Remaining count is less than 8 bytes. Fill it by a single store.
2508     // Note that the total length is no less than 8 bytes.
2509     if (t == T_BYTE || t == T_SHORT) {
2510       Label L_exit1;
2511       __ cbzw(count, L_exit1);
2512       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2513       __ str(value, Address(to, -8));    // overwrite some elements
2514       __ bind(L_exit1);
2515       __ leave();
2516       __ ret(lr);
2517     }
2518 
2519     // Handle copies less than 8 bytes.
2520     Label L_fill_2, L_fill_4, L_exit2;
2521     __ bind(L_fill_elements);
2522     switch (t) {
2523       case T_BYTE:
2524         __ tbz(count, 0, L_fill_2);
2525         __ strb(value, Address(__ post(to, 1)));
2526         __ bind(L_fill_2);
2527         __ tbz(count, 1, L_fill_4);
2528         __ strh(value, Address(__ post(to, 2)));
2529         __ bind(L_fill_4);
2530         __ tbz(count, 2, L_exit2);
2531         __ strw(value, Address(to));
2532         break;
2533       case T_SHORT:
2534         __ tbz(count, 0, L_fill_4);
2535         __ strh(value, Address(__ post(to, 2)));
2536         __ bind(L_fill_4);
2537         __ tbz(count, 1, L_exit2);
2538         __ strw(value, Address(to));
2539         break;
2540       case T_INT:
2541         __ cbzw(count, L_exit2);
2542         __ strw(value, Address(to));
2543         break;
2544       default: ShouldNotReachHere();
2545     }
2546     __ bind(L_exit2);
2547     __ leave();
2548     __ ret(lr);
2549     return start;
2550   }
2551 
2552   address generate_data_cache_writeback() {
2553     const Register line        = c_rarg0;  // address of line to write back
2554 
2555     __ align(CodeEntryAlignment);
2556 
2557     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2558 
2559     address start = __ pc();
2560     __ enter();
2561     __ cache_wb(Address(line, 0));
2562     __ leave();
2563     __ ret(lr);
2564 
2565     return start;
2566   }
2567 
2568   address generate_data_cache_writeback_sync() {
2569     const Register is_pre     = c_rarg0;  // pre or post sync
2570 
2571     __ align(CodeEntryAlignment);
2572 
2573     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2574 
2575     // pre wbsync is a no-op
2576     // post wbsync translates to an sfence
2577 
2578     Label skip;
2579     address start = __ pc();
2580     __ enter();
2581     __ cbnz(is_pre, skip);
2582     __ cache_wbsync(false);
2583     __ bind(skip);
2584     __ leave();
2585     __ ret(lr);
2586 
2587     return start;
2588   }
2589 
2590   void generate_arraycopy_stubs() {
2591     address entry;
2592     address entry_jbyte_arraycopy;
2593     address entry_jshort_arraycopy;
2594     address entry_jint_arraycopy;
2595     address entry_oop_arraycopy;
2596     address entry_jlong_arraycopy;
2597     address entry_checkcast_arraycopy;
2598 
2599     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2600     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2601 
2602     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2603     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2604 
2605     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2606     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2607 
2608     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2609 
2610     //*** jbyte
2611     // Always need aligned and unaligned versions
2612     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2613                                                                                   "jbyte_disjoint_arraycopy");
2614     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2615                                                                                   &entry_jbyte_arraycopy,
2616                                                                                   "jbyte_arraycopy");
2617     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2618                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2619     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2620                                                                                   "arrayof_jbyte_arraycopy");
2621 
2622     //*** jshort
2623     // Always need aligned and unaligned versions
2624     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2625                                                                                     "jshort_disjoint_arraycopy");
2626     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2627                                                                                     &entry_jshort_arraycopy,
2628                                                                                     "jshort_arraycopy");
2629     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2630                                                                                     "arrayof_jshort_disjoint_arraycopy");
2631     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2632                                                                                     "arrayof_jshort_arraycopy");
2633 
2634     //*** jint
2635     // Aligned versions
2636     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2637                                                                                 "arrayof_jint_disjoint_arraycopy");
2638     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2639                                                                                 "arrayof_jint_arraycopy");
2640     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2641     // entry_jint_arraycopy always points to the unaligned version
2642     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2643                                                                                 "jint_disjoint_arraycopy");
2644     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2645                                                                                 &entry_jint_arraycopy,
2646                                                                                 "jint_arraycopy");
2647 
2648     //*** jlong
2649     // It is always aligned
2650     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2651                                                                                   "arrayof_jlong_disjoint_arraycopy");
2652     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2653                                                                                   "arrayof_jlong_arraycopy");
2654     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2655     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2656 
2657     //*** oops
2658     {
2659       // With compressed oops we need unaligned versions; notice that
2660       // we overwrite entry_oop_arraycopy.
2661       bool aligned = !UseCompressedOops;
2662 
2663       StubRoutines::_arrayof_oop_disjoint_arraycopy
2664         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2665                                      /*dest_uninitialized*/false);
2666       StubRoutines::_arrayof_oop_arraycopy
2667         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2668                                      /*dest_uninitialized*/false);
2669       // Aligned versions without pre-barriers
2670       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2671         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2672                                      /*dest_uninitialized*/true);
2673       StubRoutines::_arrayof_oop_arraycopy_uninit
2674         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2675                                      /*dest_uninitialized*/true);
2676     }
2677 
2678     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2679     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2680     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2681     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2682 
2683     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2684     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2685                                                                         /*dest_uninitialized*/true);
2686 
2687     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2688                                                               entry_jbyte_arraycopy,
2689                                                               entry_jshort_arraycopy,
2690                                                               entry_jint_arraycopy,
2691                                                               entry_jlong_arraycopy);
2692 
2693     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2694                                                                entry_jbyte_arraycopy,
2695                                                                entry_jshort_arraycopy,
2696                                                                entry_jint_arraycopy,
2697                                                                entry_oop_arraycopy,
2698                                                                entry_jlong_arraycopy,
2699                                                                entry_checkcast_arraycopy);
2700 
2701     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2702     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2703     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2704     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2705     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2706     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2707   }
2708 
2709   void generate_math_stubs() { Unimplemented(); }
2710 
2711   // Arguments:
2712   //
2713   // Inputs:
2714   //   c_rarg0   - source byte array address
2715   //   c_rarg1   - destination byte array address
2716   //   c_rarg2   - K (key) in little endian int array
2717   //
2718   address generate_aescrypt_encryptBlock() {
2719     __ align(CodeEntryAlignment);
2720     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2721 
2722     const Register from        = c_rarg0;  // source array address
2723     const Register to          = c_rarg1;  // destination array address
2724     const Register key         = c_rarg2;  // key array address
2725     const Register keylen      = rscratch1;
2726 
2727     address start = __ pc();
2728     __ enter();
2729 
2730     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2731 
2732     __ aesenc_loadkeys(key, keylen);
2733     __ aesecb_encrypt(from, to, keylen);
2734 
2735     __ mov(r0, 0);
2736 
2737     __ leave();
2738     __ ret(lr);
2739 
2740     return start;
2741   }
2742 
2743   // Arguments:
2744   //
2745   // Inputs:
2746   //   c_rarg0   - source byte array address
2747   //   c_rarg1   - destination byte array address
2748   //   c_rarg2   - K (key) in little endian int array
2749   //
2750   address generate_aescrypt_decryptBlock() {
2751     assert(UseAES, "need AES cryptographic extension support");
2752     __ align(CodeEntryAlignment);
2753     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2754     Label L_doLast;
2755 
2756     const Register from        = c_rarg0;  // source array address
2757     const Register to          = c_rarg1;  // destination array address
2758     const Register key         = c_rarg2;  // key array address
2759     const Register keylen      = rscratch1;
2760 
2761     address start = __ pc();
2762     __ enter(); // required for proper stackwalking of RuntimeStub frame
2763 
2764     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2765 
2766     __ aesecb_decrypt(from, to, key, keylen);
2767 
2768     __ mov(r0, 0);
2769 
2770     __ leave();
2771     __ ret(lr);
2772 
2773     return start;
2774   }
2775 
2776   // Arguments:
2777   //
2778   // Inputs:
2779   //   c_rarg0   - source byte array address
2780   //   c_rarg1   - destination byte array address
2781   //   c_rarg2   - K (key) in little endian int array
2782   //   c_rarg3   - r vector byte array address
2783   //   c_rarg4   - input length
2784   //
2785   // Output:
2786   //   x0        - input length
2787   //
2788   address generate_cipherBlockChaining_encryptAESCrypt() {
2789     assert(UseAES, "need AES cryptographic extension support");
2790     __ align(CodeEntryAlignment);
2791     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2792 
2793     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2794 
2795     const Register from        = c_rarg0;  // source array address
2796     const Register to          = c_rarg1;  // destination array address
2797     const Register key         = c_rarg2;  // key array address
2798     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2799                                            // and left with the results of the last encryption block
2800     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2801     const Register keylen      = rscratch1;
2802 
2803     address start = __ pc();
2804 
2805       __ enter();
2806 
2807       __ movw(rscratch2, len_reg);
2808 
2809       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2810 
2811       __ ld1(v0, __ T16B, rvec);
2812 
2813       __ cmpw(keylen, 52);
2814       __ br(Assembler::CC, L_loadkeys_44);
2815       __ br(Assembler::EQ, L_loadkeys_52);
2816 
2817       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2818       __ rev32(v17, __ T16B, v17);
2819       __ rev32(v18, __ T16B, v18);
2820     __ BIND(L_loadkeys_52);
2821       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2822       __ rev32(v19, __ T16B, v19);
2823       __ rev32(v20, __ T16B, v20);
2824     __ BIND(L_loadkeys_44);
2825       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2826       __ rev32(v21, __ T16B, v21);
2827       __ rev32(v22, __ T16B, v22);
2828       __ rev32(v23, __ T16B, v23);
2829       __ rev32(v24, __ T16B, v24);
2830       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2831       __ rev32(v25, __ T16B, v25);
2832       __ rev32(v26, __ T16B, v26);
2833       __ rev32(v27, __ T16B, v27);
2834       __ rev32(v28, __ T16B, v28);
2835       __ ld1(v29, v30, v31, __ T16B, key);
2836       __ rev32(v29, __ T16B, v29);
2837       __ rev32(v30, __ T16B, v30);
2838       __ rev32(v31, __ T16B, v31);
2839 
2840     __ BIND(L_aes_loop);
2841       __ ld1(v1, __ T16B, __ post(from, 16));
2842       __ eor(v0, __ T16B, v0, v1);
2843 
2844       __ br(Assembler::CC, L_rounds_44);
2845       __ br(Assembler::EQ, L_rounds_52);
2846 
2847       __ aese(v0, v17); __ aesmc(v0, v0);
2848       __ aese(v0, v18); __ aesmc(v0, v0);
2849     __ BIND(L_rounds_52);
2850       __ aese(v0, v19); __ aesmc(v0, v0);
2851       __ aese(v0, v20); __ aesmc(v0, v0);
2852     __ BIND(L_rounds_44);
2853       __ aese(v0, v21); __ aesmc(v0, v0);
2854       __ aese(v0, v22); __ aesmc(v0, v0);
2855       __ aese(v0, v23); __ aesmc(v0, v0);
2856       __ aese(v0, v24); __ aesmc(v0, v0);
2857       __ aese(v0, v25); __ aesmc(v0, v0);
2858       __ aese(v0, v26); __ aesmc(v0, v0);
2859       __ aese(v0, v27); __ aesmc(v0, v0);
2860       __ aese(v0, v28); __ aesmc(v0, v0);
2861       __ aese(v0, v29); __ aesmc(v0, v0);
2862       __ aese(v0, v30);
2863       __ eor(v0, __ T16B, v0, v31);
2864 
2865       __ st1(v0, __ T16B, __ post(to, 16));
2866 
2867       __ subw(len_reg, len_reg, 16);
2868       __ cbnzw(len_reg, L_aes_loop);
2869 
2870       __ st1(v0, __ T16B, rvec);
2871 
2872       __ mov(r0, rscratch2);
2873 
2874       __ leave();
2875       __ ret(lr);
2876 
2877       return start;
2878   }
2879 
2880   // Arguments:
2881   //
2882   // Inputs:
2883   //   c_rarg0   - source byte array address
2884   //   c_rarg1   - destination byte array address
2885   //   c_rarg2   - K (key) in little endian int array
2886   //   c_rarg3   - r vector byte array address
2887   //   c_rarg4   - input length
2888   //
2889   // Output:
2890   //   r0        - input length
2891   //
2892   address generate_cipherBlockChaining_decryptAESCrypt() {
2893     assert(UseAES, "need AES cryptographic extension support");
2894     __ align(CodeEntryAlignment);
2895     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2896 
2897     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2898 
2899     const Register from        = c_rarg0;  // source array address
2900     const Register to          = c_rarg1;  // destination array address
2901     const Register key         = c_rarg2;  // key array address
2902     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2903                                            // and left with the results of the last encryption block
2904     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2905     const Register keylen      = rscratch1;
2906 
2907     address start = __ pc();
2908 
2909       __ enter();
2910 
2911       __ movw(rscratch2, len_reg);
2912 
2913       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2914 
2915       __ ld1(v2, __ T16B, rvec);
2916 
2917       __ ld1(v31, __ T16B, __ post(key, 16));
2918       __ rev32(v31, __ T16B, v31);
2919 
2920       __ cmpw(keylen, 52);
2921       __ br(Assembler::CC, L_loadkeys_44);
2922       __ br(Assembler::EQ, L_loadkeys_52);
2923 
2924       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2925       __ rev32(v17, __ T16B, v17);
2926       __ rev32(v18, __ T16B, v18);
2927     __ BIND(L_loadkeys_52);
2928       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2929       __ rev32(v19, __ T16B, v19);
2930       __ rev32(v20, __ T16B, v20);
2931     __ BIND(L_loadkeys_44);
2932       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2933       __ rev32(v21, __ T16B, v21);
2934       __ rev32(v22, __ T16B, v22);
2935       __ rev32(v23, __ T16B, v23);
2936       __ rev32(v24, __ T16B, v24);
2937       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2938       __ rev32(v25, __ T16B, v25);
2939       __ rev32(v26, __ T16B, v26);
2940       __ rev32(v27, __ T16B, v27);
2941       __ rev32(v28, __ T16B, v28);
2942       __ ld1(v29, v30, __ T16B, key);
2943       __ rev32(v29, __ T16B, v29);
2944       __ rev32(v30, __ T16B, v30);
2945 
2946     __ BIND(L_aes_loop);
2947       __ ld1(v0, __ T16B, __ post(from, 16));
2948       __ orr(v1, __ T16B, v0, v0);
2949 
2950       __ br(Assembler::CC, L_rounds_44);
2951       __ br(Assembler::EQ, L_rounds_52);
2952 
2953       __ aesd(v0, v17); __ aesimc(v0, v0);
2954       __ aesd(v0, v18); __ aesimc(v0, v0);
2955     __ BIND(L_rounds_52);
2956       __ aesd(v0, v19); __ aesimc(v0, v0);
2957       __ aesd(v0, v20); __ aesimc(v0, v0);
2958     __ BIND(L_rounds_44);
2959       __ aesd(v0, v21); __ aesimc(v0, v0);
2960       __ aesd(v0, v22); __ aesimc(v0, v0);
2961       __ aesd(v0, v23); __ aesimc(v0, v0);
2962       __ aesd(v0, v24); __ aesimc(v0, v0);
2963       __ aesd(v0, v25); __ aesimc(v0, v0);
2964       __ aesd(v0, v26); __ aesimc(v0, v0);
2965       __ aesd(v0, v27); __ aesimc(v0, v0);
2966       __ aesd(v0, v28); __ aesimc(v0, v0);
2967       __ aesd(v0, v29); __ aesimc(v0, v0);
2968       __ aesd(v0, v30);
2969       __ eor(v0, __ T16B, v0, v31);
2970       __ eor(v0, __ T16B, v0, v2);
2971 
2972       __ st1(v0, __ T16B, __ post(to, 16));
2973       __ orr(v2, __ T16B, v1, v1);
2974 
2975       __ subw(len_reg, len_reg, 16);
2976       __ cbnzw(len_reg, L_aes_loop);
2977 
2978       __ st1(v2, __ T16B, rvec);
2979 
2980       __ mov(r0, rscratch2);
2981 
2982       __ leave();
2983       __ ret(lr);
2984 
2985     return start;
2986   }
2987 
2988   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2989   // Inputs: 128-bits. in is preserved.
2990   // The least-significant 64-bit word is in the upper dword of each vector.
2991   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2992   // Output: result
2993   void be_add_128_64(FloatRegister result, FloatRegister in,
2994                      FloatRegister inc, FloatRegister tmp) {
2995     assert_different_registers(result, tmp, inc);
2996 
2997     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2998                                            // input
2999     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3000     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
3001                                            // MSD == 0 (must be!) to LSD
3002     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
3003   }
3004 
3005   // CTR AES crypt.
3006   // Arguments:
3007   //
3008   // Inputs:
3009   //   c_rarg0   - source byte array address
3010   //   c_rarg1   - destination byte array address
3011   //   c_rarg2   - K (key) in little endian int array
3012   //   c_rarg3   - counter vector byte array address
3013   //   c_rarg4   - input length
3014   //   c_rarg5   - saved encryptedCounter start
3015   //   c_rarg6   - saved used length
3016   //
3017   // Output:
3018   //   r0       - input length
3019   //
3020   address generate_counterMode_AESCrypt() {
3021     const Register in = c_rarg0;
3022     const Register out = c_rarg1;
3023     const Register key = c_rarg2;
3024     const Register counter = c_rarg3;
3025     const Register saved_len = c_rarg4, len = r10;
3026     const Register saved_encrypted_ctr = c_rarg5;
3027     const Register used_ptr = c_rarg6, used = r12;
3028 
3029     const Register offset = r7;
3030     const Register keylen = r11;
3031 
3032     const unsigned char block_size = 16;
3033     const int bulk_width = 4;
3034     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3035     // performance with larger data sizes, but it also means that the
3036     // fast path isn't used until you have at least 8 blocks, and up
3037     // to 127 bytes of data will be executed on the slow path. For
3038     // that reason, and also so as not to blow away too much icache, 4
3039     // blocks seems like a sensible compromise.
3040 
3041     // Algorithm:
3042     //
3043     //    if (len == 0) {
3044     //        goto DONE;
3045     //    }
3046     //    int result = len;
3047     //    do {
3048     //        if (used >= blockSize) {
3049     //            if (len >= bulk_width * blockSize) {
3050     //                CTR_large_block();
3051     //                if (len == 0)
3052     //                    goto DONE;
3053     //            }
3054     //            for (;;) {
3055     //                16ByteVector v0 = counter;
3056     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3057     //                used = 0;
3058     //                if (len < blockSize)
3059     //                    break;    /* goto NEXT */
3060     //                16ByteVector v1 = load16Bytes(in, offset);
3061     //                v1 = v1 ^ encryptedCounter;
3062     //                store16Bytes(out, offset);
3063     //                used = blockSize;
3064     //                offset += blockSize;
3065     //                len -= blockSize;
3066     //                if (len == 0)
3067     //                    goto DONE;
3068     //            }
3069     //        }
3070     //      NEXT:
3071     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3072     //        len--;
3073     //    } while (len != 0);
3074     //  DONE:
3075     //    return result;
3076     //
3077     // CTR_large_block()
3078     //    Wide bulk encryption of whole blocks.
3079 
3080     __ align(CodeEntryAlignment);
3081     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3082     const address start = __ pc();
3083     __ enter();
3084 
3085     Label DONE, CTR_large_block, large_block_return;
3086     __ ldrw(used, Address(used_ptr));
3087     __ cbzw(saved_len, DONE);
3088 
3089     __ mov(len, saved_len);
3090     __ mov(offset, 0);
3091 
3092     // Compute #rounds for AES based on the length of the key array
3093     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3094 
3095     __ aesenc_loadkeys(key, keylen);
3096 
3097     {
3098       Label L_CTR_loop, NEXT;
3099 
3100       __ bind(L_CTR_loop);
3101 
3102       __ cmp(used, block_size);
3103       __ br(__ LO, NEXT);
3104 
3105       // Maybe we have a lot of data
3106       __ subsw(rscratch1, len, bulk_width * block_size);
3107       __ br(__ HS, CTR_large_block);
3108       __ BIND(large_block_return);
3109       __ cbzw(len, DONE);
3110 
3111       // Setup the counter
3112       __ movi(v4, __ T4S, 0);
3113       __ movi(v5, __ T4S, 1);
3114       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3115 
3116       // 128-bit big-endian increment
3117       __ ld1(v0, __ T16B, counter);
3118       __ rev64(v16, __ T16B, v0);
3119       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3120       __ rev64(v16, __ T16B, v16);
3121       __ st1(v16, __ T16B, counter);
3122       // Previous counter value is in v0
3123       // v4 contains { 0, 1 }
3124 
3125       {
3126         // We have fewer than bulk_width blocks of data left. Encrypt
3127         // them one by one until there is less than a full block
3128         // remaining, being careful to save both the encrypted counter
3129         // and the counter.
3130 
3131         Label inner_loop;
3132         __ bind(inner_loop);
3133         // Counter to encrypt is in v0
3134         __ aesecb_encrypt(noreg, noreg, keylen);
3135         __ st1(v0, __ T16B, saved_encrypted_ctr);
3136 
3137         // Do we have a remaining full block?
3138 
3139         __ mov(used, 0);
3140         __ cmp(len, block_size);
3141         __ br(__ LO, NEXT);
3142 
3143         // Yes, we have a full block
3144         __ ldrq(v1, Address(in, offset));
3145         __ eor(v1, __ T16B, v1, v0);
3146         __ strq(v1, Address(out, offset));
3147         __ mov(used, block_size);
3148         __ add(offset, offset, block_size);
3149 
3150         __ subw(len, len, block_size);
3151         __ cbzw(len, DONE);
3152 
3153         // Increment the counter, store it back
3154         __ orr(v0, __ T16B, v16, v16);
3155         __ rev64(v16, __ T16B, v16);
3156         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3157         __ rev64(v16, __ T16B, v16);
3158         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3159 
3160         __ b(inner_loop);
3161       }
3162 
3163       __ BIND(NEXT);
3164 
3165       // Encrypt a single byte, and loop.
3166       // We expect this to be a rare event.
3167       __ ldrb(rscratch1, Address(in, offset));
3168       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3169       __ eor(rscratch1, rscratch1, rscratch2);
3170       __ strb(rscratch1, Address(out, offset));
3171       __ add(offset, offset, 1);
3172       __ add(used, used, 1);
3173       __ subw(len, len,1);
3174       __ cbnzw(len, L_CTR_loop);
3175     }
3176 
3177     __ bind(DONE);
3178     __ strw(used, Address(used_ptr));
3179     __ mov(r0, saved_len);
3180 
3181     __ leave(); // required for proper stackwalking of RuntimeStub frame
3182     __ ret(lr);
3183 
3184     // Bulk encryption
3185 
3186     __ BIND (CTR_large_block);
3187     assert(bulk_width == 4 || bulk_width == 8, "must be");
3188 
3189     if (bulk_width == 8) {
3190       __ sub(sp, sp, 4 * 16);
3191       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3192     }
3193     __ sub(sp, sp, 4 * 16);
3194     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3195     RegSet saved_regs = (RegSet::of(in, out, offset)
3196                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3197     __ push(saved_regs, sp);
3198     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3199     __ add(in, in, offset);
3200     __ add(out, out, offset);
3201 
3202     // Keys should already be loaded into the correct registers
3203 
3204     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3205     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3206 
3207     // AES/CTR loop
3208     {
3209       Label L_CTR_loop;
3210       __ BIND(L_CTR_loop);
3211 
3212       // Setup the counters
3213       __ movi(v8, __ T4S, 0);
3214       __ movi(v9, __ T4S, 1);
3215       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3216 
3217       for (int i = 0; i < bulk_width; i++) {
3218         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3219         __ rev64(v0_ofs, __ T16B, v16);
3220         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3221       }
3222 
3223       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3224 
3225       // Encrypt the counters
3226       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3227 
3228       if (bulk_width == 8) {
3229         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3230       }
3231 
3232       // XOR the encrypted counters with the inputs
3233       for (int i = 0; i < bulk_width; i++) {
3234         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3235         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3236         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3237       }
3238 
3239       // Write the encrypted data
3240       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3241       if (bulk_width == 8) {
3242         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3243       }
3244 
3245       __ subw(len, len, 16 * bulk_width);
3246       __ cbnzw(len, L_CTR_loop);
3247     }
3248 
3249     // Save the counter back where it goes
3250     __ rev64(v16, __ T16B, v16);
3251     __ st1(v16, __ T16B, counter);
3252 
3253     __ pop(saved_regs, sp);
3254 
3255     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3256     if (bulk_width == 8) {
3257       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3258     }
3259 
3260     __ andr(rscratch1, len, -16 * bulk_width);
3261     __ sub(len, len, rscratch1);
3262     __ add(offset, offset, rscratch1);
3263     __ mov(used, 16);
3264     __ strw(used, Address(used_ptr));
3265     __ b(large_block_return);
3266 
3267     return start;
3268   }
3269 
3270   // Vector AES Galois Counter Mode implementation. Parameters:
3271   //
3272   // in = c_rarg0
3273   // len = c_rarg1
3274   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3275   // out = c_rarg3
3276   // key = c_rarg4
3277   // state = c_rarg5 - GHASH.state
3278   // subkeyHtbl = c_rarg6 - powers of H
3279   // counter = c_rarg7 - 16 bytes of CTR
3280   // return - number of processed bytes
3281   address generate_galoisCounterMode_AESCrypt() {
3282     address ghash_polynomial = __ pc();
3283     __ emit_int64(0x87);  // The low-order bits of the field
3284                           // polynomial (i.e. p = z^7+z^2+z+1)
3285                           // repeated in the low and high parts of a
3286                           // 128-bit vector
3287     __ emit_int64(0x87);
3288 
3289     __ align(CodeEntryAlignment);
3290      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3291     address start = __ pc();
3292     __ enter();
3293 
3294     const Register in = c_rarg0;
3295     const Register len = c_rarg1;
3296     const Register ct = c_rarg2;
3297     const Register out = c_rarg3;
3298     // and updated with the incremented counter in the end
3299 
3300     const Register key = c_rarg4;
3301     const Register state = c_rarg5;
3302 
3303     const Register subkeyHtbl = c_rarg6;
3304 
3305     const Register counter = c_rarg7;
3306 
3307     const Register keylen = r10;
3308     // Save state before entering routine
3309     __ sub(sp, sp, 4 * 16);
3310     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3311     __ sub(sp, sp, 4 * 16);
3312     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3313 
3314     // __ andr(len, len, -512);
3315     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3316     __ str(len, __ pre(sp, -2 * wordSize));
3317 
3318     Label DONE;
3319     __ cbz(len, DONE);
3320 
3321     // Compute #rounds for AES based on the length of the key array
3322     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3323 
3324     __ aesenc_loadkeys(key, keylen);
3325     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3326     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3327 
3328     // AES/CTR loop
3329     {
3330       Label L_CTR_loop;
3331       __ BIND(L_CTR_loop);
3332 
3333       // Setup the counters
3334       __ movi(v8, __ T4S, 0);
3335       __ movi(v9, __ T4S, 1);
3336       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3337 
3338       assert(v0->encoding() < v8->encoding(), "");
3339       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3340         FloatRegister f = as_FloatRegister(i);
3341         __ rev32(f, __ T16B, v16);
3342         __ addv(v16, __ T4S, v16, v8);
3343       }
3344 
3345       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3346 
3347       // Encrypt the counters
3348       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3349 
3350       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3351 
3352       // XOR the encrypted counters with the inputs
3353       for (int i = 0; i < 8; i++) {
3354         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3355         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3356         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3357       }
3358       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3359       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3360 
3361       __ subw(len, len, 16 * 8);
3362       __ cbnzw(len, L_CTR_loop);
3363     }
3364 
3365     __ rev32(v16, __ T16B, v16);
3366     __ st1(v16, __ T16B, counter);
3367 
3368     __ ldr(len, Address(sp));
3369     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3370 
3371     // GHASH/CTR loop
3372     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3373                                 len, /*unrolls*/4);
3374 
3375 #ifdef ASSERT
3376     { Label L;
3377       __ cmp(len, (unsigned char)0);
3378       __ br(Assembler::EQ, L);
3379       __ stop("stubGenerator: abort");
3380       __ bind(L);
3381   }
3382 #endif
3383 
3384   __ bind(DONE);
3385     // Return the number of bytes processed
3386     __ ldr(r0, __ post(sp, 2 * wordSize));
3387 
3388     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3389     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3390 
3391     __ leave(); // required for proper stackwalking of RuntimeStub frame
3392     __ ret(lr);
3393      return start;
3394   }
3395 
3396   class Cached64Bytes {
3397   private:
3398     MacroAssembler *_masm;
3399     Register _regs[8];
3400 
3401   public:
3402     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3403       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3404       auto it = rs.begin();
3405       for (auto &r: _regs) {
3406         r = *it;
3407         ++it;
3408       }
3409     }
3410 
3411     void gen_loads(Register base) {
3412       for (int i = 0; i < 8; i += 2) {
3413         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3414       }
3415     }
3416 
3417     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3418     void extract_u32(Register dest, int i) {
3419       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3420     }
3421   };
3422 
3423   // Utility routines for md5.
3424   // Clobbers r10 and r11.
3425   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3426               int k, int s, int t) {
3427     Register rscratch3 = r10;
3428     Register rscratch4 = r11;
3429 
3430     __ eorw(rscratch3, r3, r4);
3431     __ movw(rscratch2, t);
3432     __ andw(rscratch3, rscratch3, r2);
3433     __ addw(rscratch4, r1, rscratch2);
3434     reg_cache.extract_u32(rscratch1, k);
3435     __ eorw(rscratch3, rscratch3, r4);
3436     __ addw(rscratch4, rscratch4, rscratch1);
3437     __ addw(rscratch3, rscratch3, rscratch4);
3438     __ rorw(rscratch2, rscratch3, 32 - s);
3439     __ addw(r1, rscratch2, r2);
3440   }
3441 
3442   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3443               int k, int s, int t) {
3444     Register rscratch3 = r10;
3445     Register rscratch4 = r11;
3446 
3447     __ andw(rscratch3, r2, r4);
3448     __ bicw(rscratch4, r3, r4);
3449     reg_cache.extract_u32(rscratch1, k);
3450     __ movw(rscratch2, t);
3451     __ orrw(rscratch3, rscratch3, rscratch4);
3452     __ addw(rscratch4, r1, rscratch2);
3453     __ addw(rscratch4, rscratch4, rscratch1);
3454     __ addw(rscratch3, rscratch3, rscratch4);
3455     __ rorw(rscratch2, rscratch3, 32 - s);
3456     __ addw(r1, rscratch2, r2);
3457   }
3458 
3459   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3460               int k, int s, int t) {
3461     Register rscratch3 = r10;
3462     Register rscratch4 = r11;
3463 
3464     __ eorw(rscratch3, r3, r4);
3465     __ movw(rscratch2, t);
3466     __ addw(rscratch4, r1, rscratch2);
3467     reg_cache.extract_u32(rscratch1, k);
3468     __ eorw(rscratch3, rscratch3, r2);
3469     __ addw(rscratch4, rscratch4, rscratch1);
3470     __ addw(rscratch3, rscratch3, rscratch4);
3471     __ rorw(rscratch2, rscratch3, 32 - s);
3472     __ addw(r1, rscratch2, r2);
3473   }
3474 
3475   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3476               int k, int s, int t) {
3477     Register rscratch3 = r10;
3478     Register rscratch4 = r11;
3479 
3480     __ movw(rscratch3, t);
3481     __ ornw(rscratch2, r2, r4);
3482     __ addw(rscratch4, r1, rscratch3);
3483     reg_cache.extract_u32(rscratch1, k);
3484     __ eorw(rscratch3, rscratch2, r3);
3485     __ addw(rscratch4, rscratch4, rscratch1);
3486     __ addw(rscratch3, rscratch3, rscratch4);
3487     __ rorw(rscratch2, rscratch3, 32 - s);
3488     __ addw(r1, rscratch2, r2);
3489   }
3490 
3491   // Arguments:
3492   //
3493   // Inputs:
3494   //   c_rarg0   - byte[]  source+offset
3495   //   c_rarg1   - int[]   SHA.state
3496   //   c_rarg2   - int     offset
3497   //   c_rarg3   - int     limit
3498   //
3499   address generate_md5_implCompress(bool multi_block, const char *name) {
3500     __ align(CodeEntryAlignment);
3501     StubCodeMark mark(this, "StubRoutines", name);
3502     address start = __ pc();
3503 
3504     Register buf       = c_rarg0;
3505     Register state     = c_rarg1;
3506     Register ofs       = c_rarg2;
3507     Register limit     = c_rarg3;
3508     Register a         = r4;
3509     Register b         = r5;
3510     Register c         = r6;
3511     Register d         = r7;
3512     Register rscratch3 = r10;
3513     Register rscratch4 = r11;
3514 
3515     Register state_regs[2] = { r12, r13 };
3516     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3517     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3518 
3519     __ push(saved_regs, sp);
3520 
3521     __ ldp(state_regs[0], state_regs[1], Address(state));
3522     __ ubfx(a, state_regs[0],  0, 32);
3523     __ ubfx(b, state_regs[0], 32, 32);
3524     __ ubfx(c, state_regs[1],  0, 32);
3525     __ ubfx(d, state_regs[1], 32, 32);
3526 
3527     Label md5_loop;
3528     __ BIND(md5_loop);
3529 
3530     reg_cache.gen_loads(buf);
3531 
3532     // Round 1
3533     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3534     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3535     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3536     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3537     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3538     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3539     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3540     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3541     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3542     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3543     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3544     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3545     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3546     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3547     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3548     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3549 
3550     // Round 2
3551     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3552     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3553     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3554     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3555     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3556     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3557     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3558     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3559     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3560     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3561     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3562     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3563     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3564     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3565     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3566     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3567 
3568     // Round 3
3569     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3570     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3571     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3572     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3573     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3574     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3575     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3576     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3577     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3578     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3579     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3580     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3581     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3582     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3583     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3584     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3585 
3586     // Round 4
3587     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3588     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3589     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3590     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3591     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3592     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3593     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3594     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3595     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3596     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3597     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3598     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3599     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3600     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3601     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3602     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3603 
3604     __ addw(a, state_regs[0], a);
3605     __ ubfx(rscratch2, state_regs[0], 32, 32);
3606     __ addw(b, rscratch2, b);
3607     __ addw(c, state_regs[1], c);
3608     __ ubfx(rscratch4, state_regs[1], 32, 32);
3609     __ addw(d, rscratch4, d);
3610 
3611     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3612     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3613 
3614     if (multi_block) {
3615       __ add(buf, buf, 64);
3616       __ add(ofs, ofs, 64);
3617       __ cmp(ofs, limit);
3618       __ br(Assembler::LE, md5_loop);
3619       __ mov(c_rarg0, ofs); // return ofs
3620     }
3621 
3622     // write hash values back in the correct order
3623     __ stp(state_regs[0], state_regs[1], Address(state));
3624 
3625     __ pop(saved_regs, sp);
3626 
3627     __ ret(lr);
3628 
3629     return start;
3630   }
3631 
3632   // Arguments:
3633   //
3634   // Inputs:
3635   //   c_rarg0   - byte[]  source+offset
3636   //   c_rarg1   - int[]   SHA.state
3637   //   c_rarg2   - int     offset
3638   //   c_rarg3   - int     limit
3639   //
3640   address generate_sha1_implCompress(bool multi_block, const char *name) {
3641     __ align(CodeEntryAlignment);
3642     StubCodeMark mark(this, "StubRoutines", name);
3643     address start = __ pc();
3644 
3645     Register buf   = c_rarg0;
3646     Register state = c_rarg1;
3647     Register ofs   = c_rarg2;
3648     Register limit = c_rarg3;
3649 
3650     Label keys;
3651     Label sha1_loop;
3652 
3653     // load the keys into v0..v3
3654     __ adr(rscratch1, keys);
3655     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3656     // load 5 words state into v6, v7
3657     __ ldrq(v6, Address(state, 0));
3658     __ ldrs(v7, Address(state, 16));
3659 
3660 
3661     __ BIND(sha1_loop);
3662     // load 64 bytes of data into v16..v19
3663     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3664     __ rev32(v16, __ T16B, v16);
3665     __ rev32(v17, __ T16B, v17);
3666     __ rev32(v18, __ T16B, v18);
3667     __ rev32(v19, __ T16B, v19);
3668 
3669     // do the sha1
3670     __ addv(v4, __ T4S, v16, v0);
3671     __ orr(v20, __ T16B, v6, v6);
3672 
3673     FloatRegister d0 = v16;
3674     FloatRegister d1 = v17;
3675     FloatRegister d2 = v18;
3676     FloatRegister d3 = v19;
3677 
3678     for (int round = 0; round < 20; round++) {
3679       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3680       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3681       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3682       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3683       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3684 
3685       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3686       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3687       __ sha1h(tmp2, __ T4S, v20);
3688       if (round < 5)
3689         __ sha1c(v20, __ T4S, tmp3, tmp4);
3690       else if (round < 10 || round >= 15)
3691         __ sha1p(v20, __ T4S, tmp3, tmp4);
3692       else
3693         __ sha1m(v20, __ T4S, tmp3, tmp4);
3694       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3695 
3696       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3697     }
3698 
3699     __ addv(v7, __ T2S, v7, v21);
3700     __ addv(v6, __ T4S, v6, v20);
3701 
3702     if (multi_block) {
3703       __ add(ofs, ofs, 64);
3704       __ cmp(ofs, limit);
3705       __ br(Assembler::LE, sha1_loop);
3706       __ mov(c_rarg0, ofs); // return ofs
3707     }
3708 
3709     __ strq(v6, Address(state, 0));
3710     __ strs(v7, Address(state, 16));
3711 
3712     __ ret(lr);
3713 
3714     __ bind(keys);
3715     __ emit_int32(0x5a827999);
3716     __ emit_int32(0x6ed9eba1);
3717     __ emit_int32(0x8f1bbcdc);
3718     __ emit_int32(0xca62c1d6);
3719 
3720     return start;
3721   }
3722 
3723 
3724   // Arguments:
3725   //
3726   // Inputs:
3727   //   c_rarg0   - byte[]  source+offset
3728   //   c_rarg1   - int[]   SHA.state
3729   //   c_rarg2   - int     offset
3730   //   c_rarg3   - int     limit
3731   //
3732   address generate_sha256_implCompress(bool multi_block, const char *name) {
3733     static const uint32_t round_consts[64] = {
3734       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3735       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3736       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3737       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3738       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3739       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3740       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3741       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3742       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3743       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3744       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3745       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3746       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3747       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3748       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3749       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3750     };
3751     __ align(CodeEntryAlignment);
3752     StubCodeMark mark(this, "StubRoutines", name);
3753     address start = __ pc();
3754 
3755     Register buf   = c_rarg0;
3756     Register state = c_rarg1;
3757     Register ofs   = c_rarg2;
3758     Register limit = c_rarg3;
3759 
3760     Label sha1_loop;
3761 
3762     __ stpd(v8, v9, __ pre(sp, -32));
3763     __ stpd(v10, v11, Address(sp, 16));
3764 
3765 // dga == v0
3766 // dgb == v1
3767 // dg0 == v2
3768 // dg1 == v3
3769 // dg2 == v4
3770 // t0 == v6
3771 // t1 == v7
3772 
3773     // load 16 keys to v16..v31
3774     __ lea(rscratch1, ExternalAddress((address)round_consts));
3775     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3776     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3777     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3778     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3779 
3780     // load 8 words (256 bits) state
3781     __ ldpq(v0, v1, state);
3782 
3783     __ BIND(sha1_loop);
3784     // load 64 bytes of data into v8..v11
3785     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3786     __ rev32(v8, __ T16B, v8);
3787     __ rev32(v9, __ T16B, v9);
3788     __ rev32(v10, __ T16B, v10);
3789     __ rev32(v11, __ T16B, v11);
3790 
3791     __ addv(v6, __ T4S, v8, v16);
3792     __ orr(v2, __ T16B, v0, v0);
3793     __ orr(v3, __ T16B, v1, v1);
3794 
3795     FloatRegister d0 = v8;
3796     FloatRegister d1 = v9;
3797     FloatRegister d2 = v10;
3798     FloatRegister d3 = v11;
3799 
3800 
3801     for (int round = 0; round < 16; round++) {
3802       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3803       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3804       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3805       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3806 
3807       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3808        __ orr(v4, __ T16B, v2, v2);
3809       if (round < 15)
3810         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3811       __ sha256h(v2, __ T4S, v3, tmp2);
3812       __ sha256h2(v3, __ T4S, v4, tmp2);
3813       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3814 
3815       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3816     }
3817 
3818     __ addv(v0, __ T4S, v0, v2);
3819     __ addv(v1, __ T4S, v1, v3);
3820 
3821     if (multi_block) {
3822       __ add(ofs, ofs, 64);
3823       __ cmp(ofs, limit);
3824       __ br(Assembler::LE, sha1_loop);
3825       __ mov(c_rarg0, ofs); // return ofs
3826     }
3827 
3828     __ ldpd(v10, v11, Address(sp, 16));
3829     __ ldpd(v8, v9, __ post(sp, 32));
3830 
3831     __ stpq(v0, v1, state);
3832 
3833     __ ret(lr);
3834 
3835     return start;
3836   }
3837 
3838   // Double rounds for sha512.
3839   void sha512_dround(int dr,
3840                      FloatRegister vi0, FloatRegister vi1,
3841                      FloatRegister vi2, FloatRegister vi3,
3842                      FloatRegister vi4, FloatRegister vrc0,
3843                      FloatRegister vrc1, FloatRegister vin0,
3844                      FloatRegister vin1, FloatRegister vin2,
3845                      FloatRegister vin3, FloatRegister vin4) {
3846       if (dr < 36) {
3847         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3848       }
3849       __ addv(v5, __ T2D, vrc0, vin0);
3850       __ ext(v6, __ T16B, vi2, vi3, 8);
3851       __ ext(v5, __ T16B, v5, v5, 8);
3852       __ ext(v7, __ T16B, vi1, vi2, 8);
3853       __ addv(vi3, __ T2D, vi3, v5);
3854       if (dr < 32) {
3855         __ ext(v5, __ T16B, vin3, vin4, 8);
3856         __ sha512su0(vin0, __ T2D, vin1);
3857       }
3858       __ sha512h(vi3, __ T2D, v6, v7);
3859       if (dr < 32) {
3860         __ sha512su1(vin0, __ T2D, vin2, v5);
3861       }
3862       __ addv(vi4, __ T2D, vi1, vi3);
3863       __ sha512h2(vi3, __ T2D, vi1, vi0);
3864   }
3865 
3866   // Arguments:
3867   //
3868   // Inputs:
3869   //   c_rarg0   - byte[]  source+offset
3870   //   c_rarg1   - int[]   SHA.state
3871   //   c_rarg2   - int     offset
3872   //   c_rarg3   - int     limit
3873   //
3874   address generate_sha512_implCompress(bool multi_block, const char *name) {
3875     static const uint64_t round_consts[80] = {
3876       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3877       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3878       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3879       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3880       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3881       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3882       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3883       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3884       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3885       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3886       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3887       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3888       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3889       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3890       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3891       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3892       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3893       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3894       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3895       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3896       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3897       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3898       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3899       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3900       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3901       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3902       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3903     };
3904 
3905     __ align(CodeEntryAlignment);
3906     StubCodeMark mark(this, "StubRoutines", name);
3907     address start = __ pc();
3908 
3909     Register buf   = c_rarg0;
3910     Register state = c_rarg1;
3911     Register ofs   = c_rarg2;
3912     Register limit = c_rarg3;
3913 
3914     __ stpd(v8, v9, __ pre(sp, -64));
3915     __ stpd(v10, v11, Address(sp, 16));
3916     __ stpd(v12, v13, Address(sp, 32));
3917     __ stpd(v14, v15, Address(sp, 48));
3918 
3919     Label sha512_loop;
3920 
3921     // load state
3922     __ ld1(v8, v9, v10, v11, __ T2D, state);
3923 
3924     // load first 4 round constants
3925     __ lea(rscratch1, ExternalAddress((address)round_consts));
3926     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3927 
3928     __ BIND(sha512_loop);
3929     // load 128B of data into v12..v19
3930     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3931     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3932     __ rev64(v12, __ T16B, v12);
3933     __ rev64(v13, __ T16B, v13);
3934     __ rev64(v14, __ T16B, v14);
3935     __ rev64(v15, __ T16B, v15);
3936     __ rev64(v16, __ T16B, v16);
3937     __ rev64(v17, __ T16B, v17);
3938     __ rev64(v18, __ T16B, v18);
3939     __ rev64(v19, __ T16B, v19);
3940 
3941     __ mov(rscratch2, rscratch1);
3942 
3943     __ mov(v0, __ T16B, v8);
3944     __ mov(v1, __ T16B, v9);
3945     __ mov(v2, __ T16B, v10);
3946     __ mov(v3, __ T16B, v11);
3947 
3948     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3949     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3950     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3951     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3952     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3953     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3954     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3955     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3956     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3957     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3958     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3959     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3960     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3961     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3962     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3963     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3964     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3965     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3966     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3967     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3968     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3969     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3970     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3971     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3972     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3973     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3974     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3975     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3976     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3977     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3978     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3979     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3980     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3981     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3982     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3983     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3984     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3985     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3986     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3987     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3988 
3989     __ addv(v8, __ T2D, v8, v0);
3990     __ addv(v9, __ T2D, v9, v1);
3991     __ addv(v10, __ T2D, v10, v2);
3992     __ addv(v11, __ T2D, v11, v3);
3993 
3994     if (multi_block) {
3995       __ add(ofs, ofs, 128);
3996       __ cmp(ofs, limit);
3997       __ br(Assembler::LE, sha512_loop);
3998       __ mov(c_rarg0, ofs); // return ofs
3999     }
4000 
4001     __ st1(v8, v9, v10, v11, __ T2D, state);
4002 
4003     __ ldpd(v14, v15, Address(sp, 48));
4004     __ ldpd(v12, v13, Address(sp, 32));
4005     __ ldpd(v10, v11, Address(sp, 16));
4006     __ ldpd(v8, v9, __ post(sp, 64));
4007 
4008     __ ret(lr);
4009 
4010     return start;
4011   }
4012 
4013   // Arguments:
4014   //
4015   // Inputs:
4016   //   c_rarg0   - byte[]  source+offset
4017   //   c_rarg1   - byte[]  SHA.state
4018   //   c_rarg2   - int     block_size
4019   //   c_rarg3   - int     offset
4020   //   c_rarg4   - int     limit
4021   //
4022   address generate_sha3_implCompress(bool multi_block, const char *name) {
4023     static const uint64_t round_consts[24] = {
4024       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4025       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4026       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4027       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4028       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4029       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4030       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4031       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4032     };
4033 
4034     __ align(CodeEntryAlignment);
4035     StubCodeMark mark(this, "StubRoutines", name);
4036     address start = __ pc();
4037 
4038     Register buf           = c_rarg0;
4039     Register state         = c_rarg1;
4040     Register block_size    = c_rarg2;
4041     Register ofs           = c_rarg3;
4042     Register limit         = c_rarg4;
4043 
4044     Label sha3_loop, rounds24_loop;
4045     Label sha3_512_or_sha3_384, shake128;
4046 
4047     __ stpd(v8, v9, __ pre(sp, -64));
4048     __ stpd(v10, v11, Address(sp, 16));
4049     __ stpd(v12, v13, Address(sp, 32));
4050     __ stpd(v14, v15, Address(sp, 48));
4051 
4052     // load state
4053     __ add(rscratch1, state, 32);
4054     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4055     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4056     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4057     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4058     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4059     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4060     __ ld1(v24, __ T1D, rscratch1);
4061 
4062     __ BIND(sha3_loop);
4063 
4064     // 24 keccak rounds
4065     __ movw(rscratch2, 24);
4066 
4067     // load round_constants base
4068     __ lea(rscratch1, ExternalAddress((address) round_consts));
4069 
4070     // load input
4071     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4072     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4073     __ eor(v0, __ T8B, v0, v25);
4074     __ eor(v1, __ T8B, v1, v26);
4075     __ eor(v2, __ T8B, v2, v27);
4076     __ eor(v3, __ T8B, v3, v28);
4077     __ eor(v4, __ T8B, v4, v29);
4078     __ eor(v5, __ T8B, v5, v30);
4079     __ eor(v6, __ T8B, v6, v31);
4080 
4081     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4082     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4083 
4084     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4085     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4086     __ eor(v7, __ T8B, v7, v25);
4087     __ eor(v8, __ T8B, v8, v26);
4088     __ eor(v9, __ T8B, v9, v27);
4089     __ eor(v10, __ T8B, v10, v28);
4090     __ eor(v11, __ T8B, v11, v29);
4091     __ eor(v12, __ T8B, v12, v30);
4092     __ eor(v13, __ T8B, v13, v31);
4093 
4094     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4095     __ eor(v14, __ T8B, v14, v25);
4096     __ eor(v15, __ T8B, v15, v26);
4097     __ eor(v16, __ T8B, v16, v27);
4098 
4099     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4100     __ andw(c_rarg5, block_size, 48);
4101     __ cbzw(c_rarg5, rounds24_loop);
4102 
4103     __ tbnz(block_size, 5, shake128);
4104     // block_size == 144, bit5 == 0, SHA3-244
4105     __ ldrd(v28, __ post(buf, 8));
4106     __ eor(v17, __ T8B, v17, v28);
4107     __ b(rounds24_loop);
4108 
4109     __ BIND(shake128);
4110     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4111     __ eor(v17, __ T8B, v17, v28);
4112     __ eor(v18, __ T8B, v18, v29);
4113     __ eor(v19, __ T8B, v19, v30);
4114     __ eor(v20, __ T8B, v20, v31);
4115     __ b(rounds24_loop); // block_size == 168, SHAKE128
4116 
4117     __ BIND(sha3_512_or_sha3_384);
4118     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4119     __ eor(v7, __ T8B, v7, v25);
4120     __ eor(v8, __ T8B, v8, v26);
4121     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4122 
4123     // SHA3-384
4124     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4125     __ eor(v9,  __ T8B, v9,  v27);
4126     __ eor(v10, __ T8B, v10, v28);
4127     __ eor(v11, __ T8B, v11, v29);
4128     __ eor(v12, __ T8B, v12, v30);
4129 
4130     __ BIND(rounds24_loop);
4131     __ subw(rscratch2, rscratch2, 1);
4132 
4133     __ eor3(v29, __ T16B, v4, v9, v14);
4134     __ eor3(v26, __ T16B, v1, v6, v11);
4135     __ eor3(v28, __ T16B, v3, v8, v13);
4136     __ eor3(v25, __ T16B, v0, v5, v10);
4137     __ eor3(v27, __ T16B, v2, v7, v12);
4138     __ eor3(v29, __ T16B, v29, v19, v24);
4139     __ eor3(v26, __ T16B, v26, v16, v21);
4140     __ eor3(v28, __ T16B, v28, v18, v23);
4141     __ eor3(v25, __ T16B, v25, v15, v20);
4142     __ eor3(v27, __ T16B, v27, v17, v22);
4143 
4144     __ rax1(v30, __ T2D, v29, v26);
4145     __ rax1(v26, __ T2D, v26, v28);
4146     __ rax1(v28, __ T2D, v28, v25);
4147     __ rax1(v25, __ T2D, v25, v27);
4148     __ rax1(v27, __ T2D, v27, v29);
4149 
4150     __ eor(v0, __ T16B, v0, v30);
4151     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4152     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4153     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4154     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4155     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4156     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4157     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4158     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4159     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4160     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4161     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4162     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4163     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4164     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4165     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4166     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4167     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4168     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4169     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4170     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4171     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4172     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4173     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4174     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4175 
4176     __ bcax(v20, __ T16B, v31, v22, v8);
4177     __ bcax(v21, __ T16B, v8,  v23, v22);
4178     __ bcax(v22, __ T16B, v22, v24, v23);
4179     __ bcax(v23, __ T16B, v23, v31, v24);
4180     __ bcax(v24, __ T16B, v24, v8,  v31);
4181 
4182     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4183 
4184     __ bcax(v17, __ T16B, v25, v19, v3);
4185     __ bcax(v18, __ T16B, v3,  v15, v19);
4186     __ bcax(v19, __ T16B, v19, v16, v15);
4187     __ bcax(v15, __ T16B, v15, v25, v16);
4188     __ bcax(v16, __ T16B, v16, v3,  v25);
4189 
4190     __ bcax(v10, __ T16B, v29, v12, v26);
4191     __ bcax(v11, __ T16B, v26, v13, v12);
4192     __ bcax(v12, __ T16B, v12, v14, v13);
4193     __ bcax(v13, __ T16B, v13, v29, v14);
4194     __ bcax(v14, __ T16B, v14, v26, v29);
4195 
4196     __ bcax(v7, __ T16B, v30, v9,  v4);
4197     __ bcax(v8, __ T16B, v4,  v5,  v9);
4198     __ bcax(v9, __ T16B, v9,  v6,  v5);
4199     __ bcax(v5, __ T16B, v5,  v30, v6);
4200     __ bcax(v6, __ T16B, v6,  v4,  v30);
4201 
4202     __ bcax(v3, __ T16B, v27, v0,  v28);
4203     __ bcax(v4, __ T16B, v28, v1,  v0);
4204     __ bcax(v0, __ T16B, v0,  v2,  v1);
4205     __ bcax(v1, __ T16B, v1,  v27, v2);
4206     __ bcax(v2, __ T16B, v2,  v28, v27);
4207 
4208     __ eor(v0, __ T16B, v0, v31);
4209 
4210     __ cbnzw(rscratch2, rounds24_loop);
4211 
4212     if (multi_block) {
4213       __ add(ofs, ofs, block_size);
4214       __ cmp(ofs, limit);
4215       __ br(Assembler::LE, sha3_loop);
4216       __ mov(c_rarg0, ofs); // return ofs
4217     }
4218 
4219     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4220     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4221     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4222     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4223     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4224     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4225     __ st1(v24, __ T1D, state);
4226 
4227     __ ldpd(v14, v15, Address(sp, 48));
4228     __ ldpd(v12, v13, Address(sp, 32));
4229     __ ldpd(v10, v11, Address(sp, 16));
4230     __ ldpd(v8, v9, __ post(sp, 64));
4231 
4232     __ ret(lr);
4233 
4234     return start;
4235   }
4236 
4237   /**
4238    *  Arguments:
4239    *
4240    * Inputs:
4241    *   c_rarg0   - int crc
4242    *   c_rarg1   - byte* buf
4243    *   c_rarg2   - int length
4244    *
4245    * Output:
4246    *       rax   - int crc result
4247    */
4248   address generate_updateBytesCRC32() {
4249     assert(UseCRC32Intrinsics, "what are we doing here?");
4250 
4251     __ align(CodeEntryAlignment);
4252     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4253 
4254     address start = __ pc();
4255 
4256     const Register crc   = c_rarg0;  // crc
4257     const Register buf   = c_rarg1;  // source java byte array address
4258     const Register len   = c_rarg2;  // length
4259     const Register table0 = c_rarg3; // crc_table address
4260     const Register table1 = c_rarg4;
4261     const Register table2 = c_rarg5;
4262     const Register table3 = c_rarg6;
4263     const Register tmp3 = c_rarg7;
4264 
4265     BLOCK_COMMENT("Entry:");
4266     __ enter(); // required for proper stackwalking of RuntimeStub frame
4267 
4268     __ kernel_crc32(crc, buf, len,
4269               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4270 
4271     __ leave(); // required for proper stackwalking of RuntimeStub frame
4272     __ ret(lr);
4273 
4274     return start;
4275   }
4276 
4277   // ChaCha20 block function.  This version parallelizes by loading
4278   // individual 32-bit state elements into vectors for four blocks
4279   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4280   //
4281   // state (int[16]) = c_rarg0
4282   // keystream (byte[1024]) = c_rarg1
4283   // return - number of bytes of keystream (always 256)
4284   address generate_chacha20Block_blockpar() {
4285     Label L_twoRounds, L_cc20_const;
4286     // The constant data is broken into two 128-bit segments to be loaded
4287     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4288     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4289     // The second 128-bits is a table constant used for 8-bit left rotations.
4290     __ BIND(L_cc20_const);
4291     __ emit_int64(0x0000000100000000UL);
4292     __ emit_int64(0x0000000300000002UL);
4293     __ emit_int64(0x0605040702010003UL);
4294     __ emit_int64(0x0E0D0C0F0A09080BUL);
4295 
4296     __ align(CodeEntryAlignment);
4297     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4298     address start = __ pc();
4299     __ enter();
4300 
4301     int i, j;
4302     const Register state = c_rarg0;
4303     const Register keystream = c_rarg1;
4304     const Register loopCtr = r10;
4305     const Register tmpAddr = r11;
4306 
4307     const FloatRegister stateFirst = v0;
4308     const FloatRegister stateSecond = v1;
4309     const FloatRegister stateThird = v2;
4310     const FloatRegister stateFourth = v3;
4311     const FloatRegister origCtrState = v28;
4312     const FloatRegister scratch = v29;
4313     const FloatRegister lrot8Tbl = v30;
4314 
4315     // Organize SIMD registers in an array that facilitates
4316     // putting repetitive opcodes into loop structures.  It is
4317     // important that each grouping of 4 registers is monotonically
4318     // increasing to support the requirements of multi-register
4319     // instructions (e.g. ld4r, st4, etc.)
4320     const FloatRegister workSt[16] = {
4321          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4322         v20, v21, v22, v23, v24, v25, v26, v27
4323     };
4324 
4325     // Load from memory and interlace across 16 SIMD registers,
4326     // With each word from memory being broadcast to all lanes of
4327     // each successive SIMD register.
4328     //      Addr(0) -> All lanes in workSt[i]
4329     //      Addr(4) -> All lanes workSt[i + 1], etc.
4330     __ mov(tmpAddr, state);
4331     for (i = 0; i < 16; i += 4) {
4332       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4333           __ post(tmpAddr, 16));
4334     }
4335 
4336     // Pull in constant data.  The first 16 bytes are the add overlay
4337     // which is applied to the vector holding the counter (state[12]).
4338     // The second 16 bytes is the index register for the 8-bit left
4339     // rotation tbl instruction.
4340     __ adr(tmpAddr, L_cc20_const);
4341     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4342     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4343 
4344     // Set up the 10 iteration loop and perform all 8 quarter round ops
4345     __ mov(loopCtr, 10);
4346     __ BIND(L_twoRounds);
4347 
4348     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4349         scratch, lrot8Tbl);
4350     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4351         scratch, lrot8Tbl);
4352     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4353         scratch, lrot8Tbl);
4354     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4355         scratch, lrot8Tbl);
4356 
4357     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4358         scratch, lrot8Tbl);
4359     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4360         scratch, lrot8Tbl);
4361     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4362         scratch, lrot8Tbl);
4363     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4364         scratch, lrot8Tbl);
4365 
4366     // Decrement and iterate
4367     __ sub(loopCtr, loopCtr, 1);
4368     __ cbnz(loopCtr, L_twoRounds);
4369 
4370     __ mov(tmpAddr, state);
4371 
4372     // Add the starting state back to the post-loop keystream
4373     // state.  We read/interlace the state array from memory into
4374     // 4 registers similar to what we did in the beginning.  Then
4375     // add the counter overlay onto workSt[12] at the end.
4376     for (i = 0; i < 16; i += 4) {
4377       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4378           __ post(tmpAddr, 16));
4379       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4380       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4381       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4382       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4383     }
4384     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4385 
4386     // Write to key stream, storing the same element out of workSt[0..15]
4387     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4388     // for the next element position.
4389     for (i = 0; i < 4; i++) {
4390       for (j = 0; j < 16; j += 4) {
4391         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4392             __ post(keystream, 16));
4393       }
4394     }
4395 
4396     __ mov(r0, 256);             // Return length of output keystream
4397     __ leave();
4398     __ ret(lr);
4399 
4400     return start;
4401   }
4402 
4403   /**
4404    *  Arguments:
4405    *
4406    * Inputs:
4407    *   c_rarg0   - int crc
4408    *   c_rarg1   - byte* buf
4409    *   c_rarg2   - int length
4410    *   c_rarg3   - int* table
4411    *
4412    * Output:
4413    *       r0   - int crc result
4414    */
4415   address generate_updateBytesCRC32C() {
4416     assert(UseCRC32CIntrinsics, "what are we doing here?");
4417 
4418     __ align(CodeEntryAlignment);
4419     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4420 
4421     address start = __ pc();
4422 
4423     const Register crc   = c_rarg0;  // crc
4424     const Register buf   = c_rarg1;  // source java byte array address
4425     const Register len   = c_rarg2;  // length
4426     const Register table0 = c_rarg3; // crc_table address
4427     const Register table1 = c_rarg4;
4428     const Register table2 = c_rarg5;
4429     const Register table3 = c_rarg6;
4430     const Register tmp3 = c_rarg7;
4431 
4432     BLOCK_COMMENT("Entry:");
4433     __ enter(); // required for proper stackwalking of RuntimeStub frame
4434 
4435     __ kernel_crc32c(crc, buf, len,
4436               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4437 
4438     __ leave(); // required for proper stackwalking of RuntimeStub frame
4439     __ ret(lr);
4440 
4441     return start;
4442   }
4443 
4444   /***
4445    *  Arguments:
4446    *
4447    *  Inputs:
4448    *   c_rarg0   - int   adler
4449    *   c_rarg1   - byte* buff
4450    *   c_rarg2   - int   len
4451    *
4452    * Output:
4453    *   c_rarg0   - int adler result
4454    */
4455   address generate_updateBytesAdler32() {
4456     __ align(CodeEntryAlignment);
4457     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4458     address start = __ pc();
4459 
4460     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4461 
4462     // Aliases
4463     Register adler  = c_rarg0;
4464     Register s1     = c_rarg0;
4465     Register s2     = c_rarg3;
4466     Register buff   = c_rarg1;
4467     Register len    = c_rarg2;
4468     Register nmax  = r4;
4469     Register base  = r5;
4470     Register count = r6;
4471     Register temp0 = rscratch1;
4472     Register temp1 = rscratch2;
4473     FloatRegister vbytes = v0;
4474     FloatRegister vs1acc = v1;
4475     FloatRegister vs2acc = v2;
4476     FloatRegister vtable = v3;
4477 
4478     // Max number of bytes we can process before having to take the mod
4479     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4480     uint64_t BASE = 0xfff1;
4481     uint64_t NMAX = 0x15B0;
4482 
4483     __ mov(base, BASE);
4484     __ mov(nmax, NMAX);
4485 
4486     // Load accumulation coefficients for the upper 16 bits
4487     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4488     __ ld1(vtable, __ T16B, Address(temp0));
4489 
4490     // s1 is initialized to the lower 16 bits of adler
4491     // s2 is initialized to the upper 16 bits of adler
4492     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4493     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4494 
4495     // The pipelined loop needs at least 16 elements for 1 iteration
4496     // It does check this, but it is more effective to skip to the cleanup loop
4497     __ cmp(len, (u1)16);
4498     __ br(Assembler::HS, L_nmax);
4499     __ cbz(len, L_combine);
4500 
4501     __ bind(L_simple_by1_loop);
4502     __ ldrb(temp0, Address(__ post(buff, 1)));
4503     __ add(s1, s1, temp0);
4504     __ add(s2, s2, s1);
4505     __ subs(len, len, 1);
4506     __ br(Assembler::HI, L_simple_by1_loop);
4507 
4508     // s1 = s1 % BASE
4509     __ subs(temp0, s1, base);
4510     __ csel(s1, temp0, s1, Assembler::HS);
4511 
4512     // s2 = s2 % BASE
4513     __ lsr(temp0, s2, 16);
4514     __ lsl(temp1, temp0, 4);
4515     __ sub(temp1, temp1, temp0);
4516     __ add(s2, temp1, s2, ext::uxth);
4517 
4518     __ subs(temp0, s2, base);
4519     __ csel(s2, temp0, s2, Assembler::HS);
4520 
4521     __ b(L_combine);
4522 
4523     __ bind(L_nmax);
4524     __ subs(len, len, nmax);
4525     __ sub(count, nmax, 16);
4526     __ br(Assembler::LO, L_by16);
4527 
4528     __ bind(L_nmax_loop);
4529 
4530     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4531                                       vbytes, vs1acc, vs2acc, vtable);
4532 
4533     __ subs(count, count, 16);
4534     __ br(Assembler::HS, L_nmax_loop);
4535 
4536     // s1 = s1 % BASE
4537     __ lsr(temp0, s1, 16);
4538     __ lsl(temp1, temp0, 4);
4539     __ sub(temp1, temp1, temp0);
4540     __ add(temp1, temp1, s1, ext::uxth);
4541 
4542     __ lsr(temp0, temp1, 16);
4543     __ lsl(s1, temp0, 4);
4544     __ sub(s1, s1, temp0);
4545     __ add(s1, s1, temp1, ext:: uxth);
4546 
4547     __ subs(temp0, s1, base);
4548     __ csel(s1, temp0, s1, Assembler::HS);
4549 
4550     // s2 = s2 % BASE
4551     __ lsr(temp0, s2, 16);
4552     __ lsl(temp1, temp0, 4);
4553     __ sub(temp1, temp1, temp0);
4554     __ add(temp1, temp1, s2, ext::uxth);
4555 
4556     __ lsr(temp0, temp1, 16);
4557     __ lsl(s2, temp0, 4);
4558     __ sub(s2, s2, temp0);
4559     __ add(s2, s2, temp1, ext:: uxth);
4560 
4561     __ subs(temp0, s2, base);
4562     __ csel(s2, temp0, s2, Assembler::HS);
4563 
4564     __ subs(len, len, nmax);
4565     __ sub(count, nmax, 16);
4566     __ br(Assembler::HS, L_nmax_loop);
4567 
4568     __ bind(L_by16);
4569     __ adds(len, len, count);
4570     __ br(Assembler::LO, L_by1);
4571 
4572     __ bind(L_by16_loop);
4573 
4574     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4575                                       vbytes, vs1acc, vs2acc, vtable);
4576 
4577     __ subs(len, len, 16);
4578     __ br(Assembler::HS, L_by16_loop);
4579 
4580     __ bind(L_by1);
4581     __ adds(len, len, 15);
4582     __ br(Assembler::LO, L_do_mod);
4583 
4584     __ bind(L_by1_loop);
4585     __ ldrb(temp0, Address(__ post(buff, 1)));
4586     __ add(s1, temp0, s1);
4587     __ add(s2, s2, s1);
4588     __ subs(len, len, 1);
4589     __ br(Assembler::HS, L_by1_loop);
4590 
4591     __ bind(L_do_mod);
4592     // s1 = s1 % BASE
4593     __ lsr(temp0, s1, 16);
4594     __ lsl(temp1, temp0, 4);
4595     __ sub(temp1, temp1, temp0);
4596     __ add(temp1, temp1, s1, ext::uxth);
4597 
4598     __ lsr(temp0, temp1, 16);
4599     __ lsl(s1, temp0, 4);
4600     __ sub(s1, s1, temp0);
4601     __ add(s1, s1, temp1, ext:: uxth);
4602 
4603     __ subs(temp0, s1, base);
4604     __ csel(s1, temp0, s1, Assembler::HS);
4605 
4606     // s2 = s2 % BASE
4607     __ lsr(temp0, s2, 16);
4608     __ lsl(temp1, temp0, 4);
4609     __ sub(temp1, temp1, temp0);
4610     __ add(temp1, temp1, s2, ext::uxth);
4611 
4612     __ lsr(temp0, temp1, 16);
4613     __ lsl(s2, temp0, 4);
4614     __ sub(s2, s2, temp0);
4615     __ add(s2, s2, temp1, ext:: uxth);
4616 
4617     __ subs(temp0, s2, base);
4618     __ csel(s2, temp0, s2, Assembler::HS);
4619 
4620     // Combine lower bits and higher bits
4621     __ bind(L_combine);
4622     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4623 
4624     __ ret(lr);
4625 
4626     return start;
4627   }
4628 
4629   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4630           Register temp0, Register temp1, FloatRegister vbytes,
4631           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4632     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4633     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4634     // In non-vectorized code, we update s1 and s2 as:
4635     //   s1 <- s1 + b1
4636     //   s2 <- s2 + s1
4637     //   s1 <- s1 + b2
4638     //   s2 <- s2 + b1
4639     //   ...
4640     //   s1 <- s1 + b16
4641     //   s2 <- s2 + s1
4642     // Putting above assignments together, we have:
4643     //   s1_new = s1 + b1 + b2 + ... + b16
4644     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4645     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4646     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4647     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4648 
4649     // s2 = s2 + s1 * 16
4650     __ add(s2, s2, s1, Assembler::LSL, 4);
4651 
4652     // vs1acc = b1 + b2 + b3 + ... + b16
4653     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4654     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4655     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4656     __ uaddlv(vs1acc, __ T16B, vbytes);
4657     __ uaddlv(vs2acc, __ T8H, vs2acc);
4658 
4659     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4660     __ fmovd(temp0, vs1acc);
4661     __ fmovd(temp1, vs2acc);
4662     __ add(s1, s1, temp0);
4663     __ add(s2, s2, temp1);
4664   }
4665 
4666   /**
4667    *  Arguments:
4668    *
4669    *  Input:
4670    *    c_rarg0   - x address
4671    *    c_rarg1   - x length
4672    *    c_rarg2   - y address
4673    *    c_rarg3   - y length
4674    *    c_rarg4   - z address
4675    *    c_rarg5   - z length
4676    */
4677   address generate_multiplyToLen() {
4678     __ align(CodeEntryAlignment);
4679     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4680 
4681     address start = __ pc();
4682     const Register x     = r0;
4683     const Register xlen  = r1;
4684     const Register y     = r2;
4685     const Register ylen  = r3;
4686     const Register z     = r4;
4687     const Register zlen  = r5;
4688 
4689     const Register tmp1  = r10;
4690     const Register tmp2  = r11;
4691     const Register tmp3  = r12;
4692     const Register tmp4  = r13;
4693     const Register tmp5  = r14;
4694     const Register tmp6  = r15;
4695     const Register tmp7  = r16;
4696 
4697     BLOCK_COMMENT("Entry:");
4698     __ enter(); // required for proper stackwalking of RuntimeStub frame
4699     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4700     __ leave(); // required for proper stackwalking of RuntimeStub frame
4701     __ ret(lr);
4702 
4703     return start;
4704   }
4705 
4706   address generate_squareToLen() {
4707     // squareToLen algorithm for sizes 1..127 described in java code works
4708     // faster than multiply_to_len on some CPUs and slower on others, but
4709     // multiply_to_len shows a bit better overall results
4710     __ align(CodeEntryAlignment);
4711     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4712     address start = __ pc();
4713 
4714     const Register x     = r0;
4715     const Register xlen  = r1;
4716     const Register z     = r2;
4717     const Register zlen  = r3;
4718     const Register y     = r4; // == x
4719     const Register ylen  = r5; // == xlen
4720 
4721     const Register tmp1  = r10;
4722     const Register tmp2  = r11;
4723     const Register tmp3  = r12;
4724     const Register tmp4  = r13;
4725     const Register tmp5  = r14;
4726     const Register tmp6  = r15;
4727     const Register tmp7  = r16;
4728 
4729     RegSet spilled_regs = RegSet::of(y, ylen);
4730     BLOCK_COMMENT("Entry:");
4731     __ enter();
4732     __ push(spilled_regs, sp);
4733     __ mov(y, x);
4734     __ mov(ylen, xlen);
4735     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4736     __ pop(spilled_regs, sp);
4737     __ leave();
4738     __ ret(lr);
4739     return start;
4740   }
4741 
4742   address generate_mulAdd() {
4743     __ align(CodeEntryAlignment);
4744     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4745 
4746     address start = __ pc();
4747 
4748     const Register out     = r0;
4749     const Register in      = r1;
4750     const Register offset  = r2;
4751     const Register len     = r3;
4752     const Register k       = r4;
4753 
4754     BLOCK_COMMENT("Entry:");
4755     __ enter();
4756     __ mul_add(out, in, offset, len, k);
4757     __ leave();
4758     __ ret(lr);
4759 
4760     return start;
4761   }
4762 
4763   // Arguments:
4764   //
4765   // Input:
4766   //   c_rarg0   - newArr address
4767   //   c_rarg1   - oldArr address
4768   //   c_rarg2   - newIdx
4769   //   c_rarg3   - shiftCount
4770   //   c_rarg4   - numIter
4771   //
4772   address generate_bigIntegerRightShift() {
4773     __ align(CodeEntryAlignment);
4774     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4775     address start = __ pc();
4776 
4777     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4778 
4779     Register newArr        = c_rarg0;
4780     Register oldArr        = c_rarg1;
4781     Register newIdx        = c_rarg2;
4782     Register shiftCount    = c_rarg3;
4783     Register numIter       = c_rarg4;
4784     Register idx           = numIter;
4785 
4786     Register newArrCur     = rscratch1;
4787     Register shiftRevCount = rscratch2;
4788     Register oldArrCur     = r13;
4789     Register oldArrNext    = r14;
4790 
4791     FloatRegister oldElem0        = v0;
4792     FloatRegister oldElem1        = v1;
4793     FloatRegister newElem         = v2;
4794     FloatRegister shiftVCount     = v3;
4795     FloatRegister shiftVRevCount  = v4;
4796 
4797     __ cbz(idx, Exit);
4798 
4799     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4800 
4801     // left shift count
4802     __ movw(shiftRevCount, 32);
4803     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4804 
4805     // numIter too small to allow a 4-words SIMD loop, rolling back
4806     __ cmp(numIter, (u1)4);
4807     __ br(Assembler::LT, ShiftThree);
4808 
4809     __ dup(shiftVCount,    __ T4S, shiftCount);
4810     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4811     __ negr(shiftVCount,   __ T4S, shiftVCount);
4812 
4813     __ BIND(ShiftSIMDLoop);
4814 
4815     // Calculate the load addresses
4816     __ sub(idx, idx, 4);
4817     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4818     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4819     __ add(oldArrCur,  oldArrNext, 4);
4820 
4821     // Load 4 words and process
4822     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4823     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4824     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4825     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4826     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4827     __ st1(newElem,   __ T4S,  Address(newArrCur));
4828 
4829     __ cmp(idx, (u1)4);
4830     __ br(Assembler::LT, ShiftTwoLoop);
4831     __ b(ShiftSIMDLoop);
4832 
4833     __ BIND(ShiftTwoLoop);
4834     __ cbz(idx, Exit);
4835     __ cmp(idx, (u1)1);
4836     __ br(Assembler::EQ, ShiftOne);
4837 
4838     // Calculate the load addresses
4839     __ sub(idx, idx, 2);
4840     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4841     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4842     __ add(oldArrCur,  oldArrNext, 4);
4843 
4844     // Load 2 words and process
4845     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4846     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4847     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4848     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4849     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4850     __ st1(newElem,   __ T2S, Address(newArrCur));
4851     __ b(ShiftTwoLoop);
4852 
4853     __ BIND(ShiftThree);
4854     __ tbz(idx, 1, ShiftOne);
4855     __ tbz(idx, 0, ShiftTwo);
4856     __ ldrw(r10,  Address(oldArr, 12));
4857     __ ldrw(r11,  Address(oldArr, 8));
4858     __ lsrvw(r10, r10, shiftCount);
4859     __ lslvw(r11, r11, shiftRevCount);
4860     __ orrw(r12,  r10, r11);
4861     __ strw(r12,  Address(newArr, 8));
4862 
4863     __ BIND(ShiftTwo);
4864     __ ldrw(r10,  Address(oldArr, 8));
4865     __ ldrw(r11,  Address(oldArr, 4));
4866     __ lsrvw(r10, r10, shiftCount);
4867     __ lslvw(r11, r11, shiftRevCount);
4868     __ orrw(r12,  r10, r11);
4869     __ strw(r12,  Address(newArr, 4));
4870 
4871     __ BIND(ShiftOne);
4872     __ ldrw(r10,  Address(oldArr, 4));
4873     __ ldrw(r11,  Address(oldArr));
4874     __ lsrvw(r10, r10, shiftCount);
4875     __ lslvw(r11, r11, shiftRevCount);
4876     __ orrw(r12,  r10, r11);
4877     __ strw(r12,  Address(newArr));
4878 
4879     __ BIND(Exit);
4880     __ ret(lr);
4881 
4882     return start;
4883   }
4884 
4885   // Arguments:
4886   //
4887   // Input:
4888   //   c_rarg0   - newArr address
4889   //   c_rarg1   - oldArr address
4890   //   c_rarg2   - newIdx
4891   //   c_rarg3   - shiftCount
4892   //   c_rarg4   - numIter
4893   //
4894   address generate_bigIntegerLeftShift() {
4895     __ align(CodeEntryAlignment);
4896     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4897     address start = __ pc();
4898 
4899     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4900 
4901     Register newArr        = c_rarg0;
4902     Register oldArr        = c_rarg1;
4903     Register newIdx        = c_rarg2;
4904     Register shiftCount    = c_rarg3;
4905     Register numIter       = c_rarg4;
4906 
4907     Register shiftRevCount = rscratch1;
4908     Register oldArrNext    = rscratch2;
4909 
4910     FloatRegister oldElem0        = v0;
4911     FloatRegister oldElem1        = v1;
4912     FloatRegister newElem         = v2;
4913     FloatRegister shiftVCount     = v3;
4914     FloatRegister shiftVRevCount  = v4;
4915 
4916     __ cbz(numIter, Exit);
4917 
4918     __ add(oldArrNext, oldArr, 4);
4919     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4920 
4921     // right shift count
4922     __ movw(shiftRevCount, 32);
4923     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4924 
4925     // numIter too small to allow a 4-words SIMD loop, rolling back
4926     __ cmp(numIter, (u1)4);
4927     __ br(Assembler::LT, ShiftThree);
4928 
4929     __ dup(shiftVCount,     __ T4S, shiftCount);
4930     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4931     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4932 
4933     __ BIND(ShiftSIMDLoop);
4934 
4935     // load 4 words and process
4936     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4937     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4938     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4939     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4940     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4941     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4942     __ sub(numIter,   numIter, 4);
4943 
4944     __ cmp(numIter, (u1)4);
4945     __ br(Assembler::LT, ShiftTwoLoop);
4946     __ b(ShiftSIMDLoop);
4947 
4948     __ BIND(ShiftTwoLoop);
4949     __ cbz(numIter, Exit);
4950     __ cmp(numIter, (u1)1);
4951     __ br(Assembler::EQ, ShiftOne);
4952 
4953     // load 2 words and process
4954     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4955     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4956     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4957     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4958     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4959     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4960     __ sub(numIter,   numIter, 2);
4961     __ b(ShiftTwoLoop);
4962 
4963     __ BIND(ShiftThree);
4964     __ ldrw(r10,  __ post(oldArr, 4));
4965     __ ldrw(r11,  __ post(oldArrNext, 4));
4966     __ lslvw(r10, r10, shiftCount);
4967     __ lsrvw(r11, r11, shiftRevCount);
4968     __ orrw(r12,  r10, r11);
4969     __ strw(r12,  __ post(newArr, 4));
4970     __ tbz(numIter, 1, Exit);
4971     __ tbz(numIter, 0, ShiftOne);
4972 
4973     __ BIND(ShiftTwo);
4974     __ ldrw(r10,  __ post(oldArr, 4));
4975     __ ldrw(r11,  __ post(oldArrNext, 4));
4976     __ lslvw(r10, r10, shiftCount);
4977     __ lsrvw(r11, r11, shiftRevCount);
4978     __ orrw(r12,  r10, r11);
4979     __ strw(r12,  __ post(newArr, 4));
4980 
4981     __ BIND(ShiftOne);
4982     __ ldrw(r10,  Address(oldArr));
4983     __ ldrw(r11,  Address(oldArrNext));
4984     __ lslvw(r10, r10, shiftCount);
4985     __ lsrvw(r11, r11, shiftRevCount);
4986     __ orrw(r12,  r10, r11);
4987     __ strw(r12,  Address(newArr));
4988 
4989     __ BIND(Exit);
4990     __ ret(lr);
4991 
4992     return start;
4993   }
4994 
4995   address generate_count_positives(address &count_positives_long) {
4996     const u1 large_loop_size = 64;
4997     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4998     int dcache_line = VM_Version::dcache_line_size();
4999 
5000     Register ary1 = r1, len = r2, result = r0;
5001 
5002     __ align(CodeEntryAlignment);
5003 
5004     StubCodeMark mark(this, "StubRoutines", "count_positives");
5005 
5006     address entry = __ pc();
5007 
5008     __ enter();
5009     // precondition: a copy of len is already in result
5010     // __ mov(result, len);
5011 
5012   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5013         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5014 
5015   __ cmp(len, (u1)15);
5016   __ br(Assembler::GT, LEN_OVER_15);
5017   // The only case when execution falls into this code is when pointer is near
5018   // the end of memory page and we have to avoid reading next page
5019   __ add(ary1, ary1, len);
5020   __ subs(len, len, 8);
5021   __ br(Assembler::GT, LEN_OVER_8);
5022   __ ldr(rscratch2, Address(ary1, -8));
5023   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5024   __ lsrv(rscratch2, rscratch2, rscratch1);
5025   __ tst(rscratch2, UPPER_BIT_MASK);
5026   __ csel(result, zr, result, Assembler::NE);
5027   __ leave();
5028   __ ret(lr);
5029   __ bind(LEN_OVER_8);
5030   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5031   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5032   __ tst(rscratch2, UPPER_BIT_MASK);
5033   __ br(Assembler::NE, RET_NO_POP);
5034   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5035   __ lsrv(rscratch1, rscratch1, rscratch2);
5036   __ tst(rscratch1, UPPER_BIT_MASK);
5037   __ bind(RET_NO_POP);
5038   __ csel(result, zr, result, Assembler::NE);
5039   __ leave();
5040   __ ret(lr);
5041 
5042   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5043   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5044 
5045   count_positives_long = __ pc(); // 2nd entry point
5046 
5047   __ enter();
5048 
5049   __ bind(LEN_OVER_15);
5050     __ push(spilled_regs, sp);
5051     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5052     __ cbz(rscratch2, ALIGNED);
5053     __ ldp(tmp6, tmp1, Address(ary1));
5054     __ mov(tmp5, 16);
5055     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5056     __ add(ary1, ary1, rscratch1);
5057     __ orr(tmp6, tmp6, tmp1);
5058     __ tst(tmp6, UPPER_BIT_MASK);
5059     __ br(Assembler::NE, RET_ADJUST);
5060     __ sub(len, len, rscratch1);
5061 
5062   __ bind(ALIGNED);
5063     __ cmp(len, large_loop_size);
5064     __ br(Assembler::LT, CHECK_16);
5065     // Perform 16-byte load as early return in pre-loop to handle situation
5066     // when initially aligned large array has negative values at starting bytes,
5067     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5068     // slower. Cases with negative bytes further ahead won't be affected that
5069     // much. In fact, it'll be faster due to early loads, less instructions and
5070     // less branches in LARGE_LOOP.
5071     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5072     __ sub(len, len, 16);
5073     __ orr(tmp6, tmp6, tmp1);
5074     __ tst(tmp6, UPPER_BIT_MASK);
5075     __ br(Assembler::NE, RET_ADJUST_16);
5076     __ cmp(len, large_loop_size);
5077     __ br(Assembler::LT, CHECK_16);
5078 
5079     if (SoftwarePrefetchHintDistance >= 0
5080         && SoftwarePrefetchHintDistance >= dcache_line) {
5081       // initial prefetch
5082       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5083     }
5084   __ bind(LARGE_LOOP);
5085     if (SoftwarePrefetchHintDistance >= 0) {
5086       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5087     }
5088     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5089     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5090     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5091     // instructions per cycle and have less branches, but this approach disables
5092     // early return, thus, all 64 bytes are loaded and checked every time.
5093     __ ldp(tmp2, tmp3, Address(ary1));
5094     __ ldp(tmp4, tmp5, Address(ary1, 16));
5095     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5096     __ ldp(tmp6, tmp1, Address(ary1, 48));
5097     __ add(ary1, ary1, large_loop_size);
5098     __ sub(len, len, large_loop_size);
5099     __ orr(tmp2, tmp2, tmp3);
5100     __ orr(tmp4, tmp4, tmp5);
5101     __ orr(rscratch1, rscratch1, rscratch2);
5102     __ orr(tmp6, tmp6, tmp1);
5103     __ orr(tmp2, tmp2, tmp4);
5104     __ orr(rscratch1, rscratch1, tmp6);
5105     __ orr(tmp2, tmp2, rscratch1);
5106     __ tst(tmp2, UPPER_BIT_MASK);
5107     __ br(Assembler::NE, RET_ADJUST_LONG);
5108     __ cmp(len, large_loop_size);
5109     __ br(Assembler::GE, LARGE_LOOP);
5110 
5111   __ bind(CHECK_16); // small 16-byte load pre-loop
5112     __ cmp(len, (u1)16);
5113     __ br(Assembler::LT, POST_LOOP16);
5114 
5115   __ bind(LOOP16); // small 16-byte load loop
5116     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5117     __ sub(len, len, 16);
5118     __ orr(tmp2, tmp2, tmp3);
5119     __ tst(tmp2, UPPER_BIT_MASK);
5120     __ br(Assembler::NE, RET_ADJUST_16);
5121     __ cmp(len, (u1)16);
5122     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5123 
5124   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5125     __ cmp(len, (u1)8);
5126     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5127     __ ldr(tmp3, Address(__ post(ary1, 8)));
5128     __ tst(tmp3, UPPER_BIT_MASK);
5129     __ br(Assembler::NE, RET_ADJUST);
5130     __ sub(len, len, 8);
5131 
5132   __ bind(POST_LOOP16_LOAD_TAIL);
5133     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5134     __ ldr(tmp1, Address(ary1));
5135     __ mov(tmp2, 64);
5136     __ sub(tmp4, tmp2, len, __ LSL, 3);
5137     __ lslv(tmp1, tmp1, tmp4);
5138     __ tst(tmp1, UPPER_BIT_MASK);
5139     __ br(Assembler::NE, RET_ADJUST);
5140     // Fallthrough
5141 
5142   __ bind(RET_LEN);
5143     __ pop(spilled_regs, sp);
5144     __ leave();
5145     __ ret(lr);
5146 
5147     // difference result - len is the count of guaranteed to be
5148     // positive bytes
5149 
5150   __ bind(RET_ADJUST_LONG);
5151     __ add(len, len, (u1)(large_loop_size - 16));
5152   __ bind(RET_ADJUST_16);
5153     __ add(len, len, 16);
5154   __ bind(RET_ADJUST);
5155     __ pop(spilled_regs, sp);
5156     __ leave();
5157     __ sub(result, result, len);
5158     __ ret(lr);
5159 
5160     return entry;
5161   }
5162 
5163   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5164         bool usePrefetch, Label &NOT_EQUAL) {
5165     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5166         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5167         tmp7 = r12, tmp8 = r13;
5168     Label LOOP;
5169 
5170     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5171     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5172     __ bind(LOOP);
5173     if (usePrefetch) {
5174       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5175       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5176     }
5177     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5178     __ eor(tmp1, tmp1, tmp2);
5179     __ eor(tmp3, tmp3, tmp4);
5180     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5181     __ orr(tmp1, tmp1, tmp3);
5182     __ cbnz(tmp1, NOT_EQUAL);
5183     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5184     __ eor(tmp5, tmp5, tmp6);
5185     __ eor(tmp7, tmp7, tmp8);
5186     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5187     __ orr(tmp5, tmp5, tmp7);
5188     __ cbnz(tmp5, NOT_EQUAL);
5189     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5190     __ eor(tmp1, tmp1, tmp2);
5191     __ eor(tmp3, tmp3, tmp4);
5192     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5193     __ orr(tmp1, tmp1, tmp3);
5194     __ cbnz(tmp1, NOT_EQUAL);
5195     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5196     __ eor(tmp5, tmp5, tmp6);
5197     __ sub(cnt1, cnt1, 8 * wordSize);
5198     __ eor(tmp7, tmp7, tmp8);
5199     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5200     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5201     // cmp) because subs allows an unlimited range of immediate operand.
5202     __ subs(tmp6, cnt1, loopThreshold);
5203     __ orr(tmp5, tmp5, tmp7);
5204     __ cbnz(tmp5, NOT_EQUAL);
5205     __ br(__ GE, LOOP);
5206     // post-loop
5207     __ eor(tmp1, tmp1, tmp2);
5208     __ eor(tmp3, tmp3, tmp4);
5209     __ orr(tmp1, tmp1, tmp3);
5210     __ sub(cnt1, cnt1, 2 * wordSize);
5211     __ cbnz(tmp1, NOT_EQUAL);
5212   }
5213 
5214   void generate_large_array_equals_loop_simd(int loopThreshold,
5215         bool usePrefetch, Label &NOT_EQUAL) {
5216     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5217         tmp2 = rscratch2;
5218     Label LOOP;
5219 
5220     __ bind(LOOP);
5221     if (usePrefetch) {
5222       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5223       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5224     }
5225     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5226     __ sub(cnt1, cnt1, 8 * wordSize);
5227     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5228     __ subs(tmp1, cnt1, loopThreshold);
5229     __ eor(v0, __ T16B, v0, v4);
5230     __ eor(v1, __ T16B, v1, v5);
5231     __ eor(v2, __ T16B, v2, v6);
5232     __ eor(v3, __ T16B, v3, v7);
5233     __ orr(v0, __ T16B, v0, v1);
5234     __ orr(v1, __ T16B, v2, v3);
5235     __ orr(v0, __ T16B, v0, v1);
5236     __ umov(tmp1, v0, __ D, 0);
5237     __ umov(tmp2, v0, __ D, 1);
5238     __ orr(tmp1, tmp1, tmp2);
5239     __ cbnz(tmp1, NOT_EQUAL);
5240     __ br(__ GE, LOOP);
5241   }
5242 
5243   // a1 = r1 - array1 address
5244   // a2 = r2 - array2 address
5245   // result = r0 - return value. Already contains "false"
5246   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5247   // r3-r5 are reserved temporary registers
5248   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5249   address generate_large_array_equals() {
5250     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5251         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5252         tmp7 = r12, tmp8 = r13;
5253     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5254         SMALL_LOOP, POST_LOOP;
5255     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5256     // calculate if at least 32 prefetched bytes are used
5257     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5258     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5259     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5260     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5261         tmp5, tmp6, tmp7, tmp8);
5262 
5263     __ align(CodeEntryAlignment);
5264 
5265     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5266 
5267     address entry = __ pc();
5268     __ enter();
5269     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5270     // also advance pointers to use post-increment instead of pre-increment
5271     __ add(a1, a1, wordSize);
5272     __ add(a2, a2, wordSize);
5273     if (AvoidUnalignedAccesses) {
5274       // both implementations (SIMD/nonSIMD) are using relatively large load
5275       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5276       // on some CPUs in case of address is not at least 16-byte aligned.
5277       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5278       // load if needed at least for 1st address and make if 16-byte aligned.
5279       Label ALIGNED16;
5280       __ tbz(a1, 3, ALIGNED16);
5281       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5282       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5283       __ sub(cnt1, cnt1, wordSize);
5284       __ eor(tmp1, tmp1, tmp2);
5285       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5286       __ bind(ALIGNED16);
5287     }
5288     if (UseSIMDForArrayEquals) {
5289       if (SoftwarePrefetchHintDistance >= 0) {
5290         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5291         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5292         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5293             /* prfm = */ true, NOT_EQUAL);
5294         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5295         __ br(__ LT, TAIL);
5296       }
5297       __ bind(NO_PREFETCH_LARGE_LOOP);
5298       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5299           /* prfm = */ false, NOT_EQUAL);
5300     } else {
5301       __ push(spilled_regs, sp);
5302       if (SoftwarePrefetchHintDistance >= 0) {
5303         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5304         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5305         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5306             /* prfm = */ true, NOT_EQUAL);
5307         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5308         __ br(__ LT, TAIL);
5309       }
5310       __ bind(NO_PREFETCH_LARGE_LOOP);
5311       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5312           /* prfm = */ false, NOT_EQUAL);
5313     }
5314     __ bind(TAIL);
5315       __ cbz(cnt1, EQUAL);
5316       __ subs(cnt1, cnt1, wordSize);
5317       __ br(__ LE, POST_LOOP);
5318     __ bind(SMALL_LOOP);
5319       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5320       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5321       __ subs(cnt1, cnt1, wordSize);
5322       __ eor(tmp1, tmp1, tmp2);
5323       __ cbnz(tmp1, NOT_EQUAL);
5324       __ br(__ GT, SMALL_LOOP);
5325     __ bind(POST_LOOP);
5326       __ ldr(tmp1, Address(a1, cnt1));
5327       __ ldr(tmp2, Address(a2, cnt1));
5328       __ eor(tmp1, tmp1, tmp2);
5329       __ cbnz(tmp1, NOT_EQUAL);
5330     __ bind(EQUAL);
5331       __ mov(result, true);
5332     __ bind(NOT_EQUAL);
5333       if (!UseSIMDForArrayEquals) {
5334         __ pop(spilled_regs, sp);
5335       }
5336     __ bind(NOT_EQUAL_NO_POP);
5337     __ leave();
5338     __ ret(lr);
5339     return entry;
5340   }
5341 
5342   address generate_dsin_dcos(bool isCos) {
5343     __ align(CodeEntryAlignment);
5344     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5345     address start = __ pc();
5346     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5347         (address)StubRoutines::aarch64::_two_over_pi,
5348         (address)StubRoutines::aarch64::_pio2,
5349         (address)StubRoutines::aarch64::_dsin_coef,
5350         (address)StubRoutines::aarch64::_dcos_coef);
5351     return start;
5352   }
5353 
5354   address generate_dlog() {
5355     __ align(CodeEntryAlignment);
5356     StubCodeMark mark(this, "StubRoutines", "dlog");
5357     address entry = __ pc();
5358     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5359         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5360     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5361     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5362         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5363     return entry;
5364   }
5365 
5366 
5367   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5368   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5369       Label &DIFF2) {
5370     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5371     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5372 
5373     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5374     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5375     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5376     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5377 
5378     __ fmovd(tmpL, vtmp3);
5379     __ eor(rscratch2, tmp3, tmpL);
5380     __ cbnz(rscratch2, DIFF2);
5381 
5382     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5383     __ umov(tmpL, vtmp3, __ D, 1);
5384     __ eor(rscratch2, tmpU, tmpL);
5385     __ cbnz(rscratch2, DIFF1);
5386 
5387     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5388     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5389     __ fmovd(tmpL, vtmp);
5390     __ eor(rscratch2, tmp3, tmpL);
5391     __ cbnz(rscratch2, DIFF2);
5392 
5393     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5394     __ umov(tmpL, vtmp, __ D, 1);
5395     __ eor(rscratch2, tmpU, tmpL);
5396     __ cbnz(rscratch2, DIFF1);
5397   }
5398 
5399   // r0  = result
5400   // r1  = str1
5401   // r2  = cnt1
5402   // r3  = str2
5403   // r4  = cnt2
5404   // r10 = tmp1
5405   // r11 = tmp2
5406   address generate_compare_long_string_different_encoding(bool isLU) {
5407     __ align(CodeEntryAlignment);
5408     StubCodeMark mark(this, "StubRoutines", isLU
5409         ? "compare_long_string_different_encoding LU"
5410         : "compare_long_string_different_encoding UL");
5411     address entry = __ pc();
5412     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5413         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5414         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5415     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5416         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5417     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5418     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5419 
5420     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5421 
5422     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5423     // cnt2 == amount of characters left to compare
5424     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5425     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5426     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5427     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5428     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5429     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5430     __ eor(rscratch2, tmp1, tmp2);
5431     __ mov(rscratch1, tmp2);
5432     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5433     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5434              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5435     __ push(spilled_regs, sp);
5436     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5437     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5438 
5439     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5440 
5441     if (SoftwarePrefetchHintDistance >= 0) {
5442       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5443       __ br(__ LT, NO_PREFETCH);
5444       __ bind(LARGE_LOOP_PREFETCH);
5445         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5446         __ mov(tmp4, 2);
5447         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5448         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5449           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5450           __ subs(tmp4, tmp4, 1);
5451           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5452           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5453           __ mov(tmp4, 2);
5454         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5455           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5456           __ subs(tmp4, tmp4, 1);
5457           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5458           __ sub(cnt2, cnt2, 64);
5459           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5460           __ br(__ GE, LARGE_LOOP_PREFETCH);
5461     }
5462     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5463     __ bind(NO_PREFETCH);
5464     __ subs(cnt2, cnt2, 16);
5465     __ br(__ LT, TAIL);
5466     __ align(OptoLoopAlignment);
5467     __ bind(SMALL_LOOP); // smaller loop
5468       __ subs(cnt2, cnt2, 16);
5469       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5470       __ br(__ GE, SMALL_LOOP);
5471       __ cmn(cnt2, (u1)16);
5472       __ br(__ EQ, LOAD_LAST);
5473     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5474       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5475       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5476       __ ldr(tmp3, Address(cnt1, -8));
5477       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5478       __ b(LOAD_LAST);
5479     __ bind(DIFF2);
5480       __ mov(tmpU, tmp3);
5481     __ bind(DIFF1);
5482       __ pop(spilled_regs, sp);
5483       __ b(CALCULATE_DIFFERENCE);
5484     __ bind(LOAD_LAST);
5485       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5486       // No need to load it again
5487       __ mov(tmpU, tmp3);
5488       __ pop(spilled_regs, sp);
5489 
5490       // tmp2 points to the address of the last 4 Latin1 characters right now
5491       __ ldrs(vtmp, Address(tmp2));
5492       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5493       __ fmovd(tmpL, vtmp);
5494 
5495       __ eor(rscratch2, tmpU, tmpL);
5496       __ cbz(rscratch2, DONE);
5497 
5498     // Find the first different characters in the longwords and
5499     // compute their difference.
5500     __ bind(CALCULATE_DIFFERENCE);
5501       __ rev(rscratch2, rscratch2);
5502       __ clz(rscratch2, rscratch2);
5503       __ andr(rscratch2, rscratch2, -16);
5504       __ lsrv(tmp1, tmp1, rscratch2);
5505       __ uxthw(tmp1, tmp1);
5506       __ lsrv(rscratch1, rscratch1, rscratch2);
5507       __ uxthw(rscratch1, rscratch1);
5508       __ subw(result, tmp1, rscratch1);
5509     __ bind(DONE);
5510       __ ret(lr);
5511     return entry;
5512   }
5513 
5514   address generate_method_entry_barrier() {
5515     __ align(CodeEntryAlignment);
5516     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5517 
5518     Label deoptimize_label;
5519 
5520     address start = __ pc();
5521 
5522     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5523 
5524     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5525       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5526       // We can get here despite the nmethod being good, if we have not
5527       // yet applied our cross modification fence (or data fence).
5528       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5529       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5530       __ ldrw(rscratch2, rscratch2);
5531       __ strw(rscratch2, thread_epoch_addr);
5532       __ isb();
5533       __ membar(__ LoadLoad);
5534     }
5535 
5536     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5537 
5538     __ enter();
5539     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5540 
5541     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5542 
5543     __ push_call_clobbered_registers();
5544 
5545     __ mov(c_rarg0, rscratch2);
5546     __ call_VM_leaf
5547          (CAST_FROM_FN_PTR
5548           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5549 
5550     __ reset_last_Java_frame(true);
5551 
5552     __ mov(rscratch1, r0);
5553 
5554     __ pop_call_clobbered_registers();
5555 
5556     __ cbnz(rscratch1, deoptimize_label);
5557 
5558     __ leave();
5559     __ ret(lr);
5560 
5561     __ BIND(deoptimize_label);
5562 
5563     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5564     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5565 
5566     __ mov(sp, rscratch1);
5567     __ br(rscratch2);
5568 
5569     return start;
5570   }
5571 
5572   // r0  = result
5573   // r1  = str1
5574   // r2  = cnt1
5575   // r3  = str2
5576   // r4  = cnt2
5577   // r10 = tmp1
5578   // r11 = tmp2
5579   address generate_compare_long_string_same_encoding(bool isLL) {
5580     __ align(CodeEntryAlignment);
5581     StubCodeMark mark(this, "StubRoutines", isLL
5582         ? "compare_long_string_same_encoding LL"
5583         : "compare_long_string_same_encoding UU");
5584     address entry = __ pc();
5585     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5586         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5587 
5588     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5589 
5590     // exit from large loop when less than 64 bytes left to read or we're about
5591     // to prefetch memory behind array border
5592     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5593 
5594     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5595     __ eor(rscratch2, tmp1, tmp2);
5596     __ cbnz(rscratch2, CAL_DIFFERENCE);
5597 
5598     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5599     // update pointers, because of previous read
5600     __ add(str1, str1, wordSize);
5601     __ add(str2, str2, wordSize);
5602     if (SoftwarePrefetchHintDistance >= 0) {
5603       __ align(OptoLoopAlignment);
5604       __ bind(LARGE_LOOP_PREFETCH);
5605         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5606         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5607 
5608         for (int i = 0; i < 4; i++) {
5609           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5610           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5611           __ cmp(tmp1, tmp2);
5612           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5613           __ br(Assembler::NE, DIFF);
5614         }
5615         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5616         __ add(str1, str1, 64);
5617         __ add(str2, str2, 64);
5618         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5619         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5620         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5621     }
5622 
5623     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5624     __ br(Assembler::LE, LESS16);
5625     __ align(OptoLoopAlignment);
5626     __ bind(LOOP_COMPARE16);
5627       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5628       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5629       __ cmp(tmp1, tmp2);
5630       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5631       __ br(Assembler::NE, DIFF);
5632       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5633       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5634       __ br(Assembler::LT, LESS16);
5635 
5636       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5637       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5638       __ cmp(tmp1, tmp2);
5639       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5640       __ br(Assembler::NE, DIFF);
5641       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5642       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5643       __ br(Assembler::GE, LOOP_COMPARE16);
5644       __ cbz(cnt2, LENGTH_DIFF);
5645 
5646     __ bind(LESS16);
5647       // each 8 compare
5648       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5649       __ br(Assembler::LE, LESS8);
5650       __ ldr(tmp1, Address(__ post(str1, 8)));
5651       __ ldr(tmp2, Address(__ post(str2, 8)));
5652       __ eor(rscratch2, tmp1, tmp2);
5653       __ cbnz(rscratch2, CAL_DIFFERENCE);
5654       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5655 
5656     __ bind(LESS8); // directly load last 8 bytes
5657       if (!isLL) {
5658         __ add(cnt2, cnt2, cnt2);
5659       }
5660       __ ldr(tmp1, Address(str1, cnt2));
5661       __ ldr(tmp2, Address(str2, cnt2));
5662       __ eor(rscratch2, tmp1, tmp2);
5663       __ cbz(rscratch2, LENGTH_DIFF);
5664       __ b(CAL_DIFFERENCE);
5665 
5666     __ bind(DIFF);
5667       __ cmp(tmp1, tmp2);
5668       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5669       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5670       // reuse rscratch2 register for the result of eor instruction
5671       __ eor(rscratch2, tmp1, tmp2);
5672 
5673     __ bind(CAL_DIFFERENCE);
5674       __ rev(rscratch2, rscratch2);
5675       __ clz(rscratch2, rscratch2);
5676       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5677       __ lsrv(tmp1, tmp1, rscratch2);
5678       __ lsrv(tmp2, tmp2, rscratch2);
5679       if (isLL) {
5680         __ uxtbw(tmp1, tmp1);
5681         __ uxtbw(tmp2, tmp2);
5682       } else {
5683         __ uxthw(tmp1, tmp1);
5684         __ uxthw(tmp2, tmp2);
5685       }
5686       __ subw(result, tmp1, tmp2);
5687 
5688     __ bind(LENGTH_DIFF);
5689       __ ret(lr);
5690     return entry;
5691   }
5692 
5693   enum string_compare_mode {
5694     LL,
5695     LU,
5696     UL,
5697     UU,
5698   };
5699 
5700   // The following registers are declared in aarch64.ad
5701   // r0  = result
5702   // r1  = str1
5703   // r2  = cnt1
5704   // r3  = str2
5705   // r4  = cnt2
5706   // r10 = tmp1
5707   // r11 = tmp2
5708   // z0  = ztmp1
5709   // z1  = ztmp2
5710   // p0  = pgtmp1
5711   // p1  = pgtmp2
5712   address generate_compare_long_string_sve(string_compare_mode mode) {
5713     __ align(CodeEntryAlignment);
5714     address entry = __ pc();
5715     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5716              tmp1 = r10, tmp2 = r11;
5717 
5718     Label LOOP, DONE, MISMATCH;
5719     Register vec_len = tmp1;
5720     Register idx = tmp2;
5721     // The minimum of the string lengths has been stored in cnt2.
5722     Register cnt = cnt2;
5723     FloatRegister ztmp1 = z0, ztmp2 = z1;
5724     PRegister pgtmp1 = p0, pgtmp2 = p1;
5725 
5726 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5727     switch (mode) {                                                            \
5728       case LL:                                                                 \
5729         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5730         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5731         break;                                                                 \
5732       case LU:                                                                 \
5733         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5734         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5735         break;                                                                 \
5736       case UL:                                                                 \
5737         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5738         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5739         break;                                                                 \
5740       case UU:                                                                 \
5741         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5742         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5743         break;                                                                 \
5744       default:                                                                 \
5745         ShouldNotReachHere();                                                  \
5746     }
5747 
5748     const char* stubname;
5749     switch (mode) {
5750       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5751       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5752       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5753       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5754       default: ShouldNotReachHere();
5755     }
5756 
5757     StubCodeMark mark(this, "StubRoutines", stubname);
5758 
5759     __ mov(idx, 0);
5760     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5761 
5762     if (mode == LL) {
5763       __ sve_cntb(vec_len);
5764     } else {
5765       __ sve_cnth(vec_len);
5766     }
5767 
5768     __ sub(rscratch1, cnt, vec_len);
5769 
5770     __ bind(LOOP);
5771 
5772       // main loop
5773       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5774       __ add(idx, idx, vec_len);
5775       // Compare strings.
5776       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5777       __ br(__ NE, MISMATCH);
5778       __ cmp(idx, rscratch1);
5779       __ br(__ LT, LOOP);
5780 
5781     // post loop, last iteration
5782     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5783 
5784     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5785     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5786     __ br(__ EQ, DONE);
5787 
5788     __ bind(MISMATCH);
5789 
5790     // Crop the vector to find its location.
5791     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5792     // Extract the first different characters of each string.
5793     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5794     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5795 
5796     // Compute the difference of the first different characters.
5797     __ sub(result, rscratch1, rscratch2);
5798 
5799     __ bind(DONE);
5800     __ ret(lr);
5801 #undef LOAD_PAIR
5802     return entry;
5803   }
5804 
5805   void generate_compare_long_strings() {
5806     if (UseSVE == 0) {
5807       StubRoutines::aarch64::_compare_long_string_LL
5808           = generate_compare_long_string_same_encoding(true);
5809       StubRoutines::aarch64::_compare_long_string_UU
5810           = generate_compare_long_string_same_encoding(false);
5811       StubRoutines::aarch64::_compare_long_string_LU
5812           = generate_compare_long_string_different_encoding(true);
5813       StubRoutines::aarch64::_compare_long_string_UL
5814           = generate_compare_long_string_different_encoding(false);
5815     } else {
5816       StubRoutines::aarch64::_compare_long_string_LL
5817           = generate_compare_long_string_sve(LL);
5818       StubRoutines::aarch64::_compare_long_string_UU
5819           = generate_compare_long_string_sve(UU);
5820       StubRoutines::aarch64::_compare_long_string_LU
5821           = generate_compare_long_string_sve(LU);
5822       StubRoutines::aarch64::_compare_long_string_UL
5823           = generate_compare_long_string_sve(UL);
5824     }
5825   }
5826 
5827   // R0 = result
5828   // R1 = str2
5829   // R2 = cnt1
5830   // R3 = str1
5831   // R4 = cnt2
5832   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5833   //
5834   // This generic linear code use few additional ideas, which makes it faster:
5835   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5836   // in order to skip initial loading(help in systems with 1 ld pipeline)
5837   // 2) we can use "fast" algorithm of finding single character to search for
5838   // first symbol with less branches(1 branch per each loaded register instead
5839   // of branch for each symbol), so, this is where constants like
5840   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5841   // 3) after loading and analyzing 1st register of source string, it can be
5842   // used to search for every 1st character entry, saving few loads in
5843   // comparison with "simplier-but-slower" implementation
5844   // 4) in order to avoid lots of push/pop operations, code below is heavily
5845   // re-using/re-initializing/compressing register values, which makes code
5846   // larger and a bit less readable, however, most of extra operations are
5847   // issued during loads or branches, so, penalty is minimal
5848   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5849     const char* stubName = str1_isL
5850         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5851         : "indexof_linear_uu";
5852     __ align(CodeEntryAlignment);
5853     StubCodeMark mark(this, "StubRoutines", stubName);
5854     address entry = __ pc();
5855 
5856     int str1_chr_size = str1_isL ? 1 : 2;
5857     int str2_chr_size = str2_isL ? 1 : 2;
5858     int str1_chr_shift = str1_isL ? 0 : 1;
5859     int str2_chr_shift = str2_isL ? 0 : 1;
5860     bool isL = str1_isL && str2_isL;
5861    // parameters
5862     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5863     // temporary registers
5864     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5865     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5866     // redefinitions
5867     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5868 
5869     __ push(spilled_regs, sp);
5870     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5871         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5872         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5873         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5874         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5875         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5876     // Read whole register from str1. It is safe, because length >=8 here
5877     __ ldr(ch1, Address(str1));
5878     // Read whole register from str2. It is safe, because length >=8 here
5879     __ ldr(ch2, Address(str2));
5880     __ sub(cnt2, cnt2, cnt1);
5881     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5882     if (str1_isL != str2_isL) {
5883       __ eor(v0, __ T16B, v0, v0);
5884     }
5885     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5886     __ mul(first, first, tmp1);
5887     // check if we have less than 1 register to check
5888     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5889     if (str1_isL != str2_isL) {
5890       __ fmovd(v1, ch1);
5891     }
5892     __ br(__ LE, L_SMALL);
5893     __ eor(ch2, first, ch2);
5894     if (str1_isL != str2_isL) {
5895       __ zip1(v1, __ T16B, v1, v0);
5896     }
5897     __ sub(tmp2, ch2, tmp1);
5898     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5899     __ bics(tmp2, tmp2, ch2);
5900     if (str1_isL != str2_isL) {
5901       __ fmovd(ch1, v1);
5902     }
5903     __ br(__ NE, L_HAS_ZERO);
5904     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5905     __ add(result, result, wordSize/str2_chr_size);
5906     __ add(str2, str2, wordSize);
5907     __ br(__ LT, L_POST_LOOP);
5908     __ BIND(L_LOOP);
5909       __ ldr(ch2, Address(str2));
5910       __ eor(ch2, first, ch2);
5911       __ sub(tmp2, ch2, tmp1);
5912       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5913       __ bics(tmp2, tmp2, ch2);
5914       __ br(__ NE, L_HAS_ZERO);
5915     __ BIND(L_LOOP_PROCEED);
5916       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5917       __ add(str2, str2, wordSize);
5918       __ add(result, result, wordSize/str2_chr_size);
5919       __ br(__ GE, L_LOOP);
5920     __ BIND(L_POST_LOOP);
5921       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5922       __ br(__ LE, NOMATCH);
5923       __ ldr(ch2, Address(str2));
5924       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5925       __ eor(ch2, first, ch2);
5926       __ sub(tmp2, ch2, tmp1);
5927       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5928       __ mov(tmp4, -1); // all bits set
5929       __ b(L_SMALL_PROCEED);
5930     __ align(OptoLoopAlignment);
5931     __ BIND(L_SMALL);
5932       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5933       __ eor(ch2, first, ch2);
5934       if (str1_isL != str2_isL) {
5935         __ zip1(v1, __ T16B, v1, v0);
5936       }
5937       __ sub(tmp2, ch2, tmp1);
5938       __ mov(tmp4, -1); // all bits set
5939       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5940       if (str1_isL != str2_isL) {
5941         __ fmovd(ch1, v1); // move converted 4 symbols
5942       }
5943     __ BIND(L_SMALL_PROCEED);
5944       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5945       __ bic(tmp2, tmp2, ch2);
5946       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5947       __ rbit(tmp2, tmp2);
5948       __ br(__ EQ, NOMATCH);
5949     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5950       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5951       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5952       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5953       if (str2_isL) { // LL
5954         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5955         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5956         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5957         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5958         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5959       } else {
5960         __ mov(ch2, 0xE); // all bits in byte set except last one
5961         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5962         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5963         __ lslv(tmp2, tmp2, tmp4);
5964         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5965         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5966         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5967         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5968       }
5969       __ cmp(ch1, ch2);
5970       __ mov(tmp4, wordSize/str2_chr_size);
5971       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5972     __ BIND(L_SMALL_CMP_LOOP);
5973       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5974                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5975       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5976                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5977       __ add(tmp4, tmp4, 1);
5978       __ cmp(tmp4, cnt1);
5979       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5980       __ cmp(first, ch2);
5981       __ br(__ EQ, L_SMALL_CMP_LOOP);
5982     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5983       __ cbz(tmp2, NOMATCH); // no more matches. exit
5984       __ clz(tmp4, tmp2);
5985       __ add(result, result, 1); // advance index
5986       __ add(str2, str2, str2_chr_size); // advance pointer
5987       __ b(L_SMALL_HAS_ZERO_LOOP);
5988     __ align(OptoLoopAlignment);
5989     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5990       __ cmp(first, ch2);
5991       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5992       __ b(DONE);
5993     __ align(OptoLoopAlignment);
5994     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5995       if (str2_isL) { // LL
5996         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5997         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5998         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5999         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6000         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6001       } else {
6002         __ mov(ch2, 0xE); // all bits in byte set except last one
6003         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6004         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6005         __ lslv(tmp2, tmp2, tmp4);
6006         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6007         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6008         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6009         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6010       }
6011       __ cmp(ch1, ch2);
6012       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6013       __ b(DONE);
6014     __ align(OptoLoopAlignment);
6015     __ BIND(L_HAS_ZERO);
6016       __ rbit(tmp2, tmp2);
6017       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6018       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6019       // It's fine because both counters are 32bit and are not changed in this
6020       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6021       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6022       __ sub(result, result, 1);
6023     __ BIND(L_HAS_ZERO_LOOP);
6024       __ mov(cnt1, wordSize/str2_chr_size);
6025       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6026       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6027       if (str2_isL) {
6028         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6029         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6030         __ lslv(tmp2, tmp2, tmp4);
6031         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6032         __ add(tmp4, tmp4, 1);
6033         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6034         __ lsl(tmp2, tmp2, 1);
6035         __ mov(tmp4, wordSize/str2_chr_size);
6036       } else {
6037         __ mov(ch2, 0xE);
6038         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6039         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6040         __ lslv(tmp2, tmp2, tmp4);
6041         __ add(tmp4, tmp4, 1);
6042         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6043         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6044         __ lsl(tmp2, tmp2, 1);
6045         __ mov(tmp4, wordSize/str2_chr_size);
6046         __ sub(str2, str2, str2_chr_size);
6047       }
6048       __ cmp(ch1, ch2);
6049       __ mov(tmp4, wordSize/str2_chr_size);
6050       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6051     __ BIND(L_CMP_LOOP);
6052       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6053                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6054       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6055                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6056       __ add(tmp4, tmp4, 1);
6057       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6058       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6059       __ cmp(cnt1, ch2);
6060       __ br(__ EQ, L_CMP_LOOP);
6061     __ BIND(L_CMP_LOOP_NOMATCH);
6062       // here we're not matched
6063       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6064       __ clz(tmp4, tmp2);
6065       __ add(str2, str2, str2_chr_size); // advance pointer
6066       __ b(L_HAS_ZERO_LOOP);
6067     __ align(OptoLoopAlignment);
6068     __ BIND(L_CMP_LOOP_LAST_CMP);
6069       __ cmp(cnt1, ch2);
6070       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6071       __ b(DONE);
6072     __ align(OptoLoopAlignment);
6073     __ BIND(L_CMP_LOOP_LAST_CMP2);
6074       if (str2_isL) {
6075         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6076         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6077         __ lslv(tmp2, tmp2, tmp4);
6078         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6079         __ add(tmp4, tmp4, 1);
6080         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6081         __ lsl(tmp2, tmp2, 1);
6082       } else {
6083         __ mov(ch2, 0xE);
6084         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6085         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6086         __ lslv(tmp2, tmp2, tmp4);
6087         __ add(tmp4, tmp4, 1);
6088         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6089         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6090         __ lsl(tmp2, tmp2, 1);
6091         __ sub(str2, str2, str2_chr_size);
6092       }
6093       __ cmp(ch1, ch2);
6094       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6095       __ b(DONE);
6096     __ align(OptoLoopAlignment);
6097     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6098       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6099       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6100       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6101       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6102       // result by analyzed characters value, so, we can just reset lower bits
6103       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6104       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6105       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6106       // index of last analyzed substring inside current octet. So, str2 in at
6107       // respective start address. We need to advance it to next octet
6108       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6109       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6110       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6111       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6112       __ movw(cnt2, cnt2);
6113       __ b(L_LOOP_PROCEED);
6114     __ align(OptoLoopAlignment);
6115     __ BIND(NOMATCH);
6116       __ mov(result, -1);
6117     __ BIND(DONE);
6118       __ pop(spilled_regs, sp);
6119       __ ret(lr);
6120     return entry;
6121   }
6122 
6123   void generate_string_indexof_stubs() {
6124     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6125     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6126     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6127   }
6128 
6129   void inflate_and_store_2_fp_registers(bool generatePrfm,
6130       FloatRegister src1, FloatRegister src2) {
6131     Register dst = r1;
6132     __ zip1(v1, __ T16B, src1, v0);
6133     __ zip2(v2, __ T16B, src1, v0);
6134     if (generatePrfm) {
6135       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6136     }
6137     __ zip1(v3, __ T16B, src2, v0);
6138     __ zip2(v4, __ T16B, src2, v0);
6139     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6140   }
6141 
6142   // R0 = src
6143   // R1 = dst
6144   // R2 = len
6145   // R3 = len >> 3
6146   // V0 = 0
6147   // v1 = loaded 8 bytes
6148   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6149   address generate_large_byte_array_inflate() {
6150     __ align(CodeEntryAlignment);
6151     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6152     address entry = __ pc();
6153     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6154     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6155     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6156 
6157     // do one more 8-byte read to have address 16-byte aligned in most cases
6158     // also use single store instruction
6159     __ ldrd(v2, __ post(src, 8));
6160     __ sub(octetCounter, octetCounter, 2);
6161     __ zip1(v1, __ T16B, v1, v0);
6162     __ zip1(v2, __ T16B, v2, v0);
6163     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6164     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6165     __ subs(rscratch1, octetCounter, large_loop_threshold);
6166     __ br(__ LE, LOOP_START);
6167     __ b(LOOP_PRFM_START);
6168     __ bind(LOOP_PRFM);
6169       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6170     __ bind(LOOP_PRFM_START);
6171       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6172       __ sub(octetCounter, octetCounter, 8);
6173       __ subs(rscratch1, octetCounter, large_loop_threshold);
6174       inflate_and_store_2_fp_registers(true, v3, v4);
6175       inflate_and_store_2_fp_registers(true, v5, v6);
6176       __ br(__ GT, LOOP_PRFM);
6177       __ cmp(octetCounter, (u1)8);
6178       __ br(__ LT, DONE);
6179     __ bind(LOOP);
6180       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6181       __ bind(LOOP_START);
6182       __ sub(octetCounter, octetCounter, 8);
6183       __ cmp(octetCounter, (u1)8);
6184       inflate_and_store_2_fp_registers(false, v3, v4);
6185       inflate_and_store_2_fp_registers(false, v5, v6);
6186       __ br(__ GE, LOOP);
6187     __ bind(DONE);
6188       __ ret(lr);
6189     return entry;
6190   }
6191 
6192   /**
6193    *  Arguments:
6194    *
6195    *  Input:
6196    *  c_rarg0   - current state address
6197    *  c_rarg1   - H key address
6198    *  c_rarg2   - data address
6199    *  c_rarg3   - number of blocks
6200    *
6201    *  Output:
6202    *  Updated state at c_rarg0
6203    */
6204   address generate_ghash_processBlocks() {
6205     // Bafflingly, GCM uses little-endian for the byte order, but
6206     // big-endian for the bit order.  For example, the polynomial 1 is
6207     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6208     //
6209     // So, we must either reverse the bytes in each word and do
6210     // everything big-endian or reverse the bits in each byte and do
6211     // it little-endian.  On AArch64 it's more idiomatic to reverse
6212     // the bits in each byte (we have an instruction, RBIT, to do
6213     // that) and keep the data in little-endian bit order through the
6214     // calculation, bit-reversing the inputs and outputs.
6215 
6216     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6217     __ align(wordSize * 2);
6218     address p = __ pc();
6219     __ emit_int64(0x87);  // The low-order bits of the field
6220                           // polynomial (i.e. p = z^7+z^2+z+1)
6221                           // repeated in the low and high parts of a
6222                           // 128-bit vector
6223     __ emit_int64(0x87);
6224 
6225     __ align(CodeEntryAlignment);
6226     address start = __ pc();
6227 
6228     Register state   = c_rarg0;
6229     Register subkeyH = c_rarg1;
6230     Register data    = c_rarg2;
6231     Register blocks  = c_rarg3;
6232 
6233     FloatRegister vzr = v30;
6234     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6235 
6236     __ ldrq(v24, p);    // The field polynomial
6237 
6238     __ ldrq(v0, Address(state));
6239     __ ldrq(v1, Address(subkeyH));
6240 
6241     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6242     __ rbit(v0, __ T16B, v0);
6243     __ rev64(v1, __ T16B, v1);
6244     __ rbit(v1, __ T16B, v1);
6245 
6246     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6247     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6248 
6249     {
6250       Label L_ghash_loop;
6251       __ bind(L_ghash_loop);
6252 
6253       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6254                                                  // reversing each byte
6255       __ rbit(v2, __ T16B, v2);
6256       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6257 
6258       // Multiply state in v2 by subkey in v1
6259       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6260                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6261                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6262       // Reduce v7:v5 by the field polynomial
6263       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6264 
6265       __ sub(blocks, blocks, 1);
6266       __ cbnz(blocks, L_ghash_loop);
6267     }
6268 
6269     // The bit-reversed result is at this point in v0
6270     __ rev64(v0, __ T16B, v0);
6271     __ rbit(v0, __ T16B, v0);
6272 
6273     __ st1(v0, __ T16B, state);
6274     __ ret(lr);
6275 
6276     return start;
6277   }
6278 
6279   address generate_ghash_processBlocks_wide() {
6280     address small = generate_ghash_processBlocks();
6281 
6282     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6283     __ align(wordSize * 2);
6284     address p = __ pc();
6285     __ emit_int64(0x87);  // The low-order bits of the field
6286                           // polynomial (i.e. p = z^7+z^2+z+1)
6287                           // repeated in the low and high parts of a
6288                           // 128-bit vector
6289     __ emit_int64(0x87);
6290 
6291     __ align(CodeEntryAlignment);
6292     address start = __ pc();
6293 
6294     Register state   = c_rarg0;
6295     Register subkeyH = c_rarg1;
6296     Register data    = c_rarg2;
6297     Register blocks  = c_rarg3;
6298 
6299     const int unroll = 4;
6300 
6301     __ cmp(blocks, (unsigned char)(unroll * 2));
6302     __ br(__ LT, small);
6303 
6304     if (unroll > 1) {
6305     // Save state before entering routine
6306       __ sub(sp, sp, 4 * 16);
6307       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6308       __ sub(sp, sp, 4 * 16);
6309       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6310     }
6311 
6312     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6313 
6314     if (unroll > 1) {
6315       // And restore state
6316       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6317       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6318     }
6319 
6320     __ cmp(blocks, (unsigned char)0);
6321     __ br(__ GT, small);
6322 
6323     __ ret(lr);
6324 
6325     return start;
6326   }
6327 
6328   void generate_base64_encode_simdround(Register src, Register dst,
6329         FloatRegister codec, u8 size) {
6330 
6331     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6332     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6333     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6334 
6335     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6336 
6337     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6338 
6339     __ ushr(ind0, arrangement, in0,  2);
6340 
6341     __ ushr(ind1, arrangement, in1,  2);
6342     __ shl(in0,   arrangement, in0,  6);
6343     __ orr(ind1,  arrangement, ind1, in0);
6344     __ ushr(ind1, arrangement, ind1, 2);
6345 
6346     __ ushr(ind2, arrangement, in2,  4);
6347     __ shl(in1,   arrangement, in1,  4);
6348     __ orr(ind2,  arrangement, in1,  ind2);
6349     __ ushr(ind2, arrangement, ind2, 2);
6350 
6351     __ shl(ind3,  arrangement, in2,  2);
6352     __ ushr(ind3, arrangement, ind3, 2);
6353 
6354     __ tbl(out0,  arrangement, codec,  4, ind0);
6355     __ tbl(out1,  arrangement, codec,  4, ind1);
6356     __ tbl(out2,  arrangement, codec,  4, ind2);
6357     __ tbl(out3,  arrangement, codec,  4, ind3);
6358 
6359     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6360   }
6361 
6362    /**
6363    *  Arguments:
6364    *
6365    *  Input:
6366    *  c_rarg0   - src_start
6367    *  c_rarg1   - src_offset
6368    *  c_rarg2   - src_length
6369    *  c_rarg3   - dest_start
6370    *  c_rarg4   - dest_offset
6371    *  c_rarg5   - isURL
6372    *
6373    */
6374   address generate_base64_encodeBlock() {
6375 
6376     static const char toBase64[64] = {
6377       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6378       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6379       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6380       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6381       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6382     };
6383 
6384     static const char toBase64URL[64] = {
6385       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6386       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6387       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6388       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6389       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6390     };
6391 
6392     __ align(CodeEntryAlignment);
6393     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6394     address start = __ pc();
6395 
6396     Register src   = c_rarg0;  // source array
6397     Register soff  = c_rarg1;  // source start offset
6398     Register send  = c_rarg2;  // source end offset
6399     Register dst   = c_rarg3;  // dest array
6400     Register doff  = c_rarg4;  // position for writing to dest array
6401     Register isURL = c_rarg5;  // Base64 or URL character set
6402 
6403     // c_rarg6 and c_rarg7 are free to use as temps
6404     Register codec  = c_rarg6;
6405     Register length = c_rarg7;
6406 
6407     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6408 
6409     __ add(src, src, soff);
6410     __ add(dst, dst, doff);
6411     __ sub(length, send, soff);
6412 
6413     // load the codec base address
6414     __ lea(codec, ExternalAddress((address) toBase64));
6415     __ cbz(isURL, ProcessData);
6416     __ lea(codec, ExternalAddress((address) toBase64URL));
6417 
6418     __ BIND(ProcessData);
6419 
6420     // too short to formup a SIMD loop, roll back
6421     __ cmp(length, (u1)24);
6422     __ br(Assembler::LT, Process3B);
6423 
6424     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6425 
6426     __ BIND(Process48B);
6427     __ cmp(length, (u1)48);
6428     __ br(Assembler::LT, Process24B);
6429     generate_base64_encode_simdround(src, dst, v0, 16);
6430     __ sub(length, length, 48);
6431     __ b(Process48B);
6432 
6433     __ BIND(Process24B);
6434     __ cmp(length, (u1)24);
6435     __ br(Assembler::LT, SIMDExit);
6436     generate_base64_encode_simdround(src, dst, v0, 8);
6437     __ sub(length, length, 24);
6438 
6439     __ BIND(SIMDExit);
6440     __ cbz(length, Exit);
6441 
6442     __ BIND(Process3B);
6443     //  3 src bytes, 24 bits
6444     __ ldrb(r10, __ post(src, 1));
6445     __ ldrb(r11, __ post(src, 1));
6446     __ ldrb(r12, __ post(src, 1));
6447     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6448     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6449     // codec index
6450     __ ubfmw(r15, r12, 18, 23);
6451     __ ubfmw(r14, r12, 12, 17);
6452     __ ubfmw(r13, r12, 6,  11);
6453     __ andw(r12,  r12, 63);
6454     // get the code based on the codec
6455     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6456     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6457     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6458     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6459     __ strb(r15, __ post(dst, 1));
6460     __ strb(r14, __ post(dst, 1));
6461     __ strb(r13, __ post(dst, 1));
6462     __ strb(r12, __ post(dst, 1));
6463     __ sub(length, length, 3);
6464     __ cbnz(length, Process3B);
6465 
6466     __ BIND(Exit);
6467     __ ret(lr);
6468 
6469     return start;
6470   }
6471 
6472   void generate_base64_decode_simdround(Register src, Register dst,
6473         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6474 
6475     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6476     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6477 
6478     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6479     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6480 
6481     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6482 
6483     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6484 
6485     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6486 
6487     // we need unsigned saturating subtract, to make sure all input values
6488     // in range [0, 63] will have 0U value in the higher half lookup
6489     __ uqsubv(decH0, __ T16B, in0, v27);
6490     __ uqsubv(decH1, __ T16B, in1, v27);
6491     __ uqsubv(decH2, __ T16B, in2, v27);
6492     __ uqsubv(decH3, __ T16B, in3, v27);
6493 
6494     // lower half lookup
6495     __ tbl(decL0, arrangement, codecL, 4, in0);
6496     __ tbl(decL1, arrangement, codecL, 4, in1);
6497     __ tbl(decL2, arrangement, codecL, 4, in2);
6498     __ tbl(decL3, arrangement, codecL, 4, in3);
6499 
6500     // higher half lookup
6501     __ tbx(decH0, arrangement, codecH, 4, decH0);
6502     __ tbx(decH1, arrangement, codecH, 4, decH1);
6503     __ tbx(decH2, arrangement, codecH, 4, decH2);
6504     __ tbx(decH3, arrangement, codecH, 4, decH3);
6505 
6506     // combine lower and higher
6507     __ orr(decL0, arrangement, decL0, decH0);
6508     __ orr(decL1, arrangement, decL1, decH1);
6509     __ orr(decL2, arrangement, decL2, decH2);
6510     __ orr(decL3, arrangement, decL3, decH3);
6511 
6512     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6513     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6514     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6515     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6516     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6517     __ orr(in0, arrangement, decH0, decH1);
6518     __ orr(in1, arrangement, decH2, decH3);
6519     __ orr(in2, arrangement, in0,   in1);
6520     __ umaxv(in3, arrangement, in2);
6521     __ umov(rscratch2, in3, __ B, 0);
6522 
6523     // get the data to output
6524     __ shl(out0,  arrangement, decL0, 2);
6525     __ ushr(out1, arrangement, decL1, 4);
6526     __ orr(out0,  arrangement, out0,  out1);
6527     __ shl(out1,  arrangement, decL1, 4);
6528     __ ushr(out2, arrangement, decL2, 2);
6529     __ orr(out1,  arrangement, out1,  out2);
6530     __ shl(out2,  arrangement, decL2, 6);
6531     __ orr(out2,  arrangement, out2,  decL3);
6532 
6533     __ cbz(rscratch2, NoIllegalData);
6534 
6535     // handle illegal input
6536     __ umov(r10, in2, __ D, 0);
6537     if (size == 16) {
6538       __ cbnz(r10, ErrorInLowerHalf);
6539 
6540       // illegal input is in higher half, store the lower half now.
6541       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6542 
6543       __ umov(r10, in2,  __ D, 1);
6544       __ umov(r11, out0, __ D, 1);
6545       __ umov(r12, out1, __ D, 1);
6546       __ umov(r13, out2, __ D, 1);
6547       __ b(StoreLegalData);
6548 
6549       __ BIND(ErrorInLowerHalf);
6550     }
6551     __ umov(r11, out0, __ D, 0);
6552     __ umov(r12, out1, __ D, 0);
6553     __ umov(r13, out2, __ D, 0);
6554 
6555     __ BIND(StoreLegalData);
6556     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6557     __ strb(r11, __ post(dst, 1));
6558     __ strb(r12, __ post(dst, 1));
6559     __ strb(r13, __ post(dst, 1));
6560     __ lsr(r10, r10, 8);
6561     __ lsr(r11, r11, 8);
6562     __ lsr(r12, r12, 8);
6563     __ lsr(r13, r13, 8);
6564     __ b(StoreLegalData);
6565 
6566     __ BIND(NoIllegalData);
6567     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6568   }
6569 
6570 
6571    /**
6572    *  Arguments:
6573    *
6574    *  Input:
6575    *  c_rarg0   - src_start
6576    *  c_rarg1   - src_offset
6577    *  c_rarg2   - src_length
6578    *  c_rarg3   - dest_start
6579    *  c_rarg4   - dest_offset
6580    *  c_rarg5   - isURL
6581    *  c_rarg6   - isMIME
6582    *
6583    */
6584   address generate_base64_decodeBlock() {
6585 
6586     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6587     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6588     // titled "Base64 decoding".
6589 
6590     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6591     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6592     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6593     static const uint8_t fromBase64ForNoSIMD[256] = {
6594       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6595       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6596       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6597        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6598       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6599        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6600       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6601        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6602       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6603       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6604       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6605       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6606       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6607       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6608       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6609       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6610     };
6611 
6612     static const uint8_t fromBase64URLForNoSIMD[256] = {
6613       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6614       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6616        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6617       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6618        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6619       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6620        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6621       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6622       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6623       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6624       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6625       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6626       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6627       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6628       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6629     };
6630 
6631     // A legal value of base64 code is in range [0, 127].  We need two lookups
6632     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6633     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6634     // table vector lookup use tbx, out of range indices are unchanged in
6635     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6636     // The value of index 64 is set to 0, so that we know that we already get the
6637     // decoded data with the 1st lookup.
6638     static const uint8_t fromBase64ForSIMD[128] = {
6639       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6640       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6641       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6642        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6643         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6644        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6645       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6646        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6647     };
6648 
6649     static const uint8_t fromBase64URLForSIMD[128] = {
6650       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6651       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6652       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6653        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6654         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6655        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6656        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6657        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6658     };
6659 
6660     __ align(CodeEntryAlignment);
6661     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6662     address start = __ pc();
6663 
6664     Register src    = c_rarg0;  // source array
6665     Register soff   = c_rarg1;  // source start offset
6666     Register send   = c_rarg2;  // source end offset
6667     Register dst    = c_rarg3;  // dest array
6668     Register doff   = c_rarg4;  // position for writing to dest array
6669     Register isURL  = c_rarg5;  // Base64 or URL character set
6670     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6671 
6672     Register length = send;    // reuse send as length of source data to process
6673 
6674     Register simd_codec   = c_rarg6;
6675     Register nosimd_codec = c_rarg7;
6676 
6677     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6678 
6679     __ enter();
6680 
6681     __ add(src, src, soff);
6682     __ add(dst, dst, doff);
6683 
6684     __ mov(doff, dst);
6685 
6686     __ sub(length, send, soff);
6687     __ bfm(length, zr, 0, 1);
6688 
6689     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6690     __ cbz(isURL, ProcessData);
6691     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6692 
6693     __ BIND(ProcessData);
6694     __ mov(rscratch1, length);
6695     __ cmp(length, (u1)144); // 144 = 80 + 64
6696     __ br(Assembler::LT, Process4B);
6697 
6698     // In the MIME case, the line length cannot be more than 76
6699     // bytes (see RFC 2045). This is too short a block for SIMD
6700     // to be worthwhile, so we use non-SIMD here.
6701     __ movw(rscratch1, 79);
6702 
6703     __ BIND(Process4B);
6704     __ ldrw(r14, __ post(src, 4));
6705     __ ubfxw(r10, r14, 0,  8);
6706     __ ubfxw(r11, r14, 8,  8);
6707     __ ubfxw(r12, r14, 16, 8);
6708     __ ubfxw(r13, r14, 24, 8);
6709     // get the de-code
6710     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6711     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6712     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6713     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6714     // error detection, 255u indicates an illegal input
6715     __ orrw(r14, r10, r11);
6716     __ orrw(r15, r12, r13);
6717     __ orrw(r14, r14, r15);
6718     __ tbnz(r14, 7, Exit);
6719     // recover the data
6720     __ lslw(r14, r10, 10);
6721     __ bfiw(r14, r11, 4, 6);
6722     __ bfmw(r14, r12, 2, 5);
6723     __ rev16w(r14, r14);
6724     __ bfiw(r13, r12, 6, 2);
6725     __ strh(r14, __ post(dst, 2));
6726     __ strb(r13, __ post(dst, 1));
6727     // non-simd loop
6728     __ subsw(rscratch1, rscratch1, 4);
6729     __ br(Assembler::GT, Process4B);
6730 
6731     // if exiting from PreProcess80B, rscratch1 == -1;
6732     // otherwise, rscratch1 == 0.
6733     __ cbzw(rscratch1, Exit);
6734     __ sub(length, length, 80);
6735 
6736     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6737     __ cbz(isURL, SIMDEnter);
6738     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6739 
6740     __ BIND(SIMDEnter);
6741     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6742     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6743     __ mov(rscratch1, 63);
6744     __ dup(v27, __ T16B, rscratch1);
6745 
6746     __ BIND(Process64B);
6747     __ cmp(length, (u1)64);
6748     __ br(Assembler::LT, Process32B);
6749     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6750     __ sub(length, length, 64);
6751     __ b(Process64B);
6752 
6753     __ BIND(Process32B);
6754     __ cmp(length, (u1)32);
6755     __ br(Assembler::LT, SIMDExit);
6756     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6757     __ sub(length, length, 32);
6758     __ b(Process32B);
6759 
6760     __ BIND(SIMDExit);
6761     __ cbz(length, Exit);
6762     __ movw(rscratch1, length);
6763     __ b(Process4B);
6764 
6765     __ BIND(Exit);
6766     __ sub(c_rarg0, dst, doff);
6767 
6768     __ leave();
6769     __ ret(lr);
6770 
6771     return start;
6772   }
6773 
6774   // Support for spin waits.
6775   address generate_spin_wait() {
6776     __ align(CodeEntryAlignment);
6777     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6778     address start = __ pc();
6779 
6780     __ spin_wait();
6781     __ ret(lr);
6782 
6783     return start;
6784   }
6785 
6786 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6787 
6788   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6789   //
6790   // If LSE is in use, generate LSE versions of all the stubs. The
6791   // non-LSE versions are in atomic_aarch64.S.
6792 
6793   // class AtomicStubMark records the entry point of a stub and the
6794   // stub pointer which will point to it. The stub pointer is set to
6795   // the entry point when ~AtomicStubMark() is called, which must be
6796   // after ICache::invalidate_range. This ensures safe publication of
6797   // the generated code.
6798   class AtomicStubMark {
6799     address _entry_point;
6800     aarch64_atomic_stub_t *_stub;
6801     MacroAssembler *_masm;
6802   public:
6803     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6804       _masm = masm;
6805       __ align(32);
6806       _entry_point = __ pc();
6807       _stub = stub;
6808     }
6809     ~AtomicStubMark() {
6810       *_stub = (aarch64_atomic_stub_t)_entry_point;
6811     }
6812   };
6813 
6814   // NB: For memory_order_conservative we need a trailing membar after
6815   // LSE atomic operations but not a leading membar.
6816   //
6817   // We don't need a leading membar because a clause in the Arm ARM
6818   // says:
6819   //
6820   //   Barrier-ordered-before
6821   //
6822   //   Barrier instructions order prior Memory effects before subsequent
6823   //   Memory effects generated by the same Observer. A read or a write
6824   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6825   //   Observer if and only if RW1 appears in program order before RW 2
6826   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6827   //   instruction with both Acquire and Release semantics.
6828   //
6829   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6830   // and Release semantics, therefore we don't need a leading
6831   // barrier. However, there is no corresponding Barrier-ordered-after
6832   // relationship, therefore we need a trailing membar to prevent a
6833   // later store or load from being reordered with the store in an
6834   // atomic instruction.
6835   //
6836   // This was checked by using the herd7 consistency model simulator
6837   // (http://diy.inria.fr/) with this test case:
6838   //
6839   // AArch64 LseCas
6840   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6841   // P0 | P1;
6842   // LDR W4, [X2] | MOV W3, #0;
6843   // DMB LD       | MOV W4, #1;
6844   // LDR W3, [X1] | CASAL W3, W4, [X1];
6845   //              | DMB ISH;
6846   //              | STR W4, [X2];
6847   // exists
6848   // (0:X3=0 /\ 0:X4=1)
6849   //
6850   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6851   // with the store to x in P1. Without the DMB in P1 this may happen.
6852   //
6853   // At the time of writing we don't know of any AArch64 hardware that
6854   // reorders stores in this way, but the Reference Manual permits it.
6855 
6856   void gen_cas_entry(Assembler::operand_size size,
6857                      atomic_memory_order order) {
6858     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6859       exchange_val = c_rarg2;
6860     bool acquire, release;
6861     switch (order) {
6862       case memory_order_relaxed:
6863         acquire = false;
6864         release = false;
6865         break;
6866       case memory_order_release:
6867         acquire = false;
6868         release = true;
6869         break;
6870       default:
6871         acquire = true;
6872         release = true;
6873         break;
6874     }
6875     __ mov(prev, compare_val);
6876     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6877     if (order == memory_order_conservative) {
6878       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6879     }
6880     if (size == Assembler::xword) {
6881       __ mov(r0, prev);
6882     } else {
6883       __ movw(r0, prev);
6884     }
6885     __ ret(lr);
6886   }
6887 
6888   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6889     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6890     // If not relaxed, then default to conservative.  Relaxed is the only
6891     // case we use enough to be worth specializing.
6892     if (order == memory_order_relaxed) {
6893       __ ldadd(size, incr, prev, addr);
6894     } else {
6895       __ ldaddal(size, incr, prev, addr);
6896       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6897     }
6898     if (size == Assembler::xword) {
6899       __ mov(r0, prev);
6900     } else {
6901       __ movw(r0, prev);
6902     }
6903     __ ret(lr);
6904   }
6905 
6906   void gen_swpal_entry(Assembler::operand_size size) {
6907     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6908     __ swpal(size, incr, prev, addr);
6909     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6910     if (size == Assembler::xword) {
6911       __ mov(r0, prev);
6912     } else {
6913       __ movw(r0, prev);
6914     }
6915     __ ret(lr);
6916   }
6917 
6918   void generate_atomic_entry_points() {
6919     if (! UseLSE) {
6920       return;
6921     }
6922 
6923     __ align(CodeEntryAlignment);
6924     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6925     address first_entry = __ pc();
6926 
6927     // ADD, memory_order_conservative
6928     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6929     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6930     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6931     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6932 
6933     // ADD, memory_order_relaxed
6934     AtomicStubMark mark_fetch_add_4_relaxed
6935       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6936     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6937     AtomicStubMark mark_fetch_add_8_relaxed
6938       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6939     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6940 
6941     // XCHG, memory_order_conservative
6942     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6943     gen_swpal_entry(Assembler::word);
6944     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6945     gen_swpal_entry(Assembler::xword);
6946 
6947     // CAS, memory_order_conservative
6948     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6949     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6950     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6951     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6952     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6953     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6954 
6955     // CAS, memory_order_relaxed
6956     AtomicStubMark mark_cmpxchg_1_relaxed
6957       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6958     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6959     AtomicStubMark mark_cmpxchg_4_relaxed
6960       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6961     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6962     AtomicStubMark mark_cmpxchg_8_relaxed
6963       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6964     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6965 
6966     AtomicStubMark mark_cmpxchg_4_release
6967       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6968     gen_cas_entry(MacroAssembler::word, memory_order_release);
6969     AtomicStubMark mark_cmpxchg_8_release
6970       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6971     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6972 
6973     AtomicStubMark mark_cmpxchg_4_seq_cst
6974       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6975     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6976     AtomicStubMark mark_cmpxchg_8_seq_cst
6977       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6978     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6979 
6980     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6981   }
6982 #endif // LINUX
6983 
6984   address generate_cont_thaw(Continuation::thaw_kind kind) {
6985     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6986     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6987 
6988     address start = __ pc();
6989 
6990     if (return_barrier) {
6991       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6992       __ mov(sp, rscratch1);
6993     }
6994     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6995 
6996     if (return_barrier) {
6997       // preserve possible return value from a method returning to the return barrier
6998       __ fmovd(rscratch1, v0);
6999       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7000     }
7001 
7002     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7003     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7004     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7005 
7006     if (return_barrier) {
7007       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7008       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7009       __ fmovd(v0, rscratch1);
7010     }
7011     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7012 
7013 
7014     Label thaw_success;
7015     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7016     __ cbnz(rscratch2, thaw_success);
7017     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
7018     __ br(rscratch1);
7019     __ bind(thaw_success);
7020 
7021     // make room for the thawed frames
7022     __ sub(rscratch1, sp, rscratch2);
7023     __ andr(rscratch1, rscratch1, -16); // align
7024     __ mov(sp, rscratch1);
7025 
7026     if (return_barrier) {
7027       // save original return value -- again
7028       __ fmovd(rscratch1, v0);
7029       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7030     }
7031 
7032     // If we want, we can templatize thaw by kind, and have three different entries
7033     __ movw(c_rarg1, (uint32_t)kind);
7034 
7035     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7036     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7037 
7038     if (return_barrier) {
7039       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7040       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7041       __ fmovd(v0, rscratch1);
7042     } else {
7043       __ mov(r0, zr); // return 0 (success) from doYield
7044     }
7045 
7046     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7047     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7048     __ mov(rfp, sp);
7049 
7050     if (return_barrier_exception) {
7051       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7052       __ authenticate_return_address(c_rarg1);
7053       __ verify_oop(r0);
7054       // save return value containing the exception oop in callee-saved R19
7055       __ mov(r19, r0);
7056 
7057       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7058 
7059       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7060       // __ reinitialize_ptrue();
7061 
7062       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7063 
7064       __ mov(r1, r0); // the exception handler
7065       __ mov(r0, r19); // restore return value containing the exception oop
7066       __ verify_oop(r0);
7067 
7068       __ leave();
7069       __ mov(r3, lr);
7070       __ br(r1); // the exception handler
7071     } else {
7072       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7073       __ leave();
7074       __ ret(lr);
7075     }
7076 
7077     return start;
7078   }
7079 
7080   address generate_cont_thaw() {
7081     if (!Continuations::enabled()) return nullptr;
7082 
7083     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7084     address start = __ pc();
7085     generate_cont_thaw(Continuation::thaw_top);
7086     return start;
7087   }
7088 
7089   address generate_cont_returnBarrier() {
7090     if (!Continuations::enabled()) return nullptr;
7091 
7092     // TODO: will probably need multiple return barriers depending on return type
7093     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7094     address start = __ pc();
7095 
7096     generate_cont_thaw(Continuation::thaw_return_barrier);
7097 
7098     return start;
7099   }
7100 
7101   address generate_cont_returnBarrier_exception() {
7102     if (!Continuations::enabled()) return nullptr;
7103 
7104     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7105     address start = __ pc();
7106 
7107     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7108 
7109     return start;
7110   }
7111 
7112   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7113   // are represented as long[5], with BITS_PER_LIMB = 26.
7114   // Pack five 26-bit limbs into three 64-bit registers.
7115   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7116     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7117     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7118     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7119     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7120 
7121     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7122     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7123     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7124     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7125 
7126     if (dest2->is_valid()) {
7127       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7128     } else {
7129 #ifdef ASSERT
7130       Label OK;
7131       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7132       __ br(__ EQ, OK);
7133       __ stop("high bits of Poly1305 integer should be zero");
7134       __ should_not_reach_here();
7135       __ bind(OK);
7136 #endif
7137     }
7138   }
7139 
7140   // As above, but return only a 128-bit integer, packed into two
7141   // 64-bit registers.
7142   void pack_26(Register dest0, Register dest1, Register src) {
7143     pack_26(dest0, dest1, noreg, src);
7144   }
7145 
7146   // Multiply and multiply-accumulate unsigned 64-bit registers.
7147   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7148     __ mul(prod_lo, n, m);
7149     __ umulh(prod_hi, n, m);
7150   }
7151   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7152     wide_mul(rscratch1, rscratch2, n, m);
7153     __ adds(sum_lo, sum_lo, rscratch1);
7154     __ adc(sum_hi, sum_hi, rscratch2);
7155   }
7156 
7157   // Poly1305, RFC 7539
7158 
7159   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7160   // description of the tricks used to simplify and accelerate this
7161   // computation.
7162 
7163   address generate_poly1305_processBlocks() {
7164     __ align(CodeEntryAlignment);
7165     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7166     address start = __ pc();
7167     Label here;
7168     __ enter();
7169     RegSet callee_saved = RegSet::range(r19, r28);
7170     __ push(callee_saved, sp);
7171 
7172     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7173 
7174     // Arguments
7175     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7176 
7177     // R_n is the 128-bit randomly-generated key, packed into two
7178     // registers.  The caller passes this key to us as long[5], with
7179     // BITS_PER_LIMB = 26.
7180     const Register R_0 = *++regs, R_1 = *++regs;
7181     pack_26(R_0, R_1, r_start);
7182 
7183     // RR_n is (R_n >> 2) * 5
7184     const Register RR_0 = *++regs, RR_1 = *++regs;
7185     __ lsr(RR_0, R_0, 2);
7186     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7187     __ lsr(RR_1, R_1, 2);
7188     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7189 
7190     // U_n is the current checksum
7191     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7192     pack_26(U_0, U_1, U_2, acc_start);
7193 
7194     static constexpr int BLOCK_LENGTH = 16;
7195     Label DONE, LOOP;
7196 
7197     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7198     __ br(Assembler::LT, DONE); {
7199       __ bind(LOOP);
7200 
7201       // S_n is to be the sum of U_n and the next block of data
7202       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7203       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7204       __ adds(S_0, U_0, S_0);
7205       __ adcs(S_1, U_1, S_1);
7206       __ adc(S_2, U_2, zr);
7207       __ add(S_2, S_2, 1);
7208 
7209       const Register U_0HI = *++regs, U_1HI = *++regs;
7210 
7211       // NB: this logic depends on some of the special properties of
7212       // Poly1305 keys. In particular, because we know that the top
7213       // four bits of R_0 and R_1 are zero, we can add together
7214       // partial products without any risk of needing to propagate a
7215       // carry out.
7216       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7217       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7218       __ andr(U_2, R_0, 3);
7219       __ mul(U_2, S_2, U_2);
7220 
7221       // Recycle registers S_0, S_1, S_2
7222       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7223 
7224       // Partial reduction mod 2**130 - 5
7225       __ adds(U_1, U_0HI, U_1);
7226       __ adc(U_2, U_1HI, U_2);
7227       // Sum now in U_2:U_1:U_0.
7228       // Dead: U_0HI, U_1HI.
7229       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7230 
7231       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7232 
7233       // First, U_2:U_1:U_0 += (U_2 >> 2)
7234       __ lsr(rscratch1, U_2, 2);
7235       __ andr(U_2, U_2, (u8)3);
7236       __ adds(U_0, U_0, rscratch1);
7237       __ adcs(U_1, U_1, zr);
7238       __ adc(U_2, U_2, zr);
7239       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7240       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7241       __ adcs(U_1, U_1, zr);
7242       __ adc(U_2, U_2, zr);
7243 
7244       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7245       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7246       __ br(~ Assembler::LT, LOOP);
7247     }
7248 
7249     // Further reduce modulo 2^130 - 5
7250     __ lsr(rscratch1, U_2, 2);
7251     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7252     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7253     __ adcs(U_1, U_1, zr);
7254     __ andr(U_2, U_2, (u1)3);
7255     __ adc(U_2, U_2, zr);
7256 
7257     // Unpack the sum into five 26-bit limbs and write to memory.
7258     __ ubfiz(rscratch1, U_0, 0, 26);
7259     __ ubfx(rscratch2, U_0, 26, 26);
7260     __ stp(rscratch1, rscratch2, Address(acc_start));
7261     __ ubfx(rscratch1, U_0, 52, 12);
7262     __ bfi(rscratch1, U_1, 12, 14);
7263     __ ubfx(rscratch2, U_1, 14, 26);
7264     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7265     __ ubfx(rscratch1, U_1, 40, 24);
7266     __ bfi(rscratch1, U_2, 24, 3);
7267     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7268 
7269     __ bind(DONE);
7270     __ pop(callee_saved, sp);
7271     __ leave();
7272     __ ret(lr);
7273 
7274     return start;
7275   }
7276 
7277 #if INCLUDE_JFR
7278 
7279   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
7280     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7281     __ mov(c_rarg0, thread);
7282   }
7283 
7284   // The handle is dereferenced through a load barrier.
7285   static void jfr_epilogue(MacroAssembler* _masm) {
7286     __ reset_last_Java_frame(true);
7287   }
7288 
7289   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
7290   // It returns a jobject handle to the event writer.
7291   // The handle is dereferenced and the return value is the event writer oop.
7292   static RuntimeStub* generate_jfr_write_checkpoint() {
7293     enum layout {
7294       rbp_off,
7295       rbpH_off,
7296       return_off,
7297       return_off2,
7298       framesize // inclusive of return address
7299     };
7300 
7301     int insts_size = 1024;
7302     int locs_size = 64;
7303     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
7304     OopMapSet* oop_maps = new OopMapSet();
7305     MacroAssembler* masm = new MacroAssembler(&code);
7306     MacroAssembler* _masm = masm;
7307 
7308     address start = __ pc();
7309     __ enter();
7310     int frame_complete = __ pc() - start;
7311     address the_pc = __ pc();
7312     jfr_prologue(the_pc, _masm, rthread);
7313     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
7314     jfr_epilogue(_masm);
7315     __ resolve_global_jobject(r0, rscratch1, rscratch2);
7316     __ leave();
7317     __ ret(lr);
7318 
7319     OopMap* map = new OopMap(framesize, 1); // rfp
7320     oop_maps->add_gc_map(the_pc - start, map);
7321 
7322     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7323       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
7324                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7325                                     oop_maps, false);
7326     return stub;
7327   }
7328 
7329   // For c2: call to return a leased buffer.
7330   static RuntimeStub* generate_jfr_return_lease() {
7331     enum layout {
7332       rbp_off,
7333       rbpH_off,
7334       return_off,
7335       return_off2,
7336       framesize // inclusive of return address
7337     };
7338 
7339     int insts_size = 1024;
7340     int locs_size = 64;
7341     CodeBuffer code("jfr_return_lease", insts_size, locs_size);
7342     OopMapSet* oop_maps = new OopMapSet();
7343     MacroAssembler* masm = new MacroAssembler(&code);
7344     MacroAssembler* _masm = masm;
7345 
7346     address start = __ pc();
7347     __ enter();
7348     int frame_complete = __ pc() - start;
7349     address the_pc = __ pc();
7350     jfr_prologue(the_pc, _masm, rthread);
7351     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
7352     jfr_epilogue(_masm);
7353 
7354     __ leave();
7355     __ ret(lr);
7356 
7357     OopMap* map = new OopMap(framesize, 1); // rfp
7358     oop_maps->add_gc_map(the_pc - start, map);
7359 
7360     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7361       RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete,
7362                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7363                                     oop_maps, false);
7364     return stub;
7365   }
7366 
7367 #endif // INCLUDE_JFR
7368 
7369   // exception handler for upcall stubs
7370   address generate_upcall_stub_exception_handler() {
7371     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7372     address start = __ pc();
7373 
7374     // Native caller has no idea how to handle exceptions,
7375     // so we just crash here. Up to callee to catch exceptions.
7376     __ verify_oop(r0);
7377     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7378     __ blr(rscratch1);
7379     __ should_not_reach_here();
7380 
7381     return start;
7382   }
7383 
7384   // Continuation point for throwing of implicit exceptions that are
7385   // not handled in the current activation. Fabricates an exception
7386   // oop and initiates normal exception dispatching in this
7387   // frame. Since we need to preserve callee-saved values (currently
7388   // only for C2, but done for C1 as well) we need a callee-saved oop
7389   // map and therefore have to make these stubs into RuntimeStubs
7390   // rather than BufferBlobs.  If the compiler needs all registers to
7391   // be preserved between the fault point and the exception handler
7392   // then it must assume responsibility for that in
7393   // AbstractCompiler::continuation_for_implicit_null_exception or
7394   // continuation_for_implicit_division_by_zero_exception. All other
7395   // implicit exceptions (e.g., NullPointerException or
7396   // AbstractMethodError on entry) are either at call sites or
7397   // otherwise assume that stack unwinding will be initiated, so
7398   // caller saved registers were assumed volatile in the compiler.
7399 
7400 #undef __
7401 #define __ masm->
7402 
7403   address generate_throw_exception(const char* name,
7404                                    address runtime_entry,
7405                                    Register arg1 = noreg,
7406                                    Register arg2 = noreg) {
7407     // Information about frame layout at time of blocking runtime call.
7408     // Note that we only have to preserve callee-saved registers since
7409     // the compilers are responsible for supplying a continuation point
7410     // if they expect all registers to be preserved.
7411     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7412     enum layout {
7413       rfp_off = 0,
7414       rfp_off2,
7415       return_off,
7416       return_off2,
7417       framesize // inclusive of return address
7418     };
7419 
7420     int insts_size = 512;
7421     int locs_size  = 64;
7422 
7423     CodeBuffer code(name, insts_size, locs_size);
7424     OopMapSet* oop_maps  = new OopMapSet();
7425     MacroAssembler* masm = new MacroAssembler(&code);
7426 
7427     address start = __ pc();
7428 
7429     // This is an inlined and slightly modified version of call_VM
7430     // which has the ability to fetch the return PC out of
7431     // thread-local storage and also sets up last_Java_sp slightly
7432     // differently than the real call_VM
7433 
7434     __ enter(); // Save FP and LR before call
7435 
7436     assert(is_even(framesize/2), "sp not 16-byte aligned");
7437 
7438     // lr and fp are already in place
7439     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7440 
7441     int frame_complete = __ pc() - start;
7442 
7443     // Set up last_Java_sp and last_Java_fp
7444     address the_pc = __ pc();
7445     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7446 
7447     // Call runtime
7448     if (arg1 != noreg) {
7449       assert(arg2 != c_rarg1, "clobbered");
7450       __ mov(c_rarg1, arg1);
7451     }
7452     if (arg2 != noreg) {
7453       __ mov(c_rarg2, arg2);
7454     }
7455     __ mov(c_rarg0, rthread);
7456     BLOCK_COMMENT("call runtime_entry");
7457     __ mov(rscratch1, runtime_entry);
7458     __ blr(rscratch1);
7459 
7460     // Generate oop map
7461     OopMap* map = new OopMap(framesize, 0);
7462 
7463     oop_maps->add_gc_map(the_pc - start, map);
7464 
7465     __ reset_last_Java_frame(true);
7466 
7467     // Reinitialize the ptrue predicate register, in case the external runtime
7468     // call clobbers ptrue reg, as we may return to SVE compiled code.
7469     __ reinitialize_ptrue();
7470 
7471     __ leave();
7472 
7473     // check for pending exceptions
7474 #ifdef ASSERT
7475     Label L;
7476     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7477     __ cbnz(rscratch1, L);
7478     __ should_not_reach_here();
7479     __ bind(L);
7480 #endif // ASSERT
7481     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7482 
7483     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7484     RuntimeStub* stub =
7485       RuntimeStub::new_runtime_stub(name,
7486                                     &code,
7487                                     frame_complete,
7488                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7489                                     oop_maps, false);
7490     return stub->entry_point();
7491   }
7492 
7493   class MontgomeryMultiplyGenerator : public MacroAssembler {
7494 
7495     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7496       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7497 
7498     RegSet _toSave;
7499     bool _squaring;
7500 
7501   public:
7502     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7503       : MacroAssembler(as->code()), _squaring(squaring) {
7504 
7505       // Register allocation
7506 
7507       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7508       Pa_base = *regs;       // Argument registers
7509       if (squaring)
7510         Pb_base = Pa_base;
7511       else
7512         Pb_base = *++regs;
7513       Pn_base = *++regs;
7514       Rlen= *++regs;
7515       inv = *++regs;
7516       Pm_base = *++regs;
7517 
7518                           // Working registers:
7519       Ra =  *++regs;        // The current digit of a, b, n, and m.
7520       Rb =  *++regs;
7521       Rm =  *++regs;
7522       Rn =  *++regs;
7523 
7524       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7525       Pb =  *++regs;
7526       Pm =  *++regs;
7527       Pn =  *++regs;
7528 
7529       t0 =  *++regs;        // Three registers which form a
7530       t1 =  *++regs;        // triple-precision accumuator.
7531       t2 =  *++regs;
7532 
7533       Ri =  *++regs;        // Inner and outer loop indexes.
7534       Rj =  *++regs;
7535 
7536       Rhi_ab = *++regs;     // Product registers: low and high parts
7537       Rlo_ab = *++regs;     // of a*b and m*n.
7538       Rhi_mn = *++regs;
7539       Rlo_mn = *++regs;
7540 
7541       // r19 and up are callee-saved.
7542       _toSave = RegSet::range(r19, *regs) + Pm_base;
7543     }
7544 
7545   private:
7546     void save_regs() {
7547       push(_toSave, sp);
7548     }
7549 
7550     void restore_regs() {
7551       pop(_toSave, sp);
7552     }
7553 
7554     template <typename T>
7555     void unroll_2(Register count, T block) {
7556       Label loop, end, odd;
7557       tbnz(count, 0, odd);
7558       cbz(count, end);
7559       align(16);
7560       bind(loop);
7561       (this->*block)();
7562       bind(odd);
7563       (this->*block)();
7564       subs(count, count, 2);
7565       br(Assembler::GT, loop);
7566       bind(end);
7567     }
7568 
7569     template <typename T>
7570     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7571       Label loop, end, odd;
7572       tbnz(count, 0, odd);
7573       cbz(count, end);
7574       align(16);
7575       bind(loop);
7576       (this->*block)(d, s, tmp);
7577       bind(odd);
7578       (this->*block)(d, s, tmp);
7579       subs(count, count, 2);
7580       br(Assembler::GT, loop);
7581       bind(end);
7582     }
7583 
7584     void pre1(RegisterOrConstant i) {
7585       block_comment("pre1");
7586       // Pa = Pa_base;
7587       // Pb = Pb_base + i;
7588       // Pm = Pm_base;
7589       // Pn = Pn_base + i;
7590       // Ra = *Pa;
7591       // Rb = *Pb;
7592       // Rm = *Pm;
7593       // Rn = *Pn;
7594       ldr(Ra, Address(Pa_base));
7595       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7596       ldr(Rm, Address(Pm_base));
7597       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7598       lea(Pa, Address(Pa_base));
7599       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7600       lea(Pm, Address(Pm_base));
7601       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7602 
7603       // Zero the m*n result.
7604       mov(Rhi_mn, zr);
7605       mov(Rlo_mn, zr);
7606     }
7607 
7608     // The core multiply-accumulate step of a Montgomery
7609     // multiplication.  The idea is to schedule operations as a
7610     // pipeline so that instructions with long latencies (loads and
7611     // multiplies) have time to complete before their results are
7612     // used.  This most benefits in-order implementations of the
7613     // architecture but out-of-order ones also benefit.
7614     void step() {
7615       block_comment("step");
7616       // MACC(Ra, Rb, t0, t1, t2);
7617       // Ra = *++Pa;
7618       // Rb = *--Pb;
7619       umulh(Rhi_ab, Ra, Rb);
7620       mul(Rlo_ab, Ra, Rb);
7621       ldr(Ra, pre(Pa, wordSize));
7622       ldr(Rb, pre(Pb, -wordSize));
7623       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7624                                        // previous iteration.
7625       // MACC(Rm, Rn, t0, t1, t2);
7626       // Rm = *++Pm;
7627       // Rn = *--Pn;
7628       umulh(Rhi_mn, Rm, Rn);
7629       mul(Rlo_mn, Rm, Rn);
7630       ldr(Rm, pre(Pm, wordSize));
7631       ldr(Rn, pre(Pn, -wordSize));
7632       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7633     }
7634 
7635     void post1() {
7636       block_comment("post1");
7637 
7638       // MACC(Ra, Rb, t0, t1, t2);
7639       // Ra = *++Pa;
7640       // Rb = *--Pb;
7641       umulh(Rhi_ab, Ra, Rb);
7642       mul(Rlo_ab, Ra, Rb);
7643       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7644       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7645 
7646       // *Pm = Rm = t0 * inv;
7647       mul(Rm, t0, inv);
7648       str(Rm, Address(Pm));
7649 
7650       // MACC(Rm, Rn, t0, t1, t2);
7651       // t0 = t1; t1 = t2; t2 = 0;
7652       umulh(Rhi_mn, Rm, Rn);
7653 
7654 #ifndef PRODUCT
7655       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7656       {
7657         mul(Rlo_mn, Rm, Rn);
7658         add(Rlo_mn, t0, Rlo_mn);
7659         Label ok;
7660         cbz(Rlo_mn, ok); {
7661           stop("broken Montgomery multiply");
7662         } bind(ok);
7663       }
7664 #endif
7665       // We have very carefully set things up so that
7666       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7667       // the lower half of Rm * Rn because we know the result already:
7668       // it must be -t0.  t0 + (-t0) must generate a carry iff
7669       // t0 != 0.  So, rather than do a mul and an adds we just set
7670       // the carry flag iff t0 is nonzero.
7671       //
7672       // mul(Rlo_mn, Rm, Rn);
7673       // adds(zr, t0, Rlo_mn);
7674       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7675       adcs(t0, t1, Rhi_mn);
7676       adc(t1, t2, zr);
7677       mov(t2, zr);
7678     }
7679 
7680     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7681       block_comment("pre2");
7682       // Pa = Pa_base + i-len;
7683       // Pb = Pb_base + len;
7684       // Pm = Pm_base + i-len;
7685       // Pn = Pn_base + len;
7686 
7687       if (i.is_register()) {
7688         sub(Rj, i.as_register(), len);
7689       } else {
7690         mov(Rj, i.as_constant());
7691         sub(Rj, Rj, len);
7692       }
7693       // Rj == i-len
7694 
7695       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7696       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7697       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7698       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7699 
7700       // Ra = *++Pa;
7701       // Rb = *--Pb;
7702       // Rm = *++Pm;
7703       // Rn = *--Pn;
7704       ldr(Ra, pre(Pa, wordSize));
7705       ldr(Rb, pre(Pb, -wordSize));
7706       ldr(Rm, pre(Pm, wordSize));
7707       ldr(Rn, pre(Pn, -wordSize));
7708 
7709       mov(Rhi_mn, zr);
7710       mov(Rlo_mn, zr);
7711     }
7712 
7713     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7714       block_comment("post2");
7715       if (i.is_constant()) {
7716         mov(Rj, i.as_constant()-len.as_constant());
7717       } else {
7718         sub(Rj, i.as_register(), len);
7719       }
7720 
7721       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7722 
7723       // As soon as we know the least significant digit of our result,
7724       // store it.
7725       // Pm_base[i-len] = t0;
7726       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7727 
7728       // t0 = t1; t1 = t2; t2 = 0;
7729       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7730       adc(t1, t2, zr);
7731       mov(t2, zr);
7732     }
7733 
7734     // A carry in t0 after Montgomery multiplication means that we
7735     // should subtract multiples of n from our result in m.  We'll
7736     // keep doing that until there is no carry.
7737     void normalize(RegisterOrConstant len) {
7738       block_comment("normalize");
7739       // while (t0)
7740       //   t0 = sub(Pm_base, Pn_base, t0, len);
7741       Label loop, post, again;
7742       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7743       cbz(t0, post); {
7744         bind(again); {
7745           mov(i, zr);
7746           mov(cnt, len);
7747           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7748           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7749           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7750           align(16);
7751           bind(loop); {
7752             sbcs(Rm, Rm, Rn);
7753             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7754             add(i, i, 1);
7755             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7756             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7757             sub(cnt, cnt, 1);
7758           } cbnz(cnt, loop);
7759           sbc(t0, t0, zr);
7760         } cbnz(t0, again);
7761       } bind(post);
7762     }
7763 
7764     // Move memory at s to d, reversing words.
7765     //    Increments d to end of copied memory
7766     //    Destroys tmp1, tmp2
7767     //    Preserves len
7768     //    Leaves s pointing to the address which was in d at start
7769     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7770       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7771       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7772 
7773       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7774       mov(tmp1, len);
7775       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7776       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7777     }
7778     // where
7779     void reverse1(Register d, Register s, Register tmp) {
7780       ldr(tmp, pre(s, -wordSize));
7781       ror(tmp, tmp, 32);
7782       str(tmp, post(d, wordSize));
7783     }
7784 
7785     void step_squaring() {
7786       // An extra ACC
7787       step();
7788       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7789     }
7790 
7791     void last_squaring(RegisterOrConstant i) {
7792       Label dont;
7793       // if ((i & 1) == 0) {
7794       tbnz(i.as_register(), 0, dont); {
7795         // MACC(Ra, Rb, t0, t1, t2);
7796         // Ra = *++Pa;
7797         // Rb = *--Pb;
7798         umulh(Rhi_ab, Ra, Rb);
7799         mul(Rlo_ab, Ra, Rb);
7800         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7801       } bind(dont);
7802     }
7803 
7804     void extra_step_squaring() {
7805       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7806 
7807       // MACC(Rm, Rn, t0, t1, t2);
7808       // Rm = *++Pm;
7809       // Rn = *--Pn;
7810       umulh(Rhi_mn, Rm, Rn);
7811       mul(Rlo_mn, Rm, Rn);
7812       ldr(Rm, pre(Pm, wordSize));
7813       ldr(Rn, pre(Pn, -wordSize));
7814     }
7815 
7816     void post1_squaring() {
7817       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7818 
7819       // *Pm = Rm = t0 * inv;
7820       mul(Rm, t0, inv);
7821       str(Rm, Address(Pm));
7822 
7823       // MACC(Rm, Rn, t0, t1, t2);
7824       // t0 = t1; t1 = t2; t2 = 0;
7825       umulh(Rhi_mn, Rm, Rn);
7826 
7827 #ifndef PRODUCT
7828       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7829       {
7830         mul(Rlo_mn, Rm, Rn);
7831         add(Rlo_mn, t0, Rlo_mn);
7832         Label ok;
7833         cbz(Rlo_mn, ok); {
7834           stop("broken Montgomery multiply");
7835         } bind(ok);
7836       }
7837 #endif
7838       // We have very carefully set things up so that
7839       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7840       // the lower half of Rm * Rn because we know the result already:
7841       // it must be -t0.  t0 + (-t0) must generate a carry iff
7842       // t0 != 0.  So, rather than do a mul and an adds we just set
7843       // the carry flag iff t0 is nonzero.
7844       //
7845       // mul(Rlo_mn, Rm, Rn);
7846       // adds(zr, t0, Rlo_mn);
7847       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7848       adcs(t0, t1, Rhi_mn);
7849       adc(t1, t2, zr);
7850       mov(t2, zr);
7851     }
7852 
7853     void acc(Register Rhi, Register Rlo,
7854              Register t0, Register t1, Register t2) {
7855       adds(t0, t0, Rlo);
7856       adcs(t1, t1, Rhi);
7857       adc(t2, t2, zr);
7858     }
7859 
7860   public:
7861     /**
7862      * Fast Montgomery multiplication.  The derivation of the
7863      * algorithm is in A Cryptographic Library for the Motorola
7864      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7865      *
7866      * Arguments:
7867      *
7868      * Inputs for multiplication:
7869      *   c_rarg0   - int array elements a
7870      *   c_rarg1   - int array elements b
7871      *   c_rarg2   - int array elements n (the modulus)
7872      *   c_rarg3   - int length
7873      *   c_rarg4   - int inv
7874      *   c_rarg5   - int array elements m (the result)
7875      *
7876      * Inputs for squaring:
7877      *   c_rarg0   - int array elements a
7878      *   c_rarg1   - int array elements n (the modulus)
7879      *   c_rarg2   - int length
7880      *   c_rarg3   - int inv
7881      *   c_rarg4   - int array elements m (the result)
7882      *
7883      */
7884     address generate_multiply() {
7885       Label argh, nothing;
7886       bind(argh);
7887       stop("MontgomeryMultiply total_allocation must be <= 8192");
7888 
7889       align(CodeEntryAlignment);
7890       address entry = pc();
7891 
7892       cbzw(Rlen, nothing);
7893 
7894       enter();
7895 
7896       // Make room.
7897       cmpw(Rlen, 512);
7898       br(Assembler::HI, argh);
7899       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7900       andr(sp, Ra, -2 * wordSize);
7901 
7902       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7903 
7904       {
7905         // Copy input args, reversing as we go.  We use Ra as a
7906         // temporary variable.
7907         reverse(Ra, Pa_base, Rlen, t0, t1);
7908         if (!_squaring)
7909           reverse(Ra, Pb_base, Rlen, t0, t1);
7910         reverse(Ra, Pn_base, Rlen, t0, t1);
7911       }
7912 
7913       // Push all call-saved registers and also Pm_base which we'll need
7914       // at the end.
7915       save_regs();
7916 
7917 #ifndef PRODUCT
7918       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7919       {
7920         ldr(Rn, Address(Pn_base, 0));
7921         mul(Rlo_mn, Rn, inv);
7922         subs(zr, Rlo_mn, -1);
7923         Label ok;
7924         br(EQ, ok); {
7925           stop("broken inverse in Montgomery multiply");
7926         } bind(ok);
7927       }
7928 #endif
7929 
7930       mov(Pm_base, Ra);
7931 
7932       mov(t0, zr);
7933       mov(t1, zr);
7934       mov(t2, zr);
7935 
7936       block_comment("for (int i = 0; i < len; i++) {");
7937       mov(Ri, zr); {
7938         Label loop, end;
7939         cmpw(Ri, Rlen);
7940         br(Assembler::GE, end);
7941 
7942         bind(loop);
7943         pre1(Ri);
7944 
7945         block_comment("  for (j = i; j; j--) {"); {
7946           movw(Rj, Ri);
7947           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7948         } block_comment("  } // j");
7949 
7950         post1();
7951         addw(Ri, Ri, 1);
7952         cmpw(Ri, Rlen);
7953         br(Assembler::LT, loop);
7954         bind(end);
7955         block_comment("} // i");
7956       }
7957 
7958       block_comment("for (int i = len; i < 2*len; i++) {");
7959       mov(Ri, Rlen); {
7960         Label loop, end;
7961         cmpw(Ri, Rlen, Assembler::LSL, 1);
7962         br(Assembler::GE, end);
7963 
7964         bind(loop);
7965         pre2(Ri, Rlen);
7966 
7967         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7968           lslw(Rj, Rlen, 1);
7969           subw(Rj, Rj, Ri);
7970           subw(Rj, Rj, 1);
7971           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7972         } block_comment("  } // j");
7973 
7974         post2(Ri, Rlen);
7975         addw(Ri, Ri, 1);
7976         cmpw(Ri, Rlen, Assembler::LSL, 1);
7977         br(Assembler::LT, loop);
7978         bind(end);
7979       }
7980       block_comment("} // i");
7981 
7982       normalize(Rlen);
7983 
7984       mov(Ra, Pm_base);  // Save Pm_base in Ra
7985       restore_regs();  // Restore caller's Pm_base
7986 
7987       // Copy our result into caller's Pm_base
7988       reverse(Pm_base, Ra, Rlen, t0, t1);
7989 
7990       leave();
7991       bind(nothing);
7992       ret(lr);
7993 
7994       return entry;
7995     }
7996     // In C, approximately:
7997 
7998     // void
7999     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8000     //                     julong Pn_base[], julong Pm_base[],
8001     //                     julong inv, int len) {
8002     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8003     //   julong *Pa, *Pb, *Pn, *Pm;
8004     //   julong Ra, Rb, Rn, Rm;
8005 
8006     //   int i;
8007 
8008     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8009 
8010     //   for (i = 0; i < len; i++) {
8011     //     int j;
8012 
8013     //     Pa = Pa_base;
8014     //     Pb = Pb_base + i;
8015     //     Pm = Pm_base;
8016     //     Pn = Pn_base + i;
8017 
8018     //     Ra = *Pa;
8019     //     Rb = *Pb;
8020     //     Rm = *Pm;
8021     //     Rn = *Pn;
8022 
8023     //     int iters = i;
8024     //     for (j = 0; iters--; j++) {
8025     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8026     //       MACC(Ra, Rb, t0, t1, t2);
8027     //       Ra = *++Pa;
8028     //       Rb = *--Pb;
8029     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8030     //       MACC(Rm, Rn, t0, t1, t2);
8031     //       Rm = *++Pm;
8032     //       Rn = *--Pn;
8033     //     }
8034 
8035     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8036     //     MACC(Ra, Rb, t0, t1, t2);
8037     //     *Pm = Rm = t0 * inv;
8038     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8039     //     MACC(Rm, Rn, t0, t1, t2);
8040 
8041     //     assert(t0 == 0, "broken Montgomery multiply");
8042 
8043     //     t0 = t1; t1 = t2; t2 = 0;
8044     //   }
8045 
8046     //   for (i = len; i < 2*len; i++) {
8047     //     int j;
8048 
8049     //     Pa = Pa_base + i-len;
8050     //     Pb = Pb_base + len;
8051     //     Pm = Pm_base + i-len;
8052     //     Pn = Pn_base + len;
8053 
8054     //     Ra = *++Pa;
8055     //     Rb = *--Pb;
8056     //     Rm = *++Pm;
8057     //     Rn = *--Pn;
8058 
8059     //     int iters = len*2-i-1;
8060     //     for (j = i-len+1; iters--; j++) {
8061     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8062     //       MACC(Ra, Rb, t0, t1, t2);
8063     //       Ra = *++Pa;
8064     //       Rb = *--Pb;
8065     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8066     //       MACC(Rm, Rn, t0, t1, t2);
8067     //       Rm = *++Pm;
8068     //       Rn = *--Pn;
8069     //     }
8070 
8071     //     Pm_base[i-len] = t0;
8072     //     t0 = t1; t1 = t2; t2 = 0;
8073     //   }
8074 
8075     //   while (t0)
8076     //     t0 = sub(Pm_base, Pn_base, t0, len);
8077     // }
8078 
8079     /**
8080      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8081      * multiplies than Montgomery multiplication so it should be up to
8082      * 25% faster.  However, its loop control is more complex and it
8083      * may actually run slower on some machines.
8084      *
8085      * Arguments:
8086      *
8087      * Inputs:
8088      *   c_rarg0   - int array elements a
8089      *   c_rarg1   - int array elements n (the modulus)
8090      *   c_rarg2   - int length
8091      *   c_rarg3   - int inv
8092      *   c_rarg4   - int array elements m (the result)
8093      *
8094      */
8095     address generate_square() {
8096       Label argh;
8097       bind(argh);
8098       stop("MontgomeryMultiply total_allocation must be <= 8192");
8099 
8100       align(CodeEntryAlignment);
8101       address entry = pc();
8102 
8103       enter();
8104 
8105       // Make room.
8106       cmpw(Rlen, 512);
8107       br(Assembler::HI, argh);
8108       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8109       andr(sp, Ra, -2 * wordSize);
8110 
8111       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8112 
8113       {
8114         // Copy input args, reversing as we go.  We use Ra as a
8115         // temporary variable.
8116         reverse(Ra, Pa_base, Rlen, t0, t1);
8117         reverse(Ra, Pn_base, Rlen, t0, t1);
8118       }
8119 
8120       // Push all call-saved registers and also Pm_base which we'll need
8121       // at the end.
8122       save_regs();
8123 
8124       mov(Pm_base, Ra);
8125 
8126       mov(t0, zr);
8127       mov(t1, zr);
8128       mov(t2, zr);
8129 
8130       block_comment("for (int i = 0; i < len; i++) {");
8131       mov(Ri, zr); {
8132         Label loop, end;
8133         bind(loop);
8134         cmp(Ri, Rlen);
8135         br(Assembler::GE, end);
8136 
8137         pre1(Ri);
8138 
8139         block_comment("for (j = (i+1)/2; j; j--) {"); {
8140           add(Rj, Ri, 1);
8141           lsr(Rj, Rj, 1);
8142           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8143         } block_comment("  } // j");
8144 
8145         last_squaring(Ri);
8146 
8147         block_comment("  for (j = i/2; j; j--) {"); {
8148           lsr(Rj, Ri, 1);
8149           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8150         } block_comment("  } // j");
8151 
8152         post1_squaring();
8153         add(Ri, Ri, 1);
8154         cmp(Ri, Rlen);
8155         br(Assembler::LT, loop);
8156 
8157         bind(end);
8158         block_comment("} // i");
8159       }
8160 
8161       block_comment("for (int i = len; i < 2*len; i++) {");
8162       mov(Ri, Rlen); {
8163         Label loop, end;
8164         bind(loop);
8165         cmp(Ri, Rlen, Assembler::LSL, 1);
8166         br(Assembler::GE, end);
8167 
8168         pre2(Ri, Rlen);
8169 
8170         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8171           lsl(Rj, Rlen, 1);
8172           sub(Rj, Rj, Ri);
8173           sub(Rj, Rj, 1);
8174           lsr(Rj, Rj, 1);
8175           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8176         } block_comment("  } // j");
8177 
8178         last_squaring(Ri);
8179 
8180         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8181           lsl(Rj, Rlen, 1);
8182           sub(Rj, Rj, Ri);
8183           lsr(Rj, Rj, 1);
8184           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8185         } block_comment("  } // j");
8186 
8187         post2(Ri, Rlen);
8188         add(Ri, Ri, 1);
8189         cmp(Ri, Rlen, Assembler::LSL, 1);
8190 
8191         br(Assembler::LT, loop);
8192         bind(end);
8193         block_comment("} // i");
8194       }
8195 
8196       normalize(Rlen);
8197 
8198       mov(Ra, Pm_base);  // Save Pm_base in Ra
8199       restore_regs();  // Restore caller's Pm_base
8200 
8201       // Copy our result into caller's Pm_base
8202       reverse(Pm_base, Ra, Rlen, t0, t1);
8203 
8204       leave();
8205       ret(lr);
8206 
8207       return entry;
8208     }
8209     // In C, approximately:
8210 
8211     // void
8212     // montgomery_square(julong Pa_base[], julong Pn_base[],
8213     //                   julong Pm_base[], julong inv, int len) {
8214     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8215     //   julong *Pa, *Pb, *Pn, *Pm;
8216     //   julong Ra, Rb, Rn, Rm;
8217 
8218     //   int i;
8219 
8220     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8221 
8222     //   for (i = 0; i < len; i++) {
8223     //     int j;
8224 
8225     //     Pa = Pa_base;
8226     //     Pb = Pa_base + i;
8227     //     Pm = Pm_base;
8228     //     Pn = Pn_base + i;
8229 
8230     //     Ra = *Pa;
8231     //     Rb = *Pb;
8232     //     Rm = *Pm;
8233     //     Rn = *Pn;
8234 
8235     //     int iters = (i+1)/2;
8236     //     for (j = 0; iters--; j++) {
8237     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8238     //       MACC2(Ra, Rb, t0, t1, t2);
8239     //       Ra = *++Pa;
8240     //       Rb = *--Pb;
8241     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8242     //       MACC(Rm, Rn, t0, t1, t2);
8243     //       Rm = *++Pm;
8244     //       Rn = *--Pn;
8245     //     }
8246     //     if ((i & 1) == 0) {
8247     //       assert(Ra == Pa_base[j], "must be");
8248     //       MACC(Ra, Ra, t0, t1, t2);
8249     //     }
8250     //     iters = i/2;
8251     //     assert(iters == i-j, "must be");
8252     //     for (; iters--; j++) {
8253     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8254     //       MACC(Rm, Rn, t0, t1, t2);
8255     //       Rm = *++Pm;
8256     //       Rn = *--Pn;
8257     //     }
8258 
8259     //     *Pm = Rm = t0 * inv;
8260     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8261     //     MACC(Rm, Rn, t0, t1, t2);
8262 
8263     //     assert(t0 == 0, "broken Montgomery multiply");
8264 
8265     //     t0 = t1; t1 = t2; t2 = 0;
8266     //   }
8267 
8268     //   for (i = len; i < 2*len; i++) {
8269     //     int start = i-len+1;
8270     //     int end = start + (len - start)/2;
8271     //     int j;
8272 
8273     //     Pa = Pa_base + i-len;
8274     //     Pb = Pa_base + len;
8275     //     Pm = Pm_base + i-len;
8276     //     Pn = Pn_base + len;
8277 
8278     //     Ra = *++Pa;
8279     //     Rb = *--Pb;
8280     //     Rm = *++Pm;
8281     //     Rn = *--Pn;
8282 
8283     //     int iters = (2*len-i-1)/2;
8284     //     assert(iters == end-start, "must be");
8285     //     for (j = start; iters--; j++) {
8286     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8287     //       MACC2(Ra, Rb, t0, t1, t2);
8288     //       Ra = *++Pa;
8289     //       Rb = *--Pb;
8290     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8291     //       MACC(Rm, Rn, t0, t1, t2);
8292     //       Rm = *++Pm;
8293     //       Rn = *--Pn;
8294     //     }
8295     //     if ((i & 1) == 0) {
8296     //       assert(Ra == Pa_base[j], "must be");
8297     //       MACC(Ra, Ra, t0, t1, t2);
8298     //     }
8299     //     iters =  (2*len-i)/2;
8300     //     assert(iters == len-j, "must be");
8301     //     for (; iters--; j++) {
8302     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8303     //       MACC(Rm, Rn, t0, t1, t2);
8304     //       Rm = *++Pm;
8305     //       Rn = *--Pn;
8306     //     }
8307     //     Pm_base[i-len] = t0;
8308     //     t0 = t1; t1 = t2; t2 = 0;
8309     //   }
8310 
8311     //   while (t0)
8312     //     t0 = sub(Pm_base, Pn_base, t0, len);
8313     // }
8314   };
8315 
8316 
8317   // Call here from the interpreter or compiled code to either load
8318   // multiple returned values from the inline type instance being
8319   // returned to registers or to store returned values to a newly
8320   // allocated inline type instance.
8321   address generate_return_value_stub(address destination, const char* name, bool has_res) {
8322     // We need to save all registers the calling convention may use so
8323     // the runtime calls read or update those registers. This needs to
8324     // be in sync with SharedRuntime::java_return_convention().
8325     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
8326     enum layout {
8327       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
8328       j_rarg6_off, j_rarg6_2,
8329       j_rarg5_off, j_rarg5_2,
8330       j_rarg4_off, j_rarg4_2,
8331       j_rarg3_off, j_rarg3_2,
8332       j_rarg2_off, j_rarg2_2,
8333       j_rarg1_off, j_rarg1_2,
8334       j_rarg0_off, j_rarg0_2,
8335 
8336       j_farg7_off, j_farg7_2,
8337       j_farg6_off, j_farg6_2,
8338       j_farg5_off, j_farg5_2,
8339       j_farg4_off, j_farg4_2,
8340       j_farg3_off, j_farg3_2,
8341       j_farg2_off, j_farg2_2,
8342       j_farg1_off, j_farg1_2,
8343       j_farg0_off, j_farg0_2,
8344 
8345       rfp_off, rfp_off2,
8346       return_off, return_off2,
8347 
8348       framesize // inclusive of return address
8349     };
8350 
8351     CodeBuffer code(name, 512, 64);
8352     MacroAssembler* masm = new MacroAssembler(&code);
8353 
8354     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
8355     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
8356     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
8357     int frame_size_in_words = frame_size_in_bytes / wordSize;
8358 
8359     OopMapSet* oop_maps = new OopMapSet();
8360     OopMap* map = new OopMap(frame_size_in_slots, 0);
8361 
8362     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
8363     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
8364     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
8365     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
8366     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
8367     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
8368     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
8369     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
8370 
8371     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
8372     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
8373     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
8374     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
8375     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
8376     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
8377     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
8378     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
8379 
8380     address start = __ pc();
8381 
8382     __ enter(); // Save FP and LR before call
8383 
8384     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
8385     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
8386     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
8387     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
8388 
8389     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
8390     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
8391     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
8392     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
8393 
8394     int frame_complete = __ offset();
8395 
8396     // Set up last_Java_sp and last_Java_fp
8397     address the_pc = __ pc();
8398     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
8399 
8400     // Call runtime
8401     __ mov(c_rarg1, r0);
8402     __ mov(c_rarg0, rthread);
8403 
8404     __ mov(rscratch1, destination);
8405     __ blr(rscratch1);
8406 
8407     oop_maps->add_gc_map(the_pc - start, map);
8408 
8409     __ reset_last_Java_frame(false);
8410 
8411     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
8412     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
8413     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
8414     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
8415 
8416     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
8417     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
8418     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
8419     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
8420 
8421     __ leave();
8422 
8423     // check for pending exceptions
8424     Label pending;
8425     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
8426     __ cbnz(rscratch1, pending);
8427 
8428     if (has_res) {
8429       __ get_vm_result(r0, rthread);
8430     }
8431 
8432     __ ret(lr);
8433 
8434     __ bind(pending);
8435     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
8436 
8437     // -------------
8438     // make sure all code is generated
8439     masm->flush();
8440 
8441     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
8442     return stub->entry_point();
8443   }
8444 
8445   // Initialization
8446   void generate_initial_stubs() {
8447     // Generate initial stubs and initializes the entry points
8448 
8449     // entry points that exist in all platforms Note: This is code
8450     // that could be shared among different platforms - however the
8451     // benefit seems to be smaller than the disadvantage of having a
8452     // much more complicated generator structure. See also comment in
8453     // stubRoutines.hpp.
8454 
8455     StubRoutines::_forward_exception_entry = generate_forward_exception();
8456 
8457     StubRoutines::_call_stub_entry =
8458       generate_call_stub(StubRoutines::_call_stub_return_address);
8459 
8460     // is referenced by megamorphic call
8461     StubRoutines::_catch_exception_entry = generate_catch_exception();
8462 
8463     // Build this early so it's available for the interpreter.
8464     StubRoutines::_throw_StackOverflowError_entry =
8465       generate_throw_exception("StackOverflowError throw_exception",
8466                                CAST_FROM_FN_PTR(address,
8467                                                 SharedRuntime::throw_StackOverflowError));
8468     StubRoutines::_throw_delayed_StackOverflowError_entry =
8469       generate_throw_exception("delayed StackOverflowError throw_exception",
8470                                CAST_FROM_FN_PTR(address,
8471                                                 SharedRuntime::throw_delayed_StackOverflowError));
8472 
8473     // Initialize table for copy memory (arraycopy) check.
8474     if (UnsafeCopyMemory::_table == nullptr) {
8475       UnsafeCopyMemory::create_table(8);
8476     }
8477 
8478     if (UseCRC32Intrinsics) {
8479       // set table address before stub generation which use it
8480       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8481       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8482     }
8483 
8484     if (UseCRC32CIntrinsics) {
8485       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8486     }
8487 
8488     // Disabled until JDK-8210858 is fixed
8489     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
8490     //   StubRoutines::_dlog = generate_dlog();
8491     // }
8492 
8493     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8494       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8495     }
8496 
8497     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8498       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8499     }
8500 
8501     if (InlineTypeReturnedAsFields) {
8502       StubRoutines::_load_inline_type_fields_in_regs =
8503          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
8504       StubRoutines::_store_inline_type_fields_to_buf =
8505          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
8506     }
8507   }
8508 
8509   void generate_continuation_stubs() {
8510     // Continuation stubs:
8511     StubRoutines::_cont_thaw          = generate_cont_thaw();
8512     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8513     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8514 
8515     JFR_ONLY(generate_jfr_stubs();)
8516   }
8517 
8518 #if INCLUDE_JFR
8519   void generate_jfr_stubs() {
8520     StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
8521     StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
8522     StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
8523     StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
8524   }
8525 #endif // INCLUDE_JFR
8526 
8527   void generate_final_stubs() {
8528     // support for verify_oop (must happen after universe_init)
8529     if (VerifyOops) {
8530       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8531     }
8532     StubRoutines::_throw_AbstractMethodError_entry =
8533       generate_throw_exception("AbstractMethodError throw_exception",
8534                                CAST_FROM_FN_PTR(address,
8535                                                 SharedRuntime::
8536                                                 throw_AbstractMethodError));
8537 
8538     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8539       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8540                                CAST_FROM_FN_PTR(address,
8541                                                 SharedRuntime::
8542                                                 throw_IncompatibleClassChangeError));
8543 
8544     StubRoutines::_throw_NullPointerException_at_call_entry =
8545       generate_throw_exception("NullPointerException at call throw_exception",
8546                                CAST_FROM_FN_PTR(address,
8547                                                 SharedRuntime::
8548                                                 throw_NullPointerException_at_call));
8549 
8550     // arraycopy stubs used by compilers
8551     generate_arraycopy_stubs();
8552 
8553     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8554     if (bs_nm != nullptr) {
8555       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8556     }
8557 
8558     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8559 
8560     if (UsePoly1305Intrinsics) {
8561       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8562     }
8563 
8564 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8565 
8566     generate_atomic_entry_points();
8567 
8568 #endif // LINUX
8569 
8570     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8571 
8572     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8573   }
8574 
8575   void generate_compiler_stubs() {
8576 #if COMPILER2_OR_JVMCI
8577 
8578     if (UseSVE == 0) {
8579       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8580     }
8581 
8582     // array equals stub for large arrays.
8583     if (!UseSimpleArrayEquals) {
8584       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8585     }
8586 
8587     // byte_array_inflate stub for large arrays.
8588     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8589 
8590     // countPositives stub for large arrays.
8591     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8592 
8593     generate_compare_long_strings();
8594 
8595     generate_string_indexof_stubs();
8596 
8597 #ifdef COMPILER2
8598     if (UseMultiplyToLenIntrinsic) {
8599       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8600     }
8601 
8602     if (UseSquareToLenIntrinsic) {
8603       StubRoutines::_squareToLen = generate_squareToLen();
8604     }
8605 
8606     if (UseMulAddIntrinsic) {
8607       StubRoutines::_mulAdd = generate_mulAdd();
8608     }
8609 
8610     if (UseSIMDForBigIntegerShiftIntrinsics) {
8611       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8612       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8613     }
8614 
8615     if (UseMontgomeryMultiplyIntrinsic) {
8616       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8617       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8618       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8619     }
8620 
8621     if (UseMontgomerySquareIntrinsic) {
8622       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8623       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8624       // We use generate_multiply() rather than generate_square()
8625       // because it's faster for the sizes of modulus we care about.
8626       StubRoutines::_montgomerySquare = g.generate_multiply();
8627     }
8628 #endif // COMPILER2
8629 
8630     if (UseChaCha20Intrinsics) {
8631       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8632     }
8633 
8634     if (UseBASE64Intrinsics) {
8635         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8636         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8637     }
8638 
8639     // data cache line writeback
8640     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8641     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8642 
8643     if (UseAESIntrinsics) {
8644       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8645       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8646       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8647       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8648       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8649     }
8650     if (UseGHASHIntrinsics) {
8651       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8652       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8653     }
8654     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8655       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8656     }
8657 
8658     if (UseMD5Intrinsics) {
8659       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8660       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8661     }
8662     if (UseSHA1Intrinsics) {
8663       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8664       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8665     }
8666     if (UseSHA256Intrinsics) {
8667       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8668       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8669     }
8670     if (UseSHA512Intrinsics) {
8671       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8672       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8673     }
8674     if (UseSHA3Intrinsics) {
8675       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8676       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8677     }
8678 
8679     // generate Adler32 intrinsics code
8680     if (UseAdler32Intrinsics) {
8681       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8682     }
8683 #endif // COMPILER2_OR_JVMCI
8684   }
8685 
8686  public:
8687   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8688     switch(kind) {
8689     case Initial_stubs:
8690       generate_initial_stubs();
8691       break;
8692      case Continuation_stubs:
8693       generate_continuation_stubs();
8694       break;
8695     case Compiler_stubs:
8696       generate_compiler_stubs();
8697       break;
8698     case Final_stubs:
8699       generate_final_stubs();
8700       break;
8701     default:
8702       fatal("unexpected stubs kind: %d", kind);
8703       break;
8704     };
8705   }
8706 }; // end class declaration
8707 
8708 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8709   StubGenerator g(code, kind);
8710 }
8711 
8712 
8713 #if defined (LINUX)
8714 
8715 // Define pointers to atomic stubs and initialize them to point to the
8716 // code in atomic_aarch64.S.
8717 
8718 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8719   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8720     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8721   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8722     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8723 
8724 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8725 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8726 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8727 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8728 DEFAULT_ATOMIC_OP(xchg, 4, )
8729 DEFAULT_ATOMIC_OP(xchg, 8, )
8730 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8731 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8732 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8733 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8734 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8735 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8736 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8737 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8738 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8739 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8740 
8741 #undef DEFAULT_ATOMIC_OP
8742 
8743 #endif // LINUX