1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/frame.inline.hpp"
  49 #include "runtime/handles.inline.hpp"
  50 #include "runtime/javaThread.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubCodeGenerator.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/checkedCast.hpp"
  56 #include "utilities/globalDefinitions.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER2
  59 #include "opto/runtime.hpp"
  60 #endif
  61 #if INCLUDE_ZGC
  62 #include "gc/z/zThreadLocalData.hpp"
  63 #endif
  64 
  65 // Declaration and definition of StubGenerator (no .hpp file).
  66 // For a more detailed description of the stub routine structure
  67 // see the comment in stubRoutines.hpp
  68 
  69 #undef __
  70 #define __ _masm->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 
  80 // Stub Code definitions
  81 
  82 class StubGenerator: public StubCodeGenerator {
  83  private:
  84 
  85 #ifdef PRODUCT
  86 #define inc_counter_np(counter) ((void)0)
  87 #else
  88   void inc_counter_np_(uint& counter) {
  89     __ lea(rscratch2, ExternalAddress((address)&counter));
  90     __ ldrw(rscratch1, Address(rscratch2));
  91     __ addw(rscratch1, rscratch1, 1);
  92     __ strw(rscratch1, Address(rscratch2));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -29 [ argument word 1      ]
 145   // -28 [ saved Floating-point Control Register ]
 146   // -26 [ saved v15            ] <--- sp_after_call
 147   // -25 [ saved v14            ]
 148   // -24 [ saved v13            ]
 149   // -23 [ saved v12            ]
 150   // -22 [ saved v11            ]
 151   // -21 [ saved v10            ]
 152   // -20 [ saved v9             ]
 153   // -19 [ saved v8             ]
 154   // -18 [ saved r28            ]
 155   // -17 [ saved r27            ]
 156   // -16 [ saved r26            ]
 157   // -15 [ saved r25            ]
 158   // -14 [ saved r24            ]
 159   // -13 [ saved r23            ]
 160   // -12 [ saved r22            ]
 161   // -11 [ saved r21            ]
 162   // -10 [ saved r20            ]
 163   //  -9 [ saved r19            ]
 164   //  -8 [ call wrapper    (r0) ]
 165   //  -7 [ result          (r1) ]
 166   //  -6 [ result type     (r2) ]
 167   //  -5 [ method          (r3) ]
 168   //  -4 [ entry point     (r4) ]
 169   //  -3 [ parameters      (r5) ]
 170   //  -2 [ parameter size  (r6) ]
 171   //  -1 [ thread (r7)          ]
 172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 173   //   1 [ saved lr       (r30) ]
 174 
 175   // Call stub stack layout word offsets from fp
 176   enum call_stub_layout {
 177     sp_after_call_off  = -28,
 178 
 179     fpcr_off           = sp_after_call_off,
 180     d15_off            = -26,
 181     d13_off            = -24,
 182     d11_off            = -22,
 183     d9_off             = -20,
 184 
 185     r28_off            = -18,
 186     r26_off            = -16,
 187     r24_off            = -14,
 188     r22_off            = -12,
 189     r20_off            = -10,
 190     call_wrapper_off   =  -8,
 191     result_off         =  -7,
 192     result_type_off    =  -6,
 193     method_off         =  -5,
 194     entry_point_off    =  -4,
 195     parameter_size_off =  -2,
 196     thread_off         =  -1,
 197     fp_f               =   0,
 198     retaddr_off        =   1,
 199   };
 200 
 201   address generate_call_stub(address& return_address) {
 202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 204            "adjust this code");
 205 
 206     StubCodeMark mark(this, "StubRoutines", "call_stub");
 207     address start = __ pc();
 208 
 209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 210 
 211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     __ get_fpcr(rscratch1);
 261     __ str(rscratch1, fpcr_save);
 262     // Set FPCR to the state we need. We do want Round to Nearest. We
 263     // don't want non-IEEE rounding modes or floating-point traps.
 264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 266     __ set_fpcr(rscratch1);
 267 
 268     // install Java thread in global register now we have saved
 269     // whatever value it held
 270     __ mov(rthread, c_rarg7);
 271     // And method
 272     __ mov(rmethod, c_rarg3);
 273 
 274     // set up the heapbase register
 275     __ reinit_heapbase();
 276 
 277 #ifdef ASSERT
 278     // make sure we have no pending exceptions
 279     {
 280       Label L;
 281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 282       __ cmp(rscratch1, (u1)NULL_WORD);
 283       __ br(Assembler::EQ, L);
 284       __ stop("StubRoutines::call_stub: entered with pending exception");
 285       __ BIND(L);
 286     }
 287 #endif
 288     // pass parameters if any
 289     __ mov(esp, sp);
 290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 291     __ andr(sp, rscratch1, -2 * wordSize);
 292 
 293     BLOCK_COMMENT("pass parameters if any");
 294     Label parameters_done;
 295     // parameter count is still in c_rarg6
 296     // and parameter pointer identifying param 1 is in c_rarg5
 297     __ cbzw(c_rarg6, parameters_done);
 298 
 299     address loop = __ pc();
 300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 301     __ subsw(c_rarg6, c_rarg6, 1);
 302     __ push(rscratch1);
 303     __ br(Assembler::GT, loop);
 304 
 305     __ BIND(parameters_done);
 306 
 307     // call Java entry -- passing methdoOop, and current sp
 308     //      rmethod: Method*
 309     //      r19_sender_sp: sender sp
 310     BLOCK_COMMENT("call Java function");
 311     __ mov(r19_sender_sp, sp);
 312     __ blr(c_rarg4);
 313 
 314     // we do this here because the notify will already have been done
 315     // if we get to the next instruction via an exception
 316     //
 317     // n.b. adding this instruction here affects the calculation of
 318     // whether or not a routine returns to the call stub (used when
 319     // doing stack walks) since the normal test is to check the return
 320     // pc against the address saved below. so we may need to allow for
 321     // this extra instruction in the check.
 322 
 323     // save current address for use by exception handling code
 324 
 325     return_address = __ pc();
 326 
 327     // store result depending on type (everything that is not
 328     // T_OBJECT, T_PRIMITIVE_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 329     // n.b. this assumes Java returns an integral result in r0
 330     // and a floating result in j_farg0
 331     // All of j_rargN may be used to return inline type fields so be careful
 332     // not to clobber those.
 333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 334     // assignment of Rresult below.
 335     Register Rresult = r14, Rresult_type = r15;
 336     __ ldr(Rresult, result);
 337     Label is_long, is_float, is_double, check_prim, exit;
 338     __ ldr(Rresult_type, result_type);
 339     __ cmp(Rresult_type, (u1)T_OBJECT);
 340     __ br(Assembler::EQ, check_prim);
 341     __ cmp(Rresult_type, (u1)T_PRIMITIVE_OBJECT);
 342     __ br(Assembler::EQ, check_prim);
 343     __ cmp(Rresult_type, (u1)T_LONG);
 344     __ br(Assembler::EQ, is_long);
 345     __ cmp(Rresult_type, (u1)T_FLOAT);
 346     __ br(Assembler::EQ, is_float);
 347     __ cmp(Rresult_type, (u1)T_DOUBLE);
 348     __ br(Assembler::EQ, is_double);
 349 
 350     // handle T_INT case
 351     __ strw(r0, Address(Rresult));
 352 
 353     __ BIND(exit);
 354 
 355     // pop parameters
 356     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 357 
 358 #ifdef ASSERT
 359     // verify that threads correspond
 360     {
 361       Label L, S;
 362       __ ldr(rscratch1, thread);
 363       __ cmp(rthread, rscratch1);
 364       __ br(Assembler::NE, S);
 365       __ get_thread(rscratch1);
 366       __ cmp(rthread, rscratch1);
 367       __ br(Assembler::EQ, L);
 368       __ BIND(S);
 369       __ stop("StubRoutines::call_stub: threads must correspond");
 370       __ BIND(L);
 371     }
 372 #endif
 373 
 374     __ pop_cont_fastpath(rthread);
 375 
 376     // restore callee-save registers
 377     __ ldpd(v15, v14,  d15_save);
 378     __ ldpd(v13, v12,  d13_save);
 379     __ ldpd(v11, v10,  d11_save);
 380     __ ldpd(v9,  v8,   d9_save);
 381 
 382     __ ldp(r28, r27,   r28_save);
 383     __ ldp(r26, r25,   r26_save);
 384     __ ldp(r24, r23,   r24_save);
 385     __ ldp(r22, r21,   r22_save);
 386     __ ldp(r20, r19,   r20_save);
 387 
 388     // restore fpcr
 389     __ ldr(rscratch1,  fpcr_save);
 390     __ set_fpcr(rscratch1);
 391 
 392     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 393     __ ldrw(c_rarg2, result_type);
 394     __ ldr(c_rarg3,  method);
 395     __ ldp(c_rarg4, c_rarg5,  entry_point);
 396     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 397 
 398     // leave frame and return to caller
 399     __ leave();
 400     __ ret(lr);
 401 
 402     // handle return types different from T_INT
 403     __ BIND(check_prim);
 404     if (InlineTypeReturnedAsFields) {
 405       // Check for scalarized return value
 406       __ tbz(r0, 0, is_long);
 407       // Load pack handler address
 408       __ andr(rscratch1, r0, -2);
 409       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 410       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 411       __ blr(rscratch1);
 412       __ b(exit);
 413     }
 414 
 415     __ BIND(is_long);
 416     __ str(r0, Address(Rresult, 0));
 417     __ br(Assembler::AL, exit);
 418 
 419     __ BIND(is_float);
 420     __ strs(j_farg0, Address(Rresult, 0));
 421     __ br(Assembler::AL, exit);
 422 
 423     __ BIND(is_double);
 424     __ strd(j_farg0, Address(Rresult, 0));
 425     __ br(Assembler::AL, exit);
 426 
 427     return start;
 428   }
 429 
 430   // Return point for a Java call if there's an exception thrown in
 431   // Java code.  The exception is caught and transformed into a
 432   // pending exception stored in JavaThread that can be tested from
 433   // within the VM.
 434   //
 435   // Note: Usually the parameters are removed by the callee. In case
 436   // of an exception crossing an activation frame boundary, that is
 437   // not the case if the callee is compiled code => need to setup the
 438   // rsp.
 439   //
 440   // r0: exception oop
 441 
 442   address generate_catch_exception() {
 443     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 444     address start = __ pc();
 445 
 446     // same as in generate_call_stub():
 447     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 448     const Address thread        (rfp, thread_off         * wordSize);
 449 
 450 #ifdef ASSERT
 451     // verify that threads correspond
 452     {
 453       Label L, S;
 454       __ ldr(rscratch1, thread);
 455       __ cmp(rthread, rscratch1);
 456       __ br(Assembler::NE, S);
 457       __ get_thread(rscratch1);
 458       __ cmp(rthread, rscratch1);
 459       __ br(Assembler::EQ, L);
 460       __ bind(S);
 461       __ stop("StubRoutines::catch_exception: threads must correspond");
 462       __ bind(L);
 463     }
 464 #endif
 465 
 466     // set pending exception
 467     __ verify_oop(r0);
 468 
 469     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 470     __ mov(rscratch1, (address)__FILE__);
 471     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 472     __ movw(rscratch1, (int)__LINE__);
 473     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 474 
 475     // complete return to VM
 476     assert(StubRoutines::_call_stub_return_address != nullptr,
 477            "_call_stub_return_address must have been generated before");
 478     __ b(StubRoutines::_call_stub_return_address);
 479 
 480     return start;
 481   }
 482 
 483   // Continuation point for runtime calls returning with a pending
 484   // exception.  The pending exception check happened in the runtime
 485   // or native call stub.  The pending exception in Thread is
 486   // converted into a Java-level exception.
 487   //
 488   // Contract with Java-level exception handlers:
 489   // r0: exception
 490   // r3: throwing pc
 491   //
 492   // NOTE: At entry of this stub, exception-pc must be in LR !!
 493 
 494   // NOTE: this is always used as a jump target within generated code
 495   // so it just needs to be generated code with no x86 prolog
 496 
 497   address generate_forward_exception() {
 498     StubCodeMark mark(this, "StubRoutines", "forward exception");
 499     address start = __ pc();
 500 
 501     // Upon entry, LR points to the return address returning into
 502     // Java (interpreted or compiled) code; i.e., the return address
 503     // becomes the throwing pc.
 504     //
 505     // Arguments pushed before the runtime call are still on the stack
 506     // but the exception handler will reset the stack pointer ->
 507     // ignore them.  A potential result in registers can be ignored as
 508     // well.
 509 
 510 #ifdef ASSERT
 511     // make sure this code is only executed if there is a pending exception
 512     {
 513       Label L;
 514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 515       __ cbnz(rscratch1, L);
 516       __ stop("StubRoutines::forward exception: no pending exception (1)");
 517       __ bind(L);
 518     }
 519 #endif
 520 
 521     // compute exception handler into r19
 522 
 523     // call the VM to find the handler address associated with the
 524     // caller address. pass thread in r0 and caller pc (ret address)
 525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 526     // the stack.
 527     __ mov(c_rarg1, lr);
 528     // lr will be trashed by the VM call so we move it to R19
 529     // (callee-saved) because we also need to pass it to the handler
 530     // returned by this call.
 531     __ mov(r19, lr);
 532     BLOCK_COMMENT("call exception_handler_for_return_address");
 533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 534                          SharedRuntime::exception_handler_for_return_address),
 535                     rthread, c_rarg1);
 536     // Reinitialize the ptrue predicate register, in case the external runtime
 537     // call clobbers ptrue reg, as we may return to SVE compiled code.
 538     __ reinitialize_ptrue();
 539 
 540     // we should not really care that lr is no longer the callee
 541     // address. we saved the value the handler needs in r19 so we can
 542     // just copy it to r3. however, the C2 handler will push its own
 543     // frame and then calls into the VM and the VM code asserts that
 544     // the PC for the frame above the handler belongs to a compiled
 545     // Java method. So, we restore lr here to satisfy that assert.
 546     __ mov(lr, r19);
 547     // setup r0 & r3 & clear pending exception
 548     __ mov(r3, r19);
 549     __ mov(r19, r0);
 550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 552 
 553 #ifdef ASSERT
 554     // make sure exception is set
 555     {
 556       Label L;
 557       __ cbnz(r0, L);
 558       __ stop("StubRoutines::forward exception: no pending exception (2)");
 559       __ bind(L);
 560     }
 561 #endif
 562 
 563     // continue at exception handler
 564     // r0: exception
 565     // r3: throwing pc
 566     // r19: exception handler
 567     __ verify_oop(r0);
 568     __ br(r19);
 569 
 570     return start;
 571   }
 572 
 573   // Non-destructive plausibility checks for oops
 574   //
 575   // Arguments:
 576   //    r0: oop to verify
 577   //    rscratch1: error message
 578   //
 579   // Stack after saving c_rarg3:
 580   //    [tos + 0]: saved c_rarg3
 581   //    [tos + 1]: saved c_rarg2
 582   //    [tos + 2]: saved lr
 583   //    [tos + 3]: saved rscratch2
 584   //    [tos + 4]: saved r0
 585   //    [tos + 5]: saved rscratch1
 586   address generate_verify_oop() {
 587 
 588     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 589     address start = __ pc();
 590 
 591     Label exit, error;
 592 
 593     // save c_rarg2 and c_rarg3
 594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 595 
 596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 598     __ ldr(c_rarg3, Address(c_rarg2));
 599     __ add(c_rarg3, c_rarg3, 1);
 600     __ str(c_rarg3, Address(c_rarg2));
 601 
 602     // object is in r0
 603     // make sure object is 'reasonable'
 604     __ cbz(r0, exit); // if obj is null it is OK
 605 
 606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 608 
 609     // return if everything seems ok
 610     __ bind(exit);
 611 
 612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 613     __ ret(lr);
 614 
 615     // handle errors
 616     __ bind(error);
 617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 618 
 619     __ push(RegSet::range(r0, r29), sp);
 620     // debug(char* msg, int64_t pc, int64_t regs[])
 621     __ mov(c_rarg0, rscratch1);      // pass address of error message
 622     __ mov(c_rarg1, lr);             // pass return address
 623     __ mov(c_rarg2, sp);             // pass address of regs on stack
 624 #ifndef PRODUCT
 625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 626 #endif
 627     BLOCK_COMMENT("call MacroAssembler::debug");
 628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 629     __ blr(rscratch1);
 630     __ hlt(0);
 631 
 632     return start;
 633   }
 634 
 635   // Generate indices for iota vector.
 636   address generate_iota_indices(const char *stub_name) {
 637     __ align(CodeEntryAlignment);
 638     StubCodeMark mark(this, "StubRoutines", stub_name);
 639     address start = __ pc();
 640     // B
 641     __ emit_data64(0x0706050403020100, relocInfo::none);
 642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 643     // H
 644     __ emit_data64(0x0003000200010000, relocInfo::none);
 645     __ emit_data64(0x0007000600050004, relocInfo::none);
 646     // S
 647     __ emit_data64(0x0000000100000000, relocInfo::none);
 648     __ emit_data64(0x0000000300000002, relocInfo::none);
 649     // D
 650     __ emit_data64(0x0000000000000000, relocInfo::none);
 651     __ emit_data64(0x0000000000000001, relocInfo::none);
 652     // S - FP
 653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 655     // D - FP
 656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 658     return start;
 659   }
 660 
 661   // The inner part of zero_words().  This is the bulk operation,
 662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 663   // caller is responsible for zeroing the last few words.
 664   //
 665   // Inputs:
 666   // r10: the HeapWord-aligned base address of an array to zero.
 667   // r11: the count in HeapWords, r11 > 0.
 668   //
 669   // Returns r10 and r11, adjusted for the caller to clear.
 670   // r10: the base address of the tail of words left to clear.
 671   // r11: the number of words in the tail.
 672   //      r11 < MacroAssembler::zero_words_block_size.
 673 
 674   address generate_zero_blocks() {
 675     Label done;
 676     Label base_aligned;
 677 
 678     Register base = r10, cnt = r11;
 679 
 680     __ align(CodeEntryAlignment);
 681     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 682     address start = __ pc();
 683 
 684     if (UseBlockZeroing) {
 685       int zva_length = VM_Version::zva_length();
 686 
 687       // Ensure ZVA length can be divided by 16. This is required by
 688       // the subsequent operations.
 689       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 690 
 691       __ tbz(base, 3, base_aligned);
 692       __ str(zr, Address(__ post(base, 8)));
 693       __ sub(cnt, cnt, 1);
 694       __ bind(base_aligned);
 695 
 696       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 697       // alignment.
 698       Label small;
 699       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 700       __ subs(rscratch1, cnt, low_limit >> 3);
 701       __ br(Assembler::LT, small);
 702       __ zero_dcache_blocks(base, cnt);
 703       __ bind(small);
 704     }
 705 
 706     {
 707       // Number of stp instructions we'll unroll
 708       const int unroll =
 709         MacroAssembler::zero_words_block_size / 2;
 710       // Clear the remaining blocks.
 711       Label loop;
 712       __ subs(cnt, cnt, unroll * 2);
 713       __ br(Assembler::LT, done);
 714       __ bind(loop);
 715       for (int i = 0; i < unroll; i++)
 716         __ stp(zr, zr, __ post(base, 16));
 717       __ subs(cnt, cnt, unroll * 2);
 718       __ br(Assembler::GE, loop);
 719       __ bind(done);
 720       __ add(cnt, cnt, unroll * 2);
 721     }
 722 
 723     __ ret(lr);
 724 
 725     return start;
 726   }
 727 
 728 
 729   typedef enum {
 730     copy_forwards = 1,
 731     copy_backwards = -1
 732   } copy_direction;
 733 
 734   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 735   // for arraycopy stubs.
 736   class ArrayCopyBarrierSetHelper : StackObj {
 737     BarrierSetAssembler* _bs_asm;
 738     MacroAssembler* _masm;
 739     DecoratorSet _decorators;
 740     BasicType _type;
 741     Register _gct1;
 742     Register _gct2;
 743     Register _gct3;
 744     FloatRegister _gcvt1;
 745     FloatRegister _gcvt2;
 746     FloatRegister _gcvt3;
 747 
 748   public:
 749     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 750                               DecoratorSet decorators,
 751                               BasicType type,
 752                               Register gct1,
 753                               Register gct2,
 754                               Register gct3,
 755                               FloatRegister gcvt1,
 756                               FloatRegister gcvt2,
 757                               FloatRegister gcvt3)
 758       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 759         _masm(masm),
 760         _decorators(decorators),
 761         _type(type),
 762         _gct1(gct1),
 763         _gct2(gct2),
 764         _gct3(gct3),
 765         _gcvt1(gcvt1),
 766         _gcvt2(gcvt2),
 767         _gcvt3(gcvt3) {
 768     }
 769 
 770     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 771       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 772                             dst1, dst2, src,
 773                             _gct1, _gct2, _gcvt1);
 774     }
 775 
 776     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 777       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 778                              dst, src1, src2,
 779                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 780     }
 781 
 782     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 783       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 784                             dst1, dst2, src,
 785                             _gct1);
 786     }
 787 
 788     void copy_store_at_16(Address dst, Register src1, Register src2) {
 789       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 790                              dst, src1, src2,
 791                              _gct1, _gct2, _gct3);
 792     }
 793 
 794     void copy_load_at_8(Register dst, Address src) {
 795       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 796                             dst, noreg, src,
 797                             _gct1);
 798     }
 799 
 800     void copy_store_at_8(Address dst, Register src) {
 801       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 802                              dst, src, noreg,
 803                              _gct1, _gct2, _gct3);
 804     }
 805   };
 806 
 807   // Bulk copy of blocks of 8 words.
 808   //
 809   // count is a count of words.
 810   //
 811   // Precondition: count >= 8
 812   //
 813   // Postconditions:
 814   //
 815   // The least significant bit of count contains the remaining count
 816   // of words to copy.  The rest of count is trash.
 817   //
 818   // s and d are adjusted to point to the remaining words to copy
 819   //
 820   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 821                            copy_direction direction) {
 822     int unit = wordSize * direction;
 823     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 824 
 825     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 826       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 827     const Register stride = r14;
 828     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 829     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 830     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 831 
 832     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 833     assert_different_registers(s, d, count, rscratch1, rscratch2);
 834 
 835     Label again, drain;
 836     const char *stub_name;
 837     if (direction == copy_forwards)
 838       stub_name = "forward_copy_longs";
 839     else
 840       stub_name = "backward_copy_longs";
 841 
 842     __ align(CodeEntryAlignment);
 843 
 844     StubCodeMark mark(this, "StubRoutines", stub_name);
 845 
 846     __ bind(start);
 847 
 848     Label unaligned_copy_long;
 849     if (AvoidUnalignedAccesses) {
 850       __ tbnz(d, 3, unaligned_copy_long);
 851     }
 852 
 853     if (direction == copy_forwards) {
 854       __ sub(s, s, bias);
 855       __ sub(d, d, bias);
 856     }
 857 
 858 #ifdef ASSERT
 859     // Make sure we are never given < 8 words
 860     {
 861       Label L;
 862       __ cmp(count, (u1)8);
 863       __ br(Assembler::GE, L);
 864       __ stop("genrate_copy_longs called with < 8 words");
 865       __ bind(L);
 866     }
 867 #endif
 868 
 869     // Fill 8 registers
 870     if (UseSIMDForMemoryOps) {
 871       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 872       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 873     } else {
 874       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 875       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 876       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 877       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 878     }
 879 
 880     __ subs(count, count, 16);
 881     __ br(Assembler::LO, drain);
 882 
 883     int prefetch = PrefetchCopyIntervalInBytes;
 884     bool use_stride = false;
 885     if (direction == copy_backwards) {
 886        use_stride = prefetch > 256;
 887        prefetch = -prefetch;
 888        if (use_stride) __ mov(stride, prefetch);
 889     }
 890 
 891     __ bind(again);
 892 
 893     if (PrefetchCopyIntervalInBytes > 0)
 894       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 895 
 896     if (UseSIMDForMemoryOps) {
 897       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 899       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 900       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 901     } else {
 902       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 903       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 904       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 905       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 906       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 907       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 908       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 909       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 910     }
 911 
 912     __ subs(count, count, 8);
 913     __ br(Assembler::HS, again);
 914 
 915     // Drain
 916     __ bind(drain);
 917     if (UseSIMDForMemoryOps) {
 918       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 919       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 920     } else {
 921       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 922       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 923       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 924       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 925     }
 926 
 927     {
 928       Label L1, L2;
 929       __ tbz(count, exact_log2(4), L1);
 930       if (UseSIMDForMemoryOps) {
 931         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 932         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 933       } else {
 934         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 935         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 936         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 937         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 938       }
 939       __ bind(L1);
 940 
 941       if (direction == copy_forwards) {
 942         __ add(s, s, bias);
 943         __ add(d, d, bias);
 944       }
 945 
 946       __ tbz(count, 1, L2);
 947       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 948       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 949       __ bind(L2);
 950     }
 951 
 952     __ ret(lr);
 953 
 954     if (AvoidUnalignedAccesses) {
 955       Label drain, again;
 956       // Register order for storing. Order is different for backward copy.
 957 
 958       __ bind(unaligned_copy_long);
 959 
 960       // source address is even aligned, target odd aligned
 961       //
 962       // when forward copying word pairs we read long pairs at offsets
 963       // {0, 2, 4, 6} (in long words). when backwards copying we read
 964       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 965       // address by -2 in the forwards case so we can compute the
 966       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 967       // or -1.
 968       //
 969       // when forward copying we need to store 1 word, 3 pairs and
 970       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 971       // zero offset We adjust the destination by -1 which means we
 972       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 973       //
 974       // When backwards copyng we need to store 1 word, 3 pairs and
 975       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 976       // offsets {1, 3, 5, 7, 8} * unit.
 977 
 978       if (direction == copy_forwards) {
 979         __ sub(s, s, 16);
 980         __ sub(d, d, 8);
 981       }
 982 
 983       // Fill 8 registers
 984       //
 985       // for forwards copy s was offset by -16 from the original input
 986       // value of s so the register contents are at these offsets
 987       // relative to the 64 bit block addressed by that original input
 988       // and so on for each successive 64 byte block when s is updated
 989       //
 990       // t0 at offset 0,  t1 at offset 8
 991       // t2 at offset 16, t3 at offset 24
 992       // t4 at offset 32, t5 at offset 40
 993       // t6 at offset 48, t7 at offset 56
 994 
 995       // for backwards copy s was not offset so the register contents
 996       // are at these offsets into the preceding 64 byte block
 997       // relative to that original input and so on for each successive
 998       // preceding 64 byte block when s is updated. this explains the
 999       // slightly counter-intuitive looking pattern of register usage
1000       // in the stp instructions for backwards copy.
1001       //
1002       // t0 at offset -16, t1 at offset -8
1003       // t2 at offset -32, t3 at offset -24
1004       // t4 at offset -48, t5 at offset -40
1005       // t6 at offset -64, t7 at offset -56
1006 
1007       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1008       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1009       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1010       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1011 
1012       __ subs(count, count, 16);
1013       __ br(Assembler::LO, drain);
1014 
1015       int prefetch = PrefetchCopyIntervalInBytes;
1016       bool use_stride = false;
1017       if (direction == copy_backwards) {
1018          use_stride = prefetch > 256;
1019          prefetch = -prefetch;
1020          if (use_stride) __ mov(stride, prefetch);
1021       }
1022 
1023       __ bind(again);
1024 
1025       if (PrefetchCopyIntervalInBytes > 0)
1026         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1027 
1028       if (direction == copy_forwards) {
1029        // allowing for the offset of -8 the store instructions place
1030        // registers into the target 64 bit block at the following
1031        // offsets
1032        //
1033        // t0 at offset 0
1034        // t1 at offset 8,  t2 at offset 16
1035        // t3 at offset 24, t4 at offset 32
1036        // t5 at offset 40, t6 at offset 48
1037        // t7 at offset 56
1038 
1039         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1040         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1041         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1042         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1043         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1044         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1045         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1046         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1047         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1048       } else {
1049        // d was not offset when we started so the registers are
1050        // written into the 64 bit block preceding d with the following
1051        // offsets
1052        //
1053        // t1 at offset -8
1054        // t3 at offset -24, t0 at offset -16
1055        // t5 at offset -48, t2 at offset -32
1056        // t7 at offset -56, t4 at offset -48
1057        //                   t6 at offset -64
1058        //
1059        // note that this matches the offsets previously noted for the
1060        // loads
1061 
1062         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1063         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1064         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1065         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1066         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1067         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1068         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1069         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1070         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1071       }
1072 
1073       __ subs(count, count, 8);
1074       __ br(Assembler::HS, again);
1075 
1076       // Drain
1077       //
1078       // this uses the same pattern of offsets and register arguments
1079       // as above
1080       __ bind(drain);
1081       if (direction == copy_forwards) {
1082         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1083         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1084         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1085         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1086         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1087       } else {
1088         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1089         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1090         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1091         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1092         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1093       }
1094       // now we need to copy any remaining part block which may
1095       // include a 4 word block subblock and/or a 2 word subblock.
1096       // bits 2 and 1 in the count are the tell-tale for whether we
1097       // have each such subblock
1098       {
1099         Label L1, L2;
1100         __ tbz(count, exact_log2(4), L1);
1101        // this is the same as above but copying only 4 longs hence
1102        // with only one intervening stp between the str instructions
1103        // but note that the offsets and registers still follow the
1104        // same pattern
1105         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1106         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1107         if (direction == copy_forwards) {
1108           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1109           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1110           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1111         } else {
1112           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1113           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1114           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1115         }
1116         __ bind(L1);
1117 
1118         __ tbz(count, 1, L2);
1119        // this is the same as above but copying only 2 longs hence
1120        // there is no intervening stp between the str instructions
1121        // but note that the offset and register patterns are still
1122        // the same
1123         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1124         if (direction == copy_forwards) {
1125           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1126           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1127         } else {
1128           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1129           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1130         }
1131         __ bind(L2);
1132 
1133        // for forwards copy we need to re-adjust the offsets we
1134        // applied so that s and d are follow the last words written
1135 
1136        if (direction == copy_forwards) {
1137          __ add(s, s, 16);
1138          __ add(d, d, 8);
1139        }
1140 
1141       }
1142 
1143       __ ret(lr);
1144       }
1145   }
1146 
1147   // Small copy: less than 16 bytes.
1148   //
1149   // NB: Ignores all of the bits of count which represent more than 15
1150   // bytes, so a caller doesn't have to mask them.
1151 
1152   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1153     bool is_backwards = step < 0;
1154     size_t granularity = uabs(step);
1155     int direction = is_backwards ? -1 : 1;
1156 
1157     Label Lword, Lint, Lshort, Lbyte;
1158 
1159     assert(granularity
1160            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1161 
1162     const Register t0 = r3;
1163     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1164     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1165 
1166     // ??? I don't know if this bit-test-and-branch is the right thing
1167     // to do.  It does a lot of jumping, resulting in several
1168     // mispredicted branches.  It might make more sense to do this
1169     // with something like Duff's device with a single computed branch.
1170 
1171     __ tbz(count, 3 - exact_log2(granularity), Lword);
1172     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1173     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1174     __ bind(Lword);
1175 
1176     if (granularity <= sizeof (jint)) {
1177       __ tbz(count, 2 - exact_log2(granularity), Lint);
1178       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1179       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1180       __ bind(Lint);
1181     }
1182 
1183     if (granularity <= sizeof (jshort)) {
1184       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1185       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1186       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1187       __ bind(Lshort);
1188     }
1189 
1190     if (granularity <= sizeof (jbyte)) {
1191       __ tbz(count, 0, Lbyte);
1192       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1193       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1194       __ bind(Lbyte);
1195     }
1196   }
1197 
1198   Label copy_f, copy_b;
1199   Label copy_obj_f, copy_obj_b;
1200   Label copy_obj_uninit_f, copy_obj_uninit_b;
1201 
1202   // All-singing all-dancing memory copy.
1203   //
1204   // Copy count units of memory from s to d.  The size of a unit is
1205   // step, which can be positive or negative depending on the direction
1206   // of copy.  If is_aligned is false, we align the source address.
1207   //
1208 
1209   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1210                    Register s, Register d, Register count, int step) {
1211     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1212     bool is_backwards = step < 0;
1213     unsigned int granularity = uabs(step);
1214     const Register t0 = r3, t1 = r4;
1215 
1216     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1217     // load all the data before writing anything
1218     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1219     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1220     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1221     const Register send = r17, dend = r16;
1222     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1223     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1224     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1225 
1226     if (PrefetchCopyIntervalInBytes > 0)
1227       __ prfm(Address(s, 0), PLDL1KEEP);
1228     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1229     __ br(Assembler::HI, copy_big);
1230 
1231     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1232     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1233 
1234     __ cmp(count, u1(16/granularity));
1235     __ br(Assembler::LS, copy16);
1236 
1237     __ cmp(count, u1(64/granularity));
1238     __ br(Assembler::HI, copy80);
1239 
1240     __ cmp(count, u1(32/granularity));
1241     __ br(Assembler::LS, copy32);
1242 
1243     // 33..64 bytes
1244     if (UseSIMDForMemoryOps) {
1245       bs.copy_load_at_32(v0, v1, Address(s, 0));
1246       bs.copy_load_at_32(v2, v3, Address(send, -32));
1247       bs.copy_store_at_32(Address(d, 0), v0, v1);
1248       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1249     } else {
1250       bs.copy_load_at_16(t0, t1, Address(s, 0));
1251       bs.copy_load_at_16(t2, t3, Address(s, 16));
1252       bs.copy_load_at_16(t4, t5, Address(send, -32));
1253       bs.copy_load_at_16(t6, t7, Address(send, -16));
1254 
1255       bs.copy_store_at_16(Address(d, 0), t0, t1);
1256       bs.copy_store_at_16(Address(d, 16), t2, t3);
1257       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1258       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1259     }
1260     __ b(finish);
1261 
1262     // 17..32 bytes
1263     __ bind(copy32);
1264     bs.copy_load_at_16(t0, t1, Address(s, 0));
1265     bs.copy_load_at_16(t6, t7, Address(send, -16));
1266 
1267     bs.copy_store_at_16(Address(d, 0), t0, t1);
1268     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1269     __ b(finish);
1270 
1271     // 65..80/96 bytes
1272     // (96 bytes if SIMD because we do 32 byes per instruction)
1273     __ bind(copy80);
1274     if (UseSIMDForMemoryOps) {
1275       bs.copy_load_at_32(v0, v1, Address(s, 0));
1276       bs.copy_load_at_32(v2, v3, Address(s, 32));
1277       // Unaligned pointers can be an issue for copying.
1278       // The issue has more chances to happen when granularity of data is
1279       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1280       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1281       // The most performance drop has been seen for the range 65-80 bytes.
1282       // For such cases using the pair of ldp/stp instead of the third pair of
1283       // ldpq/stpq fixes the performance issue.
1284       if (granularity < sizeof (jint)) {
1285         Label copy96;
1286         __ cmp(count, u1(80/granularity));
1287         __ br(Assembler::HI, copy96);
1288         bs.copy_load_at_16(t0, t1, Address(send, -16));
1289 
1290         bs.copy_store_at_32(Address(d, 0), v0, v1);
1291         bs.copy_store_at_32(Address(d, 32), v2, v3);
1292 
1293         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1294         __ b(finish);
1295 
1296         __ bind(copy96);
1297       }
1298       bs.copy_load_at_32(v4, v5, Address(send, -32));
1299 
1300       bs.copy_store_at_32(Address(d, 0), v0, v1);
1301       bs.copy_store_at_32(Address(d, 32), v2, v3);
1302 
1303       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1304     } else {
1305       bs.copy_load_at_16(t0, t1, Address(s, 0));
1306       bs.copy_load_at_16(t2, t3, Address(s, 16));
1307       bs.copy_load_at_16(t4, t5, Address(s, 32));
1308       bs.copy_load_at_16(t6, t7, Address(s, 48));
1309       bs.copy_load_at_16(t8, t9, Address(send, -16));
1310 
1311       bs.copy_store_at_16(Address(d, 0), t0, t1);
1312       bs.copy_store_at_16(Address(d, 16), t2, t3);
1313       bs.copy_store_at_16(Address(d, 32), t4, t5);
1314       bs.copy_store_at_16(Address(d, 48), t6, t7);
1315       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1316     }
1317     __ b(finish);
1318 
1319     // 0..16 bytes
1320     __ bind(copy16);
1321     __ cmp(count, u1(8/granularity));
1322     __ br(Assembler::LO, copy8);
1323 
1324     // 8..16 bytes
1325     bs.copy_load_at_8(t0, Address(s, 0));
1326     bs.copy_load_at_8(t1, Address(send, -8));
1327     bs.copy_store_at_8(Address(d, 0), t0);
1328     bs.copy_store_at_8(Address(dend, -8), t1);
1329     __ b(finish);
1330 
1331     if (granularity < 8) {
1332       // 4..7 bytes
1333       __ bind(copy8);
1334       __ tbz(count, 2 - exact_log2(granularity), copy4);
1335       __ ldrw(t0, Address(s, 0));
1336       __ ldrw(t1, Address(send, -4));
1337       __ strw(t0, Address(d, 0));
1338       __ strw(t1, Address(dend, -4));
1339       __ b(finish);
1340       if (granularity < 4) {
1341         // 0..3 bytes
1342         __ bind(copy4);
1343         __ cbz(count, finish); // get rid of 0 case
1344         if (granularity == 2) {
1345           __ ldrh(t0, Address(s, 0));
1346           __ strh(t0, Address(d, 0));
1347         } else { // granularity == 1
1348           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1349           // the first and last byte.
1350           // Handle the 3 byte case by loading and storing base + count/2
1351           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1352           // This does means in the 1 byte case we load/store the same
1353           // byte 3 times.
1354           __ lsr(count, count, 1);
1355           __ ldrb(t0, Address(s, 0));
1356           __ ldrb(t1, Address(send, -1));
1357           __ ldrb(t2, Address(s, count));
1358           __ strb(t0, Address(d, 0));
1359           __ strb(t1, Address(dend, -1));
1360           __ strb(t2, Address(d, count));
1361         }
1362         __ b(finish);
1363       }
1364     }
1365 
1366     __ bind(copy_big);
1367     if (is_backwards) {
1368       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1369       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1370     }
1371 
1372     // Now we've got the small case out of the way we can align the
1373     // source address on a 2-word boundary.
1374 
1375     // Here we will materialize a count in r15, which is used by copy_memory_small
1376     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1377     // Up until here, we have used t9, which aliases r15, but from here on, that register
1378     // can not be used as a temp register, as it contains the count.
1379 
1380     Label aligned;
1381 
1382     if (is_aligned) {
1383       // We may have to adjust by 1 word to get s 2-word-aligned.
1384       __ tbz(s, exact_log2(wordSize), aligned);
1385       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1386       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1387       __ sub(count, count, wordSize/granularity);
1388     } else {
1389       if (is_backwards) {
1390         __ andr(r15, s, 2 * wordSize - 1);
1391       } else {
1392         __ neg(r15, s);
1393         __ andr(r15, r15, 2 * wordSize - 1);
1394       }
1395       // r15 is the byte adjustment needed to align s.
1396       __ cbz(r15, aligned);
1397       int shift = exact_log2(granularity);
1398       if (shift)  __ lsr(r15, r15, shift);
1399       __ sub(count, count, r15);
1400 
1401 #if 0
1402       // ?? This code is only correct for a disjoint copy.  It may or
1403       // may not make sense to use it in that case.
1404 
1405       // Copy the first pair; s and d may not be aligned.
1406       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1407       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1408 
1409       // Align s and d, adjust count
1410       if (is_backwards) {
1411         __ sub(s, s, r15);
1412         __ sub(d, d, r15);
1413       } else {
1414         __ add(s, s, r15);
1415         __ add(d, d, r15);
1416       }
1417 #else
1418       copy_memory_small(decorators, type, s, d, r15, step);
1419 #endif
1420     }
1421 
1422     __ bind(aligned);
1423 
1424     // s is now 2-word-aligned.
1425 
1426     // We have a count of units and some trailing bytes.  Adjust the
1427     // count and do a bulk copy of words.
1428     __ lsr(r15, count, exact_log2(wordSize/granularity));
1429     if (direction == copy_forwards) {
1430       if (type != T_OBJECT) {
1431         __ bl(copy_f);
1432       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1433         __ bl(copy_obj_uninit_f);
1434       } else {
1435         __ bl(copy_obj_f);
1436       }
1437     } else {
1438       if (type != T_OBJECT) {
1439         __ bl(copy_b);
1440       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1441         __ bl(copy_obj_uninit_b);
1442       } else {
1443         __ bl(copy_obj_b);
1444       }
1445     }
1446 
1447     // And the tail.
1448     copy_memory_small(decorators, type, s, d, count, step);
1449 
1450     if (granularity >= 8) __ bind(copy8);
1451     if (granularity >= 4) __ bind(copy4);
1452     __ bind(finish);
1453   }
1454 
1455 
1456   void clobber_registers() {
1457 #ifdef ASSERT
1458     RegSet clobbered
1459       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1460     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1461     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1462     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1463       __ mov(*it, rscratch1);
1464     }
1465 #endif
1466 
1467   }
1468 
1469   // Scan over array at a for count oops, verifying each one.
1470   // Preserves a and count, clobbers rscratch1 and rscratch2.
1471   void verify_oop_array (int size, Register a, Register count, Register temp) {
1472     Label loop, end;
1473     __ mov(rscratch1, a);
1474     __ mov(rscratch2, zr);
1475     __ bind(loop);
1476     __ cmp(rscratch2, count);
1477     __ br(Assembler::HS, end);
1478     if (size == wordSize) {
1479       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1480       __ verify_oop(temp);
1481     } else {
1482       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1483       __ decode_heap_oop(temp); // calls verify_oop
1484     }
1485     __ add(rscratch2, rscratch2, 1);
1486     __ b(loop);
1487     __ bind(end);
1488   }
1489 
1490   // Arguments:
1491   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1492   //             ignored
1493   //   is_oop  - true => oop array, so generate store check code
1494   //   name    - stub name string
1495   //
1496   // Inputs:
1497   //   c_rarg0   - source array address
1498   //   c_rarg1   - destination array address
1499   //   c_rarg2   - element count, treated as ssize_t, can be zero
1500   //
1501   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1502   // the hardware handle it.  The two dwords within qwords that span
1503   // cache line boundaries will still be loaded and stored atomically.
1504   //
1505   // Side Effects:
1506   //   disjoint_int_copy_entry is set to the no-overlap entry point
1507   //   used by generate_conjoint_int_oop_copy().
1508   //
1509   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1510                                   const char *name, bool dest_uninitialized = false) {
1511     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1512     RegSet saved_reg = RegSet::of(s, d, count);
1513     __ align(CodeEntryAlignment);
1514     StubCodeMark mark(this, "StubRoutines", name);
1515     address start = __ pc();
1516     __ enter();
1517 
1518     if (entry != nullptr) {
1519       *entry = __ pc();
1520       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1521       BLOCK_COMMENT("Entry:");
1522     }
1523 
1524     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1525     if (dest_uninitialized) {
1526       decorators |= IS_DEST_UNINITIALIZED;
1527     }
1528     if (aligned) {
1529       decorators |= ARRAYCOPY_ALIGNED;
1530     }
1531 
1532     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1533     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1534 
1535     if (is_oop) {
1536       // save regs before copy_memory
1537       __ push(RegSet::of(d, count), sp);
1538     }
1539     {
1540       // UnsafeCopyMemory page error: continue after ucm
1541       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1542       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1543       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1544     }
1545 
1546     if (is_oop) {
1547       __ pop(RegSet::of(d, count), sp);
1548       if (VerifyOops)
1549         verify_oop_array(size, d, count, r16);
1550     }
1551 
1552     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1553 
1554     __ leave();
1555     __ mov(r0, zr); // return 0
1556     __ ret(lr);
1557     return start;
1558   }
1559 
1560   // Arguments:
1561   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1562   //             ignored
1563   //   is_oop  - true => oop array, so generate store check code
1564   //   name    - stub name string
1565   //
1566   // Inputs:
1567   //   c_rarg0   - source array address
1568   //   c_rarg1   - destination array address
1569   //   c_rarg2   - element count, treated as ssize_t, can be zero
1570   //
1571   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1572   // the hardware handle it.  The two dwords within qwords that span
1573   // cache line boundaries will still be loaded and stored atomically.
1574   //
1575   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1576                                  address *entry, const char *name,
1577                                  bool dest_uninitialized = false) {
1578     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1579     RegSet saved_regs = RegSet::of(s, d, count);
1580     StubCodeMark mark(this, "StubRoutines", name);
1581     address start = __ pc();
1582     __ enter();
1583 
1584     if (entry != nullptr) {
1585       *entry = __ pc();
1586       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1587       BLOCK_COMMENT("Entry:");
1588     }
1589 
1590     // use fwd copy when (d-s) above_equal (count*size)
1591     __ sub(rscratch1, d, s);
1592     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1593     __ br(Assembler::HS, nooverlap_target);
1594 
1595     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1596     if (dest_uninitialized) {
1597       decorators |= IS_DEST_UNINITIALIZED;
1598     }
1599     if (aligned) {
1600       decorators |= ARRAYCOPY_ALIGNED;
1601     }
1602 
1603     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1604     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1605 
1606     if (is_oop) {
1607       // save regs before copy_memory
1608       __ push(RegSet::of(d, count), sp);
1609     }
1610     {
1611       // UnsafeCopyMemory page error: continue after ucm
1612       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1613       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1614       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1615     }
1616     if (is_oop) {
1617       __ pop(RegSet::of(d, count), sp);
1618       if (VerifyOops)
1619         verify_oop_array(size, d, count, r16);
1620     }
1621     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1622     __ leave();
1623     __ mov(r0, zr); // return 0
1624     __ ret(lr);
1625     return start;
1626 }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as ssize_t, can be zero
1637   //
1638   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1639   // we let the hardware handle it.  The one to eight bytes within words,
1640   // dwords or qwords that span cache line boundaries will still be loaded
1641   // and stored atomically.
1642   //
1643   // Side Effects:
1644   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1645   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1646   // we let the hardware handle it.  The one to eight bytes within words,
1647   // dwords or qwords that span cache line boundaries will still be loaded
1648   // and stored atomically.
1649   //
1650   // Side Effects:
1651   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1652   //   used by generate_conjoint_byte_copy().
1653   //
1654   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1655     const bool not_oop = false;
1656     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1657   }
1658 
1659   // Arguments:
1660   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1661   //             ignored
1662   //   name    - stub name string
1663   //
1664   // Inputs:
1665   //   c_rarg0   - source array address
1666   //   c_rarg1   - destination array address
1667   //   c_rarg2   - element count, treated as ssize_t, can be zero
1668   //
1669   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1670   // we let the hardware handle it.  The one to eight bytes within words,
1671   // dwords or qwords that span cache line boundaries will still be loaded
1672   // and stored atomically.
1673   //
1674   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1675                                       address* entry, const char *name) {
1676     const bool not_oop = false;
1677     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1678   }
1679 
1680   // Arguments:
1681   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1682   //             ignored
1683   //   name    - stub name string
1684   //
1685   // Inputs:
1686   //   c_rarg0   - source array address
1687   //   c_rarg1   - destination array address
1688   //   c_rarg2   - element count, treated as ssize_t, can be zero
1689   //
1690   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1691   // let the hardware handle it.  The two or four words within dwords
1692   // or qwords that span cache line boundaries will still be loaded
1693   // and stored atomically.
1694   //
1695   // Side Effects:
1696   //   disjoint_short_copy_entry is set to the no-overlap entry point
1697   //   used by generate_conjoint_short_copy().
1698   //
1699   address generate_disjoint_short_copy(bool aligned,
1700                                        address* entry, const char *name) {
1701     const bool not_oop = false;
1702     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1703   }
1704 
1705   // Arguments:
1706   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1707   //             ignored
1708   //   name    - stub name string
1709   //
1710   // Inputs:
1711   //   c_rarg0   - source array address
1712   //   c_rarg1   - destination array address
1713   //   c_rarg2   - element count, treated as ssize_t, can be zero
1714   //
1715   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1716   // let the hardware handle it.  The two or four words within dwords
1717   // or qwords that span cache line boundaries will still be loaded
1718   // and stored atomically.
1719   //
1720   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1721                                        address *entry, const char *name) {
1722     const bool not_oop = false;
1723     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1724 
1725   }
1726   // Arguments:
1727   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1728   //             ignored
1729   //   name    - stub name string
1730   //
1731   // Inputs:
1732   //   c_rarg0   - source array address
1733   //   c_rarg1   - destination array address
1734   //   c_rarg2   - element count, treated as ssize_t, can be zero
1735   //
1736   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1737   // the hardware handle it.  The two dwords within qwords that span
1738   // cache line boundaries will still be loaded and stored atomically.
1739   //
1740   // Side Effects:
1741   //   disjoint_int_copy_entry is set to the no-overlap entry point
1742   //   used by generate_conjoint_int_oop_copy().
1743   //
1744   address generate_disjoint_int_copy(bool aligned, address *entry,
1745                                          const char *name, bool dest_uninitialized = false) {
1746     const bool not_oop = false;
1747     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1748   }
1749 
1750   // Arguments:
1751   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1752   //             ignored
1753   //   name    - stub name string
1754   //
1755   // Inputs:
1756   //   c_rarg0   - source array address
1757   //   c_rarg1   - destination array address
1758   //   c_rarg2   - element count, treated as ssize_t, can be zero
1759   //
1760   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1761   // the hardware handle it.  The two dwords within qwords that span
1762   // cache line boundaries will still be loaded and stored atomically.
1763   //
1764   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1765                                      address *entry, const char *name,
1766                                      bool dest_uninitialized = false) {
1767     const bool not_oop = false;
1768     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1769   }
1770 
1771 
1772   // Arguments:
1773   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1774   //             ignored
1775   //   name    - stub name string
1776   //
1777   // Inputs:
1778   //   c_rarg0   - source array address
1779   //   c_rarg1   - destination array address
1780   //   c_rarg2   - element count, treated as size_t, can be zero
1781   //
1782   // Side Effects:
1783   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1784   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1785   //
1786   address generate_disjoint_long_copy(bool aligned, address *entry,
1787                                           const char *name, bool dest_uninitialized = false) {
1788     const bool not_oop = false;
1789     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1790   }
1791 
1792   // Arguments:
1793   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1794   //             ignored
1795   //   name    - stub name string
1796   //
1797   // Inputs:
1798   //   c_rarg0   - source array address
1799   //   c_rarg1   - destination array address
1800   //   c_rarg2   - element count, treated as size_t, can be zero
1801   //
1802   address generate_conjoint_long_copy(bool aligned,
1803                                       address nooverlap_target, address *entry,
1804                                       const char *name, bool dest_uninitialized = false) {
1805     const bool not_oop = false;
1806     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1807   }
1808 
1809   // Arguments:
1810   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1811   //             ignored
1812   //   name    - stub name string
1813   //
1814   // Inputs:
1815   //   c_rarg0   - source array address
1816   //   c_rarg1   - destination array address
1817   //   c_rarg2   - element count, treated as size_t, can be zero
1818   //
1819   // Side Effects:
1820   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1821   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1822   //
1823   address generate_disjoint_oop_copy(bool aligned, address *entry,
1824                                      const char *name, bool dest_uninitialized) {
1825     const bool is_oop = true;
1826     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1827     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1828   }
1829 
1830   // Arguments:
1831   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1832   //             ignored
1833   //   name    - stub name string
1834   //
1835   // Inputs:
1836   //   c_rarg0   - source array address
1837   //   c_rarg1   - destination array address
1838   //   c_rarg2   - element count, treated as size_t, can be zero
1839   //
1840   address generate_conjoint_oop_copy(bool aligned,
1841                                      address nooverlap_target, address *entry,
1842                                      const char *name, bool dest_uninitialized) {
1843     const bool is_oop = true;
1844     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1845     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1846                                   name, dest_uninitialized);
1847   }
1848 
1849 
1850   // Helper for generating a dynamic type check.
1851   // Smashes rscratch1, rscratch2.
1852   void generate_type_check(Register sub_klass,
1853                            Register super_check_offset,
1854                            Register super_klass,
1855                            Label& L_success) {
1856     assert_different_registers(sub_klass, super_check_offset, super_klass);
1857 
1858     BLOCK_COMMENT("type_check:");
1859 
1860     Label L_miss;
1861 
1862     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1863                                      super_check_offset);
1864     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1865 
1866     // Fall through on failure!
1867     __ BIND(L_miss);
1868   }
1869 
1870   //
1871   //  Generate checkcasting array copy stub
1872   //
1873   //  Input:
1874   //    c_rarg0   - source array address
1875   //    c_rarg1   - destination array address
1876   //    c_rarg2   - element count, treated as ssize_t, can be zero
1877   //    c_rarg3   - size_t ckoff (super_check_offset)
1878   //    c_rarg4   - oop ckval (super_klass)
1879   //
1880   //  Output:
1881   //    r0 ==  0  -  success
1882   //    r0 == -1^K - failure, where K is partial transfer count
1883   //
1884   address generate_checkcast_copy(const char *name, address *entry,
1885                                   bool dest_uninitialized = false) {
1886 
1887     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1888 
1889     // Input registers (after setup_arg_regs)
1890     const Register from        = c_rarg0;   // source array address
1891     const Register to          = c_rarg1;   // destination array address
1892     const Register count       = c_rarg2;   // elementscount
1893     const Register ckoff       = c_rarg3;   // super_check_offset
1894     const Register ckval       = c_rarg4;   // super_klass
1895 
1896     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1897     RegSet wb_post_saved_regs = RegSet::of(count);
1898 
1899     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1900     const Register copied_oop  = r22;       // actual oop copied
1901     const Register count_save  = r21;       // orig elementscount
1902     const Register start_to    = r20;       // destination array start address
1903     const Register r19_klass   = r19;       // oop._klass
1904 
1905     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1906     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1907 
1908     //---------------------------------------------------------------
1909     // Assembler stub will be used for this call to arraycopy
1910     // if the two arrays are subtypes of Object[] but the
1911     // destination array type is not equal to or a supertype
1912     // of the source type.  Each element must be separately
1913     // checked.
1914 
1915     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1916                                copied_oop, r19_klass, count_save);
1917 
1918     __ align(CodeEntryAlignment);
1919     StubCodeMark mark(this, "StubRoutines", name);
1920     address start = __ pc();
1921 
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923 
1924 #ifdef ASSERT
1925     // caller guarantees that the arrays really are different
1926     // otherwise, we would have to make conjoint checks
1927     { Label L;
1928       __ b(L);                  // conjoint check not yet implemented
1929       __ stop("checkcast_copy within a single array");
1930       __ bind(L);
1931     }
1932 #endif //ASSERT
1933 
1934     // Caller of this entry point must set up the argument registers.
1935     if (entry != nullptr) {
1936       *entry = __ pc();
1937       BLOCK_COMMENT("Entry:");
1938     }
1939 
1940      // Empty array:  Nothing to do.
1941     __ cbz(count, L_done);
1942     __ push(RegSet::of(r19, r20, r21, r22), sp);
1943 
1944 #ifdef ASSERT
1945     BLOCK_COMMENT("assert consistent ckoff/ckval");
1946     // The ckoff and ckval must be mutually consistent,
1947     // even though caller generates both.
1948     { Label L;
1949       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1950       __ ldrw(start_to, Address(ckval, sco_offset));
1951       __ cmpw(ckoff, start_to);
1952       __ br(Assembler::EQ, L);
1953       __ stop("super_check_offset inconsistent");
1954       __ bind(L);
1955     }
1956 #endif //ASSERT
1957 
1958     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1959     bool is_oop = true;
1960     int element_size = UseCompressedOops ? 4 : 8;
1961     if (dest_uninitialized) {
1962       decorators |= IS_DEST_UNINITIALIZED;
1963     }
1964 
1965     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1966     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1967 
1968     // save the original count
1969     __ mov(count_save, count);
1970 
1971     // Copy from low to high addresses
1972     __ mov(start_to, to);              // Save destination array start address
1973     __ b(L_load_element);
1974 
1975     // ======== begin loop ========
1976     // (Loop is rotated; its entry is L_load_element.)
1977     // Loop control:
1978     //   for (; count != 0; count--) {
1979     //     copied_oop = load_heap_oop(from++);
1980     //     ... generate_type_check ...;
1981     //     store_heap_oop(to++, copied_oop);
1982     //   }
1983     __ align(OptoLoopAlignment);
1984 
1985     __ BIND(L_store_element);
1986     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1987                       __ post(to, element_size), copied_oop, noreg,
1988                       gct1, gct2, gct3);
1989     __ sub(count, count, 1);
1990     __ cbz(count, L_do_card_marks);
1991 
1992     // ======== loop entry is here ========
1993     __ BIND(L_load_element);
1994     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1995                      copied_oop, noreg, __ post(from, element_size),
1996                      gct1);
1997     __ cbz(copied_oop, L_store_element);
1998 
1999     __ load_klass(r19_klass, copied_oop);// query the object klass
2000     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
2001     // ======== end loop ========
2002 
2003     // It was a real error; we must depend on the caller to finish the job.
2004     // Register count = remaining oops, count_orig = total oops.
2005     // Emit GC store barriers for the oops we have copied and report
2006     // their number to the caller.
2007 
2008     __ subs(count, count_save, count);     // K = partially copied oop count
2009     __ eon(count, count, zr);                   // report (-1^K) to caller
2010     __ br(Assembler::EQ, L_done_pop);
2011 
2012     __ BIND(L_do_card_marks);
2013     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2014 
2015     __ bind(L_done_pop);
2016     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2017     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2018 
2019     __ bind(L_done);
2020     __ mov(r0, count);
2021     __ leave();
2022     __ ret(lr);
2023 
2024     return start;
2025   }
2026 
2027   // Perform range checks on the proposed arraycopy.
2028   // Kills temp, but nothing else.
2029   // Also, clean the sign bits of src_pos and dst_pos.
2030   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2031                               Register src_pos, // source position (c_rarg1)
2032                               Register dst,     // destination array oo (c_rarg2)
2033                               Register dst_pos, // destination position (c_rarg3)
2034                               Register length,
2035                               Register temp,
2036                               Label& L_failed) {
2037     BLOCK_COMMENT("arraycopy_range_checks:");
2038 
2039     assert_different_registers(rscratch1, temp);
2040 
2041     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2042     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2043     __ addw(temp, length, src_pos);
2044     __ cmpw(temp, rscratch1);
2045     __ br(Assembler::HI, L_failed);
2046 
2047     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2048     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2049     __ addw(temp, length, dst_pos);
2050     __ cmpw(temp, rscratch1);
2051     __ br(Assembler::HI, L_failed);
2052 
2053     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2054     __ movw(src_pos, src_pos);
2055     __ movw(dst_pos, dst_pos);
2056 
2057     BLOCK_COMMENT("arraycopy_range_checks done");
2058   }
2059 
2060   // These stubs get called from some dumb test routine.
2061   // I'll write them properly when they're called from
2062   // something that's actually doing something.
2063   static void fake_arraycopy_stub(address src, address dst, int count) {
2064     assert(count == 0, "huh?");
2065   }
2066 
2067 
2068   //
2069   //  Generate 'unsafe' array copy stub
2070   //  Though just as safe as the other stubs, it takes an unscaled
2071   //  size_t argument instead of an element count.
2072   //
2073   //  Input:
2074   //    c_rarg0   - source array address
2075   //    c_rarg1   - destination array address
2076   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2077   //
2078   // Examines the alignment of the operands and dispatches
2079   // to a long, int, short, or byte copy loop.
2080   //
2081   address generate_unsafe_copy(const char *name,
2082                                address byte_copy_entry,
2083                                address short_copy_entry,
2084                                address int_copy_entry,
2085                                address long_copy_entry) {
2086     Label L_long_aligned, L_int_aligned, L_short_aligned;
2087     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2088 
2089     __ align(CodeEntryAlignment);
2090     StubCodeMark mark(this, "StubRoutines", name);
2091     address start = __ pc();
2092     __ enter(); // required for proper stackwalking of RuntimeStub frame
2093 
2094     // bump this on entry, not on exit:
2095     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2096 
2097     __ orr(rscratch1, s, d);
2098     __ orr(rscratch1, rscratch1, count);
2099 
2100     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2101     __ cbz(rscratch1, L_long_aligned);
2102     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2103     __ cbz(rscratch1, L_int_aligned);
2104     __ tbz(rscratch1, 0, L_short_aligned);
2105     __ b(RuntimeAddress(byte_copy_entry));
2106 
2107     __ BIND(L_short_aligned);
2108     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2109     __ b(RuntimeAddress(short_copy_entry));
2110     __ BIND(L_int_aligned);
2111     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2112     __ b(RuntimeAddress(int_copy_entry));
2113     __ BIND(L_long_aligned);
2114     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2115     __ b(RuntimeAddress(long_copy_entry));
2116 
2117     return start;
2118   }
2119 
2120   //
2121   //  Generate generic array copy stubs
2122   //
2123   //  Input:
2124   //    c_rarg0    -  src oop
2125   //    c_rarg1    -  src_pos (32-bits)
2126   //    c_rarg2    -  dst oop
2127   //    c_rarg3    -  dst_pos (32-bits)
2128   //    c_rarg4    -  element count (32-bits)
2129   //
2130   //  Output:
2131   //    r0 ==  0  -  success
2132   //    r0 == -1^K - failure, where K is partial transfer count
2133   //
2134   address generate_generic_copy(const char *name,
2135                                 address byte_copy_entry, address short_copy_entry,
2136                                 address int_copy_entry, address oop_copy_entry,
2137                                 address long_copy_entry, address checkcast_copy_entry) {
2138 
2139     Label L_failed, L_objArray;
2140     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2141 
2142     // Input registers
2143     const Register src        = c_rarg0;  // source array oop
2144     const Register src_pos    = c_rarg1;  // source position
2145     const Register dst        = c_rarg2;  // destination array oop
2146     const Register dst_pos    = c_rarg3;  // destination position
2147     const Register length     = c_rarg4;
2148 
2149 
2150     // Registers used as temps
2151     const Register dst_klass  = c_rarg5;
2152 
2153     __ align(CodeEntryAlignment);
2154 
2155     StubCodeMark mark(this, "StubRoutines", name);
2156 
2157     address start = __ pc();
2158 
2159     __ enter(); // required for proper stackwalking of RuntimeStub frame
2160 
2161     // bump this on entry, not on exit:
2162     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2163 
2164     //-----------------------------------------------------------------------
2165     // Assembler stub will be used for this call to arraycopy
2166     // if the following conditions are met:
2167     //
2168     // (1) src and dst must not be null.
2169     // (2) src_pos must not be negative.
2170     // (3) dst_pos must not be negative.
2171     // (4) length  must not be negative.
2172     // (5) src klass and dst klass should be the same and not null.
2173     // (6) src and dst should be arrays.
2174     // (7) src_pos + length must not exceed length of src.
2175     // (8) dst_pos + length must not exceed length of dst.
2176     //
2177 
2178     //  if (src == nullptr) return -1;
2179     __ cbz(src, L_failed);
2180 
2181     //  if (src_pos < 0) return -1;
2182     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2183 
2184     //  if (dst == nullptr) return -1;
2185     __ cbz(dst, L_failed);
2186 
2187     //  if (dst_pos < 0) return -1;
2188     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2189 
2190     // registers used as temp
2191     const Register scratch_length    = r16; // elements count to copy
2192     const Register scratch_src_klass = r17; // array klass
2193     const Register lh                = r15; // layout helper
2194 
2195     //  if (length < 0) return -1;
2196     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2197     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2198 
2199     __ load_klass(scratch_src_klass, src);
2200 #ifdef ASSERT
2201     //  assert(src->klass() != nullptr);
2202     {
2203       BLOCK_COMMENT("assert klasses not null {");
2204       Label L1, L2;
2205       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2206       __ bind(L1);
2207       __ stop("broken null klass");
2208       __ bind(L2);
2209       __ load_klass(rscratch1, dst);
2210       __ cbz(rscratch1, L1);     // this would be broken also
2211       BLOCK_COMMENT("} assert klasses not null done");
2212     }
2213 #endif
2214 
2215     // Load layout helper (32-bits)
2216     //
2217     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2218     // 32        30    24            16              8     2                 0
2219     //
2220     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2221     //
2222 
2223     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2224 
2225     // Handle objArrays completely differently...
2226     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2227     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2228     __ movw(rscratch1, objArray_lh);
2229     __ eorw(rscratch2, lh, rscratch1);
2230     __ cbzw(rscratch2, L_objArray);
2231 
2232     //  if (src->klass() != dst->klass()) return -1;
2233     __ load_klass(rscratch2, dst);
2234     __ eor(rscratch2, rscratch2, scratch_src_klass);
2235     __ cbnz(rscratch2, L_failed);
2236 
2237     // Check for flat inline type array -> return -1
2238     __ tst(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2239     __ br(Assembler::NE, L_failed);
2240 
2241     // Check for null-free (non-flat) inline type array -> handle as object array
2242     __ tst(lh, Klass::_lh_null_free_array_bit_inplace);
2243     __ br(Assembler::NE, L_failed);
2244 
2245     //  if (!src->is_Array()) return -1;
2246     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2247 
2248     // At this point, it is known to be a typeArray (array_tag 0x3).
2249 #ifdef ASSERT
2250     {
2251       BLOCK_COMMENT("assert primitive array {");
2252       Label L;
2253       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2254       __ cmpw(lh, rscratch2);
2255       __ br(Assembler::GE, L);
2256       __ stop("must be a primitive array");
2257       __ bind(L);
2258       BLOCK_COMMENT("} assert primitive array done");
2259     }
2260 #endif
2261 
2262     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2263                            rscratch2, L_failed);
2264 
2265     // TypeArrayKlass
2266     //
2267     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2268     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2269     //
2270 
2271     const Register rscratch1_offset = rscratch1;    // array offset
2272     const Register r15_elsize = lh; // element size
2273 
2274     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2275            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2276     __ add(src, src, rscratch1_offset);           // src array offset
2277     __ add(dst, dst, rscratch1_offset);           // dst array offset
2278     BLOCK_COMMENT("choose copy loop based on element size");
2279 
2280     // next registers should be set before the jump to corresponding stub
2281     const Register from     = c_rarg0;  // source array address
2282     const Register to       = c_rarg1;  // destination array address
2283     const Register count    = c_rarg2;  // elements count
2284 
2285     // 'from', 'to', 'count' registers should be set in such order
2286     // since they are the same as 'src', 'src_pos', 'dst'.
2287 
2288     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2289 
2290     // The possible values of elsize are 0-3, i.e. exact_log2(element
2291     // size in bytes).  We do a simple bitwise binary search.
2292   __ BIND(L_copy_bytes);
2293     __ tbnz(r15_elsize, 1, L_copy_ints);
2294     __ tbnz(r15_elsize, 0, L_copy_shorts);
2295     __ lea(from, Address(src, src_pos));// src_addr
2296     __ lea(to,   Address(dst, dst_pos));// dst_addr
2297     __ movw(count, scratch_length); // length
2298     __ b(RuntimeAddress(byte_copy_entry));
2299 
2300   __ BIND(L_copy_shorts);
2301     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2302     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2303     __ movw(count, scratch_length); // length
2304     __ b(RuntimeAddress(short_copy_entry));
2305 
2306   __ BIND(L_copy_ints);
2307     __ tbnz(r15_elsize, 0, L_copy_longs);
2308     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2309     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2310     __ movw(count, scratch_length); // length
2311     __ b(RuntimeAddress(int_copy_entry));
2312 
2313   __ BIND(L_copy_longs);
2314 #ifdef ASSERT
2315     {
2316       BLOCK_COMMENT("assert long copy {");
2317       Label L;
2318       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2319       __ cmpw(r15_elsize, LogBytesPerLong);
2320       __ br(Assembler::EQ, L);
2321       __ stop("must be long copy, but elsize is wrong");
2322       __ bind(L);
2323       BLOCK_COMMENT("} assert long copy done");
2324     }
2325 #endif
2326     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2327     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2328     __ movw(count, scratch_length); // length
2329     __ b(RuntimeAddress(long_copy_entry));
2330 
2331     // ObjArrayKlass
2332   __ BIND(L_objArray);
2333     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2334 
2335     Label L_plain_copy, L_checkcast_copy;
2336     //  test array classes for subtyping
2337     __ load_klass(r15, dst);
2338     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2339     __ br(Assembler::NE, L_checkcast_copy);
2340 
2341     // Identically typed arrays can be copied without element-wise checks.
2342     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2343                            rscratch2, L_failed);
2344 
2345     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2346     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2347     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2348     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2349     __ movw(count, scratch_length); // length
2350   __ BIND(L_plain_copy);
2351     __ b(RuntimeAddress(oop_copy_entry));
2352 
2353   __ BIND(L_checkcast_copy);
2354     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2355     {
2356       // Before looking at dst.length, make sure dst is also an objArray.
2357       __ ldrw(rscratch1, Address(r15, lh_offset));
2358       __ movw(rscratch2, objArray_lh);
2359       __ eorw(rscratch1, rscratch1, rscratch2);
2360       __ cbnzw(rscratch1, L_failed);
2361 
2362       // It is safe to examine both src.length and dst.length.
2363       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2364                              r15, L_failed);
2365 
2366       __ load_klass(dst_klass, dst); // reload
2367 
2368       // Marshal the base address arguments now, freeing registers.
2369       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2370       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2371       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2372       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2373       __ movw(count, length);           // length (reloaded)
2374       Register sco_temp = c_rarg3;      // this register is free now
2375       assert_different_registers(from, to, count, sco_temp,
2376                                  dst_klass, scratch_src_klass);
2377       // assert_clean_int(count, sco_temp);
2378 
2379       // Generate the type check.
2380       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2381       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2382 
2383       // Smashes rscratch1, rscratch2
2384       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2385 
2386       // Fetch destination element klass from the ObjArrayKlass header.
2387       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2388       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2389       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2390 
2391       // the checkcast_copy loop needs two extra arguments:
2392       assert(c_rarg3 == sco_temp, "#3 already in place");
2393       // Set up arguments for checkcast_copy_entry.
2394       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2395       __ b(RuntimeAddress(checkcast_copy_entry));
2396     }
2397 
2398   __ BIND(L_failed);
2399     __ mov(r0, -1);
2400     __ leave();   // required for proper stackwalking of RuntimeStub frame
2401     __ ret(lr);
2402 
2403     return start;
2404   }
2405 
2406   //
2407   // Generate stub for array fill. If "aligned" is true, the
2408   // "to" address is assumed to be heapword aligned.
2409   //
2410   // Arguments for generated stub:
2411   //   to:    c_rarg0
2412   //   value: c_rarg1
2413   //   count: c_rarg2 treated as signed
2414   //
2415   address generate_fill(BasicType t, bool aligned, const char *name) {
2416     __ align(CodeEntryAlignment);
2417     StubCodeMark mark(this, "StubRoutines", name);
2418     address start = __ pc();
2419 
2420     BLOCK_COMMENT("Entry:");
2421 
2422     const Register to        = c_rarg0;  // source array address
2423     const Register value     = c_rarg1;  // value
2424     const Register count     = c_rarg2;  // elements count
2425 
2426     const Register bz_base = r10;        // base for block_zero routine
2427     const Register cnt_words = r11;      // temp register
2428 
2429     __ enter();
2430 
2431     Label L_fill_elements, L_exit1;
2432 
2433     int shift = -1;
2434     switch (t) {
2435       case T_BYTE:
2436         shift = 0;
2437         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2438         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2439         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2440         __ br(Assembler::LO, L_fill_elements);
2441         break;
2442       case T_SHORT:
2443         shift = 1;
2444         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2445         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2446         __ br(Assembler::LO, L_fill_elements);
2447         break;
2448       case T_INT:
2449         shift = 2;
2450         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2451         __ br(Assembler::LO, L_fill_elements);
2452         break;
2453       default: ShouldNotReachHere();
2454     }
2455 
2456     // Align source address at 8 bytes address boundary.
2457     Label L_skip_align1, L_skip_align2, L_skip_align4;
2458     if (!aligned) {
2459       switch (t) {
2460         case T_BYTE:
2461           // One byte misalignment happens only for byte arrays.
2462           __ tbz(to, 0, L_skip_align1);
2463           __ strb(value, Address(__ post(to, 1)));
2464           __ subw(count, count, 1);
2465           __ bind(L_skip_align1);
2466           // Fallthrough
2467         case T_SHORT:
2468           // Two bytes misalignment happens only for byte and short (char) arrays.
2469           __ tbz(to, 1, L_skip_align2);
2470           __ strh(value, Address(__ post(to, 2)));
2471           __ subw(count, count, 2 >> shift);
2472           __ bind(L_skip_align2);
2473           // Fallthrough
2474         case T_INT:
2475           // Align to 8 bytes, we know we are 4 byte aligned to start.
2476           __ tbz(to, 2, L_skip_align4);
2477           __ strw(value, Address(__ post(to, 4)));
2478           __ subw(count, count, 4 >> shift);
2479           __ bind(L_skip_align4);
2480           break;
2481         default: ShouldNotReachHere();
2482       }
2483     }
2484 
2485     //
2486     //  Fill large chunks
2487     //
2488     __ lsrw(cnt_words, count, 3 - shift); // number of words
2489     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2490     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2491     if (UseBlockZeroing) {
2492       Label non_block_zeroing, rest;
2493       // If the fill value is zero we can use the fast zero_words().
2494       __ cbnz(value, non_block_zeroing);
2495       __ mov(bz_base, to);
2496       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2497       address tpc = __ zero_words(bz_base, cnt_words);
2498       if (tpc == nullptr) {
2499         fatal("CodeCache is full at generate_fill");
2500       }
2501       __ b(rest);
2502       __ bind(non_block_zeroing);
2503       __ fill_words(to, cnt_words, value);
2504       __ bind(rest);
2505     } else {
2506       __ fill_words(to, cnt_words, value);
2507     }
2508 
2509     // Remaining count is less than 8 bytes. Fill it by a single store.
2510     // Note that the total length is no less than 8 bytes.
2511     if (t == T_BYTE || t == T_SHORT) {
2512       Label L_exit1;
2513       __ cbzw(count, L_exit1);
2514       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2515       __ str(value, Address(to, -8));    // overwrite some elements
2516       __ bind(L_exit1);
2517       __ leave();
2518       __ ret(lr);
2519     }
2520 
2521     // Handle copies less than 8 bytes.
2522     Label L_fill_2, L_fill_4, L_exit2;
2523     __ bind(L_fill_elements);
2524     switch (t) {
2525       case T_BYTE:
2526         __ tbz(count, 0, L_fill_2);
2527         __ strb(value, Address(__ post(to, 1)));
2528         __ bind(L_fill_2);
2529         __ tbz(count, 1, L_fill_4);
2530         __ strh(value, Address(__ post(to, 2)));
2531         __ bind(L_fill_4);
2532         __ tbz(count, 2, L_exit2);
2533         __ strw(value, Address(to));
2534         break;
2535       case T_SHORT:
2536         __ tbz(count, 0, L_fill_4);
2537         __ strh(value, Address(__ post(to, 2)));
2538         __ bind(L_fill_4);
2539         __ tbz(count, 1, L_exit2);
2540         __ strw(value, Address(to));
2541         break;
2542       case T_INT:
2543         __ cbzw(count, L_exit2);
2544         __ strw(value, Address(to));
2545         break;
2546       default: ShouldNotReachHere();
2547     }
2548     __ bind(L_exit2);
2549     __ leave();
2550     __ ret(lr);
2551     return start;
2552   }
2553 
2554   address generate_data_cache_writeback() {
2555     const Register line        = c_rarg0;  // address of line to write back
2556 
2557     __ align(CodeEntryAlignment);
2558 
2559     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2560 
2561     address start = __ pc();
2562     __ enter();
2563     __ cache_wb(Address(line, 0));
2564     __ leave();
2565     __ ret(lr);
2566 
2567     return start;
2568   }
2569 
2570   address generate_data_cache_writeback_sync() {
2571     const Register is_pre     = c_rarg0;  // pre or post sync
2572 
2573     __ align(CodeEntryAlignment);
2574 
2575     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2576 
2577     // pre wbsync is a no-op
2578     // post wbsync translates to an sfence
2579 
2580     Label skip;
2581     address start = __ pc();
2582     __ enter();
2583     __ cbnz(is_pre, skip);
2584     __ cache_wbsync(false);
2585     __ bind(skip);
2586     __ leave();
2587     __ ret(lr);
2588 
2589     return start;
2590   }
2591 
2592   void generate_arraycopy_stubs() {
2593     address entry;
2594     address entry_jbyte_arraycopy;
2595     address entry_jshort_arraycopy;
2596     address entry_jint_arraycopy;
2597     address entry_oop_arraycopy;
2598     address entry_jlong_arraycopy;
2599     address entry_checkcast_arraycopy;
2600 
2601     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2602     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2603 
2604     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2605     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2606 
2607     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2608     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2609 
2610     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2611 
2612     //*** jbyte
2613     // Always need aligned and unaligned versions
2614     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2615                                                                                   "jbyte_disjoint_arraycopy");
2616     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2617                                                                                   &entry_jbyte_arraycopy,
2618                                                                                   "jbyte_arraycopy");
2619     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2620                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2621     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2622                                                                                   "arrayof_jbyte_arraycopy");
2623 
2624     //*** jshort
2625     // Always need aligned and unaligned versions
2626     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2627                                                                                     "jshort_disjoint_arraycopy");
2628     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2629                                                                                     &entry_jshort_arraycopy,
2630                                                                                     "jshort_arraycopy");
2631     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2632                                                                                     "arrayof_jshort_disjoint_arraycopy");
2633     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2634                                                                                     "arrayof_jshort_arraycopy");
2635 
2636     //*** jint
2637     // Aligned versions
2638     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2639                                                                                 "arrayof_jint_disjoint_arraycopy");
2640     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2641                                                                                 "arrayof_jint_arraycopy");
2642     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2643     // entry_jint_arraycopy always points to the unaligned version
2644     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2645                                                                                 "jint_disjoint_arraycopy");
2646     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2647                                                                                 &entry_jint_arraycopy,
2648                                                                                 "jint_arraycopy");
2649 
2650     //*** jlong
2651     // It is always aligned
2652     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2653                                                                                   "arrayof_jlong_disjoint_arraycopy");
2654     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2655                                                                                   "arrayof_jlong_arraycopy");
2656     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2657     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2658 
2659     //*** oops
2660     {
2661       // With compressed oops we need unaligned versions; notice that
2662       // we overwrite entry_oop_arraycopy.
2663       bool aligned = !UseCompressedOops;
2664 
2665       StubRoutines::_arrayof_oop_disjoint_arraycopy
2666         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2667                                      /*dest_uninitialized*/false);
2668       StubRoutines::_arrayof_oop_arraycopy
2669         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2670                                      /*dest_uninitialized*/false);
2671       // Aligned versions without pre-barriers
2672       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2673         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2674                                      /*dest_uninitialized*/true);
2675       StubRoutines::_arrayof_oop_arraycopy_uninit
2676         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2677                                      /*dest_uninitialized*/true);
2678     }
2679 
2680     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2681     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2682     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2683     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2684 
2685     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2686     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2687                                                                         /*dest_uninitialized*/true);
2688 
2689     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2690                                                               entry_jbyte_arraycopy,
2691                                                               entry_jshort_arraycopy,
2692                                                               entry_jint_arraycopy,
2693                                                               entry_jlong_arraycopy);
2694 
2695     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2696                                                                entry_jbyte_arraycopy,
2697                                                                entry_jshort_arraycopy,
2698                                                                entry_jint_arraycopy,
2699                                                                entry_oop_arraycopy,
2700                                                                entry_jlong_arraycopy,
2701                                                                entry_checkcast_arraycopy);
2702 
2703     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2704     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2705     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2706     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2707     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2708     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2709   }
2710 
2711   void generate_math_stubs() { Unimplemented(); }
2712 
2713   // Arguments:
2714   //
2715   // Inputs:
2716   //   c_rarg0   - source byte array address
2717   //   c_rarg1   - destination byte array address
2718   //   c_rarg2   - K (key) in little endian int array
2719   //
2720   address generate_aescrypt_encryptBlock() {
2721     __ align(CodeEntryAlignment);
2722     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2723 
2724     const Register from        = c_rarg0;  // source array address
2725     const Register to          = c_rarg1;  // destination array address
2726     const Register key         = c_rarg2;  // key array address
2727     const Register keylen      = rscratch1;
2728 
2729     address start = __ pc();
2730     __ enter();
2731 
2732     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2733 
2734     __ aesenc_loadkeys(key, keylen);
2735     __ aesecb_encrypt(from, to, keylen);
2736 
2737     __ mov(r0, 0);
2738 
2739     __ leave();
2740     __ ret(lr);
2741 
2742     return start;
2743   }
2744 
2745   // Arguments:
2746   //
2747   // Inputs:
2748   //   c_rarg0   - source byte array address
2749   //   c_rarg1   - destination byte array address
2750   //   c_rarg2   - K (key) in little endian int array
2751   //
2752   address generate_aescrypt_decryptBlock() {
2753     assert(UseAES, "need AES cryptographic extension support");
2754     __ align(CodeEntryAlignment);
2755     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2756     Label L_doLast;
2757 
2758     const Register from        = c_rarg0;  // source array address
2759     const Register to          = c_rarg1;  // destination array address
2760     const Register key         = c_rarg2;  // key array address
2761     const Register keylen      = rscratch1;
2762 
2763     address start = __ pc();
2764     __ enter(); // required for proper stackwalking of RuntimeStub frame
2765 
2766     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2767 
2768     __ aesecb_decrypt(from, to, key, keylen);
2769 
2770     __ mov(r0, 0);
2771 
2772     __ leave();
2773     __ ret(lr);
2774 
2775     return start;
2776   }
2777 
2778   // Arguments:
2779   //
2780   // Inputs:
2781   //   c_rarg0   - source byte array address
2782   //   c_rarg1   - destination byte array address
2783   //   c_rarg2   - K (key) in little endian int array
2784   //   c_rarg3   - r vector byte array address
2785   //   c_rarg4   - input length
2786   //
2787   // Output:
2788   //   x0        - input length
2789   //
2790   address generate_cipherBlockChaining_encryptAESCrypt() {
2791     assert(UseAES, "need AES cryptographic extension support");
2792     __ align(CodeEntryAlignment);
2793     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2794 
2795     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2796 
2797     const Register from        = c_rarg0;  // source array address
2798     const Register to          = c_rarg1;  // destination array address
2799     const Register key         = c_rarg2;  // key array address
2800     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2801                                            // and left with the results of the last encryption block
2802     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2803     const Register keylen      = rscratch1;
2804 
2805     address start = __ pc();
2806 
2807       __ enter();
2808 
2809       __ movw(rscratch2, len_reg);
2810 
2811       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2812 
2813       __ ld1(v0, __ T16B, rvec);
2814 
2815       __ cmpw(keylen, 52);
2816       __ br(Assembler::CC, L_loadkeys_44);
2817       __ br(Assembler::EQ, L_loadkeys_52);
2818 
2819       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2820       __ rev32(v17, __ T16B, v17);
2821       __ rev32(v18, __ T16B, v18);
2822     __ BIND(L_loadkeys_52);
2823       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2824       __ rev32(v19, __ T16B, v19);
2825       __ rev32(v20, __ T16B, v20);
2826     __ BIND(L_loadkeys_44);
2827       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2828       __ rev32(v21, __ T16B, v21);
2829       __ rev32(v22, __ T16B, v22);
2830       __ rev32(v23, __ T16B, v23);
2831       __ rev32(v24, __ T16B, v24);
2832       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2833       __ rev32(v25, __ T16B, v25);
2834       __ rev32(v26, __ T16B, v26);
2835       __ rev32(v27, __ T16B, v27);
2836       __ rev32(v28, __ T16B, v28);
2837       __ ld1(v29, v30, v31, __ T16B, key);
2838       __ rev32(v29, __ T16B, v29);
2839       __ rev32(v30, __ T16B, v30);
2840       __ rev32(v31, __ T16B, v31);
2841 
2842     __ BIND(L_aes_loop);
2843       __ ld1(v1, __ T16B, __ post(from, 16));
2844       __ eor(v0, __ T16B, v0, v1);
2845 
2846       __ br(Assembler::CC, L_rounds_44);
2847       __ br(Assembler::EQ, L_rounds_52);
2848 
2849       __ aese(v0, v17); __ aesmc(v0, v0);
2850       __ aese(v0, v18); __ aesmc(v0, v0);
2851     __ BIND(L_rounds_52);
2852       __ aese(v0, v19); __ aesmc(v0, v0);
2853       __ aese(v0, v20); __ aesmc(v0, v0);
2854     __ BIND(L_rounds_44);
2855       __ aese(v0, v21); __ aesmc(v0, v0);
2856       __ aese(v0, v22); __ aesmc(v0, v0);
2857       __ aese(v0, v23); __ aesmc(v0, v0);
2858       __ aese(v0, v24); __ aesmc(v0, v0);
2859       __ aese(v0, v25); __ aesmc(v0, v0);
2860       __ aese(v0, v26); __ aesmc(v0, v0);
2861       __ aese(v0, v27); __ aesmc(v0, v0);
2862       __ aese(v0, v28); __ aesmc(v0, v0);
2863       __ aese(v0, v29); __ aesmc(v0, v0);
2864       __ aese(v0, v30);
2865       __ eor(v0, __ T16B, v0, v31);
2866 
2867       __ st1(v0, __ T16B, __ post(to, 16));
2868 
2869       __ subw(len_reg, len_reg, 16);
2870       __ cbnzw(len_reg, L_aes_loop);
2871 
2872       __ st1(v0, __ T16B, rvec);
2873 
2874       __ mov(r0, rscratch2);
2875 
2876       __ leave();
2877       __ ret(lr);
2878 
2879       return start;
2880   }
2881 
2882   // Arguments:
2883   //
2884   // Inputs:
2885   //   c_rarg0   - source byte array address
2886   //   c_rarg1   - destination byte array address
2887   //   c_rarg2   - K (key) in little endian int array
2888   //   c_rarg3   - r vector byte array address
2889   //   c_rarg4   - input length
2890   //
2891   // Output:
2892   //   r0        - input length
2893   //
2894   address generate_cipherBlockChaining_decryptAESCrypt() {
2895     assert(UseAES, "need AES cryptographic extension support");
2896     __ align(CodeEntryAlignment);
2897     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2898 
2899     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2900 
2901     const Register from        = c_rarg0;  // source array address
2902     const Register to          = c_rarg1;  // destination array address
2903     const Register key         = c_rarg2;  // key array address
2904     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2905                                            // and left with the results of the last encryption block
2906     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2907     const Register keylen      = rscratch1;
2908 
2909     address start = __ pc();
2910 
2911       __ enter();
2912 
2913       __ movw(rscratch2, len_reg);
2914 
2915       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2916 
2917       __ ld1(v2, __ T16B, rvec);
2918 
2919       __ ld1(v31, __ T16B, __ post(key, 16));
2920       __ rev32(v31, __ T16B, v31);
2921 
2922       __ cmpw(keylen, 52);
2923       __ br(Assembler::CC, L_loadkeys_44);
2924       __ br(Assembler::EQ, L_loadkeys_52);
2925 
2926       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2927       __ rev32(v17, __ T16B, v17);
2928       __ rev32(v18, __ T16B, v18);
2929     __ BIND(L_loadkeys_52);
2930       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2931       __ rev32(v19, __ T16B, v19);
2932       __ rev32(v20, __ T16B, v20);
2933     __ BIND(L_loadkeys_44);
2934       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2935       __ rev32(v21, __ T16B, v21);
2936       __ rev32(v22, __ T16B, v22);
2937       __ rev32(v23, __ T16B, v23);
2938       __ rev32(v24, __ T16B, v24);
2939       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2940       __ rev32(v25, __ T16B, v25);
2941       __ rev32(v26, __ T16B, v26);
2942       __ rev32(v27, __ T16B, v27);
2943       __ rev32(v28, __ T16B, v28);
2944       __ ld1(v29, v30, __ T16B, key);
2945       __ rev32(v29, __ T16B, v29);
2946       __ rev32(v30, __ T16B, v30);
2947 
2948     __ BIND(L_aes_loop);
2949       __ ld1(v0, __ T16B, __ post(from, 16));
2950       __ orr(v1, __ T16B, v0, v0);
2951 
2952       __ br(Assembler::CC, L_rounds_44);
2953       __ br(Assembler::EQ, L_rounds_52);
2954 
2955       __ aesd(v0, v17); __ aesimc(v0, v0);
2956       __ aesd(v0, v18); __ aesimc(v0, v0);
2957     __ BIND(L_rounds_52);
2958       __ aesd(v0, v19); __ aesimc(v0, v0);
2959       __ aesd(v0, v20); __ aesimc(v0, v0);
2960     __ BIND(L_rounds_44);
2961       __ aesd(v0, v21); __ aesimc(v0, v0);
2962       __ aesd(v0, v22); __ aesimc(v0, v0);
2963       __ aesd(v0, v23); __ aesimc(v0, v0);
2964       __ aesd(v0, v24); __ aesimc(v0, v0);
2965       __ aesd(v0, v25); __ aesimc(v0, v0);
2966       __ aesd(v0, v26); __ aesimc(v0, v0);
2967       __ aesd(v0, v27); __ aesimc(v0, v0);
2968       __ aesd(v0, v28); __ aesimc(v0, v0);
2969       __ aesd(v0, v29); __ aesimc(v0, v0);
2970       __ aesd(v0, v30);
2971       __ eor(v0, __ T16B, v0, v31);
2972       __ eor(v0, __ T16B, v0, v2);
2973 
2974       __ st1(v0, __ T16B, __ post(to, 16));
2975       __ orr(v2, __ T16B, v1, v1);
2976 
2977       __ subw(len_reg, len_reg, 16);
2978       __ cbnzw(len_reg, L_aes_loop);
2979 
2980       __ st1(v2, __ T16B, rvec);
2981 
2982       __ mov(r0, rscratch2);
2983 
2984       __ leave();
2985       __ ret(lr);
2986 
2987     return start;
2988   }
2989 
2990   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2991   // Inputs: 128-bits. in is preserved.
2992   // The least-significant 64-bit word is in the upper dword of each vector.
2993   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2994   // Output: result
2995   void be_add_128_64(FloatRegister result, FloatRegister in,
2996                      FloatRegister inc, FloatRegister tmp) {
2997     assert_different_registers(result, tmp, inc);
2998 
2999     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
3000                                            // input
3001     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3002     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
3003                                            // MSD == 0 (must be!) to LSD
3004     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
3005   }
3006 
3007   // CTR AES crypt.
3008   // Arguments:
3009   //
3010   // Inputs:
3011   //   c_rarg0   - source byte array address
3012   //   c_rarg1   - destination byte array address
3013   //   c_rarg2   - K (key) in little endian int array
3014   //   c_rarg3   - counter vector byte array address
3015   //   c_rarg4   - input length
3016   //   c_rarg5   - saved encryptedCounter start
3017   //   c_rarg6   - saved used length
3018   //
3019   // Output:
3020   //   r0       - input length
3021   //
3022   address generate_counterMode_AESCrypt() {
3023     const Register in = c_rarg0;
3024     const Register out = c_rarg1;
3025     const Register key = c_rarg2;
3026     const Register counter = c_rarg3;
3027     const Register saved_len = c_rarg4, len = r10;
3028     const Register saved_encrypted_ctr = c_rarg5;
3029     const Register used_ptr = c_rarg6, used = r12;
3030 
3031     const Register offset = r7;
3032     const Register keylen = r11;
3033 
3034     const unsigned char block_size = 16;
3035     const int bulk_width = 4;
3036     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3037     // performance with larger data sizes, but it also means that the
3038     // fast path isn't used until you have at least 8 blocks, and up
3039     // to 127 bytes of data will be executed on the slow path. For
3040     // that reason, and also so as not to blow away too much icache, 4
3041     // blocks seems like a sensible compromise.
3042 
3043     // Algorithm:
3044     //
3045     //    if (len == 0) {
3046     //        goto DONE;
3047     //    }
3048     //    int result = len;
3049     //    do {
3050     //        if (used >= blockSize) {
3051     //            if (len >= bulk_width * blockSize) {
3052     //                CTR_large_block();
3053     //                if (len == 0)
3054     //                    goto DONE;
3055     //            }
3056     //            for (;;) {
3057     //                16ByteVector v0 = counter;
3058     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3059     //                used = 0;
3060     //                if (len < blockSize)
3061     //                    break;    /* goto NEXT */
3062     //                16ByteVector v1 = load16Bytes(in, offset);
3063     //                v1 = v1 ^ encryptedCounter;
3064     //                store16Bytes(out, offset);
3065     //                used = blockSize;
3066     //                offset += blockSize;
3067     //                len -= blockSize;
3068     //                if (len == 0)
3069     //                    goto DONE;
3070     //            }
3071     //        }
3072     //      NEXT:
3073     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3074     //        len--;
3075     //    } while (len != 0);
3076     //  DONE:
3077     //    return result;
3078     //
3079     // CTR_large_block()
3080     //    Wide bulk encryption of whole blocks.
3081 
3082     __ align(CodeEntryAlignment);
3083     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3084     const address start = __ pc();
3085     __ enter();
3086 
3087     Label DONE, CTR_large_block, large_block_return;
3088     __ ldrw(used, Address(used_ptr));
3089     __ cbzw(saved_len, DONE);
3090 
3091     __ mov(len, saved_len);
3092     __ mov(offset, 0);
3093 
3094     // Compute #rounds for AES based on the length of the key array
3095     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3096 
3097     __ aesenc_loadkeys(key, keylen);
3098 
3099     {
3100       Label L_CTR_loop, NEXT;
3101 
3102       __ bind(L_CTR_loop);
3103 
3104       __ cmp(used, block_size);
3105       __ br(__ LO, NEXT);
3106 
3107       // Maybe we have a lot of data
3108       __ subsw(rscratch1, len, bulk_width * block_size);
3109       __ br(__ HS, CTR_large_block);
3110       __ BIND(large_block_return);
3111       __ cbzw(len, DONE);
3112 
3113       // Setup the counter
3114       __ movi(v4, __ T4S, 0);
3115       __ movi(v5, __ T4S, 1);
3116       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3117 
3118       // 128-bit big-endian increment
3119       __ ld1(v0, __ T16B, counter);
3120       __ rev64(v16, __ T16B, v0);
3121       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3122       __ rev64(v16, __ T16B, v16);
3123       __ st1(v16, __ T16B, counter);
3124       // Previous counter value is in v0
3125       // v4 contains { 0, 1 }
3126 
3127       {
3128         // We have fewer than bulk_width blocks of data left. Encrypt
3129         // them one by one until there is less than a full block
3130         // remaining, being careful to save both the encrypted counter
3131         // and the counter.
3132 
3133         Label inner_loop;
3134         __ bind(inner_loop);
3135         // Counter to encrypt is in v0
3136         __ aesecb_encrypt(noreg, noreg, keylen);
3137         __ st1(v0, __ T16B, saved_encrypted_ctr);
3138 
3139         // Do we have a remaining full block?
3140 
3141         __ mov(used, 0);
3142         __ cmp(len, block_size);
3143         __ br(__ LO, NEXT);
3144 
3145         // Yes, we have a full block
3146         __ ldrq(v1, Address(in, offset));
3147         __ eor(v1, __ T16B, v1, v0);
3148         __ strq(v1, Address(out, offset));
3149         __ mov(used, block_size);
3150         __ add(offset, offset, block_size);
3151 
3152         __ subw(len, len, block_size);
3153         __ cbzw(len, DONE);
3154 
3155         // Increment the counter, store it back
3156         __ orr(v0, __ T16B, v16, v16);
3157         __ rev64(v16, __ T16B, v16);
3158         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3159         __ rev64(v16, __ T16B, v16);
3160         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3161 
3162         __ b(inner_loop);
3163       }
3164 
3165       __ BIND(NEXT);
3166 
3167       // Encrypt a single byte, and loop.
3168       // We expect this to be a rare event.
3169       __ ldrb(rscratch1, Address(in, offset));
3170       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3171       __ eor(rscratch1, rscratch1, rscratch2);
3172       __ strb(rscratch1, Address(out, offset));
3173       __ add(offset, offset, 1);
3174       __ add(used, used, 1);
3175       __ subw(len, len,1);
3176       __ cbnzw(len, L_CTR_loop);
3177     }
3178 
3179     __ bind(DONE);
3180     __ strw(used, Address(used_ptr));
3181     __ mov(r0, saved_len);
3182 
3183     __ leave(); // required for proper stackwalking of RuntimeStub frame
3184     __ ret(lr);
3185 
3186     // Bulk encryption
3187 
3188     __ BIND (CTR_large_block);
3189     assert(bulk_width == 4 || bulk_width == 8, "must be");
3190 
3191     if (bulk_width == 8) {
3192       __ sub(sp, sp, 4 * 16);
3193       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3194     }
3195     __ sub(sp, sp, 4 * 16);
3196     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3197     RegSet saved_regs = (RegSet::of(in, out, offset)
3198                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3199     __ push(saved_regs, sp);
3200     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3201     __ add(in, in, offset);
3202     __ add(out, out, offset);
3203 
3204     // Keys should already be loaded into the correct registers
3205 
3206     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3207     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3208 
3209     // AES/CTR loop
3210     {
3211       Label L_CTR_loop;
3212       __ BIND(L_CTR_loop);
3213 
3214       // Setup the counters
3215       __ movi(v8, __ T4S, 0);
3216       __ movi(v9, __ T4S, 1);
3217       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3218 
3219       for (int i = 0; i < bulk_width; i++) {
3220         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3221         __ rev64(v0_ofs, __ T16B, v16);
3222         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3223       }
3224 
3225       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3226 
3227       // Encrypt the counters
3228       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3229 
3230       if (bulk_width == 8) {
3231         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3232       }
3233 
3234       // XOR the encrypted counters with the inputs
3235       for (int i = 0; i < bulk_width; i++) {
3236         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3237         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3238         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3239       }
3240 
3241       // Write the encrypted data
3242       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3243       if (bulk_width == 8) {
3244         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3245       }
3246 
3247       __ subw(len, len, 16 * bulk_width);
3248       __ cbnzw(len, L_CTR_loop);
3249     }
3250 
3251     // Save the counter back where it goes
3252     __ rev64(v16, __ T16B, v16);
3253     __ st1(v16, __ T16B, counter);
3254 
3255     __ pop(saved_regs, sp);
3256 
3257     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3258     if (bulk_width == 8) {
3259       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3260     }
3261 
3262     __ andr(rscratch1, len, -16 * bulk_width);
3263     __ sub(len, len, rscratch1);
3264     __ add(offset, offset, rscratch1);
3265     __ mov(used, 16);
3266     __ strw(used, Address(used_ptr));
3267     __ b(large_block_return);
3268 
3269     return start;
3270   }
3271 
3272   // Vector AES Galois Counter Mode implementation. Parameters:
3273   //
3274   // in = c_rarg0
3275   // len = c_rarg1
3276   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3277   // out = c_rarg3
3278   // key = c_rarg4
3279   // state = c_rarg5 - GHASH.state
3280   // subkeyHtbl = c_rarg6 - powers of H
3281   // counter = c_rarg7 - 16 bytes of CTR
3282   // return - number of processed bytes
3283   address generate_galoisCounterMode_AESCrypt() {
3284     address ghash_polynomial = __ pc();
3285     __ emit_int64(0x87);  // The low-order bits of the field
3286                           // polynomial (i.e. p = z^7+z^2+z+1)
3287                           // repeated in the low and high parts of a
3288                           // 128-bit vector
3289     __ emit_int64(0x87);
3290 
3291     __ align(CodeEntryAlignment);
3292      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3293     address start = __ pc();
3294     __ enter();
3295 
3296     const Register in = c_rarg0;
3297     const Register len = c_rarg1;
3298     const Register ct = c_rarg2;
3299     const Register out = c_rarg3;
3300     // and updated with the incremented counter in the end
3301 
3302     const Register key = c_rarg4;
3303     const Register state = c_rarg5;
3304 
3305     const Register subkeyHtbl = c_rarg6;
3306 
3307     const Register counter = c_rarg7;
3308 
3309     const Register keylen = r10;
3310     // Save state before entering routine
3311     __ sub(sp, sp, 4 * 16);
3312     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3313     __ sub(sp, sp, 4 * 16);
3314     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3315 
3316     // __ andr(len, len, -512);
3317     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3318     __ str(len, __ pre(sp, -2 * wordSize));
3319 
3320     Label DONE;
3321     __ cbz(len, DONE);
3322 
3323     // Compute #rounds for AES based on the length of the key array
3324     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3325 
3326     __ aesenc_loadkeys(key, keylen);
3327     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3328     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3329 
3330     // AES/CTR loop
3331     {
3332       Label L_CTR_loop;
3333       __ BIND(L_CTR_loop);
3334 
3335       // Setup the counters
3336       __ movi(v8, __ T4S, 0);
3337       __ movi(v9, __ T4S, 1);
3338       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3339 
3340       assert(v0->encoding() < v8->encoding(), "");
3341       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3342         FloatRegister f = as_FloatRegister(i);
3343         __ rev32(f, __ T16B, v16);
3344         __ addv(v16, __ T4S, v16, v8);
3345       }
3346 
3347       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3348 
3349       // Encrypt the counters
3350       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3351 
3352       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3353 
3354       // XOR the encrypted counters with the inputs
3355       for (int i = 0; i < 8; i++) {
3356         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3357         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3358         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3359       }
3360       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3361       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3362 
3363       __ subw(len, len, 16 * 8);
3364       __ cbnzw(len, L_CTR_loop);
3365     }
3366 
3367     __ rev32(v16, __ T16B, v16);
3368     __ st1(v16, __ T16B, counter);
3369 
3370     __ ldr(len, Address(sp));
3371     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3372 
3373     // GHASH/CTR loop
3374     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3375                                 len, /*unrolls*/4);
3376 
3377 #ifdef ASSERT
3378     { Label L;
3379       __ cmp(len, (unsigned char)0);
3380       __ br(Assembler::EQ, L);
3381       __ stop("stubGenerator: abort");
3382       __ bind(L);
3383   }
3384 #endif
3385 
3386   __ bind(DONE);
3387     // Return the number of bytes processed
3388     __ ldr(r0, __ post(sp, 2 * wordSize));
3389 
3390     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3391     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3392 
3393     __ leave(); // required for proper stackwalking of RuntimeStub frame
3394     __ ret(lr);
3395      return start;
3396   }
3397 
3398   class Cached64Bytes {
3399   private:
3400     MacroAssembler *_masm;
3401     Register _regs[8];
3402 
3403   public:
3404     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3405       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3406       auto it = rs.begin();
3407       for (auto &r: _regs) {
3408         r = *it;
3409         ++it;
3410       }
3411     }
3412 
3413     void gen_loads(Register base) {
3414       for (int i = 0; i < 8; i += 2) {
3415         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3416       }
3417     }
3418 
3419     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3420     void extract_u32(Register dest, int i) {
3421       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3422     }
3423   };
3424 
3425   // Utility routines for md5.
3426   // Clobbers r10 and r11.
3427   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3428               int k, int s, int t) {
3429     Register rscratch3 = r10;
3430     Register rscratch4 = r11;
3431 
3432     __ eorw(rscratch3, r3, r4);
3433     __ movw(rscratch2, t);
3434     __ andw(rscratch3, rscratch3, r2);
3435     __ addw(rscratch4, r1, rscratch2);
3436     reg_cache.extract_u32(rscratch1, k);
3437     __ eorw(rscratch3, rscratch3, r4);
3438     __ addw(rscratch4, rscratch4, rscratch1);
3439     __ addw(rscratch3, rscratch3, rscratch4);
3440     __ rorw(rscratch2, rscratch3, 32 - s);
3441     __ addw(r1, rscratch2, r2);
3442   }
3443 
3444   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3445               int k, int s, int t) {
3446     Register rscratch3 = r10;
3447     Register rscratch4 = r11;
3448 
3449     __ andw(rscratch3, r2, r4);
3450     __ bicw(rscratch4, r3, r4);
3451     reg_cache.extract_u32(rscratch1, k);
3452     __ movw(rscratch2, t);
3453     __ orrw(rscratch3, rscratch3, rscratch4);
3454     __ addw(rscratch4, r1, rscratch2);
3455     __ addw(rscratch4, rscratch4, rscratch1);
3456     __ addw(rscratch3, rscratch3, rscratch4);
3457     __ rorw(rscratch2, rscratch3, 32 - s);
3458     __ addw(r1, rscratch2, r2);
3459   }
3460 
3461   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3462               int k, int s, int t) {
3463     Register rscratch3 = r10;
3464     Register rscratch4 = r11;
3465 
3466     __ eorw(rscratch3, r3, r4);
3467     __ movw(rscratch2, t);
3468     __ addw(rscratch4, r1, rscratch2);
3469     reg_cache.extract_u32(rscratch1, k);
3470     __ eorw(rscratch3, rscratch3, r2);
3471     __ addw(rscratch4, rscratch4, rscratch1);
3472     __ addw(rscratch3, rscratch3, rscratch4);
3473     __ rorw(rscratch2, rscratch3, 32 - s);
3474     __ addw(r1, rscratch2, r2);
3475   }
3476 
3477   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3478               int k, int s, int t) {
3479     Register rscratch3 = r10;
3480     Register rscratch4 = r11;
3481 
3482     __ movw(rscratch3, t);
3483     __ ornw(rscratch2, r2, r4);
3484     __ addw(rscratch4, r1, rscratch3);
3485     reg_cache.extract_u32(rscratch1, k);
3486     __ eorw(rscratch3, rscratch2, r3);
3487     __ addw(rscratch4, rscratch4, rscratch1);
3488     __ addw(rscratch3, rscratch3, rscratch4);
3489     __ rorw(rscratch2, rscratch3, 32 - s);
3490     __ addw(r1, rscratch2, r2);
3491   }
3492 
3493   // Arguments:
3494   //
3495   // Inputs:
3496   //   c_rarg0   - byte[]  source+offset
3497   //   c_rarg1   - int[]   SHA.state
3498   //   c_rarg2   - int     offset
3499   //   c_rarg3   - int     limit
3500   //
3501   address generate_md5_implCompress(bool multi_block, const char *name) {
3502     __ align(CodeEntryAlignment);
3503     StubCodeMark mark(this, "StubRoutines", name);
3504     address start = __ pc();
3505 
3506     Register buf       = c_rarg0;
3507     Register state     = c_rarg1;
3508     Register ofs       = c_rarg2;
3509     Register limit     = c_rarg3;
3510     Register a         = r4;
3511     Register b         = r5;
3512     Register c         = r6;
3513     Register d         = r7;
3514     Register rscratch3 = r10;
3515     Register rscratch4 = r11;
3516 
3517     Register state_regs[2] = { r12, r13 };
3518     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3519     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3520 
3521     __ push(saved_regs, sp);
3522 
3523     __ ldp(state_regs[0], state_regs[1], Address(state));
3524     __ ubfx(a, state_regs[0],  0, 32);
3525     __ ubfx(b, state_regs[0], 32, 32);
3526     __ ubfx(c, state_regs[1],  0, 32);
3527     __ ubfx(d, state_regs[1], 32, 32);
3528 
3529     Label md5_loop;
3530     __ BIND(md5_loop);
3531 
3532     reg_cache.gen_loads(buf);
3533 
3534     // Round 1
3535     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3536     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3537     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3538     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3539     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3540     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3541     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3542     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3543     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3544     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3545     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3546     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3547     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3548     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3549     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3550     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3551 
3552     // Round 2
3553     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3554     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3555     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3556     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3557     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3558     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3559     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3560     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3561     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3562     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3563     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3564     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3565     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3566     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3567     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3568     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3569 
3570     // Round 3
3571     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3572     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3573     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3574     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3575     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3576     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3577     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3578     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3579     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3580     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3581     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3582     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3583     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3584     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3585     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3586     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3587 
3588     // Round 4
3589     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3590     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3591     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3592     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3593     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3594     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3595     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3596     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3597     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3598     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3599     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3600     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3601     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3602     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3603     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3604     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3605 
3606     __ addw(a, state_regs[0], a);
3607     __ ubfx(rscratch2, state_regs[0], 32, 32);
3608     __ addw(b, rscratch2, b);
3609     __ addw(c, state_regs[1], c);
3610     __ ubfx(rscratch4, state_regs[1], 32, 32);
3611     __ addw(d, rscratch4, d);
3612 
3613     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3614     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3615 
3616     if (multi_block) {
3617       __ add(buf, buf, 64);
3618       __ add(ofs, ofs, 64);
3619       __ cmp(ofs, limit);
3620       __ br(Assembler::LE, md5_loop);
3621       __ mov(c_rarg0, ofs); // return ofs
3622     }
3623 
3624     // write hash values back in the correct order
3625     __ stp(state_regs[0], state_regs[1], Address(state));
3626 
3627     __ pop(saved_regs, sp);
3628 
3629     __ ret(lr);
3630 
3631     return start;
3632   }
3633 
3634   // Arguments:
3635   //
3636   // Inputs:
3637   //   c_rarg0   - byte[]  source+offset
3638   //   c_rarg1   - int[]   SHA.state
3639   //   c_rarg2   - int     offset
3640   //   c_rarg3   - int     limit
3641   //
3642   address generate_sha1_implCompress(bool multi_block, const char *name) {
3643     __ align(CodeEntryAlignment);
3644     StubCodeMark mark(this, "StubRoutines", name);
3645     address start = __ pc();
3646 
3647     Register buf   = c_rarg0;
3648     Register state = c_rarg1;
3649     Register ofs   = c_rarg2;
3650     Register limit = c_rarg3;
3651 
3652     Label keys;
3653     Label sha1_loop;
3654 
3655     // load the keys into v0..v3
3656     __ adr(rscratch1, keys);
3657     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3658     // load 5 words state into v6, v7
3659     __ ldrq(v6, Address(state, 0));
3660     __ ldrs(v7, Address(state, 16));
3661 
3662 
3663     __ BIND(sha1_loop);
3664     // load 64 bytes of data into v16..v19
3665     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3666     __ rev32(v16, __ T16B, v16);
3667     __ rev32(v17, __ T16B, v17);
3668     __ rev32(v18, __ T16B, v18);
3669     __ rev32(v19, __ T16B, v19);
3670 
3671     // do the sha1
3672     __ addv(v4, __ T4S, v16, v0);
3673     __ orr(v20, __ T16B, v6, v6);
3674 
3675     FloatRegister d0 = v16;
3676     FloatRegister d1 = v17;
3677     FloatRegister d2 = v18;
3678     FloatRegister d3 = v19;
3679 
3680     for (int round = 0; round < 20; round++) {
3681       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3682       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3683       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3684       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3685       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3686 
3687       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3688       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3689       __ sha1h(tmp2, __ T4S, v20);
3690       if (round < 5)
3691         __ sha1c(v20, __ T4S, tmp3, tmp4);
3692       else if (round < 10 || round >= 15)
3693         __ sha1p(v20, __ T4S, tmp3, tmp4);
3694       else
3695         __ sha1m(v20, __ T4S, tmp3, tmp4);
3696       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3697 
3698       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3699     }
3700 
3701     __ addv(v7, __ T2S, v7, v21);
3702     __ addv(v6, __ T4S, v6, v20);
3703 
3704     if (multi_block) {
3705       __ add(ofs, ofs, 64);
3706       __ cmp(ofs, limit);
3707       __ br(Assembler::LE, sha1_loop);
3708       __ mov(c_rarg0, ofs); // return ofs
3709     }
3710 
3711     __ strq(v6, Address(state, 0));
3712     __ strs(v7, Address(state, 16));
3713 
3714     __ ret(lr);
3715 
3716     __ bind(keys);
3717     __ emit_int32(0x5a827999);
3718     __ emit_int32(0x6ed9eba1);
3719     __ emit_int32(0x8f1bbcdc);
3720     __ emit_int32(0xca62c1d6);
3721 
3722     return start;
3723   }
3724 
3725 
3726   // Arguments:
3727   //
3728   // Inputs:
3729   //   c_rarg0   - byte[]  source+offset
3730   //   c_rarg1   - int[]   SHA.state
3731   //   c_rarg2   - int     offset
3732   //   c_rarg3   - int     limit
3733   //
3734   address generate_sha256_implCompress(bool multi_block, const char *name) {
3735     static const uint32_t round_consts[64] = {
3736       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3737       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3738       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3739       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3740       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3741       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3742       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3743       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3744       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3745       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3746       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3747       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3748       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3749       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3750       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3751       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3752     };
3753     __ align(CodeEntryAlignment);
3754     StubCodeMark mark(this, "StubRoutines", name);
3755     address start = __ pc();
3756 
3757     Register buf   = c_rarg0;
3758     Register state = c_rarg1;
3759     Register ofs   = c_rarg2;
3760     Register limit = c_rarg3;
3761 
3762     Label sha1_loop;
3763 
3764     __ stpd(v8, v9, __ pre(sp, -32));
3765     __ stpd(v10, v11, Address(sp, 16));
3766 
3767 // dga == v0
3768 // dgb == v1
3769 // dg0 == v2
3770 // dg1 == v3
3771 // dg2 == v4
3772 // t0 == v6
3773 // t1 == v7
3774 
3775     // load 16 keys to v16..v31
3776     __ lea(rscratch1, ExternalAddress((address)round_consts));
3777     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3778     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3779     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3780     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3781 
3782     // load 8 words (256 bits) state
3783     __ ldpq(v0, v1, state);
3784 
3785     __ BIND(sha1_loop);
3786     // load 64 bytes of data into v8..v11
3787     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3788     __ rev32(v8, __ T16B, v8);
3789     __ rev32(v9, __ T16B, v9);
3790     __ rev32(v10, __ T16B, v10);
3791     __ rev32(v11, __ T16B, v11);
3792 
3793     __ addv(v6, __ T4S, v8, v16);
3794     __ orr(v2, __ T16B, v0, v0);
3795     __ orr(v3, __ T16B, v1, v1);
3796 
3797     FloatRegister d0 = v8;
3798     FloatRegister d1 = v9;
3799     FloatRegister d2 = v10;
3800     FloatRegister d3 = v11;
3801 
3802 
3803     for (int round = 0; round < 16; round++) {
3804       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3805       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3806       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3807       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3808 
3809       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3810        __ orr(v4, __ T16B, v2, v2);
3811       if (round < 15)
3812         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3813       __ sha256h(v2, __ T4S, v3, tmp2);
3814       __ sha256h2(v3, __ T4S, v4, tmp2);
3815       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3816 
3817       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3818     }
3819 
3820     __ addv(v0, __ T4S, v0, v2);
3821     __ addv(v1, __ T4S, v1, v3);
3822 
3823     if (multi_block) {
3824       __ add(ofs, ofs, 64);
3825       __ cmp(ofs, limit);
3826       __ br(Assembler::LE, sha1_loop);
3827       __ mov(c_rarg0, ofs); // return ofs
3828     }
3829 
3830     __ ldpd(v10, v11, Address(sp, 16));
3831     __ ldpd(v8, v9, __ post(sp, 32));
3832 
3833     __ stpq(v0, v1, state);
3834 
3835     __ ret(lr);
3836 
3837     return start;
3838   }
3839 
3840   // Double rounds for sha512.
3841   void sha512_dround(int dr,
3842                      FloatRegister vi0, FloatRegister vi1,
3843                      FloatRegister vi2, FloatRegister vi3,
3844                      FloatRegister vi4, FloatRegister vrc0,
3845                      FloatRegister vrc1, FloatRegister vin0,
3846                      FloatRegister vin1, FloatRegister vin2,
3847                      FloatRegister vin3, FloatRegister vin4) {
3848       if (dr < 36) {
3849         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3850       }
3851       __ addv(v5, __ T2D, vrc0, vin0);
3852       __ ext(v6, __ T16B, vi2, vi3, 8);
3853       __ ext(v5, __ T16B, v5, v5, 8);
3854       __ ext(v7, __ T16B, vi1, vi2, 8);
3855       __ addv(vi3, __ T2D, vi3, v5);
3856       if (dr < 32) {
3857         __ ext(v5, __ T16B, vin3, vin4, 8);
3858         __ sha512su0(vin0, __ T2D, vin1);
3859       }
3860       __ sha512h(vi3, __ T2D, v6, v7);
3861       if (dr < 32) {
3862         __ sha512su1(vin0, __ T2D, vin2, v5);
3863       }
3864       __ addv(vi4, __ T2D, vi1, vi3);
3865       __ sha512h2(vi3, __ T2D, vi1, vi0);
3866   }
3867 
3868   // Arguments:
3869   //
3870   // Inputs:
3871   //   c_rarg0   - byte[]  source+offset
3872   //   c_rarg1   - int[]   SHA.state
3873   //   c_rarg2   - int     offset
3874   //   c_rarg3   - int     limit
3875   //
3876   address generate_sha512_implCompress(bool multi_block, const char *name) {
3877     static const uint64_t round_consts[80] = {
3878       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3879       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3880       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3881       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3882       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3883       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3884       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3885       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3886       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3887       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3888       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3889       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3890       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3891       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3892       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3893       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3894       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3895       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3896       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3897       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3898       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3899       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3900       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3901       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3902       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3903       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3904       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3905     };
3906 
3907     __ align(CodeEntryAlignment);
3908     StubCodeMark mark(this, "StubRoutines", name);
3909     address start = __ pc();
3910 
3911     Register buf   = c_rarg0;
3912     Register state = c_rarg1;
3913     Register ofs   = c_rarg2;
3914     Register limit = c_rarg3;
3915 
3916     __ stpd(v8, v9, __ pre(sp, -64));
3917     __ stpd(v10, v11, Address(sp, 16));
3918     __ stpd(v12, v13, Address(sp, 32));
3919     __ stpd(v14, v15, Address(sp, 48));
3920 
3921     Label sha512_loop;
3922 
3923     // load state
3924     __ ld1(v8, v9, v10, v11, __ T2D, state);
3925 
3926     // load first 4 round constants
3927     __ lea(rscratch1, ExternalAddress((address)round_consts));
3928     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3929 
3930     __ BIND(sha512_loop);
3931     // load 128B of data into v12..v19
3932     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3933     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3934     __ rev64(v12, __ T16B, v12);
3935     __ rev64(v13, __ T16B, v13);
3936     __ rev64(v14, __ T16B, v14);
3937     __ rev64(v15, __ T16B, v15);
3938     __ rev64(v16, __ T16B, v16);
3939     __ rev64(v17, __ T16B, v17);
3940     __ rev64(v18, __ T16B, v18);
3941     __ rev64(v19, __ T16B, v19);
3942 
3943     __ mov(rscratch2, rscratch1);
3944 
3945     __ mov(v0, __ T16B, v8);
3946     __ mov(v1, __ T16B, v9);
3947     __ mov(v2, __ T16B, v10);
3948     __ mov(v3, __ T16B, v11);
3949 
3950     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3951     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3952     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3953     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3954     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3955     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3956     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3957     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3958     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3959     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3960     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3961     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3962     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3963     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3964     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3965     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3966     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3967     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3968     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3969     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3970     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3971     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3972     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3973     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3974     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3975     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3976     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3977     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3978     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3979     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3980     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3981     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3982     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3983     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3984     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3985     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3986     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3987     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3988     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3989     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3990 
3991     __ addv(v8, __ T2D, v8, v0);
3992     __ addv(v9, __ T2D, v9, v1);
3993     __ addv(v10, __ T2D, v10, v2);
3994     __ addv(v11, __ T2D, v11, v3);
3995 
3996     if (multi_block) {
3997       __ add(ofs, ofs, 128);
3998       __ cmp(ofs, limit);
3999       __ br(Assembler::LE, sha512_loop);
4000       __ mov(c_rarg0, ofs); // return ofs
4001     }
4002 
4003     __ st1(v8, v9, v10, v11, __ T2D, state);
4004 
4005     __ ldpd(v14, v15, Address(sp, 48));
4006     __ ldpd(v12, v13, Address(sp, 32));
4007     __ ldpd(v10, v11, Address(sp, 16));
4008     __ ldpd(v8, v9, __ post(sp, 64));
4009 
4010     __ ret(lr);
4011 
4012     return start;
4013   }
4014 
4015   // Arguments:
4016   //
4017   // Inputs:
4018   //   c_rarg0   - byte[]  source+offset
4019   //   c_rarg1   - byte[]  SHA.state
4020   //   c_rarg2   - int     block_size
4021   //   c_rarg3   - int     offset
4022   //   c_rarg4   - int     limit
4023   //
4024   address generate_sha3_implCompress(bool multi_block, const char *name) {
4025     static const uint64_t round_consts[24] = {
4026       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4027       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4028       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4029       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4030       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4031       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4032       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4033       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4034     };
4035 
4036     __ align(CodeEntryAlignment);
4037     StubCodeMark mark(this, "StubRoutines", name);
4038     address start = __ pc();
4039 
4040     Register buf           = c_rarg0;
4041     Register state         = c_rarg1;
4042     Register block_size    = c_rarg2;
4043     Register ofs           = c_rarg3;
4044     Register limit         = c_rarg4;
4045 
4046     Label sha3_loop, rounds24_loop;
4047     Label sha3_512_or_sha3_384, shake128;
4048 
4049     __ stpd(v8, v9, __ pre(sp, -64));
4050     __ stpd(v10, v11, Address(sp, 16));
4051     __ stpd(v12, v13, Address(sp, 32));
4052     __ stpd(v14, v15, Address(sp, 48));
4053 
4054     // load state
4055     __ add(rscratch1, state, 32);
4056     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4057     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4058     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4059     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4060     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4061     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4062     __ ld1(v24, __ T1D, rscratch1);
4063 
4064     __ BIND(sha3_loop);
4065 
4066     // 24 keccak rounds
4067     __ movw(rscratch2, 24);
4068 
4069     // load round_constants base
4070     __ lea(rscratch1, ExternalAddress((address) round_consts));
4071 
4072     // load input
4073     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4074     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4075     __ eor(v0, __ T8B, v0, v25);
4076     __ eor(v1, __ T8B, v1, v26);
4077     __ eor(v2, __ T8B, v2, v27);
4078     __ eor(v3, __ T8B, v3, v28);
4079     __ eor(v4, __ T8B, v4, v29);
4080     __ eor(v5, __ T8B, v5, v30);
4081     __ eor(v6, __ T8B, v6, v31);
4082 
4083     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4084     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4085 
4086     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4087     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4088     __ eor(v7, __ T8B, v7, v25);
4089     __ eor(v8, __ T8B, v8, v26);
4090     __ eor(v9, __ T8B, v9, v27);
4091     __ eor(v10, __ T8B, v10, v28);
4092     __ eor(v11, __ T8B, v11, v29);
4093     __ eor(v12, __ T8B, v12, v30);
4094     __ eor(v13, __ T8B, v13, v31);
4095 
4096     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4097     __ eor(v14, __ T8B, v14, v25);
4098     __ eor(v15, __ T8B, v15, v26);
4099     __ eor(v16, __ T8B, v16, v27);
4100 
4101     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4102     __ andw(c_rarg5, block_size, 48);
4103     __ cbzw(c_rarg5, rounds24_loop);
4104 
4105     __ tbnz(block_size, 5, shake128);
4106     // block_size == 144, bit5 == 0, SHA3-244
4107     __ ldrd(v28, __ post(buf, 8));
4108     __ eor(v17, __ T8B, v17, v28);
4109     __ b(rounds24_loop);
4110 
4111     __ BIND(shake128);
4112     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4113     __ eor(v17, __ T8B, v17, v28);
4114     __ eor(v18, __ T8B, v18, v29);
4115     __ eor(v19, __ T8B, v19, v30);
4116     __ eor(v20, __ T8B, v20, v31);
4117     __ b(rounds24_loop); // block_size == 168, SHAKE128
4118 
4119     __ BIND(sha3_512_or_sha3_384);
4120     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4121     __ eor(v7, __ T8B, v7, v25);
4122     __ eor(v8, __ T8B, v8, v26);
4123     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4124 
4125     // SHA3-384
4126     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4127     __ eor(v9,  __ T8B, v9,  v27);
4128     __ eor(v10, __ T8B, v10, v28);
4129     __ eor(v11, __ T8B, v11, v29);
4130     __ eor(v12, __ T8B, v12, v30);
4131 
4132     __ BIND(rounds24_loop);
4133     __ subw(rscratch2, rscratch2, 1);
4134 
4135     __ eor3(v29, __ T16B, v4, v9, v14);
4136     __ eor3(v26, __ T16B, v1, v6, v11);
4137     __ eor3(v28, __ T16B, v3, v8, v13);
4138     __ eor3(v25, __ T16B, v0, v5, v10);
4139     __ eor3(v27, __ T16B, v2, v7, v12);
4140     __ eor3(v29, __ T16B, v29, v19, v24);
4141     __ eor3(v26, __ T16B, v26, v16, v21);
4142     __ eor3(v28, __ T16B, v28, v18, v23);
4143     __ eor3(v25, __ T16B, v25, v15, v20);
4144     __ eor3(v27, __ T16B, v27, v17, v22);
4145 
4146     __ rax1(v30, __ T2D, v29, v26);
4147     __ rax1(v26, __ T2D, v26, v28);
4148     __ rax1(v28, __ T2D, v28, v25);
4149     __ rax1(v25, __ T2D, v25, v27);
4150     __ rax1(v27, __ T2D, v27, v29);
4151 
4152     __ eor(v0, __ T16B, v0, v30);
4153     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4154     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4155     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4156     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4157     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4158     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4159     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4160     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4161     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4162     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4163     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4164     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4165     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4166     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4167     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4168     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4169     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4170     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4171     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4172     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4173     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4174     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4175     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4176     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4177 
4178     __ bcax(v20, __ T16B, v31, v22, v8);
4179     __ bcax(v21, __ T16B, v8,  v23, v22);
4180     __ bcax(v22, __ T16B, v22, v24, v23);
4181     __ bcax(v23, __ T16B, v23, v31, v24);
4182     __ bcax(v24, __ T16B, v24, v8,  v31);
4183 
4184     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4185 
4186     __ bcax(v17, __ T16B, v25, v19, v3);
4187     __ bcax(v18, __ T16B, v3,  v15, v19);
4188     __ bcax(v19, __ T16B, v19, v16, v15);
4189     __ bcax(v15, __ T16B, v15, v25, v16);
4190     __ bcax(v16, __ T16B, v16, v3,  v25);
4191 
4192     __ bcax(v10, __ T16B, v29, v12, v26);
4193     __ bcax(v11, __ T16B, v26, v13, v12);
4194     __ bcax(v12, __ T16B, v12, v14, v13);
4195     __ bcax(v13, __ T16B, v13, v29, v14);
4196     __ bcax(v14, __ T16B, v14, v26, v29);
4197 
4198     __ bcax(v7, __ T16B, v30, v9,  v4);
4199     __ bcax(v8, __ T16B, v4,  v5,  v9);
4200     __ bcax(v9, __ T16B, v9,  v6,  v5);
4201     __ bcax(v5, __ T16B, v5,  v30, v6);
4202     __ bcax(v6, __ T16B, v6,  v4,  v30);
4203 
4204     __ bcax(v3, __ T16B, v27, v0,  v28);
4205     __ bcax(v4, __ T16B, v28, v1,  v0);
4206     __ bcax(v0, __ T16B, v0,  v2,  v1);
4207     __ bcax(v1, __ T16B, v1,  v27, v2);
4208     __ bcax(v2, __ T16B, v2,  v28, v27);
4209 
4210     __ eor(v0, __ T16B, v0, v31);
4211 
4212     __ cbnzw(rscratch2, rounds24_loop);
4213 
4214     if (multi_block) {
4215       __ add(ofs, ofs, block_size);
4216       __ cmp(ofs, limit);
4217       __ br(Assembler::LE, sha3_loop);
4218       __ mov(c_rarg0, ofs); // return ofs
4219     }
4220 
4221     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4222     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4223     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4224     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4225     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4226     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4227     __ st1(v24, __ T1D, state);
4228 
4229     __ ldpd(v14, v15, Address(sp, 48));
4230     __ ldpd(v12, v13, Address(sp, 32));
4231     __ ldpd(v10, v11, Address(sp, 16));
4232     __ ldpd(v8, v9, __ post(sp, 64));
4233 
4234     __ ret(lr);
4235 
4236     return start;
4237   }
4238 
4239   /**
4240    *  Arguments:
4241    *
4242    * Inputs:
4243    *   c_rarg0   - int crc
4244    *   c_rarg1   - byte* buf
4245    *   c_rarg2   - int length
4246    *
4247    * Output:
4248    *       rax   - int crc result
4249    */
4250   address generate_updateBytesCRC32() {
4251     assert(UseCRC32Intrinsics, "what are we doing here?");
4252 
4253     __ align(CodeEntryAlignment);
4254     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4255 
4256     address start = __ pc();
4257 
4258     const Register crc   = c_rarg0;  // crc
4259     const Register buf   = c_rarg1;  // source java byte array address
4260     const Register len   = c_rarg2;  // length
4261     const Register table0 = c_rarg3; // crc_table address
4262     const Register table1 = c_rarg4;
4263     const Register table2 = c_rarg5;
4264     const Register table3 = c_rarg6;
4265     const Register tmp3 = c_rarg7;
4266 
4267     BLOCK_COMMENT("Entry:");
4268     __ enter(); // required for proper stackwalking of RuntimeStub frame
4269 
4270     __ kernel_crc32(crc, buf, len,
4271               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4272 
4273     __ leave(); // required for proper stackwalking of RuntimeStub frame
4274     __ ret(lr);
4275 
4276     return start;
4277   }
4278 
4279   // ChaCha20 block function.  This version parallelizes by loading
4280   // individual 32-bit state elements into vectors for four blocks
4281   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4282   //
4283   // state (int[16]) = c_rarg0
4284   // keystream (byte[1024]) = c_rarg1
4285   // return - number of bytes of keystream (always 256)
4286   address generate_chacha20Block_blockpar() {
4287     Label L_twoRounds, L_cc20_const;
4288     // The constant data is broken into two 128-bit segments to be loaded
4289     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4290     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4291     // The second 128-bits is a table constant used for 8-bit left rotations.
4292     __ BIND(L_cc20_const);
4293     __ emit_int64(0x0000000100000000UL);
4294     __ emit_int64(0x0000000300000002UL);
4295     __ emit_int64(0x0605040702010003UL);
4296     __ emit_int64(0x0E0D0C0F0A09080BUL);
4297 
4298     __ align(CodeEntryAlignment);
4299     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4300     address start = __ pc();
4301     __ enter();
4302 
4303     int i, j;
4304     const Register state = c_rarg0;
4305     const Register keystream = c_rarg1;
4306     const Register loopCtr = r10;
4307     const Register tmpAddr = r11;
4308 
4309     const FloatRegister stateFirst = v0;
4310     const FloatRegister stateSecond = v1;
4311     const FloatRegister stateThird = v2;
4312     const FloatRegister stateFourth = v3;
4313     const FloatRegister origCtrState = v28;
4314     const FloatRegister scratch = v29;
4315     const FloatRegister lrot8Tbl = v30;
4316 
4317     // Organize SIMD registers in an array that facilitates
4318     // putting repetitive opcodes into loop structures.  It is
4319     // important that each grouping of 4 registers is monotonically
4320     // increasing to support the requirements of multi-register
4321     // instructions (e.g. ld4r, st4, etc.)
4322     const FloatRegister workSt[16] = {
4323          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4324         v20, v21, v22, v23, v24, v25, v26, v27
4325     };
4326 
4327     // Load from memory and interlace across 16 SIMD registers,
4328     // With each word from memory being broadcast to all lanes of
4329     // each successive SIMD register.
4330     //      Addr(0) -> All lanes in workSt[i]
4331     //      Addr(4) -> All lanes workSt[i + 1], etc.
4332     __ mov(tmpAddr, state);
4333     for (i = 0; i < 16; i += 4) {
4334       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4335           __ post(tmpAddr, 16));
4336     }
4337 
4338     // Pull in constant data.  The first 16 bytes are the add overlay
4339     // which is applied to the vector holding the counter (state[12]).
4340     // The second 16 bytes is the index register for the 8-bit left
4341     // rotation tbl instruction.
4342     __ adr(tmpAddr, L_cc20_const);
4343     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4344     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4345 
4346     // Set up the 10 iteration loop and perform all 8 quarter round ops
4347     __ mov(loopCtr, 10);
4348     __ BIND(L_twoRounds);
4349 
4350     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4351         scratch, lrot8Tbl);
4352     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4353         scratch, lrot8Tbl);
4354     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4355         scratch, lrot8Tbl);
4356     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4357         scratch, lrot8Tbl);
4358 
4359     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4360         scratch, lrot8Tbl);
4361     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4362         scratch, lrot8Tbl);
4363     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4364         scratch, lrot8Tbl);
4365     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4366         scratch, lrot8Tbl);
4367 
4368     // Decrement and iterate
4369     __ sub(loopCtr, loopCtr, 1);
4370     __ cbnz(loopCtr, L_twoRounds);
4371 
4372     __ mov(tmpAddr, state);
4373 
4374     // Add the starting state back to the post-loop keystream
4375     // state.  We read/interlace the state array from memory into
4376     // 4 registers similar to what we did in the beginning.  Then
4377     // add the counter overlay onto workSt[12] at the end.
4378     for (i = 0; i < 16; i += 4) {
4379       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4380           __ post(tmpAddr, 16));
4381       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4382       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4383       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4384       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4385     }
4386     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4387 
4388     // Write to key stream, storing the same element out of workSt[0..15]
4389     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4390     // for the next element position.
4391     for (i = 0; i < 4; i++) {
4392       for (j = 0; j < 16; j += 4) {
4393         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4394             __ post(keystream, 16));
4395       }
4396     }
4397 
4398     __ mov(r0, 256);             // Return length of output keystream
4399     __ leave();
4400     __ ret(lr);
4401 
4402     return start;
4403   }
4404 
4405   /**
4406    *  Arguments:
4407    *
4408    * Inputs:
4409    *   c_rarg0   - int crc
4410    *   c_rarg1   - byte* buf
4411    *   c_rarg2   - int length
4412    *   c_rarg3   - int* table
4413    *
4414    * Output:
4415    *       r0   - int crc result
4416    */
4417   address generate_updateBytesCRC32C() {
4418     assert(UseCRC32CIntrinsics, "what are we doing here?");
4419 
4420     __ align(CodeEntryAlignment);
4421     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4422 
4423     address start = __ pc();
4424 
4425     const Register crc   = c_rarg0;  // crc
4426     const Register buf   = c_rarg1;  // source java byte array address
4427     const Register len   = c_rarg2;  // length
4428     const Register table0 = c_rarg3; // crc_table address
4429     const Register table1 = c_rarg4;
4430     const Register table2 = c_rarg5;
4431     const Register table3 = c_rarg6;
4432     const Register tmp3 = c_rarg7;
4433 
4434     BLOCK_COMMENT("Entry:");
4435     __ enter(); // required for proper stackwalking of RuntimeStub frame
4436 
4437     __ kernel_crc32c(crc, buf, len,
4438               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4439 
4440     __ leave(); // required for proper stackwalking of RuntimeStub frame
4441     __ ret(lr);
4442 
4443     return start;
4444   }
4445 
4446   /***
4447    *  Arguments:
4448    *
4449    *  Inputs:
4450    *   c_rarg0   - int   adler
4451    *   c_rarg1   - byte* buff
4452    *   c_rarg2   - int   len
4453    *
4454    * Output:
4455    *   c_rarg0   - int adler result
4456    */
4457   address generate_updateBytesAdler32() {
4458     __ align(CodeEntryAlignment);
4459     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4460     address start = __ pc();
4461 
4462     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4463 
4464     // Aliases
4465     Register adler  = c_rarg0;
4466     Register s1     = c_rarg0;
4467     Register s2     = c_rarg3;
4468     Register buff   = c_rarg1;
4469     Register len    = c_rarg2;
4470     Register nmax  = r4;
4471     Register base  = r5;
4472     Register count = r6;
4473     Register temp0 = rscratch1;
4474     Register temp1 = rscratch2;
4475     FloatRegister vbytes = v0;
4476     FloatRegister vs1acc = v1;
4477     FloatRegister vs2acc = v2;
4478     FloatRegister vtable = v3;
4479 
4480     // Max number of bytes we can process before having to take the mod
4481     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4482     uint64_t BASE = 0xfff1;
4483     uint64_t NMAX = 0x15B0;
4484 
4485     __ mov(base, BASE);
4486     __ mov(nmax, NMAX);
4487 
4488     // Load accumulation coefficients for the upper 16 bits
4489     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4490     __ ld1(vtable, __ T16B, Address(temp0));
4491 
4492     // s1 is initialized to the lower 16 bits of adler
4493     // s2 is initialized to the upper 16 bits of adler
4494     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4495     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4496 
4497     // The pipelined loop needs at least 16 elements for 1 iteration
4498     // It does check this, but it is more effective to skip to the cleanup loop
4499     __ cmp(len, (u1)16);
4500     __ br(Assembler::HS, L_nmax);
4501     __ cbz(len, L_combine);
4502 
4503     __ bind(L_simple_by1_loop);
4504     __ ldrb(temp0, Address(__ post(buff, 1)));
4505     __ add(s1, s1, temp0);
4506     __ add(s2, s2, s1);
4507     __ subs(len, len, 1);
4508     __ br(Assembler::HI, L_simple_by1_loop);
4509 
4510     // s1 = s1 % BASE
4511     __ subs(temp0, s1, base);
4512     __ csel(s1, temp0, s1, Assembler::HS);
4513 
4514     // s2 = s2 % BASE
4515     __ lsr(temp0, s2, 16);
4516     __ lsl(temp1, temp0, 4);
4517     __ sub(temp1, temp1, temp0);
4518     __ add(s2, temp1, s2, ext::uxth);
4519 
4520     __ subs(temp0, s2, base);
4521     __ csel(s2, temp0, s2, Assembler::HS);
4522 
4523     __ b(L_combine);
4524 
4525     __ bind(L_nmax);
4526     __ subs(len, len, nmax);
4527     __ sub(count, nmax, 16);
4528     __ br(Assembler::LO, L_by16);
4529 
4530     __ bind(L_nmax_loop);
4531 
4532     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4533                                       vbytes, vs1acc, vs2acc, vtable);
4534 
4535     __ subs(count, count, 16);
4536     __ br(Assembler::HS, L_nmax_loop);
4537 
4538     // s1 = s1 % BASE
4539     __ lsr(temp0, s1, 16);
4540     __ lsl(temp1, temp0, 4);
4541     __ sub(temp1, temp1, temp0);
4542     __ add(temp1, temp1, s1, ext::uxth);
4543 
4544     __ lsr(temp0, temp1, 16);
4545     __ lsl(s1, temp0, 4);
4546     __ sub(s1, s1, temp0);
4547     __ add(s1, s1, temp1, ext:: uxth);
4548 
4549     __ subs(temp0, s1, base);
4550     __ csel(s1, temp0, s1, Assembler::HS);
4551 
4552     // s2 = s2 % BASE
4553     __ lsr(temp0, s2, 16);
4554     __ lsl(temp1, temp0, 4);
4555     __ sub(temp1, temp1, temp0);
4556     __ add(temp1, temp1, s2, ext::uxth);
4557 
4558     __ lsr(temp0, temp1, 16);
4559     __ lsl(s2, temp0, 4);
4560     __ sub(s2, s2, temp0);
4561     __ add(s2, s2, temp1, ext:: uxth);
4562 
4563     __ subs(temp0, s2, base);
4564     __ csel(s2, temp0, s2, Assembler::HS);
4565 
4566     __ subs(len, len, nmax);
4567     __ sub(count, nmax, 16);
4568     __ br(Assembler::HS, L_nmax_loop);
4569 
4570     __ bind(L_by16);
4571     __ adds(len, len, count);
4572     __ br(Assembler::LO, L_by1);
4573 
4574     __ bind(L_by16_loop);
4575 
4576     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4577                                       vbytes, vs1acc, vs2acc, vtable);
4578 
4579     __ subs(len, len, 16);
4580     __ br(Assembler::HS, L_by16_loop);
4581 
4582     __ bind(L_by1);
4583     __ adds(len, len, 15);
4584     __ br(Assembler::LO, L_do_mod);
4585 
4586     __ bind(L_by1_loop);
4587     __ ldrb(temp0, Address(__ post(buff, 1)));
4588     __ add(s1, temp0, s1);
4589     __ add(s2, s2, s1);
4590     __ subs(len, len, 1);
4591     __ br(Assembler::HS, L_by1_loop);
4592 
4593     __ bind(L_do_mod);
4594     // s1 = s1 % BASE
4595     __ lsr(temp0, s1, 16);
4596     __ lsl(temp1, temp0, 4);
4597     __ sub(temp1, temp1, temp0);
4598     __ add(temp1, temp1, s1, ext::uxth);
4599 
4600     __ lsr(temp0, temp1, 16);
4601     __ lsl(s1, temp0, 4);
4602     __ sub(s1, s1, temp0);
4603     __ add(s1, s1, temp1, ext:: uxth);
4604 
4605     __ subs(temp0, s1, base);
4606     __ csel(s1, temp0, s1, Assembler::HS);
4607 
4608     // s2 = s2 % BASE
4609     __ lsr(temp0, s2, 16);
4610     __ lsl(temp1, temp0, 4);
4611     __ sub(temp1, temp1, temp0);
4612     __ add(temp1, temp1, s2, ext::uxth);
4613 
4614     __ lsr(temp0, temp1, 16);
4615     __ lsl(s2, temp0, 4);
4616     __ sub(s2, s2, temp0);
4617     __ add(s2, s2, temp1, ext:: uxth);
4618 
4619     __ subs(temp0, s2, base);
4620     __ csel(s2, temp0, s2, Assembler::HS);
4621 
4622     // Combine lower bits and higher bits
4623     __ bind(L_combine);
4624     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4625 
4626     __ ret(lr);
4627 
4628     return start;
4629   }
4630 
4631   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4632           Register temp0, Register temp1, FloatRegister vbytes,
4633           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4634     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4635     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4636     // In non-vectorized code, we update s1 and s2 as:
4637     //   s1 <- s1 + b1
4638     //   s2 <- s2 + s1
4639     //   s1 <- s1 + b2
4640     //   s2 <- s2 + b1
4641     //   ...
4642     //   s1 <- s1 + b16
4643     //   s2 <- s2 + s1
4644     // Putting above assignments together, we have:
4645     //   s1_new = s1 + b1 + b2 + ... + b16
4646     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4647     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4648     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4649     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4650 
4651     // s2 = s2 + s1 * 16
4652     __ add(s2, s2, s1, Assembler::LSL, 4);
4653 
4654     // vs1acc = b1 + b2 + b3 + ... + b16
4655     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4656     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4657     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4658     __ uaddlv(vs1acc, __ T16B, vbytes);
4659     __ uaddlv(vs2acc, __ T8H, vs2acc);
4660 
4661     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4662     __ fmovd(temp0, vs1acc);
4663     __ fmovd(temp1, vs2acc);
4664     __ add(s1, s1, temp0);
4665     __ add(s2, s2, temp1);
4666   }
4667 
4668   /**
4669    *  Arguments:
4670    *
4671    *  Input:
4672    *    c_rarg0   - x address
4673    *    c_rarg1   - x length
4674    *    c_rarg2   - y address
4675    *    c_rarg3   - y length
4676    *    c_rarg4   - z address
4677    *    c_rarg5   - z length
4678    */
4679   address generate_multiplyToLen() {
4680     __ align(CodeEntryAlignment);
4681     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4682 
4683     address start = __ pc();
4684     const Register x     = r0;
4685     const Register xlen  = r1;
4686     const Register y     = r2;
4687     const Register ylen  = r3;
4688     const Register z     = r4;
4689     const Register zlen  = r5;
4690 
4691     const Register tmp1  = r10;
4692     const Register tmp2  = r11;
4693     const Register tmp3  = r12;
4694     const Register tmp4  = r13;
4695     const Register tmp5  = r14;
4696     const Register tmp6  = r15;
4697     const Register tmp7  = r16;
4698 
4699     BLOCK_COMMENT("Entry:");
4700     __ enter(); // required for proper stackwalking of RuntimeStub frame
4701     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4702     __ leave(); // required for proper stackwalking of RuntimeStub frame
4703     __ ret(lr);
4704 
4705     return start;
4706   }
4707 
4708   address generate_squareToLen() {
4709     // squareToLen algorithm for sizes 1..127 described in java code works
4710     // faster than multiply_to_len on some CPUs and slower on others, but
4711     // multiply_to_len shows a bit better overall results
4712     __ align(CodeEntryAlignment);
4713     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4714     address start = __ pc();
4715 
4716     const Register x     = r0;
4717     const Register xlen  = r1;
4718     const Register z     = r2;
4719     const Register zlen  = r3;
4720     const Register y     = r4; // == x
4721     const Register ylen  = r5; // == xlen
4722 
4723     const Register tmp1  = r10;
4724     const Register tmp2  = r11;
4725     const Register tmp3  = r12;
4726     const Register tmp4  = r13;
4727     const Register tmp5  = r14;
4728     const Register tmp6  = r15;
4729     const Register tmp7  = r16;
4730 
4731     RegSet spilled_regs = RegSet::of(y, ylen);
4732     BLOCK_COMMENT("Entry:");
4733     __ enter();
4734     __ push(spilled_regs, sp);
4735     __ mov(y, x);
4736     __ mov(ylen, xlen);
4737     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4738     __ pop(spilled_regs, sp);
4739     __ leave();
4740     __ ret(lr);
4741     return start;
4742   }
4743 
4744   address generate_mulAdd() {
4745     __ align(CodeEntryAlignment);
4746     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4747 
4748     address start = __ pc();
4749 
4750     const Register out     = r0;
4751     const Register in      = r1;
4752     const Register offset  = r2;
4753     const Register len     = r3;
4754     const Register k       = r4;
4755 
4756     BLOCK_COMMENT("Entry:");
4757     __ enter();
4758     __ mul_add(out, in, offset, len, k);
4759     __ leave();
4760     __ ret(lr);
4761 
4762     return start;
4763   }
4764 
4765   // Arguments:
4766   //
4767   // Input:
4768   //   c_rarg0   - newArr address
4769   //   c_rarg1   - oldArr address
4770   //   c_rarg2   - newIdx
4771   //   c_rarg3   - shiftCount
4772   //   c_rarg4   - numIter
4773   //
4774   address generate_bigIntegerRightShift() {
4775     __ align(CodeEntryAlignment);
4776     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4777     address start = __ pc();
4778 
4779     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4780 
4781     Register newArr        = c_rarg0;
4782     Register oldArr        = c_rarg1;
4783     Register newIdx        = c_rarg2;
4784     Register shiftCount    = c_rarg3;
4785     Register numIter       = c_rarg4;
4786     Register idx           = numIter;
4787 
4788     Register newArrCur     = rscratch1;
4789     Register shiftRevCount = rscratch2;
4790     Register oldArrCur     = r13;
4791     Register oldArrNext    = r14;
4792 
4793     FloatRegister oldElem0        = v0;
4794     FloatRegister oldElem1        = v1;
4795     FloatRegister newElem         = v2;
4796     FloatRegister shiftVCount     = v3;
4797     FloatRegister shiftVRevCount  = v4;
4798 
4799     __ cbz(idx, Exit);
4800 
4801     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4802 
4803     // left shift count
4804     __ movw(shiftRevCount, 32);
4805     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4806 
4807     // numIter too small to allow a 4-words SIMD loop, rolling back
4808     __ cmp(numIter, (u1)4);
4809     __ br(Assembler::LT, ShiftThree);
4810 
4811     __ dup(shiftVCount,    __ T4S, shiftCount);
4812     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4813     __ negr(shiftVCount,   __ T4S, shiftVCount);
4814 
4815     __ BIND(ShiftSIMDLoop);
4816 
4817     // Calculate the load addresses
4818     __ sub(idx, idx, 4);
4819     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4820     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4821     __ add(oldArrCur,  oldArrNext, 4);
4822 
4823     // Load 4 words and process
4824     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4825     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4826     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4827     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4828     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4829     __ st1(newElem,   __ T4S,  Address(newArrCur));
4830 
4831     __ cmp(idx, (u1)4);
4832     __ br(Assembler::LT, ShiftTwoLoop);
4833     __ b(ShiftSIMDLoop);
4834 
4835     __ BIND(ShiftTwoLoop);
4836     __ cbz(idx, Exit);
4837     __ cmp(idx, (u1)1);
4838     __ br(Assembler::EQ, ShiftOne);
4839 
4840     // Calculate the load addresses
4841     __ sub(idx, idx, 2);
4842     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4843     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4844     __ add(oldArrCur,  oldArrNext, 4);
4845 
4846     // Load 2 words and process
4847     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4848     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4849     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4850     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4851     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4852     __ st1(newElem,   __ T2S, Address(newArrCur));
4853     __ b(ShiftTwoLoop);
4854 
4855     __ BIND(ShiftThree);
4856     __ tbz(idx, 1, ShiftOne);
4857     __ tbz(idx, 0, ShiftTwo);
4858     __ ldrw(r10,  Address(oldArr, 12));
4859     __ ldrw(r11,  Address(oldArr, 8));
4860     __ lsrvw(r10, r10, shiftCount);
4861     __ lslvw(r11, r11, shiftRevCount);
4862     __ orrw(r12,  r10, r11);
4863     __ strw(r12,  Address(newArr, 8));
4864 
4865     __ BIND(ShiftTwo);
4866     __ ldrw(r10,  Address(oldArr, 8));
4867     __ ldrw(r11,  Address(oldArr, 4));
4868     __ lsrvw(r10, r10, shiftCount);
4869     __ lslvw(r11, r11, shiftRevCount);
4870     __ orrw(r12,  r10, r11);
4871     __ strw(r12,  Address(newArr, 4));
4872 
4873     __ BIND(ShiftOne);
4874     __ ldrw(r10,  Address(oldArr, 4));
4875     __ ldrw(r11,  Address(oldArr));
4876     __ lsrvw(r10, r10, shiftCount);
4877     __ lslvw(r11, r11, shiftRevCount);
4878     __ orrw(r12,  r10, r11);
4879     __ strw(r12,  Address(newArr));
4880 
4881     __ BIND(Exit);
4882     __ ret(lr);
4883 
4884     return start;
4885   }
4886 
4887   // Arguments:
4888   //
4889   // Input:
4890   //   c_rarg0   - newArr address
4891   //   c_rarg1   - oldArr address
4892   //   c_rarg2   - newIdx
4893   //   c_rarg3   - shiftCount
4894   //   c_rarg4   - numIter
4895   //
4896   address generate_bigIntegerLeftShift() {
4897     __ align(CodeEntryAlignment);
4898     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4899     address start = __ pc();
4900 
4901     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4902 
4903     Register newArr        = c_rarg0;
4904     Register oldArr        = c_rarg1;
4905     Register newIdx        = c_rarg2;
4906     Register shiftCount    = c_rarg3;
4907     Register numIter       = c_rarg4;
4908 
4909     Register shiftRevCount = rscratch1;
4910     Register oldArrNext    = rscratch2;
4911 
4912     FloatRegister oldElem0        = v0;
4913     FloatRegister oldElem1        = v1;
4914     FloatRegister newElem         = v2;
4915     FloatRegister shiftVCount     = v3;
4916     FloatRegister shiftVRevCount  = v4;
4917 
4918     __ cbz(numIter, Exit);
4919 
4920     __ add(oldArrNext, oldArr, 4);
4921     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4922 
4923     // right shift count
4924     __ movw(shiftRevCount, 32);
4925     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4926 
4927     // numIter too small to allow a 4-words SIMD loop, rolling back
4928     __ cmp(numIter, (u1)4);
4929     __ br(Assembler::LT, ShiftThree);
4930 
4931     __ dup(shiftVCount,     __ T4S, shiftCount);
4932     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4933     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4934 
4935     __ BIND(ShiftSIMDLoop);
4936 
4937     // load 4 words and process
4938     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4939     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4940     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4941     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4942     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4943     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4944     __ sub(numIter,   numIter, 4);
4945 
4946     __ cmp(numIter, (u1)4);
4947     __ br(Assembler::LT, ShiftTwoLoop);
4948     __ b(ShiftSIMDLoop);
4949 
4950     __ BIND(ShiftTwoLoop);
4951     __ cbz(numIter, Exit);
4952     __ cmp(numIter, (u1)1);
4953     __ br(Assembler::EQ, ShiftOne);
4954 
4955     // load 2 words and process
4956     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4957     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4958     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4959     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4960     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4961     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4962     __ sub(numIter,   numIter, 2);
4963     __ b(ShiftTwoLoop);
4964 
4965     __ BIND(ShiftThree);
4966     __ ldrw(r10,  __ post(oldArr, 4));
4967     __ ldrw(r11,  __ post(oldArrNext, 4));
4968     __ lslvw(r10, r10, shiftCount);
4969     __ lsrvw(r11, r11, shiftRevCount);
4970     __ orrw(r12,  r10, r11);
4971     __ strw(r12,  __ post(newArr, 4));
4972     __ tbz(numIter, 1, Exit);
4973     __ tbz(numIter, 0, ShiftOne);
4974 
4975     __ BIND(ShiftTwo);
4976     __ ldrw(r10,  __ post(oldArr, 4));
4977     __ ldrw(r11,  __ post(oldArrNext, 4));
4978     __ lslvw(r10, r10, shiftCount);
4979     __ lsrvw(r11, r11, shiftRevCount);
4980     __ orrw(r12,  r10, r11);
4981     __ strw(r12,  __ post(newArr, 4));
4982 
4983     __ BIND(ShiftOne);
4984     __ ldrw(r10,  Address(oldArr));
4985     __ ldrw(r11,  Address(oldArrNext));
4986     __ lslvw(r10, r10, shiftCount);
4987     __ lsrvw(r11, r11, shiftRevCount);
4988     __ orrw(r12,  r10, r11);
4989     __ strw(r12,  Address(newArr));
4990 
4991     __ BIND(Exit);
4992     __ ret(lr);
4993 
4994     return start;
4995   }
4996 
4997   address generate_count_positives(address &count_positives_long) {
4998     const u1 large_loop_size = 64;
4999     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5000     int dcache_line = VM_Version::dcache_line_size();
5001 
5002     Register ary1 = r1, len = r2, result = r0;
5003 
5004     __ align(CodeEntryAlignment);
5005 
5006     StubCodeMark mark(this, "StubRoutines", "count_positives");
5007 
5008     address entry = __ pc();
5009 
5010     __ enter();
5011     // precondition: a copy of len is already in result
5012     // __ mov(result, len);
5013 
5014   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5015         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5016 
5017   __ cmp(len, (u1)15);
5018   __ br(Assembler::GT, LEN_OVER_15);
5019   // The only case when execution falls into this code is when pointer is near
5020   // the end of memory page and we have to avoid reading next page
5021   __ add(ary1, ary1, len);
5022   __ subs(len, len, 8);
5023   __ br(Assembler::GT, LEN_OVER_8);
5024   __ ldr(rscratch2, Address(ary1, -8));
5025   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5026   __ lsrv(rscratch2, rscratch2, rscratch1);
5027   __ tst(rscratch2, UPPER_BIT_MASK);
5028   __ csel(result, zr, result, Assembler::NE);
5029   __ leave();
5030   __ ret(lr);
5031   __ bind(LEN_OVER_8);
5032   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5033   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5034   __ tst(rscratch2, UPPER_BIT_MASK);
5035   __ br(Assembler::NE, RET_NO_POP);
5036   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5037   __ lsrv(rscratch1, rscratch1, rscratch2);
5038   __ tst(rscratch1, UPPER_BIT_MASK);
5039   __ bind(RET_NO_POP);
5040   __ csel(result, zr, result, Assembler::NE);
5041   __ leave();
5042   __ ret(lr);
5043 
5044   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5045   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5046 
5047   count_positives_long = __ pc(); // 2nd entry point
5048 
5049   __ enter();
5050 
5051   __ bind(LEN_OVER_15);
5052     __ push(spilled_regs, sp);
5053     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5054     __ cbz(rscratch2, ALIGNED);
5055     __ ldp(tmp6, tmp1, Address(ary1));
5056     __ mov(tmp5, 16);
5057     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5058     __ add(ary1, ary1, rscratch1);
5059     __ orr(tmp6, tmp6, tmp1);
5060     __ tst(tmp6, UPPER_BIT_MASK);
5061     __ br(Assembler::NE, RET_ADJUST);
5062     __ sub(len, len, rscratch1);
5063 
5064   __ bind(ALIGNED);
5065     __ cmp(len, large_loop_size);
5066     __ br(Assembler::LT, CHECK_16);
5067     // Perform 16-byte load as early return in pre-loop to handle situation
5068     // when initially aligned large array has negative values at starting bytes,
5069     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5070     // slower. Cases with negative bytes further ahead won't be affected that
5071     // much. In fact, it'll be faster due to early loads, less instructions and
5072     // less branches in LARGE_LOOP.
5073     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5074     __ sub(len, len, 16);
5075     __ orr(tmp6, tmp6, tmp1);
5076     __ tst(tmp6, UPPER_BIT_MASK);
5077     __ br(Assembler::NE, RET_ADJUST_16);
5078     __ cmp(len, large_loop_size);
5079     __ br(Assembler::LT, CHECK_16);
5080 
5081     if (SoftwarePrefetchHintDistance >= 0
5082         && SoftwarePrefetchHintDistance >= dcache_line) {
5083       // initial prefetch
5084       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5085     }
5086   __ bind(LARGE_LOOP);
5087     if (SoftwarePrefetchHintDistance >= 0) {
5088       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5089     }
5090     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5091     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5092     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5093     // instructions per cycle and have less branches, but this approach disables
5094     // early return, thus, all 64 bytes are loaded and checked every time.
5095     __ ldp(tmp2, tmp3, Address(ary1));
5096     __ ldp(tmp4, tmp5, Address(ary1, 16));
5097     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5098     __ ldp(tmp6, tmp1, Address(ary1, 48));
5099     __ add(ary1, ary1, large_loop_size);
5100     __ sub(len, len, large_loop_size);
5101     __ orr(tmp2, tmp2, tmp3);
5102     __ orr(tmp4, tmp4, tmp5);
5103     __ orr(rscratch1, rscratch1, rscratch2);
5104     __ orr(tmp6, tmp6, tmp1);
5105     __ orr(tmp2, tmp2, tmp4);
5106     __ orr(rscratch1, rscratch1, tmp6);
5107     __ orr(tmp2, tmp2, rscratch1);
5108     __ tst(tmp2, UPPER_BIT_MASK);
5109     __ br(Assembler::NE, RET_ADJUST_LONG);
5110     __ cmp(len, large_loop_size);
5111     __ br(Assembler::GE, LARGE_LOOP);
5112 
5113   __ bind(CHECK_16); // small 16-byte load pre-loop
5114     __ cmp(len, (u1)16);
5115     __ br(Assembler::LT, POST_LOOP16);
5116 
5117   __ bind(LOOP16); // small 16-byte load loop
5118     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5119     __ sub(len, len, 16);
5120     __ orr(tmp2, tmp2, tmp3);
5121     __ tst(tmp2, UPPER_BIT_MASK);
5122     __ br(Assembler::NE, RET_ADJUST_16);
5123     __ cmp(len, (u1)16);
5124     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5125 
5126   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5127     __ cmp(len, (u1)8);
5128     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5129     __ ldr(tmp3, Address(__ post(ary1, 8)));
5130     __ tst(tmp3, UPPER_BIT_MASK);
5131     __ br(Assembler::NE, RET_ADJUST);
5132     __ sub(len, len, 8);
5133 
5134   __ bind(POST_LOOP16_LOAD_TAIL);
5135     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5136     __ ldr(tmp1, Address(ary1));
5137     __ mov(tmp2, 64);
5138     __ sub(tmp4, tmp2, len, __ LSL, 3);
5139     __ lslv(tmp1, tmp1, tmp4);
5140     __ tst(tmp1, UPPER_BIT_MASK);
5141     __ br(Assembler::NE, RET_ADJUST);
5142     // Fallthrough
5143 
5144   __ bind(RET_LEN);
5145     __ pop(spilled_regs, sp);
5146     __ leave();
5147     __ ret(lr);
5148 
5149     // difference result - len is the count of guaranteed to be
5150     // positive bytes
5151 
5152   __ bind(RET_ADJUST_LONG);
5153     __ add(len, len, (u1)(large_loop_size - 16));
5154   __ bind(RET_ADJUST_16);
5155     __ add(len, len, 16);
5156   __ bind(RET_ADJUST);
5157     __ pop(spilled_regs, sp);
5158     __ leave();
5159     __ sub(result, result, len);
5160     __ ret(lr);
5161 
5162     return entry;
5163   }
5164 
5165   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5166         bool usePrefetch, Label &NOT_EQUAL) {
5167     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5168         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5169         tmp7 = r12, tmp8 = r13;
5170     Label LOOP;
5171 
5172     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5173     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5174     __ bind(LOOP);
5175     if (usePrefetch) {
5176       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5177       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5178     }
5179     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5180     __ eor(tmp1, tmp1, tmp2);
5181     __ eor(tmp3, tmp3, tmp4);
5182     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5183     __ orr(tmp1, tmp1, tmp3);
5184     __ cbnz(tmp1, NOT_EQUAL);
5185     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5186     __ eor(tmp5, tmp5, tmp6);
5187     __ eor(tmp7, tmp7, tmp8);
5188     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5189     __ orr(tmp5, tmp5, tmp7);
5190     __ cbnz(tmp5, NOT_EQUAL);
5191     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5192     __ eor(tmp1, tmp1, tmp2);
5193     __ eor(tmp3, tmp3, tmp4);
5194     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5195     __ orr(tmp1, tmp1, tmp3);
5196     __ cbnz(tmp1, NOT_EQUAL);
5197     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5198     __ eor(tmp5, tmp5, tmp6);
5199     __ sub(cnt1, cnt1, 8 * wordSize);
5200     __ eor(tmp7, tmp7, tmp8);
5201     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5202     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5203     // cmp) because subs allows an unlimited range of immediate operand.
5204     __ subs(tmp6, cnt1, loopThreshold);
5205     __ orr(tmp5, tmp5, tmp7);
5206     __ cbnz(tmp5, NOT_EQUAL);
5207     __ br(__ GE, LOOP);
5208     // post-loop
5209     __ eor(tmp1, tmp1, tmp2);
5210     __ eor(tmp3, tmp3, tmp4);
5211     __ orr(tmp1, tmp1, tmp3);
5212     __ sub(cnt1, cnt1, 2 * wordSize);
5213     __ cbnz(tmp1, NOT_EQUAL);
5214   }
5215 
5216   void generate_large_array_equals_loop_simd(int loopThreshold,
5217         bool usePrefetch, Label &NOT_EQUAL) {
5218     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5219         tmp2 = rscratch2;
5220     Label LOOP;
5221 
5222     __ bind(LOOP);
5223     if (usePrefetch) {
5224       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5225       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5226     }
5227     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5228     __ sub(cnt1, cnt1, 8 * wordSize);
5229     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5230     __ subs(tmp1, cnt1, loopThreshold);
5231     __ eor(v0, __ T16B, v0, v4);
5232     __ eor(v1, __ T16B, v1, v5);
5233     __ eor(v2, __ T16B, v2, v6);
5234     __ eor(v3, __ T16B, v3, v7);
5235     __ orr(v0, __ T16B, v0, v1);
5236     __ orr(v1, __ T16B, v2, v3);
5237     __ orr(v0, __ T16B, v0, v1);
5238     __ umov(tmp1, v0, __ D, 0);
5239     __ umov(tmp2, v0, __ D, 1);
5240     __ orr(tmp1, tmp1, tmp2);
5241     __ cbnz(tmp1, NOT_EQUAL);
5242     __ br(__ GE, LOOP);
5243   }
5244 
5245   // a1 = r1 - array1 address
5246   // a2 = r2 - array2 address
5247   // result = r0 - return value. Already contains "false"
5248   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5249   // r3-r5 are reserved temporary registers
5250   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5251   address generate_large_array_equals() {
5252     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5253         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5254         tmp7 = r12, tmp8 = r13;
5255     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5256         SMALL_LOOP, POST_LOOP;
5257     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5258     // calculate if at least 32 prefetched bytes are used
5259     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5260     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5261     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5262     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5263         tmp5, tmp6, tmp7, tmp8);
5264 
5265     __ align(CodeEntryAlignment);
5266 
5267     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5268 
5269     address entry = __ pc();
5270     __ enter();
5271     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5272     // also advance pointers to use post-increment instead of pre-increment
5273     __ add(a1, a1, wordSize);
5274     __ add(a2, a2, wordSize);
5275     if (AvoidUnalignedAccesses) {
5276       // both implementations (SIMD/nonSIMD) are using relatively large load
5277       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5278       // on some CPUs in case of address is not at least 16-byte aligned.
5279       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5280       // load if needed at least for 1st address and make if 16-byte aligned.
5281       Label ALIGNED16;
5282       __ tbz(a1, 3, ALIGNED16);
5283       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5284       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5285       __ sub(cnt1, cnt1, wordSize);
5286       __ eor(tmp1, tmp1, tmp2);
5287       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5288       __ bind(ALIGNED16);
5289     }
5290     if (UseSIMDForArrayEquals) {
5291       if (SoftwarePrefetchHintDistance >= 0) {
5292         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5293         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5294         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5295             /* prfm = */ true, NOT_EQUAL);
5296         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5297         __ br(__ LT, TAIL);
5298       }
5299       __ bind(NO_PREFETCH_LARGE_LOOP);
5300       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5301           /* prfm = */ false, NOT_EQUAL);
5302     } else {
5303       __ push(spilled_regs, sp);
5304       if (SoftwarePrefetchHintDistance >= 0) {
5305         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5306         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5307         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5308             /* prfm = */ true, NOT_EQUAL);
5309         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5310         __ br(__ LT, TAIL);
5311       }
5312       __ bind(NO_PREFETCH_LARGE_LOOP);
5313       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5314           /* prfm = */ false, NOT_EQUAL);
5315     }
5316     __ bind(TAIL);
5317       __ cbz(cnt1, EQUAL);
5318       __ subs(cnt1, cnt1, wordSize);
5319       __ br(__ LE, POST_LOOP);
5320     __ bind(SMALL_LOOP);
5321       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5322       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5323       __ subs(cnt1, cnt1, wordSize);
5324       __ eor(tmp1, tmp1, tmp2);
5325       __ cbnz(tmp1, NOT_EQUAL);
5326       __ br(__ GT, SMALL_LOOP);
5327     __ bind(POST_LOOP);
5328       __ ldr(tmp1, Address(a1, cnt1));
5329       __ ldr(tmp2, Address(a2, cnt1));
5330       __ eor(tmp1, tmp1, tmp2);
5331       __ cbnz(tmp1, NOT_EQUAL);
5332     __ bind(EQUAL);
5333       __ mov(result, true);
5334     __ bind(NOT_EQUAL);
5335       if (!UseSIMDForArrayEquals) {
5336         __ pop(spilled_regs, sp);
5337       }
5338     __ bind(NOT_EQUAL_NO_POP);
5339     __ leave();
5340     __ ret(lr);
5341     return entry;
5342   }
5343 
5344   address generate_dsin_dcos(bool isCos) {
5345     __ align(CodeEntryAlignment);
5346     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5347     address start = __ pc();
5348     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5349         (address)StubRoutines::aarch64::_two_over_pi,
5350         (address)StubRoutines::aarch64::_pio2,
5351         (address)StubRoutines::aarch64::_dsin_coef,
5352         (address)StubRoutines::aarch64::_dcos_coef);
5353     return start;
5354   }
5355 
5356   address generate_dlog() {
5357     __ align(CodeEntryAlignment);
5358     StubCodeMark mark(this, "StubRoutines", "dlog");
5359     address entry = __ pc();
5360     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5361         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5362     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5363     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5364         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5365     return entry;
5366   }
5367 
5368 
5369   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5370   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5371       Label &DIFF2) {
5372     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5373     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5374 
5375     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5376     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5377     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5378     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5379 
5380     __ fmovd(tmpL, vtmp3);
5381     __ eor(rscratch2, tmp3, tmpL);
5382     __ cbnz(rscratch2, DIFF2);
5383 
5384     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5385     __ umov(tmpL, vtmp3, __ D, 1);
5386     __ eor(rscratch2, tmpU, tmpL);
5387     __ cbnz(rscratch2, DIFF1);
5388 
5389     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5390     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5391     __ fmovd(tmpL, vtmp);
5392     __ eor(rscratch2, tmp3, tmpL);
5393     __ cbnz(rscratch2, DIFF2);
5394 
5395     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5396     __ umov(tmpL, vtmp, __ D, 1);
5397     __ eor(rscratch2, tmpU, tmpL);
5398     __ cbnz(rscratch2, DIFF1);
5399   }
5400 
5401   // r0  = result
5402   // r1  = str1
5403   // r2  = cnt1
5404   // r3  = str2
5405   // r4  = cnt2
5406   // r10 = tmp1
5407   // r11 = tmp2
5408   address generate_compare_long_string_different_encoding(bool isLU) {
5409     __ align(CodeEntryAlignment);
5410     StubCodeMark mark(this, "StubRoutines", isLU
5411         ? "compare_long_string_different_encoding LU"
5412         : "compare_long_string_different_encoding UL");
5413     address entry = __ pc();
5414     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5415         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5416         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5417     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5418         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5419     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5420     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5421 
5422     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5423 
5424     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5425     // cnt2 == amount of characters left to compare
5426     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5427     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5428     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5429     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5430     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5431     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5432     __ eor(rscratch2, tmp1, tmp2);
5433     __ mov(rscratch1, tmp2);
5434     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5435     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5436              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5437     __ push(spilled_regs, sp);
5438     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5439     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5440 
5441     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5442 
5443     if (SoftwarePrefetchHintDistance >= 0) {
5444       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5445       __ br(__ LT, NO_PREFETCH);
5446       __ bind(LARGE_LOOP_PREFETCH);
5447         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5448         __ mov(tmp4, 2);
5449         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5450         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5451           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5452           __ subs(tmp4, tmp4, 1);
5453           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5454           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5455           __ mov(tmp4, 2);
5456         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5457           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5458           __ subs(tmp4, tmp4, 1);
5459           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5460           __ sub(cnt2, cnt2, 64);
5461           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5462           __ br(__ GE, LARGE_LOOP_PREFETCH);
5463     }
5464     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5465     __ bind(NO_PREFETCH);
5466     __ subs(cnt2, cnt2, 16);
5467     __ br(__ LT, TAIL);
5468     __ align(OptoLoopAlignment);
5469     __ bind(SMALL_LOOP); // smaller loop
5470       __ subs(cnt2, cnt2, 16);
5471       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5472       __ br(__ GE, SMALL_LOOP);
5473       __ cmn(cnt2, (u1)16);
5474       __ br(__ EQ, LOAD_LAST);
5475     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5476       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5477       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5478       __ ldr(tmp3, Address(cnt1, -8));
5479       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5480       __ b(LOAD_LAST);
5481     __ bind(DIFF2);
5482       __ mov(tmpU, tmp3);
5483     __ bind(DIFF1);
5484       __ pop(spilled_regs, sp);
5485       __ b(CALCULATE_DIFFERENCE);
5486     __ bind(LOAD_LAST);
5487       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5488       // No need to load it again
5489       __ mov(tmpU, tmp3);
5490       __ pop(spilled_regs, sp);
5491 
5492       // tmp2 points to the address of the last 4 Latin1 characters right now
5493       __ ldrs(vtmp, Address(tmp2));
5494       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5495       __ fmovd(tmpL, vtmp);
5496 
5497       __ eor(rscratch2, tmpU, tmpL);
5498       __ cbz(rscratch2, DONE);
5499 
5500     // Find the first different characters in the longwords and
5501     // compute their difference.
5502     __ bind(CALCULATE_DIFFERENCE);
5503       __ rev(rscratch2, rscratch2);
5504       __ clz(rscratch2, rscratch2);
5505       __ andr(rscratch2, rscratch2, -16);
5506       __ lsrv(tmp1, tmp1, rscratch2);
5507       __ uxthw(tmp1, tmp1);
5508       __ lsrv(rscratch1, rscratch1, rscratch2);
5509       __ uxthw(rscratch1, rscratch1);
5510       __ subw(result, tmp1, rscratch1);
5511     __ bind(DONE);
5512       __ ret(lr);
5513     return entry;
5514   }
5515 
5516   address generate_method_entry_barrier() {
5517     __ align(CodeEntryAlignment);
5518     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5519 
5520     Label deoptimize_label;
5521 
5522     address start = __ pc();
5523 
5524     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5525 
5526     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5527       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5528       // We can get here despite the nmethod being good, if we have not
5529       // yet applied our cross modification fence (or data fence).
5530       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5531       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5532       __ ldrw(rscratch2, rscratch2);
5533       __ strw(rscratch2, thread_epoch_addr);
5534       __ isb();
5535       __ membar(__ LoadLoad);
5536     }
5537 
5538     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5539 
5540     __ enter();
5541     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5542 
5543     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5544 
5545     __ push_call_clobbered_registers();
5546 
5547     __ mov(c_rarg0, rscratch2);
5548     __ call_VM_leaf
5549          (CAST_FROM_FN_PTR
5550           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5551 
5552     __ reset_last_Java_frame(true);
5553 
5554     __ mov(rscratch1, r0);
5555 
5556     __ pop_call_clobbered_registers();
5557 
5558     __ cbnz(rscratch1, deoptimize_label);
5559 
5560     __ leave();
5561     __ ret(lr);
5562 
5563     __ BIND(deoptimize_label);
5564 
5565     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5566     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5567 
5568     __ mov(sp, rscratch1);
5569     __ br(rscratch2);
5570 
5571     return start;
5572   }
5573 
5574   // r0  = result
5575   // r1  = str1
5576   // r2  = cnt1
5577   // r3  = str2
5578   // r4  = cnt2
5579   // r10 = tmp1
5580   // r11 = tmp2
5581   address generate_compare_long_string_same_encoding(bool isLL) {
5582     __ align(CodeEntryAlignment);
5583     StubCodeMark mark(this, "StubRoutines", isLL
5584         ? "compare_long_string_same_encoding LL"
5585         : "compare_long_string_same_encoding UU");
5586     address entry = __ pc();
5587     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5588         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5589 
5590     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5591 
5592     // exit from large loop when less than 64 bytes left to read or we're about
5593     // to prefetch memory behind array border
5594     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5595 
5596     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5597     __ eor(rscratch2, tmp1, tmp2);
5598     __ cbnz(rscratch2, CAL_DIFFERENCE);
5599 
5600     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5601     // update pointers, because of previous read
5602     __ add(str1, str1, wordSize);
5603     __ add(str2, str2, wordSize);
5604     if (SoftwarePrefetchHintDistance >= 0) {
5605       __ align(OptoLoopAlignment);
5606       __ bind(LARGE_LOOP_PREFETCH);
5607         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5608         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5609 
5610         for (int i = 0; i < 4; i++) {
5611           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5612           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5613           __ cmp(tmp1, tmp2);
5614           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5615           __ br(Assembler::NE, DIFF);
5616         }
5617         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5618         __ add(str1, str1, 64);
5619         __ add(str2, str2, 64);
5620         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5621         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5622         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5623     }
5624 
5625     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5626     __ br(Assembler::LE, LESS16);
5627     __ align(OptoLoopAlignment);
5628     __ bind(LOOP_COMPARE16);
5629       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5630       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5631       __ cmp(tmp1, tmp2);
5632       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5633       __ br(Assembler::NE, DIFF);
5634       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5635       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5636       __ br(Assembler::LT, LESS16);
5637 
5638       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5639       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5640       __ cmp(tmp1, tmp2);
5641       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5642       __ br(Assembler::NE, DIFF);
5643       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5644       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5645       __ br(Assembler::GE, LOOP_COMPARE16);
5646       __ cbz(cnt2, LENGTH_DIFF);
5647 
5648     __ bind(LESS16);
5649       // each 8 compare
5650       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5651       __ br(Assembler::LE, LESS8);
5652       __ ldr(tmp1, Address(__ post(str1, 8)));
5653       __ ldr(tmp2, Address(__ post(str2, 8)));
5654       __ eor(rscratch2, tmp1, tmp2);
5655       __ cbnz(rscratch2, CAL_DIFFERENCE);
5656       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5657 
5658     __ bind(LESS8); // directly load last 8 bytes
5659       if (!isLL) {
5660         __ add(cnt2, cnt2, cnt2);
5661       }
5662       __ ldr(tmp1, Address(str1, cnt2));
5663       __ ldr(tmp2, Address(str2, cnt2));
5664       __ eor(rscratch2, tmp1, tmp2);
5665       __ cbz(rscratch2, LENGTH_DIFF);
5666       __ b(CAL_DIFFERENCE);
5667 
5668     __ bind(DIFF);
5669       __ cmp(tmp1, tmp2);
5670       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5671       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5672       // reuse rscratch2 register for the result of eor instruction
5673       __ eor(rscratch2, tmp1, tmp2);
5674 
5675     __ bind(CAL_DIFFERENCE);
5676       __ rev(rscratch2, rscratch2);
5677       __ clz(rscratch2, rscratch2);
5678       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5679       __ lsrv(tmp1, tmp1, rscratch2);
5680       __ lsrv(tmp2, tmp2, rscratch2);
5681       if (isLL) {
5682         __ uxtbw(tmp1, tmp1);
5683         __ uxtbw(tmp2, tmp2);
5684       } else {
5685         __ uxthw(tmp1, tmp1);
5686         __ uxthw(tmp2, tmp2);
5687       }
5688       __ subw(result, tmp1, tmp2);
5689 
5690     __ bind(LENGTH_DIFF);
5691       __ ret(lr);
5692     return entry;
5693   }
5694 
5695   enum string_compare_mode {
5696     LL,
5697     LU,
5698     UL,
5699     UU,
5700   };
5701 
5702   // The following registers are declared in aarch64.ad
5703   // r0  = result
5704   // r1  = str1
5705   // r2  = cnt1
5706   // r3  = str2
5707   // r4  = cnt2
5708   // r10 = tmp1
5709   // r11 = tmp2
5710   // z0  = ztmp1
5711   // z1  = ztmp2
5712   // p0  = pgtmp1
5713   // p1  = pgtmp2
5714   address generate_compare_long_string_sve(string_compare_mode mode) {
5715     __ align(CodeEntryAlignment);
5716     address entry = __ pc();
5717     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5718              tmp1 = r10, tmp2 = r11;
5719 
5720     Label LOOP, DONE, MISMATCH;
5721     Register vec_len = tmp1;
5722     Register idx = tmp2;
5723     // The minimum of the string lengths has been stored in cnt2.
5724     Register cnt = cnt2;
5725     FloatRegister ztmp1 = z0, ztmp2 = z1;
5726     PRegister pgtmp1 = p0, pgtmp2 = p1;
5727 
5728 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5729     switch (mode) {                                                            \
5730       case LL:                                                                 \
5731         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5732         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5733         break;                                                                 \
5734       case LU:                                                                 \
5735         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5736         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5737         break;                                                                 \
5738       case UL:                                                                 \
5739         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5740         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5741         break;                                                                 \
5742       case UU:                                                                 \
5743         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5744         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5745         break;                                                                 \
5746       default:                                                                 \
5747         ShouldNotReachHere();                                                  \
5748     }
5749 
5750     const char* stubname;
5751     switch (mode) {
5752       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5753       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5754       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5755       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5756       default: ShouldNotReachHere();
5757     }
5758 
5759     StubCodeMark mark(this, "StubRoutines", stubname);
5760 
5761     __ mov(idx, 0);
5762     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5763 
5764     if (mode == LL) {
5765       __ sve_cntb(vec_len);
5766     } else {
5767       __ sve_cnth(vec_len);
5768     }
5769 
5770     __ sub(rscratch1, cnt, vec_len);
5771 
5772     __ bind(LOOP);
5773 
5774       // main loop
5775       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5776       __ add(idx, idx, vec_len);
5777       // Compare strings.
5778       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5779       __ br(__ NE, MISMATCH);
5780       __ cmp(idx, rscratch1);
5781       __ br(__ LT, LOOP);
5782 
5783     // post loop, last iteration
5784     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5785 
5786     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5787     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5788     __ br(__ EQ, DONE);
5789 
5790     __ bind(MISMATCH);
5791 
5792     // Crop the vector to find its location.
5793     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5794     // Extract the first different characters of each string.
5795     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5796     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5797 
5798     // Compute the difference of the first different characters.
5799     __ sub(result, rscratch1, rscratch2);
5800 
5801     __ bind(DONE);
5802     __ ret(lr);
5803 #undef LOAD_PAIR
5804     return entry;
5805   }
5806 
5807   void generate_compare_long_strings() {
5808     if (UseSVE == 0) {
5809       StubRoutines::aarch64::_compare_long_string_LL
5810           = generate_compare_long_string_same_encoding(true);
5811       StubRoutines::aarch64::_compare_long_string_UU
5812           = generate_compare_long_string_same_encoding(false);
5813       StubRoutines::aarch64::_compare_long_string_LU
5814           = generate_compare_long_string_different_encoding(true);
5815       StubRoutines::aarch64::_compare_long_string_UL
5816           = generate_compare_long_string_different_encoding(false);
5817     } else {
5818       StubRoutines::aarch64::_compare_long_string_LL
5819           = generate_compare_long_string_sve(LL);
5820       StubRoutines::aarch64::_compare_long_string_UU
5821           = generate_compare_long_string_sve(UU);
5822       StubRoutines::aarch64::_compare_long_string_LU
5823           = generate_compare_long_string_sve(LU);
5824       StubRoutines::aarch64::_compare_long_string_UL
5825           = generate_compare_long_string_sve(UL);
5826     }
5827   }
5828 
5829   // R0 = result
5830   // R1 = str2
5831   // R2 = cnt1
5832   // R3 = str1
5833   // R4 = cnt2
5834   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5835   //
5836   // This generic linear code use few additional ideas, which makes it faster:
5837   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5838   // in order to skip initial loading(help in systems with 1 ld pipeline)
5839   // 2) we can use "fast" algorithm of finding single character to search for
5840   // first symbol with less branches(1 branch per each loaded register instead
5841   // of branch for each symbol), so, this is where constants like
5842   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5843   // 3) after loading and analyzing 1st register of source string, it can be
5844   // used to search for every 1st character entry, saving few loads in
5845   // comparison with "simplier-but-slower" implementation
5846   // 4) in order to avoid lots of push/pop operations, code below is heavily
5847   // re-using/re-initializing/compressing register values, which makes code
5848   // larger and a bit less readable, however, most of extra operations are
5849   // issued during loads or branches, so, penalty is minimal
5850   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5851     const char* stubName = str1_isL
5852         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5853         : "indexof_linear_uu";
5854     __ align(CodeEntryAlignment);
5855     StubCodeMark mark(this, "StubRoutines", stubName);
5856     address entry = __ pc();
5857 
5858     int str1_chr_size = str1_isL ? 1 : 2;
5859     int str2_chr_size = str2_isL ? 1 : 2;
5860     int str1_chr_shift = str1_isL ? 0 : 1;
5861     int str2_chr_shift = str2_isL ? 0 : 1;
5862     bool isL = str1_isL && str2_isL;
5863    // parameters
5864     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5865     // temporary registers
5866     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5867     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5868     // redefinitions
5869     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5870 
5871     __ push(spilled_regs, sp);
5872     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5873         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5874         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5875         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5876         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5877         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5878     // Read whole register from str1. It is safe, because length >=8 here
5879     __ ldr(ch1, Address(str1));
5880     // Read whole register from str2. It is safe, because length >=8 here
5881     __ ldr(ch2, Address(str2));
5882     __ sub(cnt2, cnt2, cnt1);
5883     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5884     if (str1_isL != str2_isL) {
5885       __ eor(v0, __ T16B, v0, v0);
5886     }
5887     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5888     __ mul(first, first, tmp1);
5889     // check if we have less than 1 register to check
5890     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5891     if (str1_isL != str2_isL) {
5892       __ fmovd(v1, ch1);
5893     }
5894     __ br(__ LE, L_SMALL);
5895     __ eor(ch2, first, ch2);
5896     if (str1_isL != str2_isL) {
5897       __ zip1(v1, __ T16B, v1, v0);
5898     }
5899     __ sub(tmp2, ch2, tmp1);
5900     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5901     __ bics(tmp2, tmp2, ch2);
5902     if (str1_isL != str2_isL) {
5903       __ fmovd(ch1, v1);
5904     }
5905     __ br(__ NE, L_HAS_ZERO);
5906     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5907     __ add(result, result, wordSize/str2_chr_size);
5908     __ add(str2, str2, wordSize);
5909     __ br(__ LT, L_POST_LOOP);
5910     __ BIND(L_LOOP);
5911       __ ldr(ch2, Address(str2));
5912       __ eor(ch2, first, ch2);
5913       __ sub(tmp2, ch2, tmp1);
5914       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5915       __ bics(tmp2, tmp2, ch2);
5916       __ br(__ NE, L_HAS_ZERO);
5917     __ BIND(L_LOOP_PROCEED);
5918       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5919       __ add(str2, str2, wordSize);
5920       __ add(result, result, wordSize/str2_chr_size);
5921       __ br(__ GE, L_LOOP);
5922     __ BIND(L_POST_LOOP);
5923       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5924       __ br(__ LE, NOMATCH);
5925       __ ldr(ch2, Address(str2));
5926       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5927       __ eor(ch2, first, ch2);
5928       __ sub(tmp2, ch2, tmp1);
5929       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5930       __ mov(tmp4, -1); // all bits set
5931       __ b(L_SMALL_PROCEED);
5932     __ align(OptoLoopAlignment);
5933     __ BIND(L_SMALL);
5934       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5935       __ eor(ch2, first, ch2);
5936       if (str1_isL != str2_isL) {
5937         __ zip1(v1, __ T16B, v1, v0);
5938       }
5939       __ sub(tmp2, ch2, tmp1);
5940       __ mov(tmp4, -1); // all bits set
5941       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5942       if (str1_isL != str2_isL) {
5943         __ fmovd(ch1, v1); // move converted 4 symbols
5944       }
5945     __ BIND(L_SMALL_PROCEED);
5946       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5947       __ bic(tmp2, tmp2, ch2);
5948       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5949       __ rbit(tmp2, tmp2);
5950       __ br(__ EQ, NOMATCH);
5951     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5952       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5953       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5954       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5955       if (str2_isL) { // LL
5956         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5957         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5958         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5959         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5960         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5961       } else {
5962         __ mov(ch2, 0xE); // all bits in byte set except last one
5963         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5964         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5965         __ lslv(tmp2, tmp2, tmp4);
5966         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5967         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5968         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5969         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5970       }
5971       __ cmp(ch1, ch2);
5972       __ mov(tmp4, wordSize/str2_chr_size);
5973       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5974     __ BIND(L_SMALL_CMP_LOOP);
5975       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5976                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5977       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5978                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5979       __ add(tmp4, tmp4, 1);
5980       __ cmp(tmp4, cnt1);
5981       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5982       __ cmp(first, ch2);
5983       __ br(__ EQ, L_SMALL_CMP_LOOP);
5984     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5985       __ cbz(tmp2, NOMATCH); // no more matches. exit
5986       __ clz(tmp4, tmp2);
5987       __ add(result, result, 1); // advance index
5988       __ add(str2, str2, str2_chr_size); // advance pointer
5989       __ b(L_SMALL_HAS_ZERO_LOOP);
5990     __ align(OptoLoopAlignment);
5991     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5992       __ cmp(first, ch2);
5993       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5994       __ b(DONE);
5995     __ align(OptoLoopAlignment);
5996     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5997       if (str2_isL) { // LL
5998         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5999         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6000         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6001         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6002         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6003       } else {
6004         __ mov(ch2, 0xE); // all bits in byte set except last one
6005         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6006         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6007         __ lslv(tmp2, tmp2, tmp4);
6008         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6009         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6010         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6011         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6012       }
6013       __ cmp(ch1, ch2);
6014       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6015       __ b(DONE);
6016     __ align(OptoLoopAlignment);
6017     __ BIND(L_HAS_ZERO);
6018       __ rbit(tmp2, tmp2);
6019       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6020       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6021       // It's fine because both counters are 32bit and are not changed in this
6022       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6023       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6024       __ sub(result, result, 1);
6025     __ BIND(L_HAS_ZERO_LOOP);
6026       __ mov(cnt1, wordSize/str2_chr_size);
6027       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6028       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6029       if (str2_isL) {
6030         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6031         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6032         __ lslv(tmp2, tmp2, tmp4);
6033         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6034         __ add(tmp4, tmp4, 1);
6035         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6036         __ lsl(tmp2, tmp2, 1);
6037         __ mov(tmp4, wordSize/str2_chr_size);
6038       } else {
6039         __ mov(ch2, 0xE);
6040         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6041         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6042         __ lslv(tmp2, tmp2, tmp4);
6043         __ add(tmp4, tmp4, 1);
6044         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6045         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6046         __ lsl(tmp2, tmp2, 1);
6047         __ mov(tmp4, wordSize/str2_chr_size);
6048         __ sub(str2, str2, str2_chr_size);
6049       }
6050       __ cmp(ch1, ch2);
6051       __ mov(tmp4, wordSize/str2_chr_size);
6052       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6053     __ BIND(L_CMP_LOOP);
6054       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6055                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6056       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6057                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6058       __ add(tmp4, tmp4, 1);
6059       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6060       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6061       __ cmp(cnt1, ch2);
6062       __ br(__ EQ, L_CMP_LOOP);
6063     __ BIND(L_CMP_LOOP_NOMATCH);
6064       // here we're not matched
6065       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6066       __ clz(tmp4, tmp2);
6067       __ add(str2, str2, str2_chr_size); // advance pointer
6068       __ b(L_HAS_ZERO_LOOP);
6069     __ align(OptoLoopAlignment);
6070     __ BIND(L_CMP_LOOP_LAST_CMP);
6071       __ cmp(cnt1, ch2);
6072       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6073       __ b(DONE);
6074     __ align(OptoLoopAlignment);
6075     __ BIND(L_CMP_LOOP_LAST_CMP2);
6076       if (str2_isL) {
6077         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6078         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6079         __ lslv(tmp2, tmp2, tmp4);
6080         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6081         __ add(tmp4, tmp4, 1);
6082         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6083         __ lsl(tmp2, tmp2, 1);
6084       } else {
6085         __ mov(ch2, 0xE);
6086         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6087         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6088         __ lslv(tmp2, tmp2, tmp4);
6089         __ add(tmp4, tmp4, 1);
6090         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6091         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6092         __ lsl(tmp2, tmp2, 1);
6093         __ sub(str2, str2, str2_chr_size);
6094       }
6095       __ cmp(ch1, ch2);
6096       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6097       __ b(DONE);
6098     __ align(OptoLoopAlignment);
6099     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6100       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6101       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6102       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6103       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6104       // result by analyzed characters value, so, we can just reset lower bits
6105       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6106       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6107       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6108       // index of last analyzed substring inside current octet. So, str2 in at
6109       // respective start address. We need to advance it to next octet
6110       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6111       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6112       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6113       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6114       __ movw(cnt2, cnt2);
6115       __ b(L_LOOP_PROCEED);
6116     __ align(OptoLoopAlignment);
6117     __ BIND(NOMATCH);
6118       __ mov(result, -1);
6119     __ BIND(DONE);
6120       __ pop(spilled_regs, sp);
6121       __ ret(lr);
6122     return entry;
6123   }
6124 
6125   void generate_string_indexof_stubs() {
6126     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6127     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6128     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6129   }
6130 
6131   void inflate_and_store_2_fp_registers(bool generatePrfm,
6132       FloatRegister src1, FloatRegister src2) {
6133     Register dst = r1;
6134     __ zip1(v1, __ T16B, src1, v0);
6135     __ zip2(v2, __ T16B, src1, v0);
6136     if (generatePrfm) {
6137       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6138     }
6139     __ zip1(v3, __ T16B, src2, v0);
6140     __ zip2(v4, __ T16B, src2, v0);
6141     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6142   }
6143 
6144   // R0 = src
6145   // R1 = dst
6146   // R2 = len
6147   // R3 = len >> 3
6148   // V0 = 0
6149   // v1 = loaded 8 bytes
6150   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6151   address generate_large_byte_array_inflate() {
6152     __ align(CodeEntryAlignment);
6153     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6154     address entry = __ pc();
6155     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6156     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6157     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6158 
6159     // do one more 8-byte read to have address 16-byte aligned in most cases
6160     // also use single store instruction
6161     __ ldrd(v2, __ post(src, 8));
6162     __ sub(octetCounter, octetCounter, 2);
6163     __ zip1(v1, __ T16B, v1, v0);
6164     __ zip1(v2, __ T16B, v2, v0);
6165     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6166     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6167     __ subs(rscratch1, octetCounter, large_loop_threshold);
6168     __ br(__ LE, LOOP_START);
6169     __ b(LOOP_PRFM_START);
6170     __ bind(LOOP_PRFM);
6171       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6172     __ bind(LOOP_PRFM_START);
6173       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6174       __ sub(octetCounter, octetCounter, 8);
6175       __ subs(rscratch1, octetCounter, large_loop_threshold);
6176       inflate_and_store_2_fp_registers(true, v3, v4);
6177       inflate_and_store_2_fp_registers(true, v5, v6);
6178       __ br(__ GT, LOOP_PRFM);
6179       __ cmp(octetCounter, (u1)8);
6180       __ br(__ LT, DONE);
6181     __ bind(LOOP);
6182       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6183       __ bind(LOOP_START);
6184       __ sub(octetCounter, octetCounter, 8);
6185       __ cmp(octetCounter, (u1)8);
6186       inflate_and_store_2_fp_registers(false, v3, v4);
6187       inflate_and_store_2_fp_registers(false, v5, v6);
6188       __ br(__ GE, LOOP);
6189     __ bind(DONE);
6190       __ ret(lr);
6191     return entry;
6192   }
6193 
6194   /**
6195    *  Arguments:
6196    *
6197    *  Input:
6198    *  c_rarg0   - current state address
6199    *  c_rarg1   - H key address
6200    *  c_rarg2   - data address
6201    *  c_rarg3   - number of blocks
6202    *
6203    *  Output:
6204    *  Updated state at c_rarg0
6205    */
6206   address generate_ghash_processBlocks() {
6207     // Bafflingly, GCM uses little-endian for the byte order, but
6208     // big-endian for the bit order.  For example, the polynomial 1 is
6209     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6210     //
6211     // So, we must either reverse the bytes in each word and do
6212     // everything big-endian or reverse the bits in each byte and do
6213     // it little-endian.  On AArch64 it's more idiomatic to reverse
6214     // the bits in each byte (we have an instruction, RBIT, to do
6215     // that) and keep the data in little-endian bit order through the
6216     // calculation, bit-reversing the inputs and outputs.
6217 
6218     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6219     __ align(wordSize * 2);
6220     address p = __ pc();
6221     __ emit_int64(0x87);  // The low-order bits of the field
6222                           // polynomial (i.e. p = z^7+z^2+z+1)
6223                           // repeated in the low and high parts of a
6224                           // 128-bit vector
6225     __ emit_int64(0x87);
6226 
6227     __ align(CodeEntryAlignment);
6228     address start = __ pc();
6229 
6230     Register state   = c_rarg0;
6231     Register subkeyH = c_rarg1;
6232     Register data    = c_rarg2;
6233     Register blocks  = c_rarg3;
6234 
6235     FloatRegister vzr = v30;
6236     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6237 
6238     __ ldrq(v24, p);    // The field polynomial
6239 
6240     __ ldrq(v0, Address(state));
6241     __ ldrq(v1, Address(subkeyH));
6242 
6243     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6244     __ rbit(v0, __ T16B, v0);
6245     __ rev64(v1, __ T16B, v1);
6246     __ rbit(v1, __ T16B, v1);
6247 
6248     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6249     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6250 
6251     {
6252       Label L_ghash_loop;
6253       __ bind(L_ghash_loop);
6254 
6255       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6256                                                  // reversing each byte
6257       __ rbit(v2, __ T16B, v2);
6258       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6259 
6260       // Multiply state in v2 by subkey in v1
6261       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6262                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6263                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6264       // Reduce v7:v5 by the field polynomial
6265       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6266 
6267       __ sub(blocks, blocks, 1);
6268       __ cbnz(blocks, L_ghash_loop);
6269     }
6270 
6271     // The bit-reversed result is at this point in v0
6272     __ rev64(v0, __ T16B, v0);
6273     __ rbit(v0, __ T16B, v0);
6274 
6275     __ st1(v0, __ T16B, state);
6276     __ ret(lr);
6277 
6278     return start;
6279   }
6280 
6281   address generate_ghash_processBlocks_wide() {
6282     address small = generate_ghash_processBlocks();
6283 
6284     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6285     __ align(wordSize * 2);
6286     address p = __ pc();
6287     __ emit_int64(0x87);  // The low-order bits of the field
6288                           // polynomial (i.e. p = z^7+z^2+z+1)
6289                           // repeated in the low and high parts of a
6290                           // 128-bit vector
6291     __ emit_int64(0x87);
6292 
6293     __ align(CodeEntryAlignment);
6294     address start = __ pc();
6295 
6296     Register state   = c_rarg0;
6297     Register subkeyH = c_rarg1;
6298     Register data    = c_rarg2;
6299     Register blocks  = c_rarg3;
6300 
6301     const int unroll = 4;
6302 
6303     __ cmp(blocks, (unsigned char)(unroll * 2));
6304     __ br(__ LT, small);
6305 
6306     if (unroll > 1) {
6307     // Save state before entering routine
6308       __ sub(sp, sp, 4 * 16);
6309       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6310       __ sub(sp, sp, 4 * 16);
6311       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6312     }
6313 
6314     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6315 
6316     if (unroll > 1) {
6317       // And restore state
6318       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6319       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6320     }
6321 
6322     __ cmp(blocks, (unsigned char)0);
6323     __ br(__ GT, small);
6324 
6325     __ ret(lr);
6326 
6327     return start;
6328   }
6329 
6330   void generate_base64_encode_simdround(Register src, Register dst,
6331         FloatRegister codec, u8 size) {
6332 
6333     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6334     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6335     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6336 
6337     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6338 
6339     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6340 
6341     __ ushr(ind0, arrangement, in0,  2);
6342 
6343     __ ushr(ind1, arrangement, in1,  2);
6344     __ shl(in0,   arrangement, in0,  6);
6345     __ orr(ind1,  arrangement, ind1, in0);
6346     __ ushr(ind1, arrangement, ind1, 2);
6347 
6348     __ ushr(ind2, arrangement, in2,  4);
6349     __ shl(in1,   arrangement, in1,  4);
6350     __ orr(ind2,  arrangement, in1,  ind2);
6351     __ ushr(ind2, arrangement, ind2, 2);
6352 
6353     __ shl(ind3,  arrangement, in2,  2);
6354     __ ushr(ind3, arrangement, ind3, 2);
6355 
6356     __ tbl(out0,  arrangement, codec,  4, ind0);
6357     __ tbl(out1,  arrangement, codec,  4, ind1);
6358     __ tbl(out2,  arrangement, codec,  4, ind2);
6359     __ tbl(out3,  arrangement, codec,  4, ind3);
6360 
6361     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6362   }
6363 
6364    /**
6365    *  Arguments:
6366    *
6367    *  Input:
6368    *  c_rarg0   - src_start
6369    *  c_rarg1   - src_offset
6370    *  c_rarg2   - src_length
6371    *  c_rarg3   - dest_start
6372    *  c_rarg4   - dest_offset
6373    *  c_rarg5   - isURL
6374    *
6375    */
6376   address generate_base64_encodeBlock() {
6377 
6378     static const char toBase64[64] = {
6379       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6380       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6381       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6382       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6383       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6384     };
6385 
6386     static const char toBase64URL[64] = {
6387       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6388       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6389       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6390       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6391       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6392     };
6393 
6394     __ align(CodeEntryAlignment);
6395     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6396     address start = __ pc();
6397 
6398     Register src   = c_rarg0;  // source array
6399     Register soff  = c_rarg1;  // source start offset
6400     Register send  = c_rarg2;  // source end offset
6401     Register dst   = c_rarg3;  // dest array
6402     Register doff  = c_rarg4;  // position for writing to dest array
6403     Register isURL = c_rarg5;  // Base64 or URL character set
6404 
6405     // c_rarg6 and c_rarg7 are free to use as temps
6406     Register codec  = c_rarg6;
6407     Register length = c_rarg7;
6408 
6409     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6410 
6411     __ add(src, src, soff);
6412     __ add(dst, dst, doff);
6413     __ sub(length, send, soff);
6414 
6415     // load the codec base address
6416     __ lea(codec, ExternalAddress((address) toBase64));
6417     __ cbz(isURL, ProcessData);
6418     __ lea(codec, ExternalAddress((address) toBase64URL));
6419 
6420     __ BIND(ProcessData);
6421 
6422     // too short to formup a SIMD loop, roll back
6423     __ cmp(length, (u1)24);
6424     __ br(Assembler::LT, Process3B);
6425 
6426     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6427 
6428     __ BIND(Process48B);
6429     __ cmp(length, (u1)48);
6430     __ br(Assembler::LT, Process24B);
6431     generate_base64_encode_simdround(src, dst, v0, 16);
6432     __ sub(length, length, 48);
6433     __ b(Process48B);
6434 
6435     __ BIND(Process24B);
6436     __ cmp(length, (u1)24);
6437     __ br(Assembler::LT, SIMDExit);
6438     generate_base64_encode_simdround(src, dst, v0, 8);
6439     __ sub(length, length, 24);
6440 
6441     __ BIND(SIMDExit);
6442     __ cbz(length, Exit);
6443 
6444     __ BIND(Process3B);
6445     //  3 src bytes, 24 bits
6446     __ ldrb(r10, __ post(src, 1));
6447     __ ldrb(r11, __ post(src, 1));
6448     __ ldrb(r12, __ post(src, 1));
6449     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6450     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6451     // codec index
6452     __ ubfmw(r15, r12, 18, 23);
6453     __ ubfmw(r14, r12, 12, 17);
6454     __ ubfmw(r13, r12, 6,  11);
6455     __ andw(r12,  r12, 63);
6456     // get the code based on the codec
6457     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6458     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6459     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6460     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6461     __ strb(r15, __ post(dst, 1));
6462     __ strb(r14, __ post(dst, 1));
6463     __ strb(r13, __ post(dst, 1));
6464     __ strb(r12, __ post(dst, 1));
6465     __ sub(length, length, 3);
6466     __ cbnz(length, Process3B);
6467 
6468     __ BIND(Exit);
6469     __ ret(lr);
6470 
6471     return start;
6472   }
6473 
6474   void generate_base64_decode_simdround(Register src, Register dst,
6475         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6476 
6477     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6478     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6479 
6480     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6481     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6482 
6483     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6484 
6485     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6486 
6487     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6488 
6489     // we need unsigned saturating subtract, to make sure all input values
6490     // in range [0, 63] will have 0U value in the higher half lookup
6491     __ uqsubv(decH0, __ T16B, in0, v27);
6492     __ uqsubv(decH1, __ T16B, in1, v27);
6493     __ uqsubv(decH2, __ T16B, in2, v27);
6494     __ uqsubv(decH3, __ T16B, in3, v27);
6495 
6496     // lower half lookup
6497     __ tbl(decL0, arrangement, codecL, 4, in0);
6498     __ tbl(decL1, arrangement, codecL, 4, in1);
6499     __ tbl(decL2, arrangement, codecL, 4, in2);
6500     __ tbl(decL3, arrangement, codecL, 4, in3);
6501 
6502     // higher half lookup
6503     __ tbx(decH0, arrangement, codecH, 4, decH0);
6504     __ tbx(decH1, arrangement, codecH, 4, decH1);
6505     __ tbx(decH2, arrangement, codecH, 4, decH2);
6506     __ tbx(decH3, arrangement, codecH, 4, decH3);
6507 
6508     // combine lower and higher
6509     __ orr(decL0, arrangement, decL0, decH0);
6510     __ orr(decL1, arrangement, decL1, decH1);
6511     __ orr(decL2, arrangement, decL2, decH2);
6512     __ orr(decL3, arrangement, decL3, decH3);
6513 
6514     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6515     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6516     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6517     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6518     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6519     __ orr(in0, arrangement, decH0, decH1);
6520     __ orr(in1, arrangement, decH2, decH3);
6521     __ orr(in2, arrangement, in0,   in1);
6522     __ umaxv(in3, arrangement, in2);
6523     __ umov(rscratch2, in3, __ B, 0);
6524 
6525     // get the data to output
6526     __ shl(out0,  arrangement, decL0, 2);
6527     __ ushr(out1, arrangement, decL1, 4);
6528     __ orr(out0,  arrangement, out0,  out1);
6529     __ shl(out1,  arrangement, decL1, 4);
6530     __ ushr(out2, arrangement, decL2, 2);
6531     __ orr(out1,  arrangement, out1,  out2);
6532     __ shl(out2,  arrangement, decL2, 6);
6533     __ orr(out2,  arrangement, out2,  decL3);
6534 
6535     __ cbz(rscratch2, NoIllegalData);
6536 
6537     // handle illegal input
6538     __ umov(r10, in2, __ D, 0);
6539     if (size == 16) {
6540       __ cbnz(r10, ErrorInLowerHalf);
6541 
6542       // illegal input is in higher half, store the lower half now.
6543       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6544 
6545       __ umov(r10, in2,  __ D, 1);
6546       __ umov(r11, out0, __ D, 1);
6547       __ umov(r12, out1, __ D, 1);
6548       __ umov(r13, out2, __ D, 1);
6549       __ b(StoreLegalData);
6550 
6551       __ BIND(ErrorInLowerHalf);
6552     }
6553     __ umov(r11, out0, __ D, 0);
6554     __ umov(r12, out1, __ D, 0);
6555     __ umov(r13, out2, __ D, 0);
6556 
6557     __ BIND(StoreLegalData);
6558     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6559     __ strb(r11, __ post(dst, 1));
6560     __ strb(r12, __ post(dst, 1));
6561     __ strb(r13, __ post(dst, 1));
6562     __ lsr(r10, r10, 8);
6563     __ lsr(r11, r11, 8);
6564     __ lsr(r12, r12, 8);
6565     __ lsr(r13, r13, 8);
6566     __ b(StoreLegalData);
6567 
6568     __ BIND(NoIllegalData);
6569     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6570   }
6571 
6572 
6573    /**
6574    *  Arguments:
6575    *
6576    *  Input:
6577    *  c_rarg0   - src_start
6578    *  c_rarg1   - src_offset
6579    *  c_rarg2   - src_length
6580    *  c_rarg3   - dest_start
6581    *  c_rarg4   - dest_offset
6582    *  c_rarg5   - isURL
6583    *  c_rarg6   - isMIME
6584    *
6585    */
6586   address generate_base64_decodeBlock() {
6587 
6588     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6589     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6590     // titled "Base64 decoding".
6591 
6592     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6593     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6594     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6595     static const uint8_t fromBase64ForNoSIMD[256] = {
6596       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6597       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6598       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6599        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6600       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6601        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6602       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6603        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6604       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6605       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6606       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6607       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6608       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6609       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6610       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6611       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6612     };
6613 
6614     static const uint8_t fromBase64URLForNoSIMD[256] = {
6615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6616       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6617       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6618        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6619       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6620        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6621       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6622        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6623       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6624       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6625       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6626       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6627       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6628       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6629       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6630       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6631     };
6632 
6633     // A legal value of base64 code is in range [0, 127].  We need two lookups
6634     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6635     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6636     // table vector lookup use tbx, out of range indices are unchanged in
6637     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6638     // The value of index 64 is set to 0, so that we know that we already get the
6639     // decoded data with the 1st lookup.
6640     static const uint8_t fromBase64ForSIMD[128] = {
6641       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6642       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6643       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6644        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6645         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6646        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6647       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6648        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6649     };
6650 
6651     static const uint8_t fromBase64URLForSIMD[128] = {
6652       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6653       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6654       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6655        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6656         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6657        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6658        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6659        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6660     };
6661 
6662     __ align(CodeEntryAlignment);
6663     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6664     address start = __ pc();
6665 
6666     Register src    = c_rarg0;  // source array
6667     Register soff   = c_rarg1;  // source start offset
6668     Register send   = c_rarg2;  // source end offset
6669     Register dst    = c_rarg3;  // dest array
6670     Register doff   = c_rarg4;  // position for writing to dest array
6671     Register isURL  = c_rarg5;  // Base64 or URL character set
6672     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6673 
6674     Register length = send;    // reuse send as length of source data to process
6675 
6676     Register simd_codec   = c_rarg6;
6677     Register nosimd_codec = c_rarg7;
6678 
6679     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6680 
6681     __ enter();
6682 
6683     __ add(src, src, soff);
6684     __ add(dst, dst, doff);
6685 
6686     __ mov(doff, dst);
6687 
6688     __ sub(length, send, soff);
6689     __ bfm(length, zr, 0, 1);
6690 
6691     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6692     __ cbz(isURL, ProcessData);
6693     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6694 
6695     __ BIND(ProcessData);
6696     __ mov(rscratch1, length);
6697     __ cmp(length, (u1)144); // 144 = 80 + 64
6698     __ br(Assembler::LT, Process4B);
6699 
6700     // In the MIME case, the line length cannot be more than 76
6701     // bytes (see RFC 2045). This is too short a block for SIMD
6702     // to be worthwhile, so we use non-SIMD here.
6703     __ movw(rscratch1, 79);
6704 
6705     __ BIND(Process4B);
6706     __ ldrw(r14, __ post(src, 4));
6707     __ ubfxw(r10, r14, 0,  8);
6708     __ ubfxw(r11, r14, 8,  8);
6709     __ ubfxw(r12, r14, 16, 8);
6710     __ ubfxw(r13, r14, 24, 8);
6711     // get the de-code
6712     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6713     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6714     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6715     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6716     // error detection, 255u indicates an illegal input
6717     __ orrw(r14, r10, r11);
6718     __ orrw(r15, r12, r13);
6719     __ orrw(r14, r14, r15);
6720     __ tbnz(r14, 7, Exit);
6721     // recover the data
6722     __ lslw(r14, r10, 10);
6723     __ bfiw(r14, r11, 4, 6);
6724     __ bfmw(r14, r12, 2, 5);
6725     __ rev16w(r14, r14);
6726     __ bfiw(r13, r12, 6, 2);
6727     __ strh(r14, __ post(dst, 2));
6728     __ strb(r13, __ post(dst, 1));
6729     // non-simd loop
6730     __ subsw(rscratch1, rscratch1, 4);
6731     __ br(Assembler::GT, Process4B);
6732 
6733     // if exiting from PreProcess80B, rscratch1 == -1;
6734     // otherwise, rscratch1 == 0.
6735     __ cbzw(rscratch1, Exit);
6736     __ sub(length, length, 80);
6737 
6738     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6739     __ cbz(isURL, SIMDEnter);
6740     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6741 
6742     __ BIND(SIMDEnter);
6743     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6744     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6745     __ mov(rscratch1, 63);
6746     __ dup(v27, __ T16B, rscratch1);
6747 
6748     __ BIND(Process64B);
6749     __ cmp(length, (u1)64);
6750     __ br(Assembler::LT, Process32B);
6751     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6752     __ sub(length, length, 64);
6753     __ b(Process64B);
6754 
6755     __ BIND(Process32B);
6756     __ cmp(length, (u1)32);
6757     __ br(Assembler::LT, SIMDExit);
6758     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6759     __ sub(length, length, 32);
6760     __ b(Process32B);
6761 
6762     __ BIND(SIMDExit);
6763     __ cbz(length, Exit);
6764     __ movw(rscratch1, length);
6765     __ b(Process4B);
6766 
6767     __ BIND(Exit);
6768     __ sub(c_rarg0, dst, doff);
6769 
6770     __ leave();
6771     __ ret(lr);
6772 
6773     return start;
6774   }
6775 
6776   // Support for spin waits.
6777   address generate_spin_wait() {
6778     __ align(CodeEntryAlignment);
6779     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6780     address start = __ pc();
6781 
6782     __ spin_wait();
6783     __ ret(lr);
6784 
6785     return start;
6786   }
6787 
6788 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6789 
6790   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6791   //
6792   // If LSE is in use, generate LSE versions of all the stubs. The
6793   // non-LSE versions are in atomic_aarch64.S.
6794 
6795   // class AtomicStubMark records the entry point of a stub and the
6796   // stub pointer which will point to it. The stub pointer is set to
6797   // the entry point when ~AtomicStubMark() is called, which must be
6798   // after ICache::invalidate_range. This ensures safe publication of
6799   // the generated code.
6800   class AtomicStubMark {
6801     address _entry_point;
6802     aarch64_atomic_stub_t *_stub;
6803     MacroAssembler *_masm;
6804   public:
6805     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6806       _masm = masm;
6807       __ align(32);
6808       _entry_point = __ pc();
6809       _stub = stub;
6810     }
6811     ~AtomicStubMark() {
6812       *_stub = (aarch64_atomic_stub_t)_entry_point;
6813     }
6814   };
6815 
6816   // NB: For memory_order_conservative we need a trailing membar after
6817   // LSE atomic operations but not a leading membar.
6818   //
6819   // We don't need a leading membar because a clause in the Arm ARM
6820   // says:
6821   //
6822   //   Barrier-ordered-before
6823   //
6824   //   Barrier instructions order prior Memory effects before subsequent
6825   //   Memory effects generated by the same Observer. A read or a write
6826   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6827   //   Observer if and only if RW1 appears in program order before RW 2
6828   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6829   //   instruction with both Acquire and Release semantics.
6830   //
6831   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6832   // and Release semantics, therefore we don't need a leading
6833   // barrier. However, there is no corresponding Barrier-ordered-after
6834   // relationship, therefore we need a trailing membar to prevent a
6835   // later store or load from being reordered with the store in an
6836   // atomic instruction.
6837   //
6838   // This was checked by using the herd7 consistency model simulator
6839   // (http://diy.inria.fr/) with this test case:
6840   //
6841   // AArch64 LseCas
6842   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6843   // P0 | P1;
6844   // LDR W4, [X2] | MOV W3, #0;
6845   // DMB LD       | MOV W4, #1;
6846   // LDR W3, [X1] | CASAL W3, W4, [X1];
6847   //              | DMB ISH;
6848   //              | STR W4, [X2];
6849   // exists
6850   // (0:X3=0 /\ 0:X4=1)
6851   //
6852   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6853   // with the store to x in P1. Without the DMB in P1 this may happen.
6854   //
6855   // At the time of writing we don't know of any AArch64 hardware that
6856   // reorders stores in this way, but the Reference Manual permits it.
6857 
6858   void gen_cas_entry(Assembler::operand_size size,
6859                      atomic_memory_order order) {
6860     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6861       exchange_val = c_rarg2;
6862     bool acquire, release;
6863     switch (order) {
6864       case memory_order_relaxed:
6865         acquire = false;
6866         release = false;
6867         break;
6868       case memory_order_release:
6869         acquire = false;
6870         release = true;
6871         break;
6872       default:
6873         acquire = true;
6874         release = true;
6875         break;
6876     }
6877     __ mov(prev, compare_val);
6878     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6879     if (order == memory_order_conservative) {
6880       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6881     }
6882     if (size == Assembler::xword) {
6883       __ mov(r0, prev);
6884     } else {
6885       __ movw(r0, prev);
6886     }
6887     __ ret(lr);
6888   }
6889 
6890   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6891     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6892     // If not relaxed, then default to conservative.  Relaxed is the only
6893     // case we use enough to be worth specializing.
6894     if (order == memory_order_relaxed) {
6895       __ ldadd(size, incr, prev, addr);
6896     } else {
6897       __ ldaddal(size, incr, prev, addr);
6898       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6899     }
6900     if (size == Assembler::xword) {
6901       __ mov(r0, prev);
6902     } else {
6903       __ movw(r0, prev);
6904     }
6905     __ ret(lr);
6906   }
6907 
6908   void gen_swpal_entry(Assembler::operand_size size) {
6909     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6910     __ swpal(size, incr, prev, addr);
6911     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6912     if (size == Assembler::xword) {
6913       __ mov(r0, prev);
6914     } else {
6915       __ movw(r0, prev);
6916     }
6917     __ ret(lr);
6918   }
6919 
6920   void generate_atomic_entry_points() {
6921     if (! UseLSE) {
6922       return;
6923     }
6924 
6925     __ align(CodeEntryAlignment);
6926     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6927     address first_entry = __ pc();
6928 
6929     // ADD, memory_order_conservative
6930     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6931     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6932     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6933     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6934 
6935     // ADD, memory_order_relaxed
6936     AtomicStubMark mark_fetch_add_4_relaxed
6937       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6938     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6939     AtomicStubMark mark_fetch_add_8_relaxed
6940       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6941     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6942 
6943     // XCHG, memory_order_conservative
6944     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6945     gen_swpal_entry(Assembler::word);
6946     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6947     gen_swpal_entry(Assembler::xword);
6948 
6949     // CAS, memory_order_conservative
6950     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6951     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6952     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6953     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6954     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6955     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6956 
6957     // CAS, memory_order_relaxed
6958     AtomicStubMark mark_cmpxchg_1_relaxed
6959       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6960     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6961     AtomicStubMark mark_cmpxchg_4_relaxed
6962       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6963     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6964     AtomicStubMark mark_cmpxchg_8_relaxed
6965       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6966     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6967 
6968     AtomicStubMark mark_cmpxchg_4_release
6969       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6970     gen_cas_entry(MacroAssembler::word, memory_order_release);
6971     AtomicStubMark mark_cmpxchg_8_release
6972       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6973     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6974 
6975     AtomicStubMark mark_cmpxchg_4_seq_cst
6976       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6977     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6978     AtomicStubMark mark_cmpxchg_8_seq_cst
6979       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6980     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6981 
6982     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6983   }
6984 #endif // LINUX
6985 
6986   address generate_cont_thaw(Continuation::thaw_kind kind) {
6987     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6988     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6989 
6990     address start = __ pc();
6991 
6992     if (return_barrier) {
6993       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6994       __ mov(sp, rscratch1);
6995     }
6996     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6997 
6998     if (return_barrier) {
6999       // preserve possible return value from a method returning to the return barrier
7000       __ fmovd(rscratch1, v0);
7001       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7002     }
7003 
7004     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7005     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7006     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7007 
7008     if (return_barrier) {
7009       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7010       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7011       __ fmovd(v0, rscratch1);
7012     }
7013     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7014 
7015 
7016     Label thaw_success;
7017     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7018     __ cbnz(rscratch2, thaw_success);
7019     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
7020     __ br(rscratch1);
7021     __ bind(thaw_success);
7022 
7023     // make room for the thawed frames
7024     __ sub(rscratch1, sp, rscratch2);
7025     __ andr(rscratch1, rscratch1, -16); // align
7026     __ mov(sp, rscratch1);
7027 
7028     if (return_barrier) {
7029       // save original return value -- again
7030       __ fmovd(rscratch1, v0);
7031       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7032     }
7033 
7034     // If we want, we can templatize thaw by kind, and have three different entries
7035     __ movw(c_rarg1, (uint32_t)kind);
7036 
7037     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7038     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7039 
7040     if (return_barrier) {
7041       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7042       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7043       __ fmovd(v0, rscratch1);
7044     } else {
7045       __ mov(r0, zr); // return 0 (success) from doYield
7046     }
7047 
7048     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7049     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7050     __ mov(rfp, sp);
7051 
7052     if (return_barrier_exception) {
7053       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7054       __ authenticate_return_address(c_rarg1);
7055       __ verify_oop(r0);
7056       // save return value containing the exception oop in callee-saved R19
7057       __ mov(r19, r0);
7058 
7059       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7060 
7061       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7062       // __ reinitialize_ptrue();
7063 
7064       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7065 
7066       __ mov(r1, r0); // the exception handler
7067       __ mov(r0, r19); // restore return value containing the exception oop
7068       __ verify_oop(r0);
7069 
7070       __ leave();
7071       __ mov(r3, lr);
7072       __ br(r1); // the exception handler
7073     } else {
7074       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7075       __ leave();
7076       __ ret(lr);
7077     }
7078 
7079     return start;
7080   }
7081 
7082   address generate_cont_thaw() {
7083     if (!Continuations::enabled()) return nullptr;
7084 
7085     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7086     address start = __ pc();
7087     generate_cont_thaw(Continuation::thaw_top);
7088     return start;
7089   }
7090 
7091   address generate_cont_returnBarrier() {
7092     if (!Continuations::enabled()) return nullptr;
7093 
7094     // TODO: will probably need multiple return barriers depending on return type
7095     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7096     address start = __ pc();
7097 
7098     generate_cont_thaw(Continuation::thaw_return_barrier);
7099 
7100     return start;
7101   }
7102 
7103   address generate_cont_returnBarrier_exception() {
7104     if (!Continuations::enabled()) return nullptr;
7105 
7106     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7107     address start = __ pc();
7108 
7109     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7110 
7111     return start;
7112   }
7113 
7114   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7115   // are represented as long[5], with BITS_PER_LIMB = 26.
7116   // Pack five 26-bit limbs into three 64-bit registers.
7117   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7118     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7119     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7120     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7121     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7122 
7123     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7124     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7125     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7126     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7127 
7128     if (dest2->is_valid()) {
7129       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7130     } else {
7131 #ifdef ASSERT
7132       Label OK;
7133       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7134       __ br(__ EQ, OK);
7135       __ stop("high bits of Poly1305 integer should be zero");
7136       __ should_not_reach_here();
7137       __ bind(OK);
7138 #endif
7139     }
7140   }
7141 
7142   // As above, but return only a 128-bit integer, packed into two
7143   // 64-bit registers.
7144   void pack_26(Register dest0, Register dest1, Register src) {
7145     pack_26(dest0, dest1, noreg, src);
7146   }
7147 
7148   // Multiply and multiply-accumulate unsigned 64-bit registers.
7149   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7150     __ mul(prod_lo, n, m);
7151     __ umulh(prod_hi, n, m);
7152   }
7153   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7154     wide_mul(rscratch1, rscratch2, n, m);
7155     __ adds(sum_lo, sum_lo, rscratch1);
7156     __ adc(sum_hi, sum_hi, rscratch2);
7157   }
7158 
7159   // Poly1305, RFC 7539
7160 
7161   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7162   // description of the tricks used to simplify and accelerate this
7163   // computation.
7164 
7165   address generate_poly1305_processBlocks() {
7166     __ align(CodeEntryAlignment);
7167     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7168     address start = __ pc();
7169     Label here;
7170     __ enter();
7171     RegSet callee_saved = RegSet::range(r19, r28);
7172     __ push(callee_saved, sp);
7173 
7174     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7175 
7176     // Arguments
7177     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7178 
7179     // R_n is the 128-bit randomly-generated key, packed into two
7180     // registers.  The caller passes this key to us as long[5], with
7181     // BITS_PER_LIMB = 26.
7182     const Register R_0 = *++regs, R_1 = *++regs;
7183     pack_26(R_0, R_1, r_start);
7184 
7185     // RR_n is (R_n >> 2) * 5
7186     const Register RR_0 = *++regs, RR_1 = *++regs;
7187     __ lsr(RR_0, R_0, 2);
7188     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7189     __ lsr(RR_1, R_1, 2);
7190     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7191 
7192     // U_n is the current checksum
7193     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7194     pack_26(U_0, U_1, U_2, acc_start);
7195 
7196     static constexpr int BLOCK_LENGTH = 16;
7197     Label DONE, LOOP;
7198 
7199     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7200     __ br(Assembler::LT, DONE); {
7201       __ bind(LOOP);
7202 
7203       // S_n is to be the sum of U_n and the next block of data
7204       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7205       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7206       __ adds(S_0, U_0, S_0);
7207       __ adcs(S_1, U_1, S_1);
7208       __ adc(S_2, U_2, zr);
7209       __ add(S_2, S_2, 1);
7210 
7211       const Register U_0HI = *++regs, U_1HI = *++regs;
7212 
7213       // NB: this logic depends on some of the special properties of
7214       // Poly1305 keys. In particular, because we know that the top
7215       // four bits of R_0 and R_1 are zero, we can add together
7216       // partial products without any risk of needing to propagate a
7217       // carry out.
7218       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7219       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7220       __ andr(U_2, R_0, 3);
7221       __ mul(U_2, S_2, U_2);
7222 
7223       // Recycle registers S_0, S_1, S_2
7224       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7225 
7226       // Partial reduction mod 2**130 - 5
7227       __ adds(U_1, U_0HI, U_1);
7228       __ adc(U_2, U_1HI, U_2);
7229       // Sum now in U_2:U_1:U_0.
7230       // Dead: U_0HI, U_1HI.
7231       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7232 
7233       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7234 
7235       // First, U_2:U_1:U_0 += (U_2 >> 2)
7236       __ lsr(rscratch1, U_2, 2);
7237       __ andr(U_2, U_2, (u8)3);
7238       __ adds(U_0, U_0, rscratch1);
7239       __ adcs(U_1, U_1, zr);
7240       __ adc(U_2, U_2, zr);
7241       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7242       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7243       __ adcs(U_1, U_1, zr);
7244       __ adc(U_2, U_2, zr);
7245 
7246       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7247       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7248       __ br(~ Assembler::LT, LOOP);
7249     }
7250 
7251     // Further reduce modulo 2^130 - 5
7252     __ lsr(rscratch1, U_2, 2);
7253     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7254     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7255     __ adcs(U_1, U_1, zr);
7256     __ andr(U_2, U_2, (u1)3);
7257     __ adc(U_2, U_2, zr);
7258 
7259     // Unpack the sum into five 26-bit limbs and write to memory.
7260     __ ubfiz(rscratch1, U_0, 0, 26);
7261     __ ubfx(rscratch2, U_0, 26, 26);
7262     __ stp(rscratch1, rscratch2, Address(acc_start));
7263     __ ubfx(rscratch1, U_0, 52, 12);
7264     __ bfi(rscratch1, U_1, 12, 14);
7265     __ ubfx(rscratch2, U_1, 14, 26);
7266     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7267     __ ubfx(rscratch1, U_1, 40, 24);
7268     __ bfi(rscratch1, U_2, 24, 3);
7269     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7270 
7271     __ bind(DONE);
7272     __ pop(callee_saved, sp);
7273     __ leave();
7274     __ ret(lr);
7275 
7276     return start;
7277   }
7278 
7279 #if INCLUDE_JFR
7280 
7281   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
7282     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7283     __ mov(c_rarg0, thread);
7284   }
7285 
7286   // The handle is dereferenced through a load barrier.
7287   static void jfr_epilogue(MacroAssembler* _masm) {
7288     __ reset_last_Java_frame(true);
7289   }
7290 
7291   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
7292   // It returns a jobject handle to the event writer.
7293   // The handle is dereferenced and the return value is the event writer oop.
7294   static RuntimeStub* generate_jfr_write_checkpoint() {
7295     enum layout {
7296       rbp_off,
7297       rbpH_off,
7298       return_off,
7299       return_off2,
7300       framesize // inclusive of return address
7301     };
7302 
7303     int insts_size = 1024;
7304     int locs_size = 64;
7305     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
7306     OopMapSet* oop_maps = new OopMapSet();
7307     MacroAssembler* masm = new MacroAssembler(&code);
7308     MacroAssembler* _masm = masm;
7309 
7310     address start = __ pc();
7311     __ enter();
7312     int frame_complete = __ pc() - start;
7313     address the_pc = __ pc();
7314     jfr_prologue(the_pc, _masm, rthread);
7315     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
7316     jfr_epilogue(_masm);
7317     __ resolve_global_jobject(r0, rscratch1, rscratch2);
7318     __ leave();
7319     __ ret(lr);
7320 
7321     OopMap* map = new OopMap(framesize, 1); // rfp
7322     oop_maps->add_gc_map(the_pc - start, map);
7323 
7324     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7325       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
7326                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7327                                     oop_maps, false);
7328     return stub;
7329   }
7330 
7331   // For c2: call to return a leased buffer.
7332   static RuntimeStub* generate_jfr_return_lease() {
7333     enum layout {
7334       rbp_off,
7335       rbpH_off,
7336       return_off,
7337       return_off2,
7338       framesize // inclusive of return address
7339     };
7340 
7341     int insts_size = 1024;
7342     int locs_size = 64;
7343     CodeBuffer code("jfr_return_lease", insts_size, locs_size);
7344     OopMapSet* oop_maps = new OopMapSet();
7345     MacroAssembler* masm = new MacroAssembler(&code);
7346     MacroAssembler* _masm = masm;
7347 
7348     address start = __ pc();
7349     __ enter();
7350     int frame_complete = __ pc() - start;
7351     address the_pc = __ pc();
7352     jfr_prologue(the_pc, _masm, rthread);
7353     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::return_lease), 1);
7354     jfr_epilogue(_masm);
7355 
7356     __ leave();
7357     __ ret(lr);
7358 
7359     OopMap* map = new OopMap(framesize, 1); // rfp
7360     oop_maps->add_gc_map(the_pc - start, map);
7361 
7362     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7363       RuntimeStub::new_runtime_stub("jfr_return_lease", &code, frame_complete,
7364                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7365                                     oop_maps, false);
7366     return stub;
7367   }
7368 
7369 #endif // INCLUDE_JFR
7370 
7371   // exception handler for upcall stubs
7372   address generate_upcall_stub_exception_handler() {
7373     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7374     address start = __ pc();
7375 
7376     // Native caller has no idea how to handle exceptions,
7377     // so we just crash here. Up to callee to catch exceptions.
7378     __ verify_oop(r0);
7379     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7380     __ blr(rscratch1);
7381     __ should_not_reach_here();
7382 
7383     return start;
7384   }
7385 
7386   // Continuation point for throwing of implicit exceptions that are
7387   // not handled in the current activation. Fabricates an exception
7388   // oop and initiates normal exception dispatching in this
7389   // frame. Since we need to preserve callee-saved values (currently
7390   // only for C2, but done for C1 as well) we need a callee-saved oop
7391   // map and therefore have to make these stubs into RuntimeStubs
7392   // rather than BufferBlobs.  If the compiler needs all registers to
7393   // be preserved between the fault point and the exception handler
7394   // then it must assume responsibility for that in
7395   // AbstractCompiler::continuation_for_implicit_null_exception or
7396   // continuation_for_implicit_division_by_zero_exception. All other
7397   // implicit exceptions (e.g., NullPointerException or
7398   // AbstractMethodError on entry) are either at call sites or
7399   // otherwise assume that stack unwinding will be initiated, so
7400   // caller saved registers were assumed volatile in the compiler.
7401 
7402 #undef __
7403 #define __ masm->
7404 
7405   address generate_throw_exception(const char* name,
7406                                    address runtime_entry,
7407                                    Register arg1 = noreg,
7408                                    Register arg2 = noreg) {
7409     // Information about frame layout at time of blocking runtime call.
7410     // Note that we only have to preserve callee-saved registers since
7411     // the compilers are responsible for supplying a continuation point
7412     // if they expect all registers to be preserved.
7413     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7414     enum layout {
7415       rfp_off = 0,
7416       rfp_off2,
7417       return_off,
7418       return_off2,
7419       framesize // inclusive of return address
7420     };
7421 
7422     int insts_size = 512;
7423     int locs_size  = 64;
7424 
7425     CodeBuffer code(name, insts_size, locs_size);
7426     OopMapSet* oop_maps  = new OopMapSet();
7427     MacroAssembler* masm = new MacroAssembler(&code);
7428 
7429     address start = __ pc();
7430 
7431     // This is an inlined and slightly modified version of call_VM
7432     // which has the ability to fetch the return PC out of
7433     // thread-local storage and also sets up last_Java_sp slightly
7434     // differently than the real call_VM
7435 
7436     __ enter(); // Save FP and LR before call
7437 
7438     assert(is_even(framesize/2), "sp not 16-byte aligned");
7439 
7440     // lr and fp are already in place
7441     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7442 
7443     int frame_complete = __ pc() - start;
7444 
7445     // Set up last_Java_sp and last_Java_fp
7446     address the_pc = __ pc();
7447     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7448 
7449     // Call runtime
7450     if (arg1 != noreg) {
7451       assert(arg2 != c_rarg1, "clobbered");
7452       __ mov(c_rarg1, arg1);
7453     }
7454     if (arg2 != noreg) {
7455       __ mov(c_rarg2, arg2);
7456     }
7457     __ mov(c_rarg0, rthread);
7458     BLOCK_COMMENT("call runtime_entry");
7459     __ mov(rscratch1, runtime_entry);
7460     __ blr(rscratch1);
7461 
7462     // Generate oop map
7463     OopMap* map = new OopMap(framesize, 0);
7464 
7465     oop_maps->add_gc_map(the_pc - start, map);
7466 
7467     __ reset_last_Java_frame(true);
7468 
7469     // Reinitialize the ptrue predicate register, in case the external runtime
7470     // call clobbers ptrue reg, as we may return to SVE compiled code.
7471     __ reinitialize_ptrue();
7472 
7473     __ leave();
7474 
7475     // check for pending exceptions
7476 #ifdef ASSERT
7477     Label L;
7478     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7479     __ cbnz(rscratch1, L);
7480     __ should_not_reach_here();
7481     __ bind(L);
7482 #endif // ASSERT
7483     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7484 
7485     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7486     RuntimeStub* stub =
7487       RuntimeStub::new_runtime_stub(name,
7488                                     &code,
7489                                     frame_complete,
7490                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7491                                     oop_maps, false);
7492     return stub->entry_point();
7493   }
7494 
7495   class MontgomeryMultiplyGenerator : public MacroAssembler {
7496 
7497     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7498       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7499 
7500     RegSet _toSave;
7501     bool _squaring;
7502 
7503   public:
7504     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7505       : MacroAssembler(as->code()), _squaring(squaring) {
7506 
7507       // Register allocation
7508 
7509       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7510       Pa_base = *regs;       // Argument registers
7511       if (squaring)
7512         Pb_base = Pa_base;
7513       else
7514         Pb_base = *++regs;
7515       Pn_base = *++regs;
7516       Rlen= *++regs;
7517       inv = *++regs;
7518       Pm_base = *++regs;
7519 
7520                           // Working registers:
7521       Ra =  *++regs;        // The current digit of a, b, n, and m.
7522       Rb =  *++regs;
7523       Rm =  *++regs;
7524       Rn =  *++regs;
7525 
7526       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7527       Pb =  *++regs;
7528       Pm =  *++regs;
7529       Pn =  *++regs;
7530 
7531       t0 =  *++regs;        // Three registers which form a
7532       t1 =  *++regs;        // triple-precision accumuator.
7533       t2 =  *++regs;
7534 
7535       Ri =  *++regs;        // Inner and outer loop indexes.
7536       Rj =  *++regs;
7537 
7538       Rhi_ab = *++regs;     // Product registers: low and high parts
7539       Rlo_ab = *++regs;     // of a*b and m*n.
7540       Rhi_mn = *++regs;
7541       Rlo_mn = *++regs;
7542 
7543       // r19 and up are callee-saved.
7544       _toSave = RegSet::range(r19, *regs) + Pm_base;
7545     }
7546 
7547   private:
7548     void save_regs() {
7549       push(_toSave, sp);
7550     }
7551 
7552     void restore_regs() {
7553       pop(_toSave, sp);
7554     }
7555 
7556     template <typename T>
7557     void unroll_2(Register count, T block) {
7558       Label loop, end, odd;
7559       tbnz(count, 0, odd);
7560       cbz(count, end);
7561       align(16);
7562       bind(loop);
7563       (this->*block)();
7564       bind(odd);
7565       (this->*block)();
7566       subs(count, count, 2);
7567       br(Assembler::GT, loop);
7568       bind(end);
7569     }
7570 
7571     template <typename T>
7572     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7573       Label loop, end, odd;
7574       tbnz(count, 0, odd);
7575       cbz(count, end);
7576       align(16);
7577       bind(loop);
7578       (this->*block)(d, s, tmp);
7579       bind(odd);
7580       (this->*block)(d, s, tmp);
7581       subs(count, count, 2);
7582       br(Assembler::GT, loop);
7583       bind(end);
7584     }
7585 
7586     void pre1(RegisterOrConstant i) {
7587       block_comment("pre1");
7588       // Pa = Pa_base;
7589       // Pb = Pb_base + i;
7590       // Pm = Pm_base;
7591       // Pn = Pn_base + i;
7592       // Ra = *Pa;
7593       // Rb = *Pb;
7594       // Rm = *Pm;
7595       // Rn = *Pn;
7596       ldr(Ra, Address(Pa_base));
7597       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7598       ldr(Rm, Address(Pm_base));
7599       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7600       lea(Pa, Address(Pa_base));
7601       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7602       lea(Pm, Address(Pm_base));
7603       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7604 
7605       // Zero the m*n result.
7606       mov(Rhi_mn, zr);
7607       mov(Rlo_mn, zr);
7608     }
7609 
7610     // The core multiply-accumulate step of a Montgomery
7611     // multiplication.  The idea is to schedule operations as a
7612     // pipeline so that instructions with long latencies (loads and
7613     // multiplies) have time to complete before their results are
7614     // used.  This most benefits in-order implementations of the
7615     // architecture but out-of-order ones also benefit.
7616     void step() {
7617       block_comment("step");
7618       // MACC(Ra, Rb, t0, t1, t2);
7619       // Ra = *++Pa;
7620       // Rb = *--Pb;
7621       umulh(Rhi_ab, Ra, Rb);
7622       mul(Rlo_ab, Ra, Rb);
7623       ldr(Ra, pre(Pa, wordSize));
7624       ldr(Rb, pre(Pb, -wordSize));
7625       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7626                                        // previous iteration.
7627       // MACC(Rm, Rn, t0, t1, t2);
7628       // Rm = *++Pm;
7629       // Rn = *--Pn;
7630       umulh(Rhi_mn, Rm, Rn);
7631       mul(Rlo_mn, Rm, Rn);
7632       ldr(Rm, pre(Pm, wordSize));
7633       ldr(Rn, pre(Pn, -wordSize));
7634       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7635     }
7636 
7637     void post1() {
7638       block_comment("post1");
7639 
7640       // MACC(Ra, Rb, t0, t1, t2);
7641       // Ra = *++Pa;
7642       // Rb = *--Pb;
7643       umulh(Rhi_ab, Ra, Rb);
7644       mul(Rlo_ab, Ra, Rb);
7645       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7646       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7647 
7648       // *Pm = Rm = t0 * inv;
7649       mul(Rm, t0, inv);
7650       str(Rm, Address(Pm));
7651 
7652       // MACC(Rm, Rn, t0, t1, t2);
7653       // t0 = t1; t1 = t2; t2 = 0;
7654       umulh(Rhi_mn, Rm, Rn);
7655 
7656 #ifndef PRODUCT
7657       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7658       {
7659         mul(Rlo_mn, Rm, Rn);
7660         add(Rlo_mn, t0, Rlo_mn);
7661         Label ok;
7662         cbz(Rlo_mn, ok); {
7663           stop("broken Montgomery multiply");
7664         } bind(ok);
7665       }
7666 #endif
7667       // We have very carefully set things up so that
7668       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7669       // the lower half of Rm * Rn because we know the result already:
7670       // it must be -t0.  t0 + (-t0) must generate a carry iff
7671       // t0 != 0.  So, rather than do a mul and an adds we just set
7672       // the carry flag iff t0 is nonzero.
7673       //
7674       // mul(Rlo_mn, Rm, Rn);
7675       // adds(zr, t0, Rlo_mn);
7676       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7677       adcs(t0, t1, Rhi_mn);
7678       adc(t1, t2, zr);
7679       mov(t2, zr);
7680     }
7681 
7682     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7683       block_comment("pre2");
7684       // Pa = Pa_base + i-len;
7685       // Pb = Pb_base + len;
7686       // Pm = Pm_base + i-len;
7687       // Pn = Pn_base + len;
7688 
7689       if (i.is_register()) {
7690         sub(Rj, i.as_register(), len);
7691       } else {
7692         mov(Rj, i.as_constant());
7693         sub(Rj, Rj, len);
7694       }
7695       // Rj == i-len
7696 
7697       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7698       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7699       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7700       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7701 
7702       // Ra = *++Pa;
7703       // Rb = *--Pb;
7704       // Rm = *++Pm;
7705       // Rn = *--Pn;
7706       ldr(Ra, pre(Pa, wordSize));
7707       ldr(Rb, pre(Pb, -wordSize));
7708       ldr(Rm, pre(Pm, wordSize));
7709       ldr(Rn, pre(Pn, -wordSize));
7710 
7711       mov(Rhi_mn, zr);
7712       mov(Rlo_mn, zr);
7713     }
7714 
7715     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7716       block_comment("post2");
7717       if (i.is_constant()) {
7718         mov(Rj, i.as_constant()-len.as_constant());
7719       } else {
7720         sub(Rj, i.as_register(), len);
7721       }
7722 
7723       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7724 
7725       // As soon as we know the least significant digit of our result,
7726       // store it.
7727       // Pm_base[i-len] = t0;
7728       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7729 
7730       // t0 = t1; t1 = t2; t2 = 0;
7731       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7732       adc(t1, t2, zr);
7733       mov(t2, zr);
7734     }
7735 
7736     // A carry in t0 after Montgomery multiplication means that we
7737     // should subtract multiples of n from our result in m.  We'll
7738     // keep doing that until there is no carry.
7739     void normalize(RegisterOrConstant len) {
7740       block_comment("normalize");
7741       // while (t0)
7742       //   t0 = sub(Pm_base, Pn_base, t0, len);
7743       Label loop, post, again;
7744       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7745       cbz(t0, post); {
7746         bind(again); {
7747           mov(i, zr);
7748           mov(cnt, len);
7749           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7750           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7751           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7752           align(16);
7753           bind(loop); {
7754             sbcs(Rm, Rm, Rn);
7755             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7756             add(i, i, 1);
7757             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7758             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7759             sub(cnt, cnt, 1);
7760           } cbnz(cnt, loop);
7761           sbc(t0, t0, zr);
7762         } cbnz(t0, again);
7763       } bind(post);
7764     }
7765 
7766     // Move memory at s to d, reversing words.
7767     //    Increments d to end of copied memory
7768     //    Destroys tmp1, tmp2
7769     //    Preserves len
7770     //    Leaves s pointing to the address which was in d at start
7771     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7772       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7773       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7774 
7775       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7776       mov(tmp1, len);
7777       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7778       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7779     }
7780     // where
7781     void reverse1(Register d, Register s, Register tmp) {
7782       ldr(tmp, pre(s, -wordSize));
7783       ror(tmp, tmp, 32);
7784       str(tmp, post(d, wordSize));
7785     }
7786 
7787     void step_squaring() {
7788       // An extra ACC
7789       step();
7790       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7791     }
7792 
7793     void last_squaring(RegisterOrConstant i) {
7794       Label dont;
7795       // if ((i & 1) == 0) {
7796       tbnz(i.as_register(), 0, dont); {
7797         // MACC(Ra, Rb, t0, t1, t2);
7798         // Ra = *++Pa;
7799         // Rb = *--Pb;
7800         umulh(Rhi_ab, Ra, Rb);
7801         mul(Rlo_ab, Ra, Rb);
7802         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7803       } bind(dont);
7804     }
7805 
7806     void extra_step_squaring() {
7807       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7808 
7809       // MACC(Rm, Rn, t0, t1, t2);
7810       // Rm = *++Pm;
7811       // Rn = *--Pn;
7812       umulh(Rhi_mn, Rm, Rn);
7813       mul(Rlo_mn, Rm, Rn);
7814       ldr(Rm, pre(Pm, wordSize));
7815       ldr(Rn, pre(Pn, -wordSize));
7816     }
7817 
7818     void post1_squaring() {
7819       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7820 
7821       // *Pm = Rm = t0 * inv;
7822       mul(Rm, t0, inv);
7823       str(Rm, Address(Pm));
7824 
7825       // MACC(Rm, Rn, t0, t1, t2);
7826       // t0 = t1; t1 = t2; t2 = 0;
7827       umulh(Rhi_mn, Rm, Rn);
7828 
7829 #ifndef PRODUCT
7830       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7831       {
7832         mul(Rlo_mn, Rm, Rn);
7833         add(Rlo_mn, t0, Rlo_mn);
7834         Label ok;
7835         cbz(Rlo_mn, ok); {
7836           stop("broken Montgomery multiply");
7837         } bind(ok);
7838       }
7839 #endif
7840       // We have very carefully set things up so that
7841       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7842       // the lower half of Rm * Rn because we know the result already:
7843       // it must be -t0.  t0 + (-t0) must generate a carry iff
7844       // t0 != 0.  So, rather than do a mul and an adds we just set
7845       // the carry flag iff t0 is nonzero.
7846       //
7847       // mul(Rlo_mn, Rm, Rn);
7848       // adds(zr, t0, Rlo_mn);
7849       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7850       adcs(t0, t1, Rhi_mn);
7851       adc(t1, t2, zr);
7852       mov(t2, zr);
7853     }
7854 
7855     void acc(Register Rhi, Register Rlo,
7856              Register t0, Register t1, Register t2) {
7857       adds(t0, t0, Rlo);
7858       adcs(t1, t1, Rhi);
7859       adc(t2, t2, zr);
7860     }
7861 
7862   public:
7863     /**
7864      * Fast Montgomery multiplication.  The derivation of the
7865      * algorithm is in A Cryptographic Library for the Motorola
7866      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7867      *
7868      * Arguments:
7869      *
7870      * Inputs for multiplication:
7871      *   c_rarg0   - int array elements a
7872      *   c_rarg1   - int array elements b
7873      *   c_rarg2   - int array elements n (the modulus)
7874      *   c_rarg3   - int length
7875      *   c_rarg4   - int inv
7876      *   c_rarg5   - int array elements m (the result)
7877      *
7878      * Inputs for squaring:
7879      *   c_rarg0   - int array elements a
7880      *   c_rarg1   - int array elements n (the modulus)
7881      *   c_rarg2   - int length
7882      *   c_rarg3   - int inv
7883      *   c_rarg4   - int array elements m (the result)
7884      *
7885      */
7886     address generate_multiply() {
7887       Label argh, nothing;
7888       bind(argh);
7889       stop("MontgomeryMultiply total_allocation must be <= 8192");
7890 
7891       align(CodeEntryAlignment);
7892       address entry = pc();
7893 
7894       cbzw(Rlen, nothing);
7895 
7896       enter();
7897 
7898       // Make room.
7899       cmpw(Rlen, 512);
7900       br(Assembler::HI, argh);
7901       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7902       andr(sp, Ra, -2 * wordSize);
7903 
7904       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7905 
7906       {
7907         // Copy input args, reversing as we go.  We use Ra as a
7908         // temporary variable.
7909         reverse(Ra, Pa_base, Rlen, t0, t1);
7910         if (!_squaring)
7911           reverse(Ra, Pb_base, Rlen, t0, t1);
7912         reverse(Ra, Pn_base, Rlen, t0, t1);
7913       }
7914 
7915       // Push all call-saved registers and also Pm_base which we'll need
7916       // at the end.
7917       save_regs();
7918 
7919 #ifndef PRODUCT
7920       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7921       {
7922         ldr(Rn, Address(Pn_base, 0));
7923         mul(Rlo_mn, Rn, inv);
7924         subs(zr, Rlo_mn, -1);
7925         Label ok;
7926         br(EQ, ok); {
7927           stop("broken inverse in Montgomery multiply");
7928         } bind(ok);
7929       }
7930 #endif
7931 
7932       mov(Pm_base, Ra);
7933 
7934       mov(t0, zr);
7935       mov(t1, zr);
7936       mov(t2, zr);
7937 
7938       block_comment("for (int i = 0; i < len; i++) {");
7939       mov(Ri, zr); {
7940         Label loop, end;
7941         cmpw(Ri, Rlen);
7942         br(Assembler::GE, end);
7943 
7944         bind(loop);
7945         pre1(Ri);
7946 
7947         block_comment("  for (j = i; j; j--) {"); {
7948           movw(Rj, Ri);
7949           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7950         } block_comment("  } // j");
7951 
7952         post1();
7953         addw(Ri, Ri, 1);
7954         cmpw(Ri, Rlen);
7955         br(Assembler::LT, loop);
7956         bind(end);
7957         block_comment("} // i");
7958       }
7959 
7960       block_comment("for (int i = len; i < 2*len; i++) {");
7961       mov(Ri, Rlen); {
7962         Label loop, end;
7963         cmpw(Ri, Rlen, Assembler::LSL, 1);
7964         br(Assembler::GE, end);
7965 
7966         bind(loop);
7967         pre2(Ri, Rlen);
7968 
7969         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7970           lslw(Rj, Rlen, 1);
7971           subw(Rj, Rj, Ri);
7972           subw(Rj, Rj, 1);
7973           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7974         } block_comment("  } // j");
7975 
7976         post2(Ri, Rlen);
7977         addw(Ri, Ri, 1);
7978         cmpw(Ri, Rlen, Assembler::LSL, 1);
7979         br(Assembler::LT, loop);
7980         bind(end);
7981       }
7982       block_comment("} // i");
7983 
7984       normalize(Rlen);
7985 
7986       mov(Ra, Pm_base);  // Save Pm_base in Ra
7987       restore_regs();  // Restore caller's Pm_base
7988 
7989       // Copy our result into caller's Pm_base
7990       reverse(Pm_base, Ra, Rlen, t0, t1);
7991 
7992       leave();
7993       bind(nothing);
7994       ret(lr);
7995 
7996       return entry;
7997     }
7998     // In C, approximately:
7999 
8000     // void
8001     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8002     //                     julong Pn_base[], julong Pm_base[],
8003     //                     julong inv, int len) {
8004     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8005     //   julong *Pa, *Pb, *Pn, *Pm;
8006     //   julong Ra, Rb, Rn, Rm;
8007 
8008     //   int i;
8009 
8010     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8011 
8012     //   for (i = 0; i < len; i++) {
8013     //     int j;
8014 
8015     //     Pa = Pa_base;
8016     //     Pb = Pb_base + i;
8017     //     Pm = Pm_base;
8018     //     Pn = Pn_base + i;
8019 
8020     //     Ra = *Pa;
8021     //     Rb = *Pb;
8022     //     Rm = *Pm;
8023     //     Rn = *Pn;
8024 
8025     //     int iters = i;
8026     //     for (j = 0; iters--; j++) {
8027     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8028     //       MACC(Ra, Rb, t0, t1, t2);
8029     //       Ra = *++Pa;
8030     //       Rb = *--Pb;
8031     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8032     //       MACC(Rm, Rn, t0, t1, t2);
8033     //       Rm = *++Pm;
8034     //       Rn = *--Pn;
8035     //     }
8036 
8037     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8038     //     MACC(Ra, Rb, t0, t1, t2);
8039     //     *Pm = Rm = t0 * inv;
8040     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8041     //     MACC(Rm, Rn, t0, t1, t2);
8042 
8043     //     assert(t0 == 0, "broken Montgomery multiply");
8044 
8045     //     t0 = t1; t1 = t2; t2 = 0;
8046     //   }
8047 
8048     //   for (i = len; i < 2*len; i++) {
8049     //     int j;
8050 
8051     //     Pa = Pa_base + i-len;
8052     //     Pb = Pb_base + len;
8053     //     Pm = Pm_base + i-len;
8054     //     Pn = Pn_base + len;
8055 
8056     //     Ra = *++Pa;
8057     //     Rb = *--Pb;
8058     //     Rm = *++Pm;
8059     //     Rn = *--Pn;
8060 
8061     //     int iters = len*2-i-1;
8062     //     for (j = i-len+1; iters--; j++) {
8063     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8064     //       MACC(Ra, Rb, t0, t1, t2);
8065     //       Ra = *++Pa;
8066     //       Rb = *--Pb;
8067     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8068     //       MACC(Rm, Rn, t0, t1, t2);
8069     //       Rm = *++Pm;
8070     //       Rn = *--Pn;
8071     //     }
8072 
8073     //     Pm_base[i-len] = t0;
8074     //     t0 = t1; t1 = t2; t2 = 0;
8075     //   }
8076 
8077     //   while (t0)
8078     //     t0 = sub(Pm_base, Pn_base, t0, len);
8079     // }
8080 
8081     /**
8082      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8083      * multiplies than Montgomery multiplication so it should be up to
8084      * 25% faster.  However, its loop control is more complex and it
8085      * may actually run slower on some machines.
8086      *
8087      * Arguments:
8088      *
8089      * Inputs:
8090      *   c_rarg0   - int array elements a
8091      *   c_rarg1   - int array elements n (the modulus)
8092      *   c_rarg2   - int length
8093      *   c_rarg3   - int inv
8094      *   c_rarg4   - int array elements m (the result)
8095      *
8096      */
8097     address generate_square() {
8098       Label argh;
8099       bind(argh);
8100       stop("MontgomeryMultiply total_allocation must be <= 8192");
8101 
8102       align(CodeEntryAlignment);
8103       address entry = pc();
8104 
8105       enter();
8106 
8107       // Make room.
8108       cmpw(Rlen, 512);
8109       br(Assembler::HI, argh);
8110       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8111       andr(sp, Ra, -2 * wordSize);
8112 
8113       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8114 
8115       {
8116         // Copy input args, reversing as we go.  We use Ra as a
8117         // temporary variable.
8118         reverse(Ra, Pa_base, Rlen, t0, t1);
8119         reverse(Ra, Pn_base, Rlen, t0, t1);
8120       }
8121 
8122       // Push all call-saved registers and also Pm_base which we'll need
8123       // at the end.
8124       save_regs();
8125 
8126       mov(Pm_base, Ra);
8127 
8128       mov(t0, zr);
8129       mov(t1, zr);
8130       mov(t2, zr);
8131 
8132       block_comment("for (int i = 0; i < len; i++) {");
8133       mov(Ri, zr); {
8134         Label loop, end;
8135         bind(loop);
8136         cmp(Ri, Rlen);
8137         br(Assembler::GE, end);
8138 
8139         pre1(Ri);
8140 
8141         block_comment("for (j = (i+1)/2; j; j--) {"); {
8142           add(Rj, Ri, 1);
8143           lsr(Rj, Rj, 1);
8144           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8145         } block_comment("  } // j");
8146 
8147         last_squaring(Ri);
8148 
8149         block_comment("  for (j = i/2; j; j--) {"); {
8150           lsr(Rj, Ri, 1);
8151           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8152         } block_comment("  } // j");
8153 
8154         post1_squaring();
8155         add(Ri, Ri, 1);
8156         cmp(Ri, Rlen);
8157         br(Assembler::LT, loop);
8158 
8159         bind(end);
8160         block_comment("} // i");
8161       }
8162 
8163       block_comment("for (int i = len; i < 2*len; i++) {");
8164       mov(Ri, Rlen); {
8165         Label loop, end;
8166         bind(loop);
8167         cmp(Ri, Rlen, Assembler::LSL, 1);
8168         br(Assembler::GE, end);
8169 
8170         pre2(Ri, Rlen);
8171 
8172         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8173           lsl(Rj, Rlen, 1);
8174           sub(Rj, Rj, Ri);
8175           sub(Rj, Rj, 1);
8176           lsr(Rj, Rj, 1);
8177           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8178         } block_comment("  } // j");
8179 
8180         last_squaring(Ri);
8181 
8182         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8183           lsl(Rj, Rlen, 1);
8184           sub(Rj, Rj, Ri);
8185           lsr(Rj, Rj, 1);
8186           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8187         } block_comment("  } // j");
8188 
8189         post2(Ri, Rlen);
8190         add(Ri, Ri, 1);
8191         cmp(Ri, Rlen, Assembler::LSL, 1);
8192 
8193         br(Assembler::LT, loop);
8194         bind(end);
8195         block_comment("} // i");
8196       }
8197 
8198       normalize(Rlen);
8199 
8200       mov(Ra, Pm_base);  // Save Pm_base in Ra
8201       restore_regs();  // Restore caller's Pm_base
8202 
8203       // Copy our result into caller's Pm_base
8204       reverse(Pm_base, Ra, Rlen, t0, t1);
8205 
8206       leave();
8207       ret(lr);
8208 
8209       return entry;
8210     }
8211     // In C, approximately:
8212 
8213     // void
8214     // montgomery_square(julong Pa_base[], julong Pn_base[],
8215     //                   julong Pm_base[], julong inv, int len) {
8216     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8217     //   julong *Pa, *Pb, *Pn, *Pm;
8218     //   julong Ra, Rb, Rn, Rm;
8219 
8220     //   int i;
8221 
8222     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8223 
8224     //   for (i = 0; i < len; i++) {
8225     //     int j;
8226 
8227     //     Pa = Pa_base;
8228     //     Pb = Pa_base + i;
8229     //     Pm = Pm_base;
8230     //     Pn = Pn_base + i;
8231 
8232     //     Ra = *Pa;
8233     //     Rb = *Pb;
8234     //     Rm = *Pm;
8235     //     Rn = *Pn;
8236 
8237     //     int iters = (i+1)/2;
8238     //     for (j = 0; iters--; j++) {
8239     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8240     //       MACC2(Ra, Rb, t0, t1, t2);
8241     //       Ra = *++Pa;
8242     //       Rb = *--Pb;
8243     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8244     //       MACC(Rm, Rn, t0, t1, t2);
8245     //       Rm = *++Pm;
8246     //       Rn = *--Pn;
8247     //     }
8248     //     if ((i & 1) == 0) {
8249     //       assert(Ra == Pa_base[j], "must be");
8250     //       MACC(Ra, Ra, t0, t1, t2);
8251     //     }
8252     //     iters = i/2;
8253     //     assert(iters == i-j, "must be");
8254     //     for (; iters--; j++) {
8255     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8256     //       MACC(Rm, Rn, t0, t1, t2);
8257     //       Rm = *++Pm;
8258     //       Rn = *--Pn;
8259     //     }
8260 
8261     //     *Pm = Rm = t0 * inv;
8262     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8263     //     MACC(Rm, Rn, t0, t1, t2);
8264 
8265     //     assert(t0 == 0, "broken Montgomery multiply");
8266 
8267     //     t0 = t1; t1 = t2; t2 = 0;
8268     //   }
8269 
8270     //   for (i = len; i < 2*len; i++) {
8271     //     int start = i-len+1;
8272     //     int end = start + (len - start)/2;
8273     //     int j;
8274 
8275     //     Pa = Pa_base + i-len;
8276     //     Pb = Pa_base + len;
8277     //     Pm = Pm_base + i-len;
8278     //     Pn = Pn_base + len;
8279 
8280     //     Ra = *++Pa;
8281     //     Rb = *--Pb;
8282     //     Rm = *++Pm;
8283     //     Rn = *--Pn;
8284 
8285     //     int iters = (2*len-i-1)/2;
8286     //     assert(iters == end-start, "must be");
8287     //     for (j = start; iters--; j++) {
8288     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8289     //       MACC2(Ra, Rb, t0, t1, t2);
8290     //       Ra = *++Pa;
8291     //       Rb = *--Pb;
8292     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8293     //       MACC(Rm, Rn, t0, t1, t2);
8294     //       Rm = *++Pm;
8295     //       Rn = *--Pn;
8296     //     }
8297     //     if ((i & 1) == 0) {
8298     //       assert(Ra == Pa_base[j], "must be");
8299     //       MACC(Ra, Ra, t0, t1, t2);
8300     //     }
8301     //     iters =  (2*len-i)/2;
8302     //     assert(iters == len-j, "must be");
8303     //     for (; iters--; j++) {
8304     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8305     //       MACC(Rm, Rn, t0, t1, t2);
8306     //       Rm = *++Pm;
8307     //       Rn = *--Pn;
8308     //     }
8309     //     Pm_base[i-len] = t0;
8310     //     t0 = t1; t1 = t2; t2 = 0;
8311     //   }
8312 
8313     //   while (t0)
8314     //     t0 = sub(Pm_base, Pn_base, t0, len);
8315     // }
8316   };
8317 
8318 
8319   // Call here from the interpreter or compiled code to either load
8320   // multiple returned values from the inline type instance being
8321   // returned to registers or to store returned values to a newly
8322   // allocated inline type instance.
8323   address generate_return_value_stub(address destination, const char* name, bool has_res) {
8324     // We need to save all registers the calling convention may use so
8325     // the runtime calls read or update those registers. This needs to
8326     // be in sync with SharedRuntime::java_return_convention().
8327     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
8328     enum layout {
8329       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
8330       j_rarg6_off, j_rarg6_2,
8331       j_rarg5_off, j_rarg5_2,
8332       j_rarg4_off, j_rarg4_2,
8333       j_rarg3_off, j_rarg3_2,
8334       j_rarg2_off, j_rarg2_2,
8335       j_rarg1_off, j_rarg1_2,
8336       j_rarg0_off, j_rarg0_2,
8337 
8338       j_farg7_off, j_farg7_2,
8339       j_farg6_off, j_farg6_2,
8340       j_farg5_off, j_farg5_2,
8341       j_farg4_off, j_farg4_2,
8342       j_farg3_off, j_farg3_2,
8343       j_farg2_off, j_farg2_2,
8344       j_farg1_off, j_farg1_2,
8345       j_farg0_off, j_farg0_2,
8346 
8347       rfp_off, rfp_off2,
8348       return_off, return_off2,
8349 
8350       framesize // inclusive of return address
8351     };
8352 
8353     CodeBuffer code(name, 512, 64);
8354     MacroAssembler* masm = new MacroAssembler(&code);
8355 
8356     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
8357     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
8358     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
8359     int frame_size_in_words = frame_size_in_bytes / wordSize;
8360 
8361     OopMapSet* oop_maps = new OopMapSet();
8362     OopMap* map = new OopMap(frame_size_in_slots, 0);
8363 
8364     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
8365     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
8366     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
8367     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
8368     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
8369     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
8370     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
8371     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
8372 
8373     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
8374     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
8375     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
8376     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
8377     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
8378     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
8379     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
8380     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
8381 
8382     address start = __ pc();
8383 
8384     __ enter(); // Save FP and LR before call
8385 
8386     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
8387     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
8388     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
8389     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
8390 
8391     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
8392     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
8393     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
8394     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
8395 
8396     int frame_complete = __ offset();
8397 
8398     // Set up last_Java_sp and last_Java_fp
8399     address the_pc = __ pc();
8400     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
8401 
8402     // Call runtime
8403     __ mov(c_rarg1, r0);
8404     __ mov(c_rarg0, rthread);
8405 
8406     __ mov(rscratch1, destination);
8407     __ blr(rscratch1);
8408 
8409     oop_maps->add_gc_map(the_pc - start, map);
8410 
8411     __ reset_last_Java_frame(false);
8412 
8413     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
8414     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
8415     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
8416     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
8417 
8418     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
8419     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
8420     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
8421     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
8422 
8423     __ leave();
8424 
8425     // check for pending exceptions
8426     Label pending;
8427     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
8428     __ cbnz(rscratch1, pending);
8429 
8430     if (has_res) {
8431       __ get_vm_result(r0, rthread);
8432     }
8433 
8434     __ ret(lr);
8435 
8436     __ bind(pending);
8437     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
8438 
8439     // -------------
8440     // make sure all code is generated
8441     masm->flush();
8442 
8443     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
8444     return stub->entry_point();
8445   }
8446 
8447   // Initialization
8448   void generate_initial_stubs() {
8449     // Generate initial stubs and initializes the entry points
8450 
8451     // entry points that exist in all platforms Note: This is code
8452     // that could be shared among different platforms - however the
8453     // benefit seems to be smaller than the disadvantage of having a
8454     // much more complicated generator structure. See also comment in
8455     // stubRoutines.hpp.
8456 
8457     StubRoutines::_forward_exception_entry = generate_forward_exception();
8458 
8459     StubRoutines::_call_stub_entry =
8460       generate_call_stub(StubRoutines::_call_stub_return_address);
8461 
8462     // is referenced by megamorphic call
8463     StubRoutines::_catch_exception_entry = generate_catch_exception();
8464 
8465     // Build this early so it's available for the interpreter.
8466     StubRoutines::_throw_StackOverflowError_entry =
8467       generate_throw_exception("StackOverflowError throw_exception",
8468                                CAST_FROM_FN_PTR(address,
8469                                                 SharedRuntime::throw_StackOverflowError));
8470     StubRoutines::_throw_delayed_StackOverflowError_entry =
8471       generate_throw_exception("delayed StackOverflowError throw_exception",
8472                                CAST_FROM_FN_PTR(address,
8473                                                 SharedRuntime::throw_delayed_StackOverflowError));
8474 
8475     // Initialize table for copy memory (arraycopy) check.
8476     if (UnsafeCopyMemory::_table == nullptr) {
8477       UnsafeCopyMemory::create_table(8);
8478     }
8479 
8480     if (UseCRC32Intrinsics) {
8481       // set table address before stub generation which use it
8482       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8483       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8484     }
8485 
8486     if (UseCRC32CIntrinsics) {
8487       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8488     }
8489 
8490     // Disabled until JDK-8210858 is fixed
8491     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
8492     //   StubRoutines::_dlog = generate_dlog();
8493     // }
8494 
8495     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8496       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8497     }
8498 
8499     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8500       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8501     }
8502 
8503     if (InlineTypeReturnedAsFields) {
8504       StubRoutines::_load_inline_type_fields_in_regs =
8505          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
8506       StubRoutines::_store_inline_type_fields_to_buf =
8507          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
8508     }
8509   }
8510 
8511   void generate_continuation_stubs() {
8512     // Continuation stubs:
8513     StubRoutines::_cont_thaw          = generate_cont_thaw();
8514     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8515     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8516 
8517     JFR_ONLY(generate_jfr_stubs();)
8518   }
8519 
8520 #if INCLUDE_JFR
8521   void generate_jfr_stubs() {
8522     StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
8523     StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
8524     StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
8525     StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
8526   }
8527 #endif // INCLUDE_JFR
8528 
8529   void generate_final_stubs() {
8530     // support for verify_oop (must happen after universe_init)
8531     if (VerifyOops) {
8532       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8533     }
8534     StubRoutines::_throw_AbstractMethodError_entry =
8535       generate_throw_exception("AbstractMethodError throw_exception",
8536                                CAST_FROM_FN_PTR(address,
8537                                                 SharedRuntime::
8538                                                 throw_AbstractMethodError));
8539 
8540     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8541       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8542                                CAST_FROM_FN_PTR(address,
8543                                                 SharedRuntime::
8544                                                 throw_IncompatibleClassChangeError));
8545 
8546     StubRoutines::_throw_NullPointerException_at_call_entry =
8547       generate_throw_exception("NullPointerException at call throw_exception",
8548                                CAST_FROM_FN_PTR(address,
8549                                                 SharedRuntime::
8550                                                 throw_NullPointerException_at_call));
8551 
8552     // arraycopy stubs used by compilers
8553     generate_arraycopy_stubs();
8554 
8555     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8556     if (bs_nm != nullptr) {
8557       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8558     }
8559 
8560     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8561 
8562     if (UsePoly1305Intrinsics) {
8563       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8564     }
8565 
8566 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8567 
8568     generate_atomic_entry_points();
8569 
8570 #endif // LINUX
8571 
8572     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8573 
8574     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8575   }
8576 
8577   void generate_compiler_stubs() {
8578 #if COMPILER2_OR_JVMCI
8579 
8580     if (UseSVE == 0) {
8581       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8582     }
8583 
8584     // array equals stub for large arrays.
8585     if (!UseSimpleArrayEquals) {
8586       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8587     }
8588 
8589     // byte_array_inflate stub for large arrays.
8590     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8591 
8592     // countPositives stub for large arrays.
8593     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8594 
8595     generate_compare_long_strings();
8596 
8597     generate_string_indexof_stubs();
8598 
8599 #ifdef COMPILER2
8600     if (UseMultiplyToLenIntrinsic) {
8601       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8602     }
8603 
8604     if (UseSquareToLenIntrinsic) {
8605       StubRoutines::_squareToLen = generate_squareToLen();
8606     }
8607 
8608     if (UseMulAddIntrinsic) {
8609       StubRoutines::_mulAdd = generate_mulAdd();
8610     }
8611 
8612     if (UseSIMDForBigIntegerShiftIntrinsics) {
8613       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8614       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8615     }
8616 
8617     if (UseMontgomeryMultiplyIntrinsic) {
8618       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8619       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8620       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8621     }
8622 
8623     if (UseMontgomerySquareIntrinsic) {
8624       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8625       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8626       // We use generate_multiply() rather than generate_square()
8627       // because it's faster for the sizes of modulus we care about.
8628       StubRoutines::_montgomerySquare = g.generate_multiply();
8629     }
8630 #endif // COMPILER2
8631 
8632     if (UseChaCha20Intrinsics) {
8633       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8634     }
8635 
8636     if (UseBASE64Intrinsics) {
8637         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8638         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8639     }
8640 
8641     // data cache line writeback
8642     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8643     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8644 
8645     if (UseAESIntrinsics) {
8646       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8647       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8648       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8649       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8650       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8651     }
8652     if (UseGHASHIntrinsics) {
8653       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8654       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8655     }
8656     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8657       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8658     }
8659 
8660     if (UseMD5Intrinsics) {
8661       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8662       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8663     }
8664     if (UseSHA1Intrinsics) {
8665       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8666       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8667     }
8668     if (UseSHA256Intrinsics) {
8669       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8670       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8671     }
8672     if (UseSHA512Intrinsics) {
8673       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8674       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8675     }
8676     if (UseSHA3Intrinsics) {
8677       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8678       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8679     }
8680 
8681     // generate Adler32 intrinsics code
8682     if (UseAdler32Intrinsics) {
8683       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8684     }
8685 #endif // COMPILER2_OR_JVMCI
8686   }
8687 
8688  public:
8689   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8690     switch(kind) {
8691     case Initial_stubs:
8692       generate_initial_stubs();
8693       break;
8694      case Continuation_stubs:
8695       generate_continuation_stubs();
8696       break;
8697     case Compiler_stubs:
8698       generate_compiler_stubs();
8699       break;
8700     case Final_stubs:
8701       generate_final_stubs();
8702       break;
8703     default:
8704       fatal("unexpected stubs kind: %d", kind);
8705       break;
8706     };
8707   }
8708 }; // end class declaration
8709 
8710 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8711   StubGenerator g(code, kind);
8712 }
8713 
8714 
8715 #if defined (LINUX)
8716 
8717 // Define pointers to atomic stubs and initialize them to point to the
8718 // code in atomic_aarch64.S.
8719 
8720 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8721   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8722     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8723   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8724     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8725 
8726 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8727 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8728 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8729 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8730 DEFAULT_ATOMIC_OP(xchg, 4, )
8731 DEFAULT_ATOMIC_OP(xchg, 8, )
8732 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8733 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8734 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8735 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8736 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8737 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8738 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8739 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8740 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8741 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8742 
8743 #undef DEFAULT_ATOMIC_OP
8744 
8745 #endif // LINUX