1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/atomic.hpp"
  46 #include "runtime/continuation.hpp"
  47 #include "runtime/continuationEntry.inline.hpp"
  48 #include "runtime/frame.inline.hpp"
  49 #include "runtime/handles.inline.hpp"
  50 #include "runtime/javaThread.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/stubCodeGenerator.hpp"
  53 #include "runtime/stubRoutines.hpp"
  54 #include "utilities/align.hpp"
  55 #include "utilities/checkedCast.hpp"
  56 #include "utilities/globalDefinitions.hpp"
  57 #include "utilities/powerOfTwo.hpp"
  58 #ifdef COMPILER2
  59 #include "opto/runtime.hpp"
  60 #endif
  61 #if INCLUDE_ZGC
  62 #include "gc/z/zThreadLocalData.hpp"
  63 #endif
  64 
  65 // Declaration and definition of StubGenerator (no .hpp file).
  66 // For a more detailed description of the stub routine structure
  67 // see the comment in stubRoutines.hpp
  68 
  69 #undef __
  70 #define __ _masm->
  71 
  72 #ifdef PRODUCT
  73 #define BLOCK_COMMENT(str) /* nothing */
  74 #else
  75 #define BLOCK_COMMENT(str) __ block_comment(str)
  76 #endif
  77 
  78 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  79 
  80 // Stub Code definitions
  81 
  82 class StubGenerator: public StubCodeGenerator {
  83  private:
  84 
  85 #ifdef PRODUCT
  86 #define inc_counter_np(counter) ((void)0)
  87 #else
  88   void inc_counter_np_(uint& counter) {
  89     __ incrementw(ExternalAddress((address)&counter));
  90   }
  91 #define inc_counter_np(counter) \
  92   BLOCK_COMMENT("inc_counter " #counter); \
  93   inc_counter_np_(counter);
  94 #endif
  95 
  96   // Call stubs are used to call Java from C
  97   //
  98   // Arguments:
  99   //    c_rarg0:   call wrapper address                   address
 100   //    c_rarg1:   result                                 address
 101   //    c_rarg2:   result type                            BasicType
 102   //    c_rarg3:   method                                 Method*
 103   //    c_rarg4:   (interpreter) entry point              address
 104   //    c_rarg5:   parameters                             intptr_t*
 105   //    c_rarg6:   parameter size (in words)              int
 106   //    c_rarg7:   thread                                 Thread*
 107   //
 108   // There is no return from the stub itself as any Java result
 109   // is written to result
 110   //
 111   // we save r30 (lr) as the return PC at the base of the frame and
 112   // link r29 (fp) below it as the frame pointer installing sp (r31)
 113   // into fp.
 114   //
 115   // we save r0-r7, which accounts for all the c arguments.
 116   //
 117   // TODO: strictly do we need to save them all? they are treated as
 118   // volatile by C so could we omit saving the ones we are going to
 119   // place in global registers (thread? method?) or those we only use
 120   // during setup of the Java call?
 121   //
 122   // we don't need to save r8 which C uses as an indirect result location
 123   // return register.
 124   //
 125   // we don't need to save r9-r15 which both C and Java treat as
 126   // volatile
 127   //
 128   // we don't need to save r16-18 because Java does not use them
 129   //
 130   // we save r19-r28 which Java uses as scratch registers and C
 131   // expects to be callee-save
 132   //
 133   // we save the bottom 64 bits of each value stored in v8-v15; it is
 134   // the responsibility of the caller to preserve larger values.
 135   //
 136   // so the stub frame looks like this when we enter Java code
 137   //
 138   //     [ return_from_Java     ] <--- sp
 139   //     [ argument word n      ]
 140   //      ...
 141   // -29 [ argument word 1      ]
 142   // -28 [ saved Floating-point Control Register ]
 143   // -26 [ saved v15            ] <--- sp_after_call
 144   // -25 [ saved v14            ]
 145   // -24 [ saved v13            ]
 146   // -23 [ saved v12            ]
 147   // -22 [ saved v11            ]
 148   // -21 [ saved v10            ]
 149   // -20 [ saved v9             ]
 150   // -19 [ saved v8             ]
 151   // -18 [ saved r28            ]
 152   // -17 [ saved r27            ]
 153   // -16 [ saved r26            ]
 154   // -15 [ saved r25            ]
 155   // -14 [ saved r24            ]
 156   // -13 [ saved r23            ]
 157   // -12 [ saved r22            ]
 158   // -11 [ saved r21            ]
 159   // -10 [ saved r20            ]
 160   //  -9 [ saved r19            ]
 161   //  -8 [ call wrapper    (r0) ]
 162   //  -7 [ result          (r1) ]
 163   //  -6 [ result type     (r2) ]
 164   //  -5 [ method          (r3) ]
 165   //  -4 [ entry point     (r4) ]
 166   //  -3 [ parameters      (r5) ]
 167   //  -2 [ parameter size  (r6) ]
 168   //  -1 [ thread (r7)          ]
 169   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 170   //   1 [ saved lr       (r30) ]
 171 
 172   // Call stub stack layout word offsets from fp
 173   enum call_stub_layout {
 174     sp_after_call_off  = -28,
 175 
 176     fpcr_off           = sp_after_call_off,
 177     d15_off            = -26,
 178     d13_off            = -24,
 179     d11_off            = -22,
 180     d9_off             = -20,
 181 
 182     r28_off            = -18,
 183     r26_off            = -16,
 184     r24_off            = -14,
 185     r22_off            = -12,
 186     r20_off            = -10,
 187     call_wrapper_off   =  -8,
 188     result_off         =  -7,
 189     result_type_off    =  -6,
 190     method_off         =  -5,
 191     entry_point_off    =  -4,
 192     parameter_size_off =  -2,
 193     thread_off         =  -1,
 194     fp_f               =   0,
 195     retaddr_off        =   1,
 196   };
 197 
 198   address generate_call_stub(address& return_address) {
 199     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 200            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 201            "adjust this code");
 202 
 203     StubCodeMark mark(this, "StubRoutines", "call_stub");
 204     address start = __ pc();
 205 
 206     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 207 
 208     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 209     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 210     const Address result        (rfp, result_off         * wordSize);
 211     const Address result_type   (rfp, result_type_off    * wordSize);
 212     const Address method        (rfp, method_off         * wordSize);
 213     const Address entry_point   (rfp, entry_point_off    * wordSize);
 214     const Address parameter_size(rfp, parameter_size_off * wordSize);
 215 
 216     const Address thread        (rfp, thread_off         * wordSize);
 217 
 218     const Address d15_save      (rfp, d15_off * wordSize);
 219     const Address d13_save      (rfp, d13_off * wordSize);
 220     const Address d11_save      (rfp, d11_off * wordSize);
 221     const Address d9_save       (rfp, d9_off * wordSize);
 222 
 223     const Address r28_save      (rfp, r28_off * wordSize);
 224     const Address r26_save      (rfp, r26_off * wordSize);
 225     const Address r24_save      (rfp, r24_off * wordSize);
 226     const Address r22_save      (rfp, r22_off * wordSize);
 227     const Address r20_save      (rfp, r20_off * wordSize);
 228 
 229     // stub code
 230 
 231     address aarch64_entry = __ pc();
 232 
 233     // set up frame and move sp to end of save area
 234     __ enter();
 235     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 236 
 237     // save register parameters and Java scratch/global registers
 238     // n.b. we save thread even though it gets installed in
 239     // rthread because we want to sanity check rthread later
 240     __ str(c_rarg7,  thread);
 241     __ strw(c_rarg6, parameter_size);
 242     __ stp(c_rarg4, c_rarg5,  entry_point);
 243     __ stp(c_rarg2, c_rarg3,  result_type);
 244     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 245 
 246     __ stp(r20, r19,   r20_save);
 247     __ stp(r22, r21,   r22_save);
 248     __ stp(r24, r23,   r24_save);
 249     __ stp(r26, r25,   r26_save);
 250     __ stp(r28, r27,   r28_save);
 251 
 252     __ stpd(v9,  v8,   d9_save);
 253     __ stpd(v11, v10,  d11_save);
 254     __ stpd(v13, v12,  d13_save);
 255     __ stpd(v15, v14,  d15_save);
 256 
 257     __ get_fpcr(rscratch1);
 258     __ str(rscratch1, fpcr_save);
 259     // Set FPCR to the state we need. We do want Round to Nearest. We
 260     // don't want non-IEEE rounding modes or floating-point traps.
 261     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 262     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 263     __ set_fpcr(rscratch1);
 264 
 265     // install Java thread in global register now we have saved
 266     // whatever value it held
 267     __ mov(rthread, c_rarg7);
 268     // And method
 269     __ mov(rmethod, c_rarg3);
 270 
 271     // set up the heapbase register
 272     __ reinit_heapbase();
 273 
 274 #ifdef ASSERT
 275     // make sure we have no pending exceptions
 276     {
 277       Label L;
 278       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 279       __ cmp(rscratch1, (u1)NULL_WORD);
 280       __ br(Assembler::EQ, L);
 281       __ stop("StubRoutines::call_stub: entered with pending exception");
 282       __ BIND(L);
 283     }
 284 #endif
 285     // pass parameters if any
 286     __ mov(esp, sp);
 287     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 288     __ andr(sp, rscratch1, -2 * wordSize);
 289 
 290     BLOCK_COMMENT("pass parameters if any");
 291     Label parameters_done;
 292     // parameter count is still in c_rarg6
 293     // and parameter pointer identifying param 1 is in c_rarg5
 294     __ cbzw(c_rarg6, parameters_done);
 295 
 296     address loop = __ pc();
 297     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 298     __ subsw(c_rarg6, c_rarg6, 1);
 299     __ push(rscratch1);
 300     __ br(Assembler::GT, loop);
 301 
 302     __ BIND(parameters_done);
 303 
 304     // call Java entry -- passing methdoOop, and current sp
 305     //      rmethod: Method*
 306     //      r19_sender_sp: sender sp
 307     BLOCK_COMMENT("call Java function");
 308     __ mov(r19_sender_sp, sp);
 309     __ blr(c_rarg4);
 310 
 311     // we do this here because the notify will already have been done
 312     // if we get to the next instruction via an exception
 313     //
 314     // n.b. adding this instruction here affects the calculation of
 315     // whether or not a routine returns to the call stub (used when
 316     // doing stack walks) since the normal test is to check the return
 317     // pc against the address saved below. so we may need to allow for
 318     // this extra instruction in the check.
 319 
 320     // save current address for use by exception handling code
 321 
 322     return_address = __ pc();
 323 
 324     // store result depending on type (everything that is not
 325     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 326     // n.b. this assumes Java returns an integral result in r0
 327     // and a floating result in j_farg0
 328     // All of j_rargN may be used to return inline type fields so be careful
 329     // not to clobber those.
 330     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 331     // assignment of Rresult below.
 332     Register Rresult = r14, Rresult_type = r15;
 333     __ ldr(Rresult, result);
 334     Label is_long, is_float, is_double, check_prim, exit;
 335     __ ldr(Rresult_type, result_type);
 336     __ cmp(Rresult_type, (u1)T_OBJECT);
 337     __ br(Assembler::EQ, check_prim);
 338     __ cmp(Rresult_type, (u1)T_LONG);
 339     __ br(Assembler::EQ, is_long);
 340     __ cmp(Rresult_type, (u1)T_FLOAT);
 341     __ br(Assembler::EQ, is_float);
 342     __ cmp(Rresult_type, (u1)T_DOUBLE);
 343     __ br(Assembler::EQ, is_double);
 344 
 345     // handle T_INT case
 346     __ strw(r0, Address(Rresult));
 347 
 348     __ BIND(exit);
 349 
 350     // pop parameters
 351     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 352 
 353 #ifdef ASSERT
 354     // verify that threads correspond
 355     {
 356       Label L, S;
 357       __ ldr(rscratch1, thread);
 358       __ cmp(rthread, rscratch1);
 359       __ br(Assembler::NE, S);
 360       __ get_thread(rscratch1);
 361       __ cmp(rthread, rscratch1);
 362       __ br(Assembler::EQ, L);
 363       __ BIND(S);
 364       __ stop("StubRoutines::call_stub: threads must correspond");
 365       __ BIND(L);
 366     }
 367 #endif
 368 
 369     __ pop_cont_fastpath(rthread);
 370 
 371     // restore callee-save registers
 372     __ ldpd(v15, v14,  d15_save);
 373     __ ldpd(v13, v12,  d13_save);
 374     __ ldpd(v11, v10,  d11_save);
 375     __ ldpd(v9,  v8,   d9_save);
 376 
 377     __ ldp(r28, r27,   r28_save);
 378     __ ldp(r26, r25,   r26_save);
 379     __ ldp(r24, r23,   r24_save);
 380     __ ldp(r22, r21,   r22_save);
 381     __ ldp(r20, r19,   r20_save);
 382 
 383     // restore fpcr
 384     __ ldr(rscratch1,  fpcr_save);
 385     __ set_fpcr(rscratch1);
 386 
 387     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 388     __ ldrw(c_rarg2, result_type);
 389     __ ldr(c_rarg3,  method);
 390     __ ldp(c_rarg4, c_rarg5,  entry_point);
 391     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 392 
 393     // leave frame and return to caller
 394     __ leave();
 395     __ ret(lr);
 396 
 397     // handle return types different from T_INT
 398     __ BIND(check_prim);
 399     if (InlineTypeReturnedAsFields) {
 400       // Check for scalarized return value
 401       __ tbz(r0, 0, is_long);
 402       // Load pack handler address
 403       __ andr(rscratch1, r0, -2);
 404       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 405       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 406       __ blr(rscratch1);
 407       __ b(exit);
 408     }
 409 
 410     __ BIND(is_long);
 411     __ str(r0, Address(Rresult, 0));
 412     __ br(Assembler::AL, exit);
 413 
 414     __ BIND(is_float);
 415     __ strs(j_farg0, Address(Rresult, 0));
 416     __ br(Assembler::AL, exit);
 417 
 418     __ BIND(is_double);
 419     __ strd(j_farg0, Address(Rresult, 0));
 420     __ br(Assembler::AL, exit);
 421 
 422     return start;
 423   }
 424 
 425   // Return point for a Java call if there's an exception thrown in
 426   // Java code.  The exception is caught and transformed into a
 427   // pending exception stored in JavaThread that can be tested from
 428   // within the VM.
 429   //
 430   // Note: Usually the parameters are removed by the callee. In case
 431   // of an exception crossing an activation frame boundary, that is
 432   // not the case if the callee is compiled code => need to setup the
 433   // rsp.
 434   //
 435   // r0: exception oop
 436 
 437   address generate_catch_exception() {
 438     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 439     address start = __ pc();
 440 
 441     // same as in generate_call_stub():
 442     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 443     const Address thread        (rfp, thread_off         * wordSize);
 444 
 445 #ifdef ASSERT
 446     // verify that threads correspond
 447     {
 448       Label L, S;
 449       __ ldr(rscratch1, thread);
 450       __ cmp(rthread, rscratch1);
 451       __ br(Assembler::NE, S);
 452       __ get_thread(rscratch1);
 453       __ cmp(rthread, rscratch1);
 454       __ br(Assembler::EQ, L);
 455       __ bind(S);
 456       __ stop("StubRoutines::catch_exception: threads must correspond");
 457       __ bind(L);
 458     }
 459 #endif
 460 
 461     // set pending exception
 462     __ verify_oop(r0);
 463 
 464     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 465     __ mov(rscratch1, (address)__FILE__);
 466     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 467     __ movw(rscratch1, (int)__LINE__);
 468     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 469 
 470     // complete return to VM
 471     assert(StubRoutines::_call_stub_return_address != nullptr,
 472            "_call_stub_return_address must have been generated before");
 473     __ b(StubRoutines::_call_stub_return_address);
 474 
 475     return start;
 476   }
 477 
 478   // Continuation point for runtime calls returning with a pending
 479   // exception.  The pending exception check happened in the runtime
 480   // or native call stub.  The pending exception in Thread is
 481   // converted into a Java-level exception.
 482   //
 483   // Contract with Java-level exception handlers:
 484   // r0: exception
 485   // r3: throwing pc
 486   //
 487   // NOTE: At entry of this stub, exception-pc must be in LR !!
 488 
 489   // NOTE: this is always used as a jump target within generated code
 490   // so it just needs to be generated code with no x86 prolog
 491 
 492   address generate_forward_exception() {
 493     StubCodeMark mark(this, "StubRoutines", "forward exception");
 494     address start = __ pc();
 495 
 496     // Upon entry, LR points to the return address returning into
 497     // Java (interpreted or compiled) code; i.e., the return address
 498     // becomes the throwing pc.
 499     //
 500     // Arguments pushed before the runtime call are still on the stack
 501     // but the exception handler will reset the stack pointer ->
 502     // ignore them.  A potential result in registers can be ignored as
 503     // well.
 504 
 505 #ifdef ASSERT
 506     // make sure this code is only executed if there is a pending exception
 507     {
 508       Label L;
 509       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 510       __ cbnz(rscratch1, L);
 511       __ stop("StubRoutines::forward exception: no pending exception (1)");
 512       __ bind(L);
 513     }
 514 #endif
 515 
 516     // compute exception handler into r19
 517 
 518     // call the VM to find the handler address associated with the
 519     // caller address. pass thread in r0 and caller pc (ret address)
 520     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 521     // the stack.
 522     __ mov(c_rarg1, lr);
 523     // lr will be trashed by the VM call so we move it to R19
 524     // (callee-saved) because we also need to pass it to the handler
 525     // returned by this call.
 526     __ mov(r19, lr);
 527     BLOCK_COMMENT("call exception_handler_for_return_address");
 528     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 529                          SharedRuntime::exception_handler_for_return_address),
 530                     rthread, c_rarg1);
 531     // Reinitialize the ptrue predicate register, in case the external runtime
 532     // call clobbers ptrue reg, as we may return to SVE compiled code.
 533     __ reinitialize_ptrue();
 534 
 535     // we should not really care that lr is no longer the callee
 536     // address. we saved the value the handler needs in r19 so we can
 537     // just copy it to r3. however, the C2 handler will push its own
 538     // frame and then calls into the VM and the VM code asserts that
 539     // the PC for the frame above the handler belongs to a compiled
 540     // Java method. So, we restore lr here to satisfy that assert.
 541     __ mov(lr, r19);
 542     // setup r0 & r3 & clear pending exception
 543     __ mov(r3, r19);
 544     __ mov(r19, r0);
 545     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 546     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 547 
 548 #ifdef ASSERT
 549     // make sure exception is set
 550     {
 551       Label L;
 552       __ cbnz(r0, L);
 553       __ stop("StubRoutines::forward exception: no pending exception (2)");
 554       __ bind(L);
 555     }
 556 #endif
 557 
 558     // continue at exception handler
 559     // r0: exception
 560     // r3: throwing pc
 561     // r19: exception handler
 562     __ verify_oop(r0);
 563     __ br(r19);
 564 
 565     return start;
 566   }
 567 
 568   // Non-destructive plausibility checks for oops
 569   //
 570   // Arguments:
 571   //    r0: oop to verify
 572   //    rscratch1: error message
 573   //
 574   // Stack after saving c_rarg3:
 575   //    [tos + 0]: saved c_rarg3
 576   //    [tos + 1]: saved c_rarg2
 577   //    [tos + 2]: saved lr
 578   //    [tos + 3]: saved rscratch2
 579   //    [tos + 4]: saved r0
 580   //    [tos + 5]: saved rscratch1
 581   address generate_verify_oop() {
 582 
 583     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 584     address start = __ pc();
 585 
 586     Label exit, error;
 587 
 588     // save c_rarg2 and c_rarg3
 589     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 590 
 591     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 592     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 593     __ ldr(c_rarg3, Address(c_rarg2));
 594     __ add(c_rarg3, c_rarg3, 1);
 595     __ str(c_rarg3, Address(c_rarg2));
 596 
 597     // object is in r0
 598     // make sure object is 'reasonable'
 599     __ cbz(r0, exit); // if obj is null it is OK
 600 
 601     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 602     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 603 
 604     // return if everything seems ok
 605     __ bind(exit);
 606 
 607     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 608     __ ret(lr);
 609 
 610     // handle errors
 611     __ bind(error);
 612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 613 
 614     __ push(RegSet::range(r0, r29), sp);
 615     // debug(char* msg, int64_t pc, int64_t regs[])
 616     __ mov(c_rarg0, rscratch1);      // pass address of error message
 617     __ mov(c_rarg1, lr);             // pass return address
 618     __ mov(c_rarg2, sp);             // pass address of regs on stack
 619 #ifndef PRODUCT
 620     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 621 #endif
 622     BLOCK_COMMENT("call MacroAssembler::debug");
 623     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 624     __ blr(rscratch1);
 625     __ hlt(0);
 626 
 627     return start;
 628   }
 629 
 630   // Generate indices for iota vector.
 631   address generate_iota_indices(const char *stub_name) {
 632     __ align(CodeEntryAlignment);
 633     StubCodeMark mark(this, "StubRoutines", stub_name);
 634     address start = __ pc();
 635     // B
 636     __ emit_data64(0x0706050403020100, relocInfo::none);
 637     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 638     // H
 639     __ emit_data64(0x0003000200010000, relocInfo::none);
 640     __ emit_data64(0x0007000600050004, relocInfo::none);
 641     // S
 642     __ emit_data64(0x0000000100000000, relocInfo::none);
 643     __ emit_data64(0x0000000300000002, relocInfo::none);
 644     // D
 645     __ emit_data64(0x0000000000000000, relocInfo::none);
 646     __ emit_data64(0x0000000000000001, relocInfo::none);
 647     // S - FP
 648     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 649     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 650     // D - FP
 651     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 652     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 653     return start;
 654   }
 655 
 656   // The inner part of zero_words().  This is the bulk operation,
 657   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 658   // caller is responsible for zeroing the last few words.
 659   //
 660   // Inputs:
 661   // r10: the HeapWord-aligned base address of an array to zero.
 662   // r11: the count in HeapWords, r11 > 0.
 663   //
 664   // Returns r10 and r11, adjusted for the caller to clear.
 665   // r10: the base address of the tail of words left to clear.
 666   // r11: the number of words in the tail.
 667   //      r11 < MacroAssembler::zero_words_block_size.
 668 
 669   address generate_zero_blocks() {
 670     Label done;
 671     Label base_aligned;
 672 
 673     Register base = r10, cnt = r11;
 674 
 675     __ align(CodeEntryAlignment);
 676     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 677     address start = __ pc();
 678 
 679     if (UseBlockZeroing) {
 680       int zva_length = VM_Version::zva_length();
 681 
 682       // Ensure ZVA length can be divided by 16. This is required by
 683       // the subsequent operations.
 684       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 685 
 686       __ tbz(base, 3, base_aligned);
 687       __ str(zr, Address(__ post(base, 8)));
 688       __ sub(cnt, cnt, 1);
 689       __ bind(base_aligned);
 690 
 691       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 692       // alignment.
 693       Label small;
 694       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 695       __ subs(rscratch1, cnt, low_limit >> 3);
 696       __ br(Assembler::LT, small);
 697       __ zero_dcache_blocks(base, cnt);
 698       __ bind(small);
 699     }
 700 
 701     {
 702       // Number of stp instructions we'll unroll
 703       const int unroll =
 704         MacroAssembler::zero_words_block_size / 2;
 705       // Clear the remaining blocks.
 706       Label loop;
 707       __ subs(cnt, cnt, unroll * 2);
 708       __ br(Assembler::LT, done);
 709       __ bind(loop);
 710       for (int i = 0; i < unroll; i++)
 711         __ stp(zr, zr, __ post(base, 16));
 712       __ subs(cnt, cnt, unroll * 2);
 713       __ br(Assembler::GE, loop);
 714       __ bind(done);
 715       __ add(cnt, cnt, unroll * 2);
 716     }
 717 
 718     __ ret(lr);
 719 
 720     return start;
 721   }
 722 
 723 
 724   typedef enum {
 725     copy_forwards = 1,
 726     copy_backwards = -1
 727   } copy_direction;
 728 
 729   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 730   // for arraycopy stubs.
 731   class ArrayCopyBarrierSetHelper : StackObj {
 732     BarrierSetAssembler* _bs_asm;
 733     MacroAssembler* _masm;
 734     DecoratorSet _decorators;
 735     BasicType _type;
 736     Register _gct1;
 737     Register _gct2;
 738     Register _gct3;
 739     FloatRegister _gcvt1;
 740     FloatRegister _gcvt2;
 741     FloatRegister _gcvt3;
 742 
 743   public:
 744     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 745                               DecoratorSet decorators,
 746                               BasicType type,
 747                               Register gct1,
 748                               Register gct2,
 749                               Register gct3,
 750                               FloatRegister gcvt1,
 751                               FloatRegister gcvt2,
 752                               FloatRegister gcvt3)
 753       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 754         _masm(masm),
 755         _decorators(decorators),
 756         _type(type),
 757         _gct1(gct1),
 758         _gct2(gct2),
 759         _gct3(gct3),
 760         _gcvt1(gcvt1),
 761         _gcvt2(gcvt2),
 762         _gcvt3(gcvt3) {
 763     }
 764 
 765     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 766       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 767                             dst1, dst2, src,
 768                             _gct1, _gct2, _gcvt1);
 769     }
 770 
 771     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 772       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 773                              dst, src1, src2,
 774                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 775     }
 776 
 777     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 778       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 779                             dst1, dst2, src,
 780                             _gct1);
 781     }
 782 
 783     void copy_store_at_16(Address dst, Register src1, Register src2) {
 784       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 785                              dst, src1, src2,
 786                              _gct1, _gct2, _gct3);
 787     }
 788 
 789     void copy_load_at_8(Register dst, Address src) {
 790       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 791                             dst, noreg, src,
 792                             _gct1);
 793     }
 794 
 795     void copy_store_at_8(Address dst, Register src) {
 796       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 797                              dst, src, noreg,
 798                              _gct1, _gct2, _gct3);
 799     }
 800   };
 801 
 802   // Bulk copy of blocks of 8 words.
 803   //
 804   // count is a count of words.
 805   //
 806   // Precondition: count >= 8
 807   //
 808   // Postconditions:
 809   //
 810   // The least significant bit of count contains the remaining count
 811   // of words to copy.  The rest of count is trash.
 812   //
 813   // s and d are adjusted to point to the remaining words to copy
 814   //
 815   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 816                            copy_direction direction) {
 817     int unit = wordSize * direction;
 818     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 819 
 820     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 821       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 822     const Register stride = r14;
 823     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 824     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 825     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 826 
 827     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 828     assert_different_registers(s, d, count, rscratch1, rscratch2);
 829 
 830     Label again, drain;
 831     const char *stub_name;
 832     if (direction == copy_forwards)
 833       stub_name = "forward_copy_longs";
 834     else
 835       stub_name = "backward_copy_longs";
 836 
 837     __ align(CodeEntryAlignment);
 838 
 839     StubCodeMark mark(this, "StubRoutines", stub_name);
 840 
 841     __ bind(start);
 842 
 843     Label unaligned_copy_long;
 844     if (AvoidUnalignedAccesses) {
 845       __ tbnz(d, 3, unaligned_copy_long);
 846     }
 847 
 848     if (direction == copy_forwards) {
 849       __ sub(s, s, bias);
 850       __ sub(d, d, bias);
 851     }
 852 
 853 #ifdef ASSERT
 854     // Make sure we are never given < 8 words
 855     {
 856       Label L;
 857       __ cmp(count, (u1)8);
 858       __ br(Assembler::GE, L);
 859       __ stop("genrate_copy_longs called with < 8 words");
 860       __ bind(L);
 861     }
 862 #endif
 863 
 864     // Fill 8 registers
 865     if (UseSIMDForMemoryOps) {
 866       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 867       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 868     } else {
 869       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 870       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 871       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 872       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 873     }
 874 
 875     __ subs(count, count, 16);
 876     __ br(Assembler::LO, drain);
 877 
 878     int prefetch = PrefetchCopyIntervalInBytes;
 879     bool use_stride = false;
 880     if (direction == copy_backwards) {
 881        use_stride = prefetch > 256;
 882        prefetch = -prefetch;
 883        if (use_stride) __ mov(stride, prefetch);
 884     }
 885 
 886     __ bind(again);
 887 
 888     if (PrefetchCopyIntervalInBytes > 0)
 889       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 890 
 891     if (UseSIMDForMemoryOps) {
 892       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 893       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 894       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 895       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 896     } else {
 897       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 898       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 899       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 900       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 901       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 902       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 903       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 905     }
 906 
 907     __ subs(count, count, 8);
 908     __ br(Assembler::HS, again);
 909 
 910     // Drain
 911     __ bind(drain);
 912     if (UseSIMDForMemoryOps) {
 913       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 914       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 915     } else {
 916       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 917       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 918       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 919       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 920     }
 921 
 922     {
 923       Label L1, L2;
 924       __ tbz(count, exact_log2(4), L1);
 925       if (UseSIMDForMemoryOps) {
 926         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 927         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 928       } else {
 929         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 930         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 931         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 932         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 933       }
 934       __ bind(L1);
 935 
 936       if (direction == copy_forwards) {
 937         __ add(s, s, bias);
 938         __ add(d, d, bias);
 939       }
 940 
 941       __ tbz(count, 1, L2);
 942       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 943       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 944       __ bind(L2);
 945     }
 946 
 947     __ ret(lr);
 948 
 949     if (AvoidUnalignedAccesses) {
 950       Label drain, again;
 951       // Register order for storing. Order is different for backward copy.
 952 
 953       __ bind(unaligned_copy_long);
 954 
 955       // source address is even aligned, target odd aligned
 956       //
 957       // when forward copying word pairs we read long pairs at offsets
 958       // {0, 2, 4, 6} (in long words). when backwards copying we read
 959       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 960       // address by -2 in the forwards case so we can compute the
 961       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 962       // or -1.
 963       //
 964       // when forward copying we need to store 1 word, 3 pairs and
 965       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 966       // zero offset We adjust the destination by -1 which means we
 967       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 968       //
 969       // When backwards copyng we need to store 1 word, 3 pairs and
 970       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 971       // offsets {1, 3, 5, 7, 8} * unit.
 972 
 973       if (direction == copy_forwards) {
 974         __ sub(s, s, 16);
 975         __ sub(d, d, 8);
 976       }
 977 
 978       // Fill 8 registers
 979       //
 980       // for forwards copy s was offset by -16 from the original input
 981       // value of s so the register contents are at these offsets
 982       // relative to the 64 bit block addressed by that original input
 983       // and so on for each successive 64 byte block when s is updated
 984       //
 985       // t0 at offset 0,  t1 at offset 8
 986       // t2 at offset 16, t3 at offset 24
 987       // t4 at offset 32, t5 at offset 40
 988       // t6 at offset 48, t7 at offset 56
 989 
 990       // for backwards copy s was not offset so the register contents
 991       // are at these offsets into the preceding 64 byte block
 992       // relative to that original input and so on for each successive
 993       // preceding 64 byte block when s is updated. this explains the
 994       // slightly counter-intuitive looking pattern of register usage
 995       // in the stp instructions for backwards copy.
 996       //
 997       // t0 at offset -16, t1 at offset -8
 998       // t2 at offset -32, t3 at offset -24
 999       // t4 at offset -48, t5 at offset -40
1000       // t6 at offset -64, t7 at offset -56
1001 
1002       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1003       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1004       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1005       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1006 
1007       __ subs(count, count, 16);
1008       __ br(Assembler::LO, drain);
1009 
1010       int prefetch = PrefetchCopyIntervalInBytes;
1011       bool use_stride = false;
1012       if (direction == copy_backwards) {
1013          use_stride = prefetch > 256;
1014          prefetch = -prefetch;
1015          if (use_stride) __ mov(stride, prefetch);
1016       }
1017 
1018       __ bind(again);
1019 
1020       if (PrefetchCopyIntervalInBytes > 0)
1021         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1022 
1023       if (direction == copy_forwards) {
1024        // allowing for the offset of -8 the store instructions place
1025        // registers into the target 64 bit block at the following
1026        // offsets
1027        //
1028        // t0 at offset 0
1029        // t1 at offset 8,  t2 at offset 16
1030        // t3 at offset 24, t4 at offset 32
1031        // t5 at offset 40, t6 at offset 48
1032        // t7 at offset 56
1033 
1034         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1035         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1036         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1037         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1038         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1039         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1040         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1041         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1042         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1043       } else {
1044        // d was not offset when we started so the registers are
1045        // written into the 64 bit block preceding d with the following
1046        // offsets
1047        //
1048        // t1 at offset -8
1049        // t3 at offset -24, t0 at offset -16
1050        // t5 at offset -48, t2 at offset -32
1051        // t7 at offset -56, t4 at offset -48
1052        //                   t6 at offset -64
1053        //
1054        // note that this matches the offsets previously noted for the
1055        // loads
1056 
1057         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1058         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1059         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1060         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1061         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1062         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1063         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1064         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1065         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1066       }
1067 
1068       __ subs(count, count, 8);
1069       __ br(Assembler::HS, again);
1070 
1071       // Drain
1072       //
1073       // this uses the same pattern of offsets and register arguments
1074       // as above
1075       __ bind(drain);
1076       if (direction == copy_forwards) {
1077         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1078         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1079         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1080         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1081         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1082       } else {
1083         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1084         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1085         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1086         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1087         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1088       }
1089       // now we need to copy any remaining part block which may
1090       // include a 4 word block subblock and/or a 2 word subblock.
1091       // bits 2 and 1 in the count are the tell-tale for whether we
1092       // have each such subblock
1093       {
1094         Label L1, L2;
1095         __ tbz(count, exact_log2(4), L1);
1096        // this is the same as above but copying only 4 longs hence
1097        // with only one intervening stp between the str instructions
1098        // but note that the offsets and registers still follow the
1099        // same pattern
1100         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1101         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1102         if (direction == copy_forwards) {
1103           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1104           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1105           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1106         } else {
1107           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1108           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1109           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1110         }
1111         __ bind(L1);
1112 
1113         __ tbz(count, 1, L2);
1114        // this is the same as above but copying only 2 longs hence
1115        // there is no intervening stp between the str instructions
1116        // but note that the offset and register patterns are still
1117        // the same
1118         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1119         if (direction == copy_forwards) {
1120           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1121           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1122         } else {
1123           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1124           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1125         }
1126         __ bind(L2);
1127 
1128        // for forwards copy we need to re-adjust the offsets we
1129        // applied so that s and d are follow the last words written
1130 
1131        if (direction == copy_forwards) {
1132          __ add(s, s, 16);
1133          __ add(d, d, 8);
1134        }
1135 
1136       }
1137 
1138       __ ret(lr);
1139       }
1140   }
1141 
1142   // Small copy: less than 16 bytes.
1143   //
1144   // NB: Ignores all of the bits of count which represent more than 15
1145   // bytes, so a caller doesn't have to mask them.
1146 
1147   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1148     bool is_backwards = step < 0;
1149     size_t granularity = uabs(step);
1150     int direction = is_backwards ? -1 : 1;
1151 
1152     Label Lword, Lint, Lshort, Lbyte;
1153 
1154     assert(granularity
1155            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1156 
1157     const Register t0 = r3;
1158     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1159     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1160 
1161     // ??? I don't know if this bit-test-and-branch is the right thing
1162     // to do.  It does a lot of jumping, resulting in several
1163     // mispredicted branches.  It might make more sense to do this
1164     // with something like Duff's device with a single computed branch.
1165 
1166     __ tbz(count, 3 - exact_log2(granularity), Lword);
1167     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1168     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1169     __ bind(Lword);
1170 
1171     if (granularity <= sizeof (jint)) {
1172       __ tbz(count, 2 - exact_log2(granularity), Lint);
1173       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1174       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1175       __ bind(Lint);
1176     }
1177 
1178     if (granularity <= sizeof (jshort)) {
1179       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1180       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1181       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1182       __ bind(Lshort);
1183     }
1184 
1185     if (granularity <= sizeof (jbyte)) {
1186       __ tbz(count, 0, Lbyte);
1187       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1188       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1189       __ bind(Lbyte);
1190     }
1191   }
1192 
1193   Label copy_f, copy_b;
1194   Label copy_obj_f, copy_obj_b;
1195   Label copy_obj_uninit_f, copy_obj_uninit_b;
1196 
1197   // All-singing all-dancing memory copy.
1198   //
1199   // Copy count units of memory from s to d.  The size of a unit is
1200   // step, which can be positive or negative depending on the direction
1201   // of copy.  If is_aligned is false, we align the source address.
1202   //
1203 
1204   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1205                    Register s, Register d, Register count, int step) {
1206     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1207     bool is_backwards = step < 0;
1208     unsigned int granularity = uabs(step);
1209     const Register t0 = r3, t1 = r4;
1210 
1211     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1212     // load all the data before writing anything
1213     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1214     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1215     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1216     const Register send = r17, dend = r16;
1217     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1218     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1219     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1220 
1221     if (PrefetchCopyIntervalInBytes > 0)
1222       __ prfm(Address(s, 0), PLDL1KEEP);
1223     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1224     __ br(Assembler::HI, copy_big);
1225 
1226     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1227     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1228 
1229     __ cmp(count, u1(16/granularity));
1230     __ br(Assembler::LS, copy16);
1231 
1232     __ cmp(count, u1(64/granularity));
1233     __ br(Assembler::HI, copy80);
1234 
1235     __ cmp(count, u1(32/granularity));
1236     __ br(Assembler::LS, copy32);
1237 
1238     // 33..64 bytes
1239     if (UseSIMDForMemoryOps) {
1240       bs.copy_load_at_32(v0, v1, Address(s, 0));
1241       bs.copy_load_at_32(v2, v3, Address(send, -32));
1242       bs.copy_store_at_32(Address(d, 0), v0, v1);
1243       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1244     } else {
1245       bs.copy_load_at_16(t0, t1, Address(s, 0));
1246       bs.copy_load_at_16(t2, t3, Address(s, 16));
1247       bs.copy_load_at_16(t4, t5, Address(send, -32));
1248       bs.copy_load_at_16(t6, t7, Address(send, -16));
1249 
1250       bs.copy_store_at_16(Address(d, 0), t0, t1);
1251       bs.copy_store_at_16(Address(d, 16), t2, t3);
1252       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1253       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1254     }
1255     __ b(finish);
1256 
1257     // 17..32 bytes
1258     __ bind(copy32);
1259     bs.copy_load_at_16(t0, t1, Address(s, 0));
1260     bs.copy_load_at_16(t6, t7, Address(send, -16));
1261 
1262     bs.copy_store_at_16(Address(d, 0), t0, t1);
1263     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1264     __ b(finish);
1265 
1266     // 65..80/96 bytes
1267     // (96 bytes if SIMD because we do 32 byes per instruction)
1268     __ bind(copy80);
1269     if (UseSIMDForMemoryOps) {
1270       bs.copy_load_at_32(v0, v1, Address(s, 0));
1271       bs.copy_load_at_32(v2, v3, Address(s, 32));
1272       // Unaligned pointers can be an issue for copying.
1273       // The issue has more chances to happen when granularity of data is
1274       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1275       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1276       // The most performance drop has been seen for the range 65-80 bytes.
1277       // For such cases using the pair of ldp/stp instead of the third pair of
1278       // ldpq/stpq fixes the performance issue.
1279       if (granularity < sizeof (jint)) {
1280         Label copy96;
1281         __ cmp(count, u1(80/granularity));
1282         __ br(Assembler::HI, copy96);
1283         bs.copy_load_at_16(t0, t1, Address(send, -16));
1284 
1285         bs.copy_store_at_32(Address(d, 0), v0, v1);
1286         bs.copy_store_at_32(Address(d, 32), v2, v3);
1287 
1288         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1289         __ b(finish);
1290 
1291         __ bind(copy96);
1292       }
1293       bs.copy_load_at_32(v4, v5, Address(send, -32));
1294 
1295       bs.copy_store_at_32(Address(d, 0), v0, v1);
1296       bs.copy_store_at_32(Address(d, 32), v2, v3);
1297 
1298       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1299     } else {
1300       bs.copy_load_at_16(t0, t1, Address(s, 0));
1301       bs.copy_load_at_16(t2, t3, Address(s, 16));
1302       bs.copy_load_at_16(t4, t5, Address(s, 32));
1303       bs.copy_load_at_16(t6, t7, Address(s, 48));
1304       bs.copy_load_at_16(t8, t9, Address(send, -16));
1305 
1306       bs.copy_store_at_16(Address(d, 0), t0, t1);
1307       bs.copy_store_at_16(Address(d, 16), t2, t3);
1308       bs.copy_store_at_16(Address(d, 32), t4, t5);
1309       bs.copy_store_at_16(Address(d, 48), t6, t7);
1310       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1311     }
1312     __ b(finish);
1313 
1314     // 0..16 bytes
1315     __ bind(copy16);
1316     __ cmp(count, u1(8/granularity));
1317     __ br(Assembler::LO, copy8);
1318 
1319     // 8..16 bytes
1320     bs.copy_load_at_8(t0, Address(s, 0));
1321     bs.copy_load_at_8(t1, Address(send, -8));
1322     bs.copy_store_at_8(Address(d, 0), t0);
1323     bs.copy_store_at_8(Address(dend, -8), t1);
1324     __ b(finish);
1325 
1326     if (granularity < 8) {
1327       // 4..7 bytes
1328       __ bind(copy8);
1329       __ tbz(count, 2 - exact_log2(granularity), copy4);
1330       __ ldrw(t0, Address(s, 0));
1331       __ ldrw(t1, Address(send, -4));
1332       __ strw(t0, Address(d, 0));
1333       __ strw(t1, Address(dend, -4));
1334       __ b(finish);
1335       if (granularity < 4) {
1336         // 0..3 bytes
1337         __ bind(copy4);
1338         __ cbz(count, finish); // get rid of 0 case
1339         if (granularity == 2) {
1340           __ ldrh(t0, Address(s, 0));
1341           __ strh(t0, Address(d, 0));
1342         } else { // granularity == 1
1343           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1344           // the first and last byte.
1345           // Handle the 3 byte case by loading and storing base + count/2
1346           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1347           // This does means in the 1 byte case we load/store the same
1348           // byte 3 times.
1349           __ lsr(count, count, 1);
1350           __ ldrb(t0, Address(s, 0));
1351           __ ldrb(t1, Address(send, -1));
1352           __ ldrb(t2, Address(s, count));
1353           __ strb(t0, Address(d, 0));
1354           __ strb(t1, Address(dend, -1));
1355           __ strb(t2, Address(d, count));
1356         }
1357         __ b(finish);
1358       }
1359     }
1360 
1361     __ bind(copy_big);
1362     if (is_backwards) {
1363       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1364       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1365     }
1366 
1367     // Now we've got the small case out of the way we can align the
1368     // source address on a 2-word boundary.
1369 
1370     // Here we will materialize a count in r15, which is used by copy_memory_small
1371     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1372     // Up until here, we have used t9, which aliases r15, but from here on, that register
1373     // can not be used as a temp register, as it contains the count.
1374 
1375     Label aligned;
1376 
1377     if (is_aligned) {
1378       // We may have to adjust by 1 word to get s 2-word-aligned.
1379       __ tbz(s, exact_log2(wordSize), aligned);
1380       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1381       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1382       __ sub(count, count, wordSize/granularity);
1383     } else {
1384       if (is_backwards) {
1385         __ andr(r15, s, 2 * wordSize - 1);
1386       } else {
1387         __ neg(r15, s);
1388         __ andr(r15, r15, 2 * wordSize - 1);
1389       }
1390       // r15 is the byte adjustment needed to align s.
1391       __ cbz(r15, aligned);
1392       int shift = exact_log2(granularity);
1393       if (shift)  __ lsr(r15, r15, shift);
1394       __ sub(count, count, r15);
1395 
1396 #if 0
1397       // ?? This code is only correct for a disjoint copy.  It may or
1398       // may not make sense to use it in that case.
1399 
1400       // Copy the first pair; s and d may not be aligned.
1401       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1402       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1403 
1404       // Align s and d, adjust count
1405       if (is_backwards) {
1406         __ sub(s, s, r15);
1407         __ sub(d, d, r15);
1408       } else {
1409         __ add(s, s, r15);
1410         __ add(d, d, r15);
1411       }
1412 #else
1413       copy_memory_small(decorators, type, s, d, r15, step);
1414 #endif
1415     }
1416 
1417     __ bind(aligned);
1418 
1419     // s is now 2-word-aligned.
1420 
1421     // We have a count of units and some trailing bytes.  Adjust the
1422     // count and do a bulk copy of words.
1423     __ lsr(r15, count, exact_log2(wordSize/granularity));
1424     if (direction == copy_forwards) {
1425       if (type != T_OBJECT) {
1426         __ bl(copy_f);
1427       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1428         __ bl(copy_obj_uninit_f);
1429       } else {
1430         __ bl(copy_obj_f);
1431       }
1432     } else {
1433       if (type != T_OBJECT) {
1434         __ bl(copy_b);
1435       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1436         __ bl(copy_obj_uninit_b);
1437       } else {
1438         __ bl(copy_obj_b);
1439       }
1440     }
1441 
1442     // And the tail.
1443     copy_memory_small(decorators, type, s, d, count, step);
1444 
1445     if (granularity >= 8) __ bind(copy8);
1446     if (granularity >= 4) __ bind(copy4);
1447     __ bind(finish);
1448   }
1449 
1450 
1451   void clobber_registers() {
1452 #ifdef ASSERT
1453     RegSet clobbered
1454       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1455     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1456     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1457     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1458       __ mov(*it, rscratch1);
1459     }
1460 #endif
1461 
1462   }
1463 
1464   // Scan over array at a for count oops, verifying each one.
1465   // Preserves a and count, clobbers rscratch1 and rscratch2.
1466   void verify_oop_array (int size, Register a, Register count, Register temp) {
1467     Label loop, end;
1468     __ mov(rscratch1, a);
1469     __ mov(rscratch2, zr);
1470     __ bind(loop);
1471     __ cmp(rscratch2, count);
1472     __ br(Assembler::HS, end);
1473     if (size == wordSize) {
1474       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1475       __ verify_oop(temp);
1476     } else {
1477       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1478       __ decode_heap_oop(temp); // calls verify_oop
1479     }
1480     __ add(rscratch2, rscratch2, 1);
1481     __ b(loop);
1482     __ bind(end);
1483   }
1484 
1485   // Arguments:
1486   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1487   //             ignored
1488   //   is_oop  - true => oop array, so generate store check code
1489   //   name    - stub name string
1490   //
1491   // Inputs:
1492   //   c_rarg0   - source array address
1493   //   c_rarg1   - destination array address
1494   //   c_rarg2   - element count, treated as ssize_t, can be zero
1495   //
1496   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1497   // the hardware handle it.  The two dwords within qwords that span
1498   // cache line boundaries will still be loaded and stored atomically.
1499   //
1500   // Side Effects:
1501   //   disjoint_int_copy_entry is set to the no-overlap entry point
1502   //   used by generate_conjoint_int_oop_copy().
1503   //
1504   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1505                                   const char *name, bool dest_uninitialized = false) {
1506     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1507     RegSet saved_reg = RegSet::of(s, d, count);
1508     __ align(CodeEntryAlignment);
1509     StubCodeMark mark(this, "StubRoutines", name);
1510     address start = __ pc();
1511     __ enter();
1512 
1513     if (entry != nullptr) {
1514       *entry = __ pc();
1515       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1516       BLOCK_COMMENT("Entry:");
1517     }
1518 
1519     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1520     if (dest_uninitialized) {
1521       decorators |= IS_DEST_UNINITIALIZED;
1522     }
1523     if (aligned) {
1524       decorators |= ARRAYCOPY_ALIGNED;
1525     }
1526 
1527     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1528     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1529 
1530     if (is_oop) {
1531       // save regs before copy_memory
1532       __ push(RegSet::of(d, count), sp);
1533     }
1534     {
1535       // UnsafeMemoryAccess page error: continue after unsafe access
1536       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1537       UnsafeMemoryAccessMark umam(this, add_entry, true);
1538       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1539     }
1540 
1541     if (is_oop) {
1542       __ pop(RegSet::of(d, count), sp);
1543       if (VerifyOops)
1544         verify_oop_array(size, d, count, r16);
1545     }
1546 
1547     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1548 
1549     __ leave();
1550     __ mov(r0, zr); // return 0
1551     __ ret(lr);
1552     return start;
1553   }
1554 
1555   // Arguments:
1556   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1557   //             ignored
1558   //   is_oop  - true => oop array, so generate store check code
1559   //   name    - stub name string
1560   //
1561   // Inputs:
1562   //   c_rarg0   - source array address
1563   //   c_rarg1   - destination array address
1564   //   c_rarg2   - element count, treated as ssize_t, can be zero
1565   //
1566   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1567   // the hardware handle it.  The two dwords within qwords that span
1568   // cache line boundaries will still be loaded and stored atomically.
1569   //
1570   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1571                                  address *entry, const char *name,
1572                                  bool dest_uninitialized = false) {
1573     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1574     RegSet saved_regs = RegSet::of(s, d, count);
1575     StubCodeMark mark(this, "StubRoutines", name);
1576     address start = __ pc();
1577     __ enter();
1578 
1579     if (entry != nullptr) {
1580       *entry = __ pc();
1581       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1582       BLOCK_COMMENT("Entry:");
1583     }
1584 
1585     // use fwd copy when (d-s) above_equal (count*size)
1586     __ sub(rscratch1, d, s);
1587     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1588     __ br(Assembler::HS, nooverlap_target);
1589 
1590     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1591     if (dest_uninitialized) {
1592       decorators |= IS_DEST_UNINITIALIZED;
1593     }
1594     if (aligned) {
1595       decorators |= ARRAYCOPY_ALIGNED;
1596     }
1597 
1598     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1599     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1600 
1601     if (is_oop) {
1602       // save regs before copy_memory
1603       __ push(RegSet::of(d, count), sp);
1604     }
1605     {
1606       // UnsafeMemoryAccess page error: continue after unsafe access
1607       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1608       UnsafeMemoryAccessMark umam(this, add_entry, true);
1609       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1610     }
1611     if (is_oop) {
1612       __ pop(RegSet::of(d, count), sp);
1613       if (VerifyOops)
1614         verify_oop_array(size, d, count, r16);
1615     }
1616     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1617     __ leave();
1618     __ mov(r0, zr); // return 0
1619     __ ret(lr);
1620     return start;
1621 }
1622 
1623   // Arguments:
1624   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1625   //             ignored
1626   //   name    - stub name string
1627   //
1628   // Inputs:
1629   //   c_rarg0   - source array address
1630   //   c_rarg1   - destination array address
1631   //   c_rarg2   - element count, treated as ssize_t, can be zero
1632   //
1633   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1634   // we let the hardware handle it.  The one to eight bytes within words,
1635   // dwords or qwords that span cache line boundaries will still be loaded
1636   // and stored atomically.
1637   //
1638   // Side Effects:
1639   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1640   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1641   // we let the hardware handle it.  The one to eight bytes within words,
1642   // dwords or qwords that span cache line boundaries will still be loaded
1643   // and stored atomically.
1644   //
1645   // Side Effects:
1646   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1647   //   used by generate_conjoint_byte_copy().
1648   //
1649   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1650     const bool not_oop = false;
1651     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1652   }
1653 
1654   // Arguments:
1655   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1656   //             ignored
1657   //   name    - stub name string
1658   //
1659   // Inputs:
1660   //   c_rarg0   - source array address
1661   //   c_rarg1   - destination array address
1662   //   c_rarg2   - element count, treated as ssize_t, can be zero
1663   //
1664   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1665   // we let the hardware handle it.  The one to eight bytes within words,
1666   // dwords or qwords that span cache line boundaries will still be loaded
1667   // and stored atomically.
1668   //
1669   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1670                                       address* entry, const char *name) {
1671     const bool not_oop = false;
1672     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1673   }
1674 
1675   // Arguments:
1676   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1677   //             ignored
1678   //   name    - stub name string
1679   //
1680   // Inputs:
1681   //   c_rarg0   - source array address
1682   //   c_rarg1   - destination array address
1683   //   c_rarg2   - element count, treated as ssize_t, can be zero
1684   //
1685   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1686   // let the hardware handle it.  The two or four words within dwords
1687   // or qwords that span cache line boundaries will still be loaded
1688   // and stored atomically.
1689   //
1690   // Side Effects:
1691   //   disjoint_short_copy_entry is set to the no-overlap entry point
1692   //   used by generate_conjoint_short_copy().
1693   //
1694   address generate_disjoint_short_copy(bool aligned,
1695                                        address* entry, const char *name) {
1696     const bool not_oop = false;
1697     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1698   }
1699 
1700   // Arguments:
1701   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1702   //             ignored
1703   //   name    - stub name string
1704   //
1705   // Inputs:
1706   //   c_rarg0   - source array address
1707   //   c_rarg1   - destination array address
1708   //   c_rarg2   - element count, treated as ssize_t, can be zero
1709   //
1710   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1711   // let the hardware handle it.  The two or four words within dwords
1712   // or qwords that span cache line boundaries will still be loaded
1713   // and stored atomically.
1714   //
1715   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1716                                        address *entry, const char *name) {
1717     const bool not_oop = false;
1718     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1719 
1720   }
1721   // Arguments:
1722   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1723   //             ignored
1724   //   name    - stub name string
1725   //
1726   // Inputs:
1727   //   c_rarg0   - source array address
1728   //   c_rarg1   - destination array address
1729   //   c_rarg2   - element count, treated as ssize_t, can be zero
1730   //
1731   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1732   // the hardware handle it.  The two dwords within qwords that span
1733   // cache line boundaries will still be loaded and stored atomically.
1734   //
1735   // Side Effects:
1736   //   disjoint_int_copy_entry is set to the no-overlap entry point
1737   //   used by generate_conjoint_int_oop_copy().
1738   //
1739   address generate_disjoint_int_copy(bool aligned, address *entry,
1740                                          const char *name, bool dest_uninitialized = false) {
1741     const bool not_oop = false;
1742     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1743   }
1744 
1745   // Arguments:
1746   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1747   //             ignored
1748   //   name    - stub name string
1749   //
1750   // Inputs:
1751   //   c_rarg0   - source array address
1752   //   c_rarg1   - destination array address
1753   //   c_rarg2   - element count, treated as ssize_t, can be zero
1754   //
1755   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1756   // the hardware handle it.  The two dwords within qwords that span
1757   // cache line boundaries will still be loaded and stored atomically.
1758   //
1759   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1760                                      address *entry, const char *name,
1761                                      bool dest_uninitialized = false) {
1762     const bool not_oop = false;
1763     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1764   }
1765 
1766 
1767   // Arguments:
1768   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1769   //             ignored
1770   //   name    - stub name string
1771   //
1772   // Inputs:
1773   //   c_rarg0   - source array address
1774   //   c_rarg1   - destination array address
1775   //   c_rarg2   - element count, treated as size_t, can be zero
1776   //
1777   // Side Effects:
1778   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1779   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1780   //
1781   address generate_disjoint_long_copy(bool aligned, address *entry,
1782                                           const char *name, bool dest_uninitialized = false) {
1783     const bool not_oop = false;
1784     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1785   }
1786 
1787   // Arguments:
1788   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1789   //             ignored
1790   //   name    - stub name string
1791   //
1792   // Inputs:
1793   //   c_rarg0   - source array address
1794   //   c_rarg1   - destination array address
1795   //   c_rarg2   - element count, treated as size_t, can be zero
1796   //
1797   address generate_conjoint_long_copy(bool aligned,
1798                                       address nooverlap_target, address *entry,
1799                                       const char *name, bool dest_uninitialized = false) {
1800     const bool not_oop = false;
1801     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1802   }
1803 
1804   // Arguments:
1805   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1806   //             ignored
1807   //   name    - stub name string
1808   //
1809   // Inputs:
1810   //   c_rarg0   - source array address
1811   //   c_rarg1   - destination array address
1812   //   c_rarg2   - element count, treated as size_t, can be zero
1813   //
1814   // Side Effects:
1815   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1816   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1817   //
1818   address generate_disjoint_oop_copy(bool aligned, address *entry,
1819                                      const char *name, bool dest_uninitialized) {
1820     const bool is_oop = true;
1821     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1822     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1823   }
1824 
1825   // Arguments:
1826   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1827   //             ignored
1828   //   name    - stub name string
1829   //
1830   // Inputs:
1831   //   c_rarg0   - source array address
1832   //   c_rarg1   - destination array address
1833   //   c_rarg2   - element count, treated as size_t, can be zero
1834   //
1835   address generate_conjoint_oop_copy(bool aligned,
1836                                      address nooverlap_target, address *entry,
1837                                      const char *name, bool dest_uninitialized) {
1838     const bool is_oop = true;
1839     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1840     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1841                                   name, dest_uninitialized);
1842   }
1843 
1844 
1845   // Helper for generating a dynamic type check.
1846   // Smashes rscratch1, rscratch2.
1847   void generate_type_check(Register sub_klass,
1848                            Register super_check_offset,
1849                            Register super_klass,
1850                            Label& L_success) {
1851     assert_different_registers(sub_klass, super_check_offset, super_klass);
1852 
1853     BLOCK_COMMENT("type_check:");
1854 
1855     Label L_miss;
1856 
1857     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1858                                      super_check_offset);
1859     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1860 
1861     // Fall through on failure!
1862     __ BIND(L_miss);
1863   }
1864 
1865   //
1866   //  Generate checkcasting array copy stub
1867   //
1868   //  Input:
1869   //    c_rarg0   - source array address
1870   //    c_rarg1   - destination array address
1871   //    c_rarg2   - element count, treated as ssize_t, can be zero
1872   //    c_rarg3   - size_t ckoff (super_check_offset)
1873   //    c_rarg4   - oop ckval (super_klass)
1874   //
1875   //  Output:
1876   //    r0 ==  0  -  success
1877   //    r0 == -1^K - failure, where K is partial transfer count
1878   //
1879   address generate_checkcast_copy(const char *name, address *entry,
1880                                   bool dest_uninitialized = false) {
1881 
1882     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1883 
1884     // Input registers (after setup_arg_regs)
1885     const Register from        = c_rarg0;   // source array address
1886     const Register to          = c_rarg1;   // destination array address
1887     const Register count       = c_rarg2;   // elementscount
1888     const Register ckoff       = c_rarg3;   // super_check_offset
1889     const Register ckval       = c_rarg4;   // super_klass
1890 
1891     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1892     RegSet wb_post_saved_regs = RegSet::of(count);
1893 
1894     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1895     const Register copied_oop  = r22;       // actual oop copied
1896     const Register count_save  = r21;       // orig elementscount
1897     const Register start_to    = r20;       // destination array start address
1898     const Register r19_klass   = r19;       // oop._klass
1899 
1900     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1901     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1902 
1903     //---------------------------------------------------------------
1904     // Assembler stub will be used for this call to arraycopy
1905     // if the two arrays are subtypes of Object[] but the
1906     // destination array type is not equal to or a supertype
1907     // of the source type.  Each element must be separately
1908     // checked.
1909 
1910     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1911                                copied_oop, r19_klass, count_save);
1912 
1913     __ align(CodeEntryAlignment);
1914     StubCodeMark mark(this, "StubRoutines", name);
1915     address start = __ pc();
1916 
1917     __ enter(); // required for proper stackwalking of RuntimeStub frame
1918 
1919 #ifdef ASSERT
1920     // caller guarantees that the arrays really are different
1921     // otherwise, we would have to make conjoint checks
1922     { Label L;
1923       __ b(L);                  // conjoint check not yet implemented
1924       __ stop("checkcast_copy within a single array");
1925       __ bind(L);
1926     }
1927 #endif //ASSERT
1928 
1929     // Caller of this entry point must set up the argument registers.
1930     if (entry != nullptr) {
1931       *entry = __ pc();
1932       BLOCK_COMMENT("Entry:");
1933     }
1934 
1935      // Empty array:  Nothing to do.
1936     __ cbz(count, L_done);
1937     __ push(RegSet::of(r19, r20, r21, r22), sp);
1938 
1939 #ifdef ASSERT
1940     BLOCK_COMMENT("assert consistent ckoff/ckval");
1941     // The ckoff and ckval must be mutually consistent,
1942     // even though caller generates both.
1943     { Label L;
1944       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1945       __ ldrw(start_to, Address(ckval, sco_offset));
1946       __ cmpw(ckoff, start_to);
1947       __ br(Assembler::EQ, L);
1948       __ stop("super_check_offset inconsistent");
1949       __ bind(L);
1950     }
1951 #endif //ASSERT
1952 
1953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1954     bool is_oop = true;
1955     int element_size = UseCompressedOops ? 4 : 8;
1956     if (dest_uninitialized) {
1957       decorators |= IS_DEST_UNINITIALIZED;
1958     }
1959 
1960     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1961     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1962 
1963     // save the original count
1964     __ mov(count_save, count);
1965 
1966     // Copy from low to high addresses
1967     __ mov(start_to, to);              // Save destination array start address
1968     __ b(L_load_element);
1969 
1970     // ======== begin loop ========
1971     // (Loop is rotated; its entry is L_load_element.)
1972     // Loop control:
1973     //   for (; count != 0; count--) {
1974     //     copied_oop = load_heap_oop(from++);
1975     //     ... generate_type_check ...;
1976     //     store_heap_oop(to++, copied_oop);
1977     //   }
1978     __ align(OptoLoopAlignment);
1979 
1980     __ BIND(L_store_element);
1981     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1982                       __ post(to, element_size), copied_oop, noreg,
1983                       gct1, gct2, gct3);
1984     __ sub(count, count, 1);
1985     __ cbz(count, L_do_card_marks);
1986 
1987     // ======== loop entry is here ========
1988     __ BIND(L_load_element);
1989     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1990                      copied_oop, noreg, __ post(from, element_size),
1991                      gct1);
1992     __ cbz(copied_oop, L_store_element);
1993 
1994     __ load_klass(r19_klass, copied_oop);// query the object klass
1995     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1996     // ======== end loop ========
1997 
1998     // It was a real error; we must depend on the caller to finish the job.
1999     // Register count = remaining oops, count_orig = total oops.
2000     // Emit GC store barriers for the oops we have copied and report
2001     // their number to the caller.
2002 
2003     __ subs(count, count_save, count);     // K = partially copied oop count
2004     __ eon(count, count, zr);                   // report (-1^K) to caller
2005     __ br(Assembler::EQ, L_done_pop);
2006 
2007     __ BIND(L_do_card_marks);
2008     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2009 
2010     __ bind(L_done_pop);
2011     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2012     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2013 
2014     __ bind(L_done);
2015     __ mov(r0, count);
2016     __ leave();
2017     __ ret(lr);
2018 
2019     return start;
2020   }
2021 
2022   // Perform range checks on the proposed arraycopy.
2023   // Kills temp, but nothing else.
2024   // Also, clean the sign bits of src_pos and dst_pos.
2025   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2026                               Register src_pos, // source position (c_rarg1)
2027                               Register dst,     // destination array oo (c_rarg2)
2028                               Register dst_pos, // destination position (c_rarg3)
2029                               Register length,
2030                               Register temp,
2031                               Label& L_failed) {
2032     BLOCK_COMMENT("arraycopy_range_checks:");
2033 
2034     assert_different_registers(rscratch1, temp);
2035 
2036     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2037     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2038     __ addw(temp, length, src_pos);
2039     __ cmpw(temp, rscratch1);
2040     __ br(Assembler::HI, L_failed);
2041 
2042     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2043     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2044     __ addw(temp, length, dst_pos);
2045     __ cmpw(temp, rscratch1);
2046     __ br(Assembler::HI, L_failed);
2047 
2048     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2049     __ movw(src_pos, src_pos);
2050     __ movw(dst_pos, dst_pos);
2051 
2052     BLOCK_COMMENT("arraycopy_range_checks done");
2053   }
2054 
2055   // These stubs get called from some dumb test routine.
2056   // I'll write them properly when they're called from
2057   // something that's actually doing something.
2058   static void fake_arraycopy_stub(address src, address dst, int count) {
2059     assert(count == 0, "huh?");
2060   }
2061 
2062 
2063   //
2064   //  Generate 'unsafe' array copy stub
2065   //  Though just as safe as the other stubs, it takes an unscaled
2066   //  size_t argument instead of an element count.
2067   //
2068   //  Input:
2069   //    c_rarg0   - source array address
2070   //    c_rarg1   - destination array address
2071   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2072   //
2073   // Examines the alignment of the operands and dispatches
2074   // to a long, int, short, or byte copy loop.
2075   //
2076   address generate_unsafe_copy(const char *name,
2077                                address byte_copy_entry,
2078                                address short_copy_entry,
2079                                address int_copy_entry,
2080                                address long_copy_entry) {
2081     Label L_long_aligned, L_int_aligned, L_short_aligned;
2082     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2083 
2084     __ align(CodeEntryAlignment);
2085     StubCodeMark mark(this, "StubRoutines", name);
2086     address start = __ pc();
2087     __ enter(); // required for proper stackwalking of RuntimeStub frame
2088 
2089     // bump this on entry, not on exit:
2090     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2091 
2092     __ orr(rscratch1, s, d);
2093     __ orr(rscratch1, rscratch1, count);
2094 
2095     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2096     __ cbz(rscratch1, L_long_aligned);
2097     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2098     __ cbz(rscratch1, L_int_aligned);
2099     __ tbz(rscratch1, 0, L_short_aligned);
2100     __ b(RuntimeAddress(byte_copy_entry));
2101 
2102     __ BIND(L_short_aligned);
2103     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2104     __ b(RuntimeAddress(short_copy_entry));
2105     __ BIND(L_int_aligned);
2106     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2107     __ b(RuntimeAddress(int_copy_entry));
2108     __ BIND(L_long_aligned);
2109     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2110     __ b(RuntimeAddress(long_copy_entry));
2111 
2112     return start;
2113   }
2114 
2115   //
2116   //  Generate generic array copy stubs
2117   //
2118   //  Input:
2119   //    c_rarg0    -  src oop
2120   //    c_rarg1    -  src_pos (32-bits)
2121   //    c_rarg2    -  dst oop
2122   //    c_rarg3    -  dst_pos (32-bits)
2123   //    c_rarg4    -  element count (32-bits)
2124   //
2125   //  Output:
2126   //    r0 ==  0  -  success
2127   //    r0 == -1^K - failure, where K is partial transfer count
2128   //
2129   address generate_generic_copy(const char *name,
2130                                 address byte_copy_entry, address short_copy_entry,
2131                                 address int_copy_entry, address oop_copy_entry,
2132                                 address long_copy_entry, address checkcast_copy_entry) {
2133 
2134     Label L_failed, L_objArray;
2135     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2136 
2137     // Input registers
2138     const Register src        = c_rarg0;  // source array oop
2139     const Register src_pos    = c_rarg1;  // source position
2140     const Register dst        = c_rarg2;  // destination array oop
2141     const Register dst_pos    = c_rarg3;  // destination position
2142     const Register length     = c_rarg4;
2143 
2144 
2145     // Registers used as temps
2146     const Register dst_klass  = c_rarg5;
2147 
2148     __ align(CodeEntryAlignment);
2149 
2150     StubCodeMark mark(this, "StubRoutines", name);
2151 
2152     address start = __ pc();
2153 
2154     __ enter(); // required for proper stackwalking of RuntimeStub frame
2155 
2156     // bump this on entry, not on exit:
2157     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2158 
2159     //-----------------------------------------------------------------------
2160     // Assembler stub will be used for this call to arraycopy
2161     // if the following conditions are met:
2162     //
2163     // (1) src and dst must not be null.
2164     // (2) src_pos must not be negative.
2165     // (3) dst_pos must not be negative.
2166     // (4) length  must not be negative.
2167     // (5) src klass and dst klass should be the same and not null.
2168     // (6) src and dst should be arrays.
2169     // (7) src_pos + length must not exceed length of src.
2170     // (8) dst_pos + length must not exceed length of dst.
2171     //
2172 
2173     //  if (src == nullptr) return -1;
2174     __ cbz(src, L_failed);
2175 
2176     //  if (src_pos < 0) return -1;
2177     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2178 
2179     //  if (dst == nullptr) return -1;
2180     __ cbz(dst, L_failed);
2181 
2182     //  if (dst_pos < 0) return -1;
2183     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2184 
2185     // registers used as temp
2186     const Register scratch_length    = r16; // elements count to copy
2187     const Register scratch_src_klass = r17; // array klass
2188     const Register lh                = r15; // layout helper
2189 
2190     //  if (length < 0) return -1;
2191     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2192     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2193 
2194     __ load_klass(scratch_src_klass, src);
2195 #ifdef ASSERT
2196     //  assert(src->klass() != nullptr);
2197     {
2198       BLOCK_COMMENT("assert klasses not null {");
2199       Label L1, L2;
2200       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2201       __ bind(L1);
2202       __ stop("broken null klass");
2203       __ bind(L2);
2204       __ load_klass(rscratch1, dst);
2205       __ cbz(rscratch1, L1);     // this would be broken also
2206       BLOCK_COMMENT("} assert klasses not null done");
2207     }
2208 #endif
2209 
2210     // Load layout helper (32-bits)
2211     //
2212     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2213     // 32        30    24            16              8     2                 0
2214     //
2215     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2216     //
2217 
2218     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2219 
2220     // Handle objArrays completely differently...
2221     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2222     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2223     __ movw(rscratch1, objArray_lh);
2224     __ eorw(rscratch2, lh, rscratch1);
2225     __ cbzw(rscratch2, L_objArray);
2226 
2227     //  if (src->klass() != dst->klass()) return -1;
2228     __ load_klass(rscratch2, dst);
2229     __ eor(rscratch2, rscratch2, scratch_src_klass);
2230     __ cbnz(rscratch2, L_failed);
2231 
2232     // Check for flat inline type array -> return -1
2233     __ test_flat_array_oop(src, rscratch2, L_failed);
2234 
2235     // Check for null-free (non-flat) inline type array -> handle as object array
2236     __ test_null_free_array_oop(src, rscratch2, L_objArray);
2237 
2238     //  if (!src->is_Array()) return -1;
2239     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2240 
2241     // At this point, it is known to be a typeArray (array_tag 0x3).
2242 #ifdef ASSERT
2243     {
2244       BLOCK_COMMENT("assert primitive array {");
2245       Label L;
2246       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2247       __ cmpw(lh, rscratch2);
2248       __ br(Assembler::GE, L);
2249       __ stop("must be a primitive array");
2250       __ bind(L);
2251       BLOCK_COMMENT("} assert primitive array done");
2252     }
2253 #endif
2254 
2255     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2256                            rscratch2, L_failed);
2257 
2258     // TypeArrayKlass
2259     //
2260     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2261     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2262     //
2263 
2264     const Register rscratch1_offset = rscratch1;    // array offset
2265     const Register r15_elsize = lh; // element size
2266 
2267     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2268            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2269     __ add(src, src, rscratch1_offset);           // src array offset
2270     __ add(dst, dst, rscratch1_offset);           // dst array offset
2271     BLOCK_COMMENT("choose copy loop based on element size");
2272 
2273     // next registers should be set before the jump to corresponding stub
2274     const Register from     = c_rarg0;  // source array address
2275     const Register to       = c_rarg1;  // destination array address
2276     const Register count    = c_rarg2;  // elements count
2277 
2278     // 'from', 'to', 'count' registers should be set in such order
2279     // since they are the same as 'src', 'src_pos', 'dst'.
2280 
2281     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2282 
2283     // The possible values of elsize are 0-3, i.e. exact_log2(element
2284     // size in bytes).  We do a simple bitwise binary search.
2285   __ BIND(L_copy_bytes);
2286     __ tbnz(r15_elsize, 1, L_copy_ints);
2287     __ tbnz(r15_elsize, 0, L_copy_shorts);
2288     __ lea(from, Address(src, src_pos));// src_addr
2289     __ lea(to,   Address(dst, dst_pos));// dst_addr
2290     __ movw(count, scratch_length); // length
2291     __ b(RuntimeAddress(byte_copy_entry));
2292 
2293   __ BIND(L_copy_shorts);
2294     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2295     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2296     __ movw(count, scratch_length); // length
2297     __ b(RuntimeAddress(short_copy_entry));
2298 
2299   __ BIND(L_copy_ints);
2300     __ tbnz(r15_elsize, 0, L_copy_longs);
2301     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2302     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2303     __ movw(count, scratch_length); // length
2304     __ b(RuntimeAddress(int_copy_entry));
2305 
2306   __ BIND(L_copy_longs);
2307 #ifdef ASSERT
2308     {
2309       BLOCK_COMMENT("assert long copy {");
2310       Label L;
2311       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2312       __ cmpw(r15_elsize, LogBytesPerLong);
2313       __ br(Assembler::EQ, L);
2314       __ stop("must be long copy, but elsize is wrong");
2315       __ bind(L);
2316       BLOCK_COMMENT("} assert long copy done");
2317     }
2318 #endif
2319     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2320     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2321     __ movw(count, scratch_length); // length
2322     __ b(RuntimeAddress(long_copy_entry));
2323 
2324     // ObjArrayKlass
2325   __ BIND(L_objArray);
2326     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2327 
2328     Label L_plain_copy, L_checkcast_copy;
2329     //  test array classes for subtyping
2330     __ load_klass(r15, dst);
2331     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2332     __ br(Assembler::NE, L_checkcast_copy);
2333 
2334     // Identically typed arrays can be copied without element-wise checks.
2335     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2336                            rscratch2, L_failed);
2337 
2338     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2339     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2340     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2341     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2342     __ movw(count, scratch_length); // length
2343   __ BIND(L_plain_copy);
2344     __ b(RuntimeAddress(oop_copy_entry));
2345 
2346   __ BIND(L_checkcast_copy);
2347     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2348     {
2349       // Before looking at dst.length, make sure dst is also an objArray.
2350       __ ldrw(rscratch1, Address(r15, lh_offset));
2351       __ movw(rscratch2, objArray_lh);
2352       __ eorw(rscratch1, rscratch1, rscratch2);
2353       __ cbnzw(rscratch1, L_failed);
2354 
2355       // It is safe to examine both src.length and dst.length.
2356       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2357                              r15, L_failed);
2358 
2359       __ load_klass(dst_klass, dst); // reload
2360 
2361       // Marshal the base address arguments now, freeing registers.
2362       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2363       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2364       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2365       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2366       __ movw(count, length);           // length (reloaded)
2367       Register sco_temp = c_rarg3;      // this register is free now
2368       assert_different_registers(from, to, count, sco_temp,
2369                                  dst_klass, scratch_src_klass);
2370       // assert_clean_int(count, sco_temp);
2371 
2372       // Generate the type check.
2373       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2374       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2375 
2376       // Smashes rscratch1, rscratch2
2377       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2378 
2379       // Fetch destination element klass from the ObjArrayKlass header.
2380       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2381       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2382       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2383 
2384       // the checkcast_copy loop needs two extra arguments:
2385       assert(c_rarg3 == sco_temp, "#3 already in place");
2386       // Set up arguments for checkcast_copy_entry.
2387       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2388       __ b(RuntimeAddress(checkcast_copy_entry));
2389     }
2390 
2391   __ BIND(L_failed);
2392     __ mov(r0, -1);
2393     __ leave();   // required for proper stackwalking of RuntimeStub frame
2394     __ ret(lr);
2395 
2396     return start;
2397   }
2398 
2399   //
2400   // Generate stub for array fill. If "aligned" is true, the
2401   // "to" address is assumed to be heapword aligned.
2402   //
2403   // Arguments for generated stub:
2404   //   to:    c_rarg0
2405   //   value: c_rarg1
2406   //   count: c_rarg2 treated as signed
2407   //
2408   address generate_fill(BasicType t, bool aligned, const char *name) {
2409     __ align(CodeEntryAlignment);
2410     StubCodeMark mark(this, "StubRoutines", name);
2411     address start = __ pc();
2412 
2413     BLOCK_COMMENT("Entry:");
2414 
2415     const Register to        = c_rarg0;  // source array address
2416     const Register value     = c_rarg1;  // value
2417     const Register count     = c_rarg2;  // elements count
2418 
2419     const Register bz_base = r10;        // base for block_zero routine
2420     const Register cnt_words = r11;      // temp register
2421 
2422     __ enter();
2423 
2424     Label L_fill_elements, L_exit1;
2425 
2426     int shift = -1;
2427     switch (t) {
2428       case T_BYTE:
2429         shift = 0;
2430         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2431         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2432         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2433         __ br(Assembler::LO, L_fill_elements);
2434         break;
2435       case T_SHORT:
2436         shift = 1;
2437         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2438         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2439         __ br(Assembler::LO, L_fill_elements);
2440         break;
2441       case T_INT:
2442         shift = 2;
2443         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2444         __ br(Assembler::LO, L_fill_elements);
2445         break;
2446       default: ShouldNotReachHere();
2447     }
2448 
2449     // Align source address at 8 bytes address boundary.
2450     Label L_skip_align1, L_skip_align2, L_skip_align4;
2451     if (!aligned) {
2452       switch (t) {
2453         case T_BYTE:
2454           // One byte misalignment happens only for byte arrays.
2455           __ tbz(to, 0, L_skip_align1);
2456           __ strb(value, Address(__ post(to, 1)));
2457           __ subw(count, count, 1);
2458           __ bind(L_skip_align1);
2459           // Fallthrough
2460         case T_SHORT:
2461           // Two bytes misalignment happens only for byte and short (char) arrays.
2462           __ tbz(to, 1, L_skip_align2);
2463           __ strh(value, Address(__ post(to, 2)));
2464           __ subw(count, count, 2 >> shift);
2465           __ bind(L_skip_align2);
2466           // Fallthrough
2467         case T_INT:
2468           // Align to 8 bytes, we know we are 4 byte aligned to start.
2469           __ tbz(to, 2, L_skip_align4);
2470           __ strw(value, Address(__ post(to, 4)));
2471           __ subw(count, count, 4 >> shift);
2472           __ bind(L_skip_align4);
2473           break;
2474         default: ShouldNotReachHere();
2475       }
2476     }
2477 
2478     //
2479     //  Fill large chunks
2480     //
2481     __ lsrw(cnt_words, count, 3 - shift); // number of words
2482     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2483     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2484     if (UseBlockZeroing) {
2485       Label non_block_zeroing, rest;
2486       // If the fill value is zero we can use the fast zero_words().
2487       __ cbnz(value, non_block_zeroing);
2488       __ mov(bz_base, to);
2489       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2490       address tpc = __ zero_words(bz_base, cnt_words);
2491       if (tpc == nullptr) {
2492         fatal("CodeCache is full at generate_fill");
2493       }
2494       __ b(rest);
2495       __ bind(non_block_zeroing);
2496       __ fill_words(to, cnt_words, value);
2497       __ bind(rest);
2498     } else {
2499       __ fill_words(to, cnt_words, value);
2500     }
2501 
2502     // Remaining count is less than 8 bytes. Fill it by a single store.
2503     // Note that the total length is no less than 8 bytes.
2504     if (t == T_BYTE || t == T_SHORT) {
2505       Label L_exit1;
2506       __ cbzw(count, L_exit1);
2507       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2508       __ str(value, Address(to, -8));    // overwrite some elements
2509       __ bind(L_exit1);
2510       __ leave();
2511       __ ret(lr);
2512     }
2513 
2514     // Handle copies less than 8 bytes.
2515     Label L_fill_2, L_fill_4, L_exit2;
2516     __ bind(L_fill_elements);
2517     switch (t) {
2518       case T_BYTE:
2519         __ tbz(count, 0, L_fill_2);
2520         __ strb(value, Address(__ post(to, 1)));
2521         __ bind(L_fill_2);
2522         __ tbz(count, 1, L_fill_4);
2523         __ strh(value, Address(__ post(to, 2)));
2524         __ bind(L_fill_4);
2525         __ tbz(count, 2, L_exit2);
2526         __ strw(value, Address(to));
2527         break;
2528       case T_SHORT:
2529         __ tbz(count, 0, L_fill_4);
2530         __ strh(value, Address(__ post(to, 2)));
2531         __ bind(L_fill_4);
2532         __ tbz(count, 1, L_exit2);
2533         __ strw(value, Address(to));
2534         break;
2535       case T_INT:
2536         __ cbzw(count, L_exit2);
2537         __ strw(value, Address(to));
2538         break;
2539       default: ShouldNotReachHere();
2540     }
2541     __ bind(L_exit2);
2542     __ leave();
2543     __ ret(lr);
2544     return start;
2545   }
2546 
2547   address generate_data_cache_writeback() {
2548     const Register line        = c_rarg0;  // address of line to write back
2549 
2550     __ align(CodeEntryAlignment);
2551 
2552     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2553 
2554     address start = __ pc();
2555     __ enter();
2556     __ cache_wb(Address(line, 0));
2557     __ leave();
2558     __ ret(lr);
2559 
2560     return start;
2561   }
2562 
2563   address generate_data_cache_writeback_sync() {
2564     const Register is_pre     = c_rarg0;  // pre or post sync
2565 
2566     __ align(CodeEntryAlignment);
2567 
2568     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2569 
2570     // pre wbsync is a no-op
2571     // post wbsync translates to an sfence
2572 
2573     Label skip;
2574     address start = __ pc();
2575     __ enter();
2576     __ cbnz(is_pre, skip);
2577     __ cache_wbsync(false);
2578     __ bind(skip);
2579     __ leave();
2580     __ ret(lr);
2581 
2582     return start;
2583   }
2584 
2585   void generate_arraycopy_stubs() {
2586     address entry;
2587     address entry_jbyte_arraycopy;
2588     address entry_jshort_arraycopy;
2589     address entry_jint_arraycopy;
2590     address entry_oop_arraycopy;
2591     address entry_jlong_arraycopy;
2592     address entry_checkcast_arraycopy;
2593 
2594     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2595     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2596 
2597     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2598     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2599 
2600     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2601     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2602 
2603     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2604 
2605     //*** jbyte
2606     // Always need aligned and unaligned versions
2607     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2608                                                                                   "jbyte_disjoint_arraycopy");
2609     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2610                                                                                   &entry_jbyte_arraycopy,
2611                                                                                   "jbyte_arraycopy");
2612     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2613                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2614     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2615                                                                                   "arrayof_jbyte_arraycopy");
2616 
2617     //*** jshort
2618     // Always need aligned and unaligned versions
2619     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2620                                                                                     "jshort_disjoint_arraycopy");
2621     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2622                                                                                     &entry_jshort_arraycopy,
2623                                                                                     "jshort_arraycopy");
2624     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2625                                                                                     "arrayof_jshort_disjoint_arraycopy");
2626     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2627                                                                                     "arrayof_jshort_arraycopy");
2628 
2629     //*** jint
2630     // Aligned versions
2631     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2632                                                                                 "arrayof_jint_disjoint_arraycopy");
2633     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2634                                                                                 "arrayof_jint_arraycopy");
2635     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2636     // entry_jint_arraycopy always points to the unaligned version
2637     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2638                                                                                 "jint_disjoint_arraycopy");
2639     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2640                                                                                 &entry_jint_arraycopy,
2641                                                                                 "jint_arraycopy");
2642 
2643     //*** jlong
2644     // It is always aligned
2645     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2646                                                                                   "arrayof_jlong_disjoint_arraycopy");
2647     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2648                                                                                   "arrayof_jlong_arraycopy");
2649     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2650     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2651 
2652     //*** oops
2653     {
2654       // With compressed oops we need unaligned versions; notice that
2655       // we overwrite entry_oop_arraycopy.
2656       bool aligned = !UseCompressedOops;
2657 
2658       StubRoutines::_arrayof_oop_disjoint_arraycopy
2659         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2660                                      /*dest_uninitialized*/false);
2661       StubRoutines::_arrayof_oop_arraycopy
2662         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2663                                      /*dest_uninitialized*/false);
2664       // Aligned versions without pre-barriers
2665       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2666         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2667                                      /*dest_uninitialized*/true);
2668       StubRoutines::_arrayof_oop_arraycopy_uninit
2669         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2670                                      /*dest_uninitialized*/true);
2671     }
2672 
2673     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2674     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2675     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2676     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2677 
2678     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2679     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2680                                                                         /*dest_uninitialized*/true);
2681 
2682     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2683                                                               entry_jbyte_arraycopy,
2684                                                               entry_jshort_arraycopy,
2685                                                               entry_jint_arraycopy,
2686                                                               entry_jlong_arraycopy);
2687 
2688     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2689                                                                entry_jbyte_arraycopy,
2690                                                                entry_jshort_arraycopy,
2691                                                                entry_jint_arraycopy,
2692                                                                entry_oop_arraycopy,
2693                                                                entry_jlong_arraycopy,
2694                                                                entry_checkcast_arraycopy);
2695 
2696     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2697     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2698     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2699     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2700     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2701     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2702   }
2703 
2704   void generate_math_stubs() { Unimplemented(); }
2705 
2706   // Arguments:
2707   //
2708   // Inputs:
2709   //   c_rarg0   - source byte array address
2710   //   c_rarg1   - destination byte array address
2711   //   c_rarg2   - K (key) in little endian int array
2712   //
2713   address generate_aescrypt_encryptBlock() {
2714     __ align(CodeEntryAlignment);
2715     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2716 
2717     const Register from        = c_rarg0;  // source array address
2718     const Register to          = c_rarg1;  // destination array address
2719     const Register key         = c_rarg2;  // key array address
2720     const Register keylen      = rscratch1;
2721 
2722     address start = __ pc();
2723     __ enter();
2724 
2725     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2726 
2727     __ aesenc_loadkeys(key, keylen);
2728     __ aesecb_encrypt(from, to, keylen);
2729 
2730     __ mov(r0, 0);
2731 
2732     __ leave();
2733     __ ret(lr);
2734 
2735     return start;
2736   }
2737 
2738   // Arguments:
2739   //
2740   // Inputs:
2741   //   c_rarg0   - source byte array address
2742   //   c_rarg1   - destination byte array address
2743   //   c_rarg2   - K (key) in little endian int array
2744   //
2745   address generate_aescrypt_decryptBlock() {
2746     assert(UseAES, "need AES cryptographic extension support");
2747     __ align(CodeEntryAlignment);
2748     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2749     Label L_doLast;
2750 
2751     const Register from        = c_rarg0;  // source array address
2752     const Register to          = c_rarg1;  // destination array address
2753     const Register key         = c_rarg2;  // key array address
2754     const Register keylen      = rscratch1;
2755 
2756     address start = __ pc();
2757     __ enter(); // required for proper stackwalking of RuntimeStub frame
2758 
2759     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2760 
2761     __ aesecb_decrypt(from, to, key, keylen);
2762 
2763     __ mov(r0, 0);
2764 
2765     __ leave();
2766     __ ret(lr);
2767 
2768     return start;
2769   }
2770 
2771   // Arguments:
2772   //
2773   // Inputs:
2774   //   c_rarg0   - source byte array address
2775   //   c_rarg1   - destination byte array address
2776   //   c_rarg2   - K (key) in little endian int array
2777   //   c_rarg3   - r vector byte array address
2778   //   c_rarg4   - input length
2779   //
2780   // Output:
2781   //   x0        - input length
2782   //
2783   address generate_cipherBlockChaining_encryptAESCrypt() {
2784     assert(UseAES, "need AES cryptographic extension support");
2785     __ align(CodeEntryAlignment);
2786     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2787 
2788     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2789 
2790     const Register from        = c_rarg0;  // source array address
2791     const Register to          = c_rarg1;  // destination array address
2792     const Register key         = c_rarg2;  // key array address
2793     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2794                                            // and left with the results of the last encryption block
2795     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2796     const Register keylen      = rscratch1;
2797 
2798     address start = __ pc();
2799 
2800       __ enter();
2801 
2802       __ movw(rscratch2, len_reg);
2803 
2804       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2805 
2806       __ ld1(v0, __ T16B, rvec);
2807 
2808       __ cmpw(keylen, 52);
2809       __ br(Assembler::CC, L_loadkeys_44);
2810       __ br(Assembler::EQ, L_loadkeys_52);
2811 
2812       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2813       __ rev32(v17, __ T16B, v17);
2814       __ rev32(v18, __ T16B, v18);
2815     __ BIND(L_loadkeys_52);
2816       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2817       __ rev32(v19, __ T16B, v19);
2818       __ rev32(v20, __ T16B, v20);
2819     __ BIND(L_loadkeys_44);
2820       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2821       __ rev32(v21, __ T16B, v21);
2822       __ rev32(v22, __ T16B, v22);
2823       __ rev32(v23, __ T16B, v23);
2824       __ rev32(v24, __ T16B, v24);
2825       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2826       __ rev32(v25, __ T16B, v25);
2827       __ rev32(v26, __ T16B, v26);
2828       __ rev32(v27, __ T16B, v27);
2829       __ rev32(v28, __ T16B, v28);
2830       __ ld1(v29, v30, v31, __ T16B, key);
2831       __ rev32(v29, __ T16B, v29);
2832       __ rev32(v30, __ T16B, v30);
2833       __ rev32(v31, __ T16B, v31);
2834 
2835     __ BIND(L_aes_loop);
2836       __ ld1(v1, __ T16B, __ post(from, 16));
2837       __ eor(v0, __ T16B, v0, v1);
2838 
2839       __ br(Assembler::CC, L_rounds_44);
2840       __ br(Assembler::EQ, L_rounds_52);
2841 
2842       __ aese(v0, v17); __ aesmc(v0, v0);
2843       __ aese(v0, v18); __ aesmc(v0, v0);
2844     __ BIND(L_rounds_52);
2845       __ aese(v0, v19); __ aesmc(v0, v0);
2846       __ aese(v0, v20); __ aesmc(v0, v0);
2847     __ BIND(L_rounds_44);
2848       __ aese(v0, v21); __ aesmc(v0, v0);
2849       __ aese(v0, v22); __ aesmc(v0, v0);
2850       __ aese(v0, v23); __ aesmc(v0, v0);
2851       __ aese(v0, v24); __ aesmc(v0, v0);
2852       __ aese(v0, v25); __ aesmc(v0, v0);
2853       __ aese(v0, v26); __ aesmc(v0, v0);
2854       __ aese(v0, v27); __ aesmc(v0, v0);
2855       __ aese(v0, v28); __ aesmc(v0, v0);
2856       __ aese(v0, v29); __ aesmc(v0, v0);
2857       __ aese(v0, v30);
2858       __ eor(v0, __ T16B, v0, v31);
2859 
2860       __ st1(v0, __ T16B, __ post(to, 16));
2861 
2862       __ subw(len_reg, len_reg, 16);
2863       __ cbnzw(len_reg, L_aes_loop);
2864 
2865       __ st1(v0, __ T16B, rvec);
2866 
2867       __ mov(r0, rscratch2);
2868 
2869       __ leave();
2870       __ ret(lr);
2871 
2872       return start;
2873   }
2874 
2875   // Arguments:
2876   //
2877   // Inputs:
2878   //   c_rarg0   - source byte array address
2879   //   c_rarg1   - destination byte array address
2880   //   c_rarg2   - K (key) in little endian int array
2881   //   c_rarg3   - r vector byte array address
2882   //   c_rarg4   - input length
2883   //
2884   // Output:
2885   //   r0        - input length
2886   //
2887   address generate_cipherBlockChaining_decryptAESCrypt() {
2888     assert(UseAES, "need AES cryptographic extension support");
2889     __ align(CodeEntryAlignment);
2890     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2891 
2892     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2893 
2894     const Register from        = c_rarg0;  // source array address
2895     const Register to          = c_rarg1;  // destination array address
2896     const Register key         = c_rarg2;  // key array address
2897     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2898                                            // and left with the results of the last encryption block
2899     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2900     const Register keylen      = rscratch1;
2901 
2902     address start = __ pc();
2903 
2904       __ enter();
2905 
2906       __ movw(rscratch2, len_reg);
2907 
2908       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2909 
2910       __ ld1(v2, __ T16B, rvec);
2911 
2912       __ ld1(v31, __ T16B, __ post(key, 16));
2913       __ rev32(v31, __ T16B, v31);
2914 
2915       __ cmpw(keylen, 52);
2916       __ br(Assembler::CC, L_loadkeys_44);
2917       __ br(Assembler::EQ, L_loadkeys_52);
2918 
2919       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2920       __ rev32(v17, __ T16B, v17);
2921       __ rev32(v18, __ T16B, v18);
2922     __ BIND(L_loadkeys_52);
2923       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2924       __ rev32(v19, __ T16B, v19);
2925       __ rev32(v20, __ T16B, v20);
2926     __ BIND(L_loadkeys_44);
2927       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2928       __ rev32(v21, __ T16B, v21);
2929       __ rev32(v22, __ T16B, v22);
2930       __ rev32(v23, __ T16B, v23);
2931       __ rev32(v24, __ T16B, v24);
2932       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2933       __ rev32(v25, __ T16B, v25);
2934       __ rev32(v26, __ T16B, v26);
2935       __ rev32(v27, __ T16B, v27);
2936       __ rev32(v28, __ T16B, v28);
2937       __ ld1(v29, v30, __ T16B, key);
2938       __ rev32(v29, __ T16B, v29);
2939       __ rev32(v30, __ T16B, v30);
2940 
2941     __ BIND(L_aes_loop);
2942       __ ld1(v0, __ T16B, __ post(from, 16));
2943       __ orr(v1, __ T16B, v0, v0);
2944 
2945       __ br(Assembler::CC, L_rounds_44);
2946       __ br(Assembler::EQ, L_rounds_52);
2947 
2948       __ aesd(v0, v17); __ aesimc(v0, v0);
2949       __ aesd(v0, v18); __ aesimc(v0, v0);
2950     __ BIND(L_rounds_52);
2951       __ aesd(v0, v19); __ aesimc(v0, v0);
2952       __ aesd(v0, v20); __ aesimc(v0, v0);
2953     __ BIND(L_rounds_44);
2954       __ aesd(v0, v21); __ aesimc(v0, v0);
2955       __ aesd(v0, v22); __ aesimc(v0, v0);
2956       __ aesd(v0, v23); __ aesimc(v0, v0);
2957       __ aesd(v0, v24); __ aesimc(v0, v0);
2958       __ aesd(v0, v25); __ aesimc(v0, v0);
2959       __ aesd(v0, v26); __ aesimc(v0, v0);
2960       __ aesd(v0, v27); __ aesimc(v0, v0);
2961       __ aesd(v0, v28); __ aesimc(v0, v0);
2962       __ aesd(v0, v29); __ aesimc(v0, v0);
2963       __ aesd(v0, v30);
2964       __ eor(v0, __ T16B, v0, v31);
2965       __ eor(v0, __ T16B, v0, v2);
2966 
2967       __ st1(v0, __ T16B, __ post(to, 16));
2968       __ orr(v2, __ T16B, v1, v1);
2969 
2970       __ subw(len_reg, len_reg, 16);
2971       __ cbnzw(len_reg, L_aes_loop);
2972 
2973       __ st1(v2, __ T16B, rvec);
2974 
2975       __ mov(r0, rscratch2);
2976 
2977       __ leave();
2978       __ ret(lr);
2979 
2980     return start;
2981   }
2982 
2983   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2984   // Inputs: 128-bits. in is preserved.
2985   // The least-significant 64-bit word is in the upper dword of each vector.
2986   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2987   // Output: result
2988   void be_add_128_64(FloatRegister result, FloatRegister in,
2989                      FloatRegister inc, FloatRegister tmp) {
2990     assert_different_registers(result, tmp, inc);
2991 
2992     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2993                                            // input
2994     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2995     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
2996                                            // MSD == 0 (must be!) to LSD
2997     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
2998   }
2999 
3000   // CTR AES crypt.
3001   // Arguments:
3002   //
3003   // Inputs:
3004   //   c_rarg0   - source byte array address
3005   //   c_rarg1   - destination byte array address
3006   //   c_rarg2   - K (key) in little endian int array
3007   //   c_rarg3   - counter vector byte array address
3008   //   c_rarg4   - input length
3009   //   c_rarg5   - saved encryptedCounter start
3010   //   c_rarg6   - saved used length
3011   //
3012   // Output:
3013   //   r0       - input length
3014   //
3015   address generate_counterMode_AESCrypt() {
3016     const Register in = c_rarg0;
3017     const Register out = c_rarg1;
3018     const Register key = c_rarg2;
3019     const Register counter = c_rarg3;
3020     const Register saved_len = c_rarg4, len = r10;
3021     const Register saved_encrypted_ctr = c_rarg5;
3022     const Register used_ptr = c_rarg6, used = r12;
3023 
3024     const Register offset = r7;
3025     const Register keylen = r11;
3026 
3027     const unsigned char block_size = 16;
3028     const int bulk_width = 4;
3029     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3030     // performance with larger data sizes, but it also means that the
3031     // fast path isn't used until you have at least 8 blocks, and up
3032     // to 127 bytes of data will be executed on the slow path. For
3033     // that reason, and also so as not to blow away too much icache, 4
3034     // blocks seems like a sensible compromise.
3035 
3036     // Algorithm:
3037     //
3038     //    if (len == 0) {
3039     //        goto DONE;
3040     //    }
3041     //    int result = len;
3042     //    do {
3043     //        if (used >= blockSize) {
3044     //            if (len >= bulk_width * blockSize) {
3045     //                CTR_large_block();
3046     //                if (len == 0)
3047     //                    goto DONE;
3048     //            }
3049     //            for (;;) {
3050     //                16ByteVector v0 = counter;
3051     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3052     //                used = 0;
3053     //                if (len < blockSize)
3054     //                    break;    /* goto NEXT */
3055     //                16ByteVector v1 = load16Bytes(in, offset);
3056     //                v1 = v1 ^ encryptedCounter;
3057     //                store16Bytes(out, offset);
3058     //                used = blockSize;
3059     //                offset += blockSize;
3060     //                len -= blockSize;
3061     //                if (len == 0)
3062     //                    goto DONE;
3063     //            }
3064     //        }
3065     //      NEXT:
3066     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3067     //        len--;
3068     //    } while (len != 0);
3069     //  DONE:
3070     //    return result;
3071     //
3072     // CTR_large_block()
3073     //    Wide bulk encryption of whole blocks.
3074 
3075     __ align(CodeEntryAlignment);
3076     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3077     const address start = __ pc();
3078     __ enter();
3079 
3080     Label DONE, CTR_large_block, large_block_return;
3081     __ ldrw(used, Address(used_ptr));
3082     __ cbzw(saved_len, DONE);
3083 
3084     __ mov(len, saved_len);
3085     __ mov(offset, 0);
3086 
3087     // Compute #rounds for AES based on the length of the key array
3088     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3089 
3090     __ aesenc_loadkeys(key, keylen);
3091 
3092     {
3093       Label L_CTR_loop, NEXT;
3094 
3095       __ bind(L_CTR_loop);
3096 
3097       __ cmp(used, block_size);
3098       __ br(__ LO, NEXT);
3099 
3100       // Maybe we have a lot of data
3101       __ subsw(rscratch1, len, bulk_width * block_size);
3102       __ br(__ HS, CTR_large_block);
3103       __ BIND(large_block_return);
3104       __ cbzw(len, DONE);
3105 
3106       // Setup the counter
3107       __ movi(v4, __ T4S, 0);
3108       __ movi(v5, __ T4S, 1);
3109       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3110 
3111       // 128-bit big-endian increment
3112       __ ld1(v0, __ T16B, counter);
3113       __ rev64(v16, __ T16B, v0);
3114       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3115       __ rev64(v16, __ T16B, v16);
3116       __ st1(v16, __ T16B, counter);
3117       // Previous counter value is in v0
3118       // v4 contains { 0, 1 }
3119 
3120       {
3121         // We have fewer than bulk_width blocks of data left. Encrypt
3122         // them one by one until there is less than a full block
3123         // remaining, being careful to save both the encrypted counter
3124         // and the counter.
3125 
3126         Label inner_loop;
3127         __ bind(inner_loop);
3128         // Counter to encrypt is in v0
3129         __ aesecb_encrypt(noreg, noreg, keylen);
3130         __ st1(v0, __ T16B, saved_encrypted_ctr);
3131 
3132         // Do we have a remaining full block?
3133 
3134         __ mov(used, 0);
3135         __ cmp(len, block_size);
3136         __ br(__ LO, NEXT);
3137 
3138         // Yes, we have a full block
3139         __ ldrq(v1, Address(in, offset));
3140         __ eor(v1, __ T16B, v1, v0);
3141         __ strq(v1, Address(out, offset));
3142         __ mov(used, block_size);
3143         __ add(offset, offset, block_size);
3144 
3145         __ subw(len, len, block_size);
3146         __ cbzw(len, DONE);
3147 
3148         // Increment the counter, store it back
3149         __ orr(v0, __ T16B, v16, v16);
3150         __ rev64(v16, __ T16B, v16);
3151         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3152         __ rev64(v16, __ T16B, v16);
3153         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3154 
3155         __ b(inner_loop);
3156       }
3157 
3158       __ BIND(NEXT);
3159 
3160       // Encrypt a single byte, and loop.
3161       // We expect this to be a rare event.
3162       __ ldrb(rscratch1, Address(in, offset));
3163       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3164       __ eor(rscratch1, rscratch1, rscratch2);
3165       __ strb(rscratch1, Address(out, offset));
3166       __ add(offset, offset, 1);
3167       __ add(used, used, 1);
3168       __ subw(len, len,1);
3169       __ cbnzw(len, L_CTR_loop);
3170     }
3171 
3172     __ bind(DONE);
3173     __ strw(used, Address(used_ptr));
3174     __ mov(r0, saved_len);
3175 
3176     __ leave(); // required for proper stackwalking of RuntimeStub frame
3177     __ ret(lr);
3178 
3179     // Bulk encryption
3180 
3181     __ BIND (CTR_large_block);
3182     assert(bulk_width == 4 || bulk_width == 8, "must be");
3183 
3184     if (bulk_width == 8) {
3185       __ sub(sp, sp, 4 * 16);
3186       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3187     }
3188     __ sub(sp, sp, 4 * 16);
3189     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3190     RegSet saved_regs = (RegSet::of(in, out, offset)
3191                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3192     __ push(saved_regs, sp);
3193     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3194     __ add(in, in, offset);
3195     __ add(out, out, offset);
3196 
3197     // Keys should already be loaded into the correct registers
3198 
3199     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3200     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3201 
3202     // AES/CTR loop
3203     {
3204       Label L_CTR_loop;
3205       __ BIND(L_CTR_loop);
3206 
3207       // Setup the counters
3208       __ movi(v8, __ T4S, 0);
3209       __ movi(v9, __ T4S, 1);
3210       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3211 
3212       for (int i = 0; i < bulk_width; i++) {
3213         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3214         __ rev64(v0_ofs, __ T16B, v16);
3215         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3216       }
3217 
3218       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3219 
3220       // Encrypt the counters
3221       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3222 
3223       if (bulk_width == 8) {
3224         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3225       }
3226 
3227       // XOR the encrypted counters with the inputs
3228       for (int i = 0; i < bulk_width; i++) {
3229         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3230         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3231         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3232       }
3233 
3234       // Write the encrypted data
3235       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3236       if (bulk_width == 8) {
3237         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3238       }
3239 
3240       __ subw(len, len, 16 * bulk_width);
3241       __ cbnzw(len, L_CTR_loop);
3242     }
3243 
3244     // Save the counter back where it goes
3245     __ rev64(v16, __ T16B, v16);
3246     __ st1(v16, __ T16B, counter);
3247 
3248     __ pop(saved_regs, sp);
3249 
3250     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3251     if (bulk_width == 8) {
3252       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3253     }
3254 
3255     __ andr(rscratch1, len, -16 * bulk_width);
3256     __ sub(len, len, rscratch1);
3257     __ add(offset, offset, rscratch1);
3258     __ mov(used, 16);
3259     __ strw(used, Address(used_ptr));
3260     __ b(large_block_return);
3261 
3262     return start;
3263   }
3264 
3265   // Vector AES Galois Counter Mode implementation. Parameters:
3266   //
3267   // in = c_rarg0
3268   // len = c_rarg1
3269   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3270   // out = c_rarg3
3271   // key = c_rarg4
3272   // state = c_rarg5 - GHASH.state
3273   // subkeyHtbl = c_rarg6 - powers of H
3274   // counter = c_rarg7 - 16 bytes of CTR
3275   // return - number of processed bytes
3276   address generate_galoisCounterMode_AESCrypt() {
3277     address ghash_polynomial = __ pc();
3278     __ emit_int64(0x87);  // The low-order bits of the field
3279                           // polynomial (i.e. p = z^7+z^2+z+1)
3280                           // repeated in the low and high parts of a
3281                           // 128-bit vector
3282     __ emit_int64(0x87);
3283 
3284     __ align(CodeEntryAlignment);
3285      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3286     address start = __ pc();
3287     __ enter();
3288 
3289     const Register in = c_rarg0;
3290     const Register len = c_rarg1;
3291     const Register ct = c_rarg2;
3292     const Register out = c_rarg3;
3293     // and updated with the incremented counter in the end
3294 
3295     const Register key = c_rarg4;
3296     const Register state = c_rarg5;
3297 
3298     const Register subkeyHtbl = c_rarg6;
3299 
3300     const Register counter = c_rarg7;
3301 
3302     const Register keylen = r10;
3303     // Save state before entering routine
3304     __ sub(sp, sp, 4 * 16);
3305     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3306     __ sub(sp, sp, 4 * 16);
3307     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3308 
3309     // __ andr(len, len, -512);
3310     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3311     __ str(len, __ pre(sp, -2 * wordSize));
3312 
3313     Label DONE;
3314     __ cbz(len, DONE);
3315 
3316     // Compute #rounds for AES based on the length of the key array
3317     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3318 
3319     __ aesenc_loadkeys(key, keylen);
3320     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3321     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3322 
3323     // AES/CTR loop
3324     {
3325       Label L_CTR_loop;
3326       __ BIND(L_CTR_loop);
3327 
3328       // Setup the counters
3329       __ movi(v8, __ T4S, 0);
3330       __ movi(v9, __ T4S, 1);
3331       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3332 
3333       assert(v0->encoding() < v8->encoding(), "");
3334       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3335         FloatRegister f = as_FloatRegister(i);
3336         __ rev32(f, __ T16B, v16);
3337         __ addv(v16, __ T4S, v16, v8);
3338       }
3339 
3340       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3341 
3342       // Encrypt the counters
3343       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3344 
3345       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3346 
3347       // XOR the encrypted counters with the inputs
3348       for (int i = 0; i < 8; i++) {
3349         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3350         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3351         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3352       }
3353       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3354       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3355 
3356       __ subw(len, len, 16 * 8);
3357       __ cbnzw(len, L_CTR_loop);
3358     }
3359 
3360     __ rev32(v16, __ T16B, v16);
3361     __ st1(v16, __ T16B, counter);
3362 
3363     __ ldr(len, Address(sp));
3364     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3365 
3366     // GHASH/CTR loop
3367     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3368                                 len, /*unrolls*/4);
3369 
3370 #ifdef ASSERT
3371     { Label L;
3372       __ cmp(len, (unsigned char)0);
3373       __ br(Assembler::EQ, L);
3374       __ stop("stubGenerator: abort");
3375       __ bind(L);
3376   }
3377 #endif
3378 
3379   __ bind(DONE);
3380     // Return the number of bytes processed
3381     __ ldr(r0, __ post(sp, 2 * wordSize));
3382 
3383     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3384     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3385 
3386     __ leave(); // required for proper stackwalking of RuntimeStub frame
3387     __ ret(lr);
3388      return start;
3389   }
3390 
3391   class Cached64Bytes {
3392   private:
3393     MacroAssembler *_masm;
3394     Register _regs[8];
3395 
3396   public:
3397     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3398       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3399       auto it = rs.begin();
3400       for (auto &r: _regs) {
3401         r = *it;
3402         ++it;
3403       }
3404     }
3405 
3406     void gen_loads(Register base) {
3407       for (int i = 0; i < 8; i += 2) {
3408         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3409       }
3410     }
3411 
3412     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3413     void extract_u32(Register dest, int i) {
3414       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3415     }
3416   };
3417 
3418   // Utility routines for md5.
3419   // Clobbers r10 and r11.
3420   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3421               int k, int s, int t) {
3422     Register rscratch3 = r10;
3423     Register rscratch4 = r11;
3424 
3425     __ eorw(rscratch3, r3, r4);
3426     __ movw(rscratch2, t);
3427     __ andw(rscratch3, rscratch3, r2);
3428     __ addw(rscratch4, r1, rscratch2);
3429     reg_cache.extract_u32(rscratch1, k);
3430     __ eorw(rscratch3, rscratch3, r4);
3431     __ addw(rscratch4, rscratch4, rscratch1);
3432     __ addw(rscratch3, rscratch3, rscratch4);
3433     __ rorw(rscratch2, rscratch3, 32 - s);
3434     __ addw(r1, rscratch2, r2);
3435   }
3436 
3437   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3438               int k, int s, int t) {
3439     Register rscratch3 = r10;
3440     Register rscratch4 = r11;
3441 
3442     reg_cache.extract_u32(rscratch1, k);
3443     __ movw(rscratch2, t);
3444     __ addw(rscratch4, r1, rscratch2);
3445     __ addw(rscratch4, rscratch4, rscratch1);
3446     __ bicw(rscratch2, r3, r4);
3447     __ andw(rscratch3, r2, r4);
3448     __ addw(rscratch2, rscratch2, rscratch4);
3449     __ addw(rscratch2, rscratch2, rscratch3);
3450     __ rorw(rscratch2, rscratch2, 32 - s);
3451     __ addw(r1, rscratch2, r2);
3452   }
3453 
3454   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3455               int k, int s, int t) {
3456     Register rscratch3 = r10;
3457     Register rscratch4 = r11;
3458 
3459     __ eorw(rscratch3, r3, r4);
3460     __ movw(rscratch2, t);
3461     __ addw(rscratch4, r1, rscratch2);
3462     reg_cache.extract_u32(rscratch1, k);
3463     __ eorw(rscratch3, rscratch3, r2);
3464     __ addw(rscratch4, rscratch4, rscratch1);
3465     __ addw(rscratch3, rscratch3, rscratch4);
3466     __ rorw(rscratch2, rscratch3, 32 - s);
3467     __ addw(r1, rscratch2, r2);
3468   }
3469 
3470   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3471               int k, int s, int t) {
3472     Register rscratch3 = r10;
3473     Register rscratch4 = r11;
3474 
3475     __ movw(rscratch3, t);
3476     __ ornw(rscratch2, r2, r4);
3477     __ addw(rscratch4, r1, rscratch3);
3478     reg_cache.extract_u32(rscratch1, k);
3479     __ eorw(rscratch3, rscratch2, r3);
3480     __ addw(rscratch4, rscratch4, rscratch1);
3481     __ addw(rscratch3, rscratch3, rscratch4);
3482     __ rorw(rscratch2, rscratch3, 32 - s);
3483     __ addw(r1, rscratch2, r2);
3484   }
3485 
3486   // Arguments:
3487   //
3488   // Inputs:
3489   //   c_rarg0   - byte[]  source+offset
3490   //   c_rarg1   - int[]   SHA.state
3491   //   c_rarg2   - int     offset
3492   //   c_rarg3   - int     limit
3493   //
3494   address generate_md5_implCompress(bool multi_block, const char *name) {
3495     __ align(CodeEntryAlignment);
3496     StubCodeMark mark(this, "StubRoutines", name);
3497     address start = __ pc();
3498 
3499     Register buf       = c_rarg0;
3500     Register state     = c_rarg1;
3501     Register ofs       = c_rarg2;
3502     Register limit     = c_rarg3;
3503     Register a         = r4;
3504     Register b         = r5;
3505     Register c         = r6;
3506     Register d         = r7;
3507     Register rscratch3 = r10;
3508     Register rscratch4 = r11;
3509 
3510     Register state_regs[2] = { r12, r13 };
3511     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3512     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3513 
3514     __ push(saved_regs, sp);
3515 
3516     __ ldp(state_regs[0], state_regs[1], Address(state));
3517     __ ubfx(a, state_regs[0],  0, 32);
3518     __ ubfx(b, state_regs[0], 32, 32);
3519     __ ubfx(c, state_regs[1],  0, 32);
3520     __ ubfx(d, state_regs[1], 32, 32);
3521 
3522     Label md5_loop;
3523     __ BIND(md5_loop);
3524 
3525     reg_cache.gen_loads(buf);
3526 
3527     // Round 1
3528     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3529     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3530     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3531     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3532     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3533     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3534     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3535     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3536     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3537     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3538     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3539     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3540     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3541     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3542     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3543     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3544 
3545     // Round 2
3546     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3547     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3548     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3549     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3550     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3551     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3552     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3553     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3554     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3555     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3556     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3557     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3558     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3559     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3560     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3561     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3562 
3563     // Round 3
3564     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3565     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3566     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3567     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3568     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3569     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3570     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3571     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3572     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3573     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3574     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3575     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3576     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3577     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3578     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3579     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3580 
3581     // Round 4
3582     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3583     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3584     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3585     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3586     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3587     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3588     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3589     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3590     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3591     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3592     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3593     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3594     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3595     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3596     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3597     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3598 
3599     __ addw(a, state_regs[0], a);
3600     __ ubfx(rscratch2, state_regs[0], 32, 32);
3601     __ addw(b, rscratch2, b);
3602     __ addw(c, state_regs[1], c);
3603     __ ubfx(rscratch4, state_regs[1], 32, 32);
3604     __ addw(d, rscratch4, d);
3605 
3606     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3607     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3608 
3609     if (multi_block) {
3610       __ add(buf, buf, 64);
3611       __ add(ofs, ofs, 64);
3612       __ cmp(ofs, limit);
3613       __ br(Assembler::LE, md5_loop);
3614       __ mov(c_rarg0, ofs); // return ofs
3615     }
3616 
3617     // write hash values back in the correct order
3618     __ stp(state_regs[0], state_regs[1], Address(state));
3619 
3620     __ pop(saved_regs, sp);
3621 
3622     __ ret(lr);
3623 
3624     return start;
3625   }
3626 
3627   // Arguments:
3628   //
3629   // Inputs:
3630   //   c_rarg0   - byte[]  source+offset
3631   //   c_rarg1   - int[]   SHA.state
3632   //   c_rarg2   - int     offset
3633   //   c_rarg3   - int     limit
3634   //
3635   address generate_sha1_implCompress(bool multi_block, const char *name) {
3636     __ align(CodeEntryAlignment);
3637     StubCodeMark mark(this, "StubRoutines", name);
3638     address start = __ pc();
3639 
3640     Register buf   = c_rarg0;
3641     Register state = c_rarg1;
3642     Register ofs   = c_rarg2;
3643     Register limit = c_rarg3;
3644 
3645     Label keys;
3646     Label sha1_loop;
3647 
3648     // load the keys into v0..v3
3649     __ adr(rscratch1, keys);
3650     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3651     // load 5 words state into v6, v7
3652     __ ldrq(v6, Address(state, 0));
3653     __ ldrs(v7, Address(state, 16));
3654 
3655 
3656     __ BIND(sha1_loop);
3657     // load 64 bytes of data into v16..v19
3658     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3659     __ rev32(v16, __ T16B, v16);
3660     __ rev32(v17, __ T16B, v17);
3661     __ rev32(v18, __ T16B, v18);
3662     __ rev32(v19, __ T16B, v19);
3663 
3664     // do the sha1
3665     __ addv(v4, __ T4S, v16, v0);
3666     __ orr(v20, __ T16B, v6, v6);
3667 
3668     FloatRegister d0 = v16;
3669     FloatRegister d1 = v17;
3670     FloatRegister d2 = v18;
3671     FloatRegister d3 = v19;
3672 
3673     for (int round = 0; round < 20; round++) {
3674       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3675       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3676       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3677       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3678       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3679 
3680       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3681       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3682       __ sha1h(tmp2, __ T4S, v20);
3683       if (round < 5)
3684         __ sha1c(v20, __ T4S, tmp3, tmp4);
3685       else if (round < 10 || round >= 15)
3686         __ sha1p(v20, __ T4S, tmp3, tmp4);
3687       else
3688         __ sha1m(v20, __ T4S, tmp3, tmp4);
3689       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3690 
3691       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3692     }
3693 
3694     __ addv(v7, __ T2S, v7, v21);
3695     __ addv(v6, __ T4S, v6, v20);
3696 
3697     if (multi_block) {
3698       __ add(ofs, ofs, 64);
3699       __ cmp(ofs, limit);
3700       __ br(Assembler::LE, sha1_loop);
3701       __ mov(c_rarg0, ofs); // return ofs
3702     }
3703 
3704     __ strq(v6, Address(state, 0));
3705     __ strs(v7, Address(state, 16));
3706 
3707     __ ret(lr);
3708 
3709     __ bind(keys);
3710     __ emit_int32(0x5a827999);
3711     __ emit_int32(0x6ed9eba1);
3712     __ emit_int32(0x8f1bbcdc);
3713     __ emit_int32(0xca62c1d6);
3714 
3715     return start;
3716   }
3717 
3718 
3719   // Arguments:
3720   //
3721   // Inputs:
3722   //   c_rarg0   - byte[]  source+offset
3723   //   c_rarg1   - int[]   SHA.state
3724   //   c_rarg2   - int     offset
3725   //   c_rarg3   - int     limit
3726   //
3727   address generate_sha256_implCompress(bool multi_block, const char *name) {
3728     static const uint32_t round_consts[64] = {
3729       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3730       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3731       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3732       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3733       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3734       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3735       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3736       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3737       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3738       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3739       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3740       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3741       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3742       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3743       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3744       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3745     };
3746     __ align(CodeEntryAlignment);
3747     StubCodeMark mark(this, "StubRoutines", name);
3748     address start = __ pc();
3749 
3750     Register buf   = c_rarg0;
3751     Register state = c_rarg1;
3752     Register ofs   = c_rarg2;
3753     Register limit = c_rarg3;
3754 
3755     Label sha1_loop;
3756 
3757     __ stpd(v8, v9, __ pre(sp, -32));
3758     __ stpd(v10, v11, Address(sp, 16));
3759 
3760 // dga == v0
3761 // dgb == v1
3762 // dg0 == v2
3763 // dg1 == v3
3764 // dg2 == v4
3765 // t0 == v6
3766 // t1 == v7
3767 
3768     // load 16 keys to v16..v31
3769     __ lea(rscratch1, ExternalAddress((address)round_consts));
3770     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3771     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3772     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3773     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3774 
3775     // load 8 words (256 bits) state
3776     __ ldpq(v0, v1, state);
3777 
3778     __ BIND(sha1_loop);
3779     // load 64 bytes of data into v8..v11
3780     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3781     __ rev32(v8, __ T16B, v8);
3782     __ rev32(v9, __ T16B, v9);
3783     __ rev32(v10, __ T16B, v10);
3784     __ rev32(v11, __ T16B, v11);
3785 
3786     __ addv(v6, __ T4S, v8, v16);
3787     __ orr(v2, __ T16B, v0, v0);
3788     __ orr(v3, __ T16B, v1, v1);
3789 
3790     FloatRegister d0 = v8;
3791     FloatRegister d1 = v9;
3792     FloatRegister d2 = v10;
3793     FloatRegister d3 = v11;
3794 
3795 
3796     for (int round = 0; round < 16; round++) {
3797       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3798       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3799       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3800       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3801 
3802       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3803        __ orr(v4, __ T16B, v2, v2);
3804       if (round < 15)
3805         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3806       __ sha256h(v2, __ T4S, v3, tmp2);
3807       __ sha256h2(v3, __ T4S, v4, tmp2);
3808       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3809 
3810       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3811     }
3812 
3813     __ addv(v0, __ T4S, v0, v2);
3814     __ addv(v1, __ T4S, v1, v3);
3815 
3816     if (multi_block) {
3817       __ add(ofs, ofs, 64);
3818       __ cmp(ofs, limit);
3819       __ br(Assembler::LE, sha1_loop);
3820       __ mov(c_rarg0, ofs); // return ofs
3821     }
3822 
3823     __ ldpd(v10, v11, Address(sp, 16));
3824     __ ldpd(v8, v9, __ post(sp, 32));
3825 
3826     __ stpq(v0, v1, state);
3827 
3828     __ ret(lr);
3829 
3830     return start;
3831   }
3832 
3833   // Double rounds for sha512.
3834   void sha512_dround(int dr,
3835                      FloatRegister vi0, FloatRegister vi1,
3836                      FloatRegister vi2, FloatRegister vi3,
3837                      FloatRegister vi4, FloatRegister vrc0,
3838                      FloatRegister vrc1, FloatRegister vin0,
3839                      FloatRegister vin1, FloatRegister vin2,
3840                      FloatRegister vin3, FloatRegister vin4) {
3841       if (dr < 36) {
3842         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3843       }
3844       __ addv(v5, __ T2D, vrc0, vin0);
3845       __ ext(v6, __ T16B, vi2, vi3, 8);
3846       __ ext(v5, __ T16B, v5, v5, 8);
3847       __ ext(v7, __ T16B, vi1, vi2, 8);
3848       __ addv(vi3, __ T2D, vi3, v5);
3849       if (dr < 32) {
3850         __ ext(v5, __ T16B, vin3, vin4, 8);
3851         __ sha512su0(vin0, __ T2D, vin1);
3852       }
3853       __ sha512h(vi3, __ T2D, v6, v7);
3854       if (dr < 32) {
3855         __ sha512su1(vin0, __ T2D, vin2, v5);
3856       }
3857       __ addv(vi4, __ T2D, vi1, vi3);
3858       __ sha512h2(vi3, __ T2D, vi1, vi0);
3859   }
3860 
3861   // Arguments:
3862   //
3863   // Inputs:
3864   //   c_rarg0   - byte[]  source+offset
3865   //   c_rarg1   - int[]   SHA.state
3866   //   c_rarg2   - int     offset
3867   //   c_rarg3   - int     limit
3868   //
3869   address generate_sha512_implCompress(bool multi_block, const char *name) {
3870     static const uint64_t round_consts[80] = {
3871       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3872       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3873       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3874       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3875       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3876       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3877       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3878       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3879       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3880       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3881       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3882       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3883       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3884       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3885       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3886       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3887       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3888       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3889       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3890       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3891       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3892       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3893       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3894       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3895       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3896       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3897       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3898     };
3899 
3900     __ align(CodeEntryAlignment);
3901     StubCodeMark mark(this, "StubRoutines", name);
3902     address start = __ pc();
3903 
3904     Register buf   = c_rarg0;
3905     Register state = c_rarg1;
3906     Register ofs   = c_rarg2;
3907     Register limit = c_rarg3;
3908 
3909     __ stpd(v8, v9, __ pre(sp, -64));
3910     __ stpd(v10, v11, Address(sp, 16));
3911     __ stpd(v12, v13, Address(sp, 32));
3912     __ stpd(v14, v15, Address(sp, 48));
3913 
3914     Label sha512_loop;
3915 
3916     // load state
3917     __ ld1(v8, v9, v10, v11, __ T2D, state);
3918 
3919     // load first 4 round constants
3920     __ lea(rscratch1, ExternalAddress((address)round_consts));
3921     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3922 
3923     __ BIND(sha512_loop);
3924     // load 128B of data into v12..v19
3925     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3926     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3927     __ rev64(v12, __ T16B, v12);
3928     __ rev64(v13, __ T16B, v13);
3929     __ rev64(v14, __ T16B, v14);
3930     __ rev64(v15, __ T16B, v15);
3931     __ rev64(v16, __ T16B, v16);
3932     __ rev64(v17, __ T16B, v17);
3933     __ rev64(v18, __ T16B, v18);
3934     __ rev64(v19, __ T16B, v19);
3935 
3936     __ mov(rscratch2, rscratch1);
3937 
3938     __ mov(v0, __ T16B, v8);
3939     __ mov(v1, __ T16B, v9);
3940     __ mov(v2, __ T16B, v10);
3941     __ mov(v3, __ T16B, v11);
3942 
3943     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3944     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3945     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3946     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3947     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3948     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3949     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3950     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3951     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3952     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3953     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3954     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3955     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3956     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3957     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3958     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3959     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3960     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3961     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3962     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3963     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3964     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3965     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3966     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3967     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3968     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3969     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3970     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3971     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3972     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3973     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3974     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3975     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3976     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3977     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3978     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3979     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3980     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3981     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3982     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3983 
3984     __ addv(v8, __ T2D, v8, v0);
3985     __ addv(v9, __ T2D, v9, v1);
3986     __ addv(v10, __ T2D, v10, v2);
3987     __ addv(v11, __ T2D, v11, v3);
3988 
3989     if (multi_block) {
3990       __ add(ofs, ofs, 128);
3991       __ cmp(ofs, limit);
3992       __ br(Assembler::LE, sha512_loop);
3993       __ mov(c_rarg0, ofs); // return ofs
3994     }
3995 
3996     __ st1(v8, v9, v10, v11, __ T2D, state);
3997 
3998     __ ldpd(v14, v15, Address(sp, 48));
3999     __ ldpd(v12, v13, Address(sp, 32));
4000     __ ldpd(v10, v11, Address(sp, 16));
4001     __ ldpd(v8, v9, __ post(sp, 64));
4002 
4003     __ ret(lr);
4004 
4005     return start;
4006   }
4007 
4008   // Arguments:
4009   //
4010   // Inputs:
4011   //   c_rarg0   - byte[]  source+offset
4012   //   c_rarg1   - byte[]  SHA.state
4013   //   c_rarg2   - int     block_size
4014   //   c_rarg3   - int     offset
4015   //   c_rarg4   - int     limit
4016   //
4017   address generate_sha3_implCompress(bool multi_block, const char *name) {
4018     static const uint64_t round_consts[24] = {
4019       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4020       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4021       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4022       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4023       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4024       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4025       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4026       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4027     };
4028 
4029     __ align(CodeEntryAlignment);
4030     StubCodeMark mark(this, "StubRoutines", name);
4031     address start = __ pc();
4032 
4033     Register buf           = c_rarg0;
4034     Register state         = c_rarg1;
4035     Register block_size    = c_rarg2;
4036     Register ofs           = c_rarg3;
4037     Register limit         = c_rarg4;
4038 
4039     Label sha3_loop, rounds24_loop;
4040     Label sha3_512_or_sha3_384, shake128;
4041 
4042     __ stpd(v8, v9, __ pre(sp, -64));
4043     __ stpd(v10, v11, Address(sp, 16));
4044     __ stpd(v12, v13, Address(sp, 32));
4045     __ stpd(v14, v15, Address(sp, 48));
4046 
4047     // load state
4048     __ add(rscratch1, state, 32);
4049     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4050     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4051     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4052     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4053     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4054     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4055     __ ld1(v24, __ T1D, rscratch1);
4056 
4057     __ BIND(sha3_loop);
4058 
4059     // 24 keccak rounds
4060     __ movw(rscratch2, 24);
4061 
4062     // load round_constants base
4063     __ lea(rscratch1, ExternalAddress((address) round_consts));
4064 
4065     // load input
4066     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4067     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4068     __ eor(v0, __ T8B, v0, v25);
4069     __ eor(v1, __ T8B, v1, v26);
4070     __ eor(v2, __ T8B, v2, v27);
4071     __ eor(v3, __ T8B, v3, v28);
4072     __ eor(v4, __ T8B, v4, v29);
4073     __ eor(v5, __ T8B, v5, v30);
4074     __ eor(v6, __ T8B, v6, v31);
4075 
4076     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4077     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4078 
4079     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4080     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4081     __ eor(v7, __ T8B, v7, v25);
4082     __ eor(v8, __ T8B, v8, v26);
4083     __ eor(v9, __ T8B, v9, v27);
4084     __ eor(v10, __ T8B, v10, v28);
4085     __ eor(v11, __ T8B, v11, v29);
4086     __ eor(v12, __ T8B, v12, v30);
4087     __ eor(v13, __ T8B, v13, v31);
4088 
4089     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4090     __ eor(v14, __ T8B, v14, v25);
4091     __ eor(v15, __ T8B, v15, v26);
4092     __ eor(v16, __ T8B, v16, v27);
4093 
4094     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4095     __ andw(c_rarg5, block_size, 48);
4096     __ cbzw(c_rarg5, rounds24_loop);
4097 
4098     __ tbnz(block_size, 5, shake128);
4099     // block_size == 144, bit5 == 0, SHA3-244
4100     __ ldrd(v28, __ post(buf, 8));
4101     __ eor(v17, __ T8B, v17, v28);
4102     __ b(rounds24_loop);
4103 
4104     __ BIND(shake128);
4105     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4106     __ eor(v17, __ T8B, v17, v28);
4107     __ eor(v18, __ T8B, v18, v29);
4108     __ eor(v19, __ T8B, v19, v30);
4109     __ eor(v20, __ T8B, v20, v31);
4110     __ b(rounds24_loop); // block_size == 168, SHAKE128
4111 
4112     __ BIND(sha3_512_or_sha3_384);
4113     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4114     __ eor(v7, __ T8B, v7, v25);
4115     __ eor(v8, __ T8B, v8, v26);
4116     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4117 
4118     // SHA3-384
4119     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4120     __ eor(v9,  __ T8B, v9,  v27);
4121     __ eor(v10, __ T8B, v10, v28);
4122     __ eor(v11, __ T8B, v11, v29);
4123     __ eor(v12, __ T8B, v12, v30);
4124 
4125     __ BIND(rounds24_loop);
4126     __ subw(rscratch2, rscratch2, 1);
4127 
4128     __ eor3(v29, __ T16B, v4, v9, v14);
4129     __ eor3(v26, __ T16B, v1, v6, v11);
4130     __ eor3(v28, __ T16B, v3, v8, v13);
4131     __ eor3(v25, __ T16B, v0, v5, v10);
4132     __ eor3(v27, __ T16B, v2, v7, v12);
4133     __ eor3(v29, __ T16B, v29, v19, v24);
4134     __ eor3(v26, __ T16B, v26, v16, v21);
4135     __ eor3(v28, __ T16B, v28, v18, v23);
4136     __ eor3(v25, __ T16B, v25, v15, v20);
4137     __ eor3(v27, __ T16B, v27, v17, v22);
4138 
4139     __ rax1(v30, __ T2D, v29, v26);
4140     __ rax1(v26, __ T2D, v26, v28);
4141     __ rax1(v28, __ T2D, v28, v25);
4142     __ rax1(v25, __ T2D, v25, v27);
4143     __ rax1(v27, __ T2D, v27, v29);
4144 
4145     __ eor(v0, __ T16B, v0, v30);
4146     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4147     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4148     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4149     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4150     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4151     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4152     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4153     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4154     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4155     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4156     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4157     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4158     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4159     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4160     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4161     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4162     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4163     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4164     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4165     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4166     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4167     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4168     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4169     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4170 
4171     __ bcax(v20, __ T16B, v31, v22, v8);
4172     __ bcax(v21, __ T16B, v8,  v23, v22);
4173     __ bcax(v22, __ T16B, v22, v24, v23);
4174     __ bcax(v23, __ T16B, v23, v31, v24);
4175     __ bcax(v24, __ T16B, v24, v8,  v31);
4176 
4177     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4178 
4179     __ bcax(v17, __ T16B, v25, v19, v3);
4180     __ bcax(v18, __ T16B, v3,  v15, v19);
4181     __ bcax(v19, __ T16B, v19, v16, v15);
4182     __ bcax(v15, __ T16B, v15, v25, v16);
4183     __ bcax(v16, __ T16B, v16, v3,  v25);
4184 
4185     __ bcax(v10, __ T16B, v29, v12, v26);
4186     __ bcax(v11, __ T16B, v26, v13, v12);
4187     __ bcax(v12, __ T16B, v12, v14, v13);
4188     __ bcax(v13, __ T16B, v13, v29, v14);
4189     __ bcax(v14, __ T16B, v14, v26, v29);
4190 
4191     __ bcax(v7, __ T16B, v30, v9,  v4);
4192     __ bcax(v8, __ T16B, v4,  v5,  v9);
4193     __ bcax(v9, __ T16B, v9,  v6,  v5);
4194     __ bcax(v5, __ T16B, v5,  v30, v6);
4195     __ bcax(v6, __ T16B, v6,  v4,  v30);
4196 
4197     __ bcax(v3, __ T16B, v27, v0,  v28);
4198     __ bcax(v4, __ T16B, v28, v1,  v0);
4199     __ bcax(v0, __ T16B, v0,  v2,  v1);
4200     __ bcax(v1, __ T16B, v1,  v27, v2);
4201     __ bcax(v2, __ T16B, v2,  v28, v27);
4202 
4203     __ eor(v0, __ T16B, v0, v31);
4204 
4205     __ cbnzw(rscratch2, rounds24_loop);
4206 
4207     if (multi_block) {
4208       __ add(ofs, ofs, block_size);
4209       __ cmp(ofs, limit);
4210       __ br(Assembler::LE, sha3_loop);
4211       __ mov(c_rarg0, ofs); // return ofs
4212     }
4213 
4214     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4215     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4216     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4217     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4218     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4219     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4220     __ st1(v24, __ T1D, state);
4221 
4222     __ ldpd(v14, v15, Address(sp, 48));
4223     __ ldpd(v12, v13, Address(sp, 32));
4224     __ ldpd(v10, v11, Address(sp, 16));
4225     __ ldpd(v8, v9, __ post(sp, 64));
4226 
4227     __ ret(lr);
4228 
4229     return start;
4230   }
4231 
4232   /**
4233    *  Arguments:
4234    *
4235    * Inputs:
4236    *   c_rarg0   - int crc
4237    *   c_rarg1   - byte* buf
4238    *   c_rarg2   - int length
4239    *
4240    * Output:
4241    *       rax   - int crc result
4242    */
4243   address generate_updateBytesCRC32() {
4244     assert(UseCRC32Intrinsics, "what are we doing here?");
4245 
4246     __ align(CodeEntryAlignment);
4247     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4248 
4249     address start = __ pc();
4250 
4251     const Register crc   = c_rarg0;  // crc
4252     const Register buf   = c_rarg1;  // source java byte array address
4253     const Register len   = c_rarg2;  // length
4254     const Register table0 = c_rarg3; // crc_table address
4255     const Register table1 = c_rarg4;
4256     const Register table2 = c_rarg5;
4257     const Register table3 = c_rarg6;
4258     const Register tmp3 = c_rarg7;
4259 
4260     BLOCK_COMMENT("Entry:");
4261     __ enter(); // required for proper stackwalking of RuntimeStub frame
4262 
4263     __ kernel_crc32(crc, buf, len,
4264               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4265 
4266     __ leave(); // required for proper stackwalking of RuntimeStub frame
4267     __ ret(lr);
4268 
4269     return start;
4270   }
4271 
4272   // ChaCha20 block function.  This version parallelizes by loading
4273   // individual 32-bit state elements into vectors for four blocks
4274   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4275   //
4276   // state (int[16]) = c_rarg0
4277   // keystream (byte[1024]) = c_rarg1
4278   // return - number of bytes of keystream (always 256)
4279   address generate_chacha20Block_blockpar() {
4280     Label L_twoRounds, L_cc20_const;
4281     // The constant data is broken into two 128-bit segments to be loaded
4282     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4283     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4284     // The second 128-bits is a table constant used for 8-bit left rotations.
4285     __ BIND(L_cc20_const);
4286     __ emit_int64(0x0000000100000000UL);
4287     __ emit_int64(0x0000000300000002UL);
4288     __ emit_int64(0x0605040702010003UL);
4289     __ emit_int64(0x0E0D0C0F0A09080BUL);
4290 
4291     __ align(CodeEntryAlignment);
4292     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4293     address start = __ pc();
4294     __ enter();
4295 
4296     int i, j;
4297     const Register state = c_rarg0;
4298     const Register keystream = c_rarg1;
4299     const Register loopCtr = r10;
4300     const Register tmpAddr = r11;
4301 
4302     const FloatRegister stateFirst = v0;
4303     const FloatRegister stateSecond = v1;
4304     const FloatRegister stateThird = v2;
4305     const FloatRegister stateFourth = v3;
4306     const FloatRegister origCtrState = v28;
4307     const FloatRegister scratch = v29;
4308     const FloatRegister lrot8Tbl = v30;
4309 
4310     // Organize SIMD registers in an array that facilitates
4311     // putting repetitive opcodes into loop structures.  It is
4312     // important that each grouping of 4 registers is monotonically
4313     // increasing to support the requirements of multi-register
4314     // instructions (e.g. ld4r, st4, etc.)
4315     const FloatRegister workSt[16] = {
4316          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4317         v20, v21, v22, v23, v24, v25, v26, v27
4318     };
4319 
4320     // Load from memory and interlace across 16 SIMD registers,
4321     // With each word from memory being broadcast to all lanes of
4322     // each successive SIMD register.
4323     //      Addr(0) -> All lanes in workSt[i]
4324     //      Addr(4) -> All lanes workSt[i + 1], etc.
4325     __ mov(tmpAddr, state);
4326     for (i = 0; i < 16; i += 4) {
4327       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4328           __ post(tmpAddr, 16));
4329     }
4330 
4331     // Pull in constant data.  The first 16 bytes are the add overlay
4332     // which is applied to the vector holding the counter (state[12]).
4333     // The second 16 bytes is the index register for the 8-bit left
4334     // rotation tbl instruction.
4335     __ adr(tmpAddr, L_cc20_const);
4336     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4337     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4338 
4339     // Set up the 10 iteration loop and perform all 8 quarter round ops
4340     __ mov(loopCtr, 10);
4341     __ BIND(L_twoRounds);
4342 
4343     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4344         scratch, lrot8Tbl);
4345     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4346         scratch, lrot8Tbl);
4347     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4348         scratch, lrot8Tbl);
4349     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4350         scratch, lrot8Tbl);
4351 
4352     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4353         scratch, lrot8Tbl);
4354     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4355         scratch, lrot8Tbl);
4356     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4357         scratch, lrot8Tbl);
4358     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4359         scratch, lrot8Tbl);
4360 
4361     // Decrement and iterate
4362     __ sub(loopCtr, loopCtr, 1);
4363     __ cbnz(loopCtr, L_twoRounds);
4364 
4365     __ mov(tmpAddr, state);
4366 
4367     // Add the starting state back to the post-loop keystream
4368     // state.  We read/interlace the state array from memory into
4369     // 4 registers similar to what we did in the beginning.  Then
4370     // add the counter overlay onto workSt[12] at the end.
4371     for (i = 0; i < 16; i += 4) {
4372       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4373           __ post(tmpAddr, 16));
4374       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4375       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4376       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4377       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4378     }
4379     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4380 
4381     // Write to key stream, storing the same element out of workSt[0..15]
4382     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4383     // for the next element position.
4384     for (i = 0; i < 4; i++) {
4385       for (j = 0; j < 16; j += 4) {
4386         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4387             __ post(keystream, 16));
4388       }
4389     }
4390 
4391     __ mov(r0, 256);             // Return length of output keystream
4392     __ leave();
4393     __ ret(lr);
4394 
4395     return start;
4396   }
4397 
4398   /**
4399    *  Arguments:
4400    *
4401    * Inputs:
4402    *   c_rarg0   - int crc
4403    *   c_rarg1   - byte* buf
4404    *   c_rarg2   - int length
4405    *   c_rarg3   - int* table
4406    *
4407    * Output:
4408    *       r0   - int crc result
4409    */
4410   address generate_updateBytesCRC32C() {
4411     assert(UseCRC32CIntrinsics, "what are we doing here?");
4412 
4413     __ align(CodeEntryAlignment);
4414     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4415 
4416     address start = __ pc();
4417 
4418     const Register crc   = c_rarg0;  // crc
4419     const Register buf   = c_rarg1;  // source java byte array address
4420     const Register len   = c_rarg2;  // length
4421     const Register table0 = c_rarg3; // crc_table address
4422     const Register table1 = c_rarg4;
4423     const Register table2 = c_rarg5;
4424     const Register table3 = c_rarg6;
4425     const Register tmp3 = c_rarg7;
4426 
4427     BLOCK_COMMENT("Entry:");
4428     __ enter(); // required for proper stackwalking of RuntimeStub frame
4429 
4430     __ kernel_crc32c(crc, buf, len,
4431               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4432 
4433     __ leave(); // required for proper stackwalking of RuntimeStub frame
4434     __ ret(lr);
4435 
4436     return start;
4437   }
4438 
4439   /***
4440    *  Arguments:
4441    *
4442    *  Inputs:
4443    *   c_rarg0   - int   adler
4444    *   c_rarg1   - byte* buff
4445    *   c_rarg2   - int   len
4446    *
4447    * Output:
4448    *   c_rarg0   - int adler result
4449    */
4450   address generate_updateBytesAdler32() {
4451     __ align(CodeEntryAlignment);
4452     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4453     address start = __ pc();
4454 
4455     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4456 
4457     // Aliases
4458     Register adler  = c_rarg0;
4459     Register s1     = c_rarg0;
4460     Register s2     = c_rarg3;
4461     Register buff   = c_rarg1;
4462     Register len    = c_rarg2;
4463     Register nmax  = r4;
4464     Register base  = r5;
4465     Register count = r6;
4466     Register temp0 = rscratch1;
4467     Register temp1 = rscratch2;
4468     FloatRegister vbytes = v0;
4469     FloatRegister vs1acc = v1;
4470     FloatRegister vs2acc = v2;
4471     FloatRegister vtable = v3;
4472 
4473     // Max number of bytes we can process before having to take the mod
4474     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4475     uint64_t BASE = 0xfff1;
4476     uint64_t NMAX = 0x15B0;
4477 
4478     __ mov(base, BASE);
4479     __ mov(nmax, NMAX);
4480 
4481     // Load accumulation coefficients for the upper 16 bits
4482     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4483     __ ld1(vtable, __ T16B, Address(temp0));
4484 
4485     // s1 is initialized to the lower 16 bits of adler
4486     // s2 is initialized to the upper 16 bits of adler
4487     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4488     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4489 
4490     // The pipelined loop needs at least 16 elements for 1 iteration
4491     // It does check this, but it is more effective to skip to the cleanup loop
4492     __ cmp(len, (u1)16);
4493     __ br(Assembler::HS, L_nmax);
4494     __ cbz(len, L_combine);
4495 
4496     __ bind(L_simple_by1_loop);
4497     __ ldrb(temp0, Address(__ post(buff, 1)));
4498     __ add(s1, s1, temp0);
4499     __ add(s2, s2, s1);
4500     __ subs(len, len, 1);
4501     __ br(Assembler::HI, L_simple_by1_loop);
4502 
4503     // s1 = s1 % BASE
4504     __ subs(temp0, s1, base);
4505     __ csel(s1, temp0, s1, Assembler::HS);
4506 
4507     // s2 = s2 % BASE
4508     __ lsr(temp0, s2, 16);
4509     __ lsl(temp1, temp0, 4);
4510     __ sub(temp1, temp1, temp0);
4511     __ add(s2, temp1, s2, ext::uxth);
4512 
4513     __ subs(temp0, s2, base);
4514     __ csel(s2, temp0, s2, Assembler::HS);
4515 
4516     __ b(L_combine);
4517 
4518     __ bind(L_nmax);
4519     __ subs(len, len, nmax);
4520     __ sub(count, nmax, 16);
4521     __ br(Assembler::LO, L_by16);
4522 
4523     __ bind(L_nmax_loop);
4524 
4525     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4526                                       vbytes, vs1acc, vs2acc, vtable);
4527 
4528     __ subs(count, count, 16);
4529     __ br(Assembler::HS, L_nmax_loop);
4530 
4531     // s1 = s1 % BASE
4532     __ lsr(temp0, s1, 16);
4533     __ lsl(temp1, temp0, 4);
4534     __ sub(temp1, temp1, temp0);
4535     __ add(temp1, temp1, s1, ext::uxth);
4536 
4537     __ lsr(temp0, temp1, 16);
4538     __ lsl(s1, temp0, 4);
4539     __ sub(s1, s1, temp0);
4540     __ add(s1, s1, temp1, ext:: uxth);
4541 
4542     __ subs(temp0, s1, base);
4543     __ csel(s1, temp0, s1, Assembler::HS);
4544 
4545     // s2 = s2 % BASE
4546     __ lsr(temp0, s2, 16);
4547     __ lsl(temp1, temp0, 4);
4548     __ sub(temp1, temp1, temp0);
4549     __ add(temp1, temp1, s2, ext::uxth);
4550 
4551     __ lsr(temp0, temp1, 16);
4552     __ lsl(s2, temp0, 4);
4553     __ sub(s2, s2, temp0);
4554     __ add(s2, s2, temp1, ext:: uxth);
4555 
4556     __ subs(temp0, s2, base);
4557     __ csel(s2, temp0, s2, Assembler::HS);
4558 
4559     __ subs(len, len, nmax);
4560     __ sub(count, nmax, 16);
4561     __ br(Assembler::HS, L_nmax_loop);
4562 
4563     __ bind(L_by16);
4564     __ adds(len, len, count);
4565     __ br(Assembler::LO, L_by1);
4566 
4567     __ bind(L_by16_loop);
4568 
4569     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4570                                       vbytes, vs1acc, vs2acc, vtable);
4571 
4572     __ subs(len, len, 16);
4573     __ br(Assembler::HS, L_by16_loop);
4574 
4575     __ bind(L_by1);
4576     __ adds(len, len, 15);
4577     __ br(Assembler::LO, L_do_mod);
4578 
4579     __ bind(L_by1_loop);
4580     __ ldrb(temp0, Address(__ post(buff, 1)));
4581     __ add(s1, temp0, s1);
4582     __ add(s2, s2, s1);
4583     __ subs(len, len, 1);
4584     __ br(Assembler::HS, L_by1_loop);
4585 
4586     __ bind(L_do_mod);
4587     // s1 = s1 % BASE
4588     __ lsr(temp0, s1, 16);
4589     __ lsl(temp1, temp0, 4);
4590     __ sub(temp1, temp1, temp0);
4591     __ add(temp1, temp1, s1, ext::uxth);
4592 
4593     __ lsr(temp0, temp1, 16);
4594     __ lsl(s1, temp0, 4);
4595     __ sub(s1, s1, temp0);
4596     __ add(s1, s1, temp1, ext:: uxth);
4597 
4598     __ subs(temp0, s1, base);
4599     __ csel(s1, temp0, s1, Assembler::HS);
4600 
4601     // s2 = s2 % BASE
4602     __ lsr(temp0, s2, 16);
4603     __ lsl(temp1, temp0, 4);
4604     __ sub(temp1, temp1, temp0);
4605     __ add(temp1, temp1, s2, ext::uxth);
4606 
4607     __ lsr(temp0, temp1, 16);
4608     __ lsl(s2, temp0, 4);
4609     __ sub(s2, s2, temp0);
4610     __ add(s2, s2, temp1, ext:: uxth);
4611 
4612     __ subs(temp0, s2, base);
4613     __ csel(s2, temp0, s2, Assembler::HS);
4614 
4615     // Combine lower bits and higher bits
4616     __ bind(L_combine);
4617     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4618 
4619     __ ret(lr);
4620 
4621     return start;
4622   }
4623 
4624   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4625           Register temp0, Register temp1, FloatRegister vbytes,
4626           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4627     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4628     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4629     // In non-vectorized code, we update s1 and s2 as:
4630     //   s1 <- s1 + b1
4631     //   s2 <- s2 + s1
4632     //   s1 <- s1 + b2
4633     //   s2 <- s2 + b1
4634     //   ...
4635     //   s1 <- s1 + b16
4636     //   s2 <- s2 + s1
4637     // Putting above assignments together, we have:
4638     //   s1_new = s1 + b1 + b2 + ... + b16
4639     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4640     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4641     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4642     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4643 
4644     // s2 = s2 + s1 * 16
4645     __ add(s2, s2, s1, Assembler::LSL, 4);
4646 
4647     // vs1acc = b1 + b2 + b3 + ... + b16
4648     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4649     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4650     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4651     __ uaddlv(vs1acc, __ T16B, vbytes);
4652     __ uaddlv(vs2acc, __ T8H, vs2acc);
4653 
4654     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4655     __ fmovd(temp0, vs1acc);
4656     __ fmovd(temp1, vs2acc);
4657     __ add(s1, s1, temp0);
4658     __ add(s2, s2, temp1);
4659   }
4660 
4661   /**
4662    *  Arguments:
4663    *
4664    *  Input:
4665    *    c_rarg0   - x address
4666    *    c_rarg1   - x length
4667    *    c_rarg2   - y address
4668    *    c_rarg3   - y length
4669    *    c_rarg4   - z address
4670    */
4671   address generate_multiplyToLen() {
4672     __ align(CodeEntryAlignment);
4673     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4674 
4675     address start = __ pc();
4676     const Register x     = r0;
4677     const Register xlen  = r1;
4678     const Register y     = r2;
4679     const Register ylen  = r3;
4680     const Register z     = r4;
4681 
4682     const Register tmp0  = r5;
4683     const Register tmp1  = r10;
4684     const Register tmp2  = r11;
4685     const Register tmp3  = r12;
4686     const Register tmp4  = r13;
4687     const Register tmp5  = r14;
4688     const Register tmp6  = r15;
4689     const Register tmp7  = r16;
4690 
4691     BLOCK_COMMENT("Entry:");
4692     __ enter(); // required for proper stackwalking of RuntimeStub frame
4693     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4694     __ leave(); // required for proper stackwalking of RuntimeStub frame
4695     __ ret(lr);
4696 
4697     return start;
4698   }
4699 
4700   address generate_squareToLen() {
4701     // squareToLen algorithm for sizes 1..127 described in java code works
4702     // faster than multiply_to_len on some CPUs and slower on others, but
4703     // multiply_to_len shows a bit better overall results
4704     __ align(CodeEntryAlignment);
4705     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4706     address start = __ pc();
4707 
4708     const Register x     = r0;
4709     const Register xlen  = r1;
4710     const Register z     = r2;
4711     const Register y     = r4; // == x
4712     const Register ylen  = r5; // == xlen
4713 
4714     const Register tmp0  = r3;
4715     const Register tmp1  = r10;
4716     const Register tmp2  = r11;
4717     const Register tmp3  = r12;
4718     const Register tmp4  = r13;
4719     const Register tmp5  = r14;
4720     const Register tmp6  = r15;
4721     const Register tmp7  = r16;
4722 
4723     RegSet spilled_regs = RegSet::of(y, ylen);
4724     BLOCK_COMMENT("Entry:");
4725     __ enter();
4726     __ push(spilled_regs, sp);
4727     __ mov(y, x);
4728     __ mov(ylen, xlen);
4729     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4730     __ pop(spilled_regs, sp);
4731     __ leave();
4732     __ ret(lr);
4733     return start;
4734   }
4735 
4736   address generate_mulAdd() {
4737     __ align(CodeEntryAlignment);
4738     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4739 
4740     address start = __ pc();
4741 
4742     const Register out     = r0;
4743     const Register in      = r1;
4744     const Register offset  = r2;
4745     const Register len     = r3;
4746     const Register k       = r4;
4747 
4748     BLOCK_COMMENT("Entry:");
4749     __ enter();
4750     __ mul_add(out, in, offset, len, k);
4751     __ leave();
4752     __ ret(lr);
4753 
4754     return start;
4755   }
4756 
4757   // Arguments:
4758   //
4759   // Input:
4760   //   c_rarg0   - newArr address
4761   //   c_rarg1   - oldArr address
4762   //   c_rarg2   - newIdx
4763   //   c_rarg3   - shiftCount
4764   //   c_rarg4   - numIter
4765   //
4766   address generate_bigIntegerRightShift() {
4767     __ align(CodeEntryAlignment);
4768     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4769     address start = __ pc();
4770 
4771     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4772 
4773     Register newArr        = c_rarg0;
4774     Register oldArr        = c_rarg1;
4775     Register newIdx        = c_rarg2;
4776     Register shiftCount    = c_rarg3;
4777     Register numIter       = c_rarg4;
4778     Register idx           = numIter;
4779 
4780     Register newArrCur     = rscratch1;
4781     Register shiftRevCount = rscratch2;
4782     Register oldArrCur     = r13;
4783     Register oldArrNext    = r14;
4784 
4785     FloatRegister oldElem0        = v0;
4786     FloatRegister oldElem1        = v1;
4787     FloatRegister newElem         = v2;
4788     FloatRegister shiftVCount     = v3;
4789     FloatRegister shiftVRevCount  = v4;
4790 
4791     __ cbz(idx, Exit);
4792 
4793     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4794 
4795     // left shift count
4796     __ movw(shiftRevCount, 32);
4797     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4798 
4799     // numIter too small to allow a 4-words SIMD loop, rolling back
4800     __ cmp(numIter, (u1)4);
4801     __ br(Assembler::LT, ShiftThree);
4802 
4803     __ dup(shiftVCount,    __ T4S, shiftCount);
4804     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4805     __ negr(shiftVCount,   __ T4S, shiftVCount);
4806 
4807     __ BIND(ShiftSIMDLoop);
4808 
4809     // Calculate the load addresses
4810     __ sub(idx, idx, 4);
4811     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4812     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4813     __ add(oldArrCur,  oldArrNext, 4);
4814 
4815     // Load 4 words and process
4816     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4817     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4818     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4819     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4820     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4821     __ st1(newElem,   __ T4S,  Address(newArrCur));
4822 
4823     __ cmp(idx, (u1)4);
4824     __ br(Assembler::LT, ShiftTwoLoop);
4825     __ b(ShiftSIMDLoop);
4826 
4827     __ BIND(ShiftTwoLoop);
4828     __ cbz(idx, Exit);
4829     __ cmp(idx, (u1)1);
4830     __ br(Assembler::EQ, ShiftOne);
4831 
4832     // Calculate the load addresses
4833     __ sub(idx, idx, 2);
4834     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4835     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4836     __ add(oldArrCur,  oldArrNext, 4);
4837 
4838     // Load 2 words and process
4839     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4840     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4841     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4842     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4843     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4844     __ st1(newElem,   __ T2S, Address(newArrCur));
4845     __ b(ShiftTwoLoop);
4846 
4847     __ BIND(ShiftThree);
4848     __ tbz(idx, 1, ShiftOne);
4849     __ tbz(idx, 0, ShiftTwo);
4850     __ ldrw(r10,  Address(oldArr, 12));
4851     __ ldrw(r11,  Address(oldArr, 8));
4852     __ lsrvw(r10, r10, shiftCount);
4853     __ lslvw(r11, r11, shiftRevCount);
4854     __ orrw(r12,  r10, r11);
4855     __ strw(r12,  Address(newArr, 8));
4856 
4857     __ BIND(ShiftTwo);
4858     __ ldrw(r10,  Address(oldArr, 8));
4859     __ ldrw(r11,  Address(oldArr, 4));
4860     __ lsrvw(r10, r10, shiftCount);
4861     __ lslvw(r11, r11, shiftRevCount);
4862     __ orrw(r12,  r10, r11);
4863     __ strw(r12,  Address(newArr, 4));
4864 
4865     __ BIND(ShiftOne);
4866     __ ldrw(r10,  Address(oldArr, 4));
4867     __ ldrw(r11,  Address(oldArr));
4868     __ lsrvw(r10, r10, shiftCount);
4869     __ lslvw(r11, r11, shiftRevCount);
4870     __ orrw(r12,  r10, r11);
4871     __ strw(r12,  Address(newArr));
4872 
4873     __ BIND(Exit);
4874     __ ret(lr);
4875 
4876     return start;
4877   }
4878 
4879   // Arguments:
4880   //
4881   // Input:
4882   //   c_rarg0   - newArr address
4883   //   c_rarg1   - oldArr address
4884   //   c_rarg2   - newIdx
4885   //   c_rarg3   - shiftCount
4886   //   c_rarg4   - numIter
4887   //
4888   address generate_bigIntegerLeftShift() {
4889     __ align(CodeEntryAlignment);
4890     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4891     address start = __ pc();
4892 
4893     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4894 
4895     Register newArr        = c_rarg0;
4896     Register oldArr        = c_rarg1;
4897     Register newIdx        = c_rarg2;
4898     Register shiftCount    = c_rarg3;
4899     Register numIter       = c_rarg4;
4900 
4901     Register shiftRevCount = rscratch1;
4902     Register oldArrNext    = rscratch2;
4903 
4904     FloatRegister oldElem0        = v0;
4905     FloatRegister oldElem1        = v1;
4906     FloatRegister newElem         = v2;
4907     FloatRegister shiftVCount     = v3;
4908     FloatRegister shiftVRevCount  = v4;
4909 
4910     __ cbz(numIter, Exit);
4911 
4912     __ add(oldArrNext, oldArr, 4);
4913     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4914 
4915     // right shift count
4916     __ movw(shiftRevCount, 32);
4917     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4918 
4919     // numIter too small to allow a 4-words SIMD loop, rolling back
4920     __ cmp(numIter, (u1)4);
4921     __ br(Assembler::LT, ShiftThree);
4922 
4923     __ dup(shiftVCount,     __ T4S, shiftCount);
4924     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4925     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4926 
4927     __ BIND(ShiftSIMDLoop);
4928 
4929     // load 4 words and process
4930     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4931     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4932     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4933     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4934     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4935     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4936     __ sub(numIter,   numIter, 4);
4937 
4938     __ cmp(numIter, (u1)4);
4939     __ br(Assembler::LT, ShiftTwoLoop);
4940     __ b(ShiftSIMDLoop);
4941 
4942     __ BIND(ShiftTwoLoop);
4943     __ cbz(numIter, Exit);
4944     __ cmp(numIter, (u1)1);
4945     __ br(Assembler::EQ, ShiftOne);
4946 
4947     // load 2 words and process
4948     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4949     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4950     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4951     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4952     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4953     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4954     __ sub(numIter,   numIter, 2);
4955     __ b(ShiftTwoLoop);
4956 
4957     __ BIND(ShiftThree);
4958     __ ldrw(r10,  __ post(oldArr, 4));
4959     __ ldrw(r11,  __ post(oldArrNext, 4));
4960     __ lslvw(r10, r10, shiftCount);
4961     __ lsrvw(r11, r11, shiftRevCount);
4962     __ orrw(r12,  r10, r11);
4963     __ strw(r12,  __ post(newArr, 4));
4964     __ tbz(numIter, 1, Exit);
4965     __ tbz(numIter, 0, ShiftOne);
4966 
4967     __ BIND(ShiftTwo);
4968     __ ldrw(r10,  __ post(oldArr, 4));
4969     __ ldrw(r11,  __ post(oldArrNext, 4));
4970     __ lslvw(r10, r10, shiftCount);
4971     __ lsrvw(r11, r11, shiftRevCount);
4972     __ orrw(r12,  r10, r11);
4973     __ strw(r12,  __ post(newArr, 4));
4974 
4975     __ BIND(ShiftOne);
4976     __ ldrw(r10,  Address(oldArr));
4977     __ ldrw(r11,  Address(oldArrNext));
4978     __ lslvw(r10, r10, shiftCount);
4979     __ lsrvw(r11, r11, shiftRevCount);
4980     __ orrw(r12,  r10, r11);
4981     __ strw(r12,  Address(newArr));
4982 
4983     __ BIND(Exit);
4984     __ ret(lr);
4985 
4986     return start;
4987   }
4988 
4989   address generate_count_positives(address &count_positives_long) {
4990     const u1 large_loop_size = 64;
4991     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4992     int dcache_line = VM_Version::dcache_line_size();
4993 
4994     Register ary1 = r1, len = r2, result = r0;
4995 
4996     __ align(CodeEntryAlignment);
4997 
4998     StubCodeMark mark(this, "StubRoutines", "count_positives");
4999 
5000     address entry = __ pc();
5001 
5002     __ enter();
5003     // precondition: a copy of len is already in result
5004     // __ mov(result, len);
5005 
5006   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5007         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5008 
5009   __ cmp(len, (u1)15);
5010   __ br(Assembler::GT, LEN_OVER_15);
5011   // The only case when execution falls into this code is when pointer is near
5012   // the end of memory page and we have to avoid reading next page
5013   __ add(ary1, ary1, len);
5014   __ subs(len, len, 8);
5015   __ br(Assembler::GT, LEN_OVER_8);
5016   __ ldr(rscratch2, Address(ary1, -8));
5017   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5018   __ lsrv(rscratch2, rscratch2, rscratch1);
5019   __ tst(rscratch2, UPPER_BIT_MASK);
5020   __ csel(result, zr, result, Assembler::NE);
5021   __ leave();
5022   __ ret(lr);
5023   __ bind(LEN_OVER_8);
5024   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5025   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5026   __ tst(rscratch2, UPPER_BIT_MASK);
5027   __ br(Assembler::NE, RET_NO_POP);
5028   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5029   __ lsrv(rscratch1, rscratch1, rscratch2);
5030   __ tst(rscratch1, UPPER_BIT_MASK);
5031   __ bind(RET_NO_POP);
5032   __ csel(result, zr, result, Assembler::NE);
5033   __ leave();
5034   __ ret(lr);
5035 
5036   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5037   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5038 
5039   count_positives_long = __ pc(); // 2nd entry point
5040 
5041   __ enter();
5042 
5043   __ bind(LEN_OVER_15);
5044     __ push(spilled_regs, sp);
5045     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5046     __ cbz(rscratch2, ALIGNED);
5047     __ ldp(tmp6, tmp1, Address(ary1));
5048     __ mov(tmp5, 16);
5049     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5050     __ add(ary1, ary1, rscratch1);
5051     __ orr(tmp6, tmp6, tmp1);
5052     __ tst(tmp6, UPPER_BIT_MASK);
5053     __ br(Assembler::NE, RET_ADJUST);
5054     __ sub(len, len, rscratch1);
5055 
5056   __ bind(ALIGNED);
5057     __ cmp(len, large_loop_size);
5058     __ br(Assembler::LT, CHECK_16);
5059     // Perform 16-byte load as early return in pre-loop to handle situation
5060     // when initially aligned large array has negative values at starting bytes,
5061     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5062     // slower. Cases with negative bytes further ahead won't be affected that
5063     // much. In fact, it'll be faster due to early loads, less instructions and
5064     // less branches in LARGE_LOOP.
5065     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5066     __ sub(len, len, 16);
5067     __ orr(tmp6, tmp6, tmp1);
5068     __ tst(tmp6, UPPER_BIT_MASK);
5069     __ br(Assembler::NE, RET_ADJUST_16);
5070     __ cmp(len, large_loop_size);
5071     __ br(Assembler::LT, CHECK_16);
5072 
5073     if (SoftwarePrefetchHintDistance >= 0
5074         && SoftwarePrefetchHintDistance >= dcache_line) {
5075       // initial prefetch
5076       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5077     }
5078   __ bind(LARGE_LOOP);
5079     if (SoftwarePrefetchHintDistance >= 0) {
5080       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5081     }
5082     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5083     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5084     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5085     // instructions per cycle and have less branches, but this approach disables
5086     // early return, thus, all 64 bytes are loaded and checked every time.
5087     __ ldp(tmp2, tmp3, Address(ary1));
5088     __ ldp(tmp4, tmp5, Address(ary1, 16));
5089     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5090     __ ldp(tmp6, tmp1, Address(ary1, 48));
5091     __ add(ary1, ary1, large_loop_size);
5092     __ sub(len, len, large_loop_size);
5093     __ orr(tmp2, tmp2, tmp3);
5094     __ orr(tmp4, tmp4, tmp5);
5095     __ orr(rscratch1, rscratch1, rscratch2);
5096     __ orr(tmp6, tmp6, tmp1);
5097     __ orr(tmp2, tmp2, tmp4);
5098     __ orr(rscratch1, rscratch1, tmp6);
5099     __ orr(tmp2, tmp2, rscratch1);
5100     __ tst(tmp2, UPPER_BIT_MASK);
5101     __ br(Assembler::NE, RET_ADJUST_LONG);
5102     __ cmp(len, large_loop_size);
5103     __ br(Assembler::GE, LARGE_LOOP);
5104 
5105   __ bind(CHECK_16); // small 16-byte load pre-loop
5106     __ cmp(len, (u1)16);
5107     __ br(Assembler::LT, POST_LOOP16);
5108 
5109   __ bind(LOOP16); // small 16-byte load loop
5110     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5111     __ sub(len, len, 16);
5112     __ orr(tmp2, tmp2, tmp3);
5113     __ tst(tmp2, UPPER_BIT_MASK);
5114     __ br(Assembler::NE, RET_ADJUST_16);
5115     __ cmp(len, (u1)16);
5116     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5117 
5118   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5119     __ cmp(len, (u1)8);
5120     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5121     __ ldr(tmp3, Address(__ post(ary1, 8)));
5122     __ tst(tmp3, UPPER_BIT_MASK);
5123     __ br(Assembler::NE, RET_ADJUST);
5124     __ sub(len, len, 8);
5125 
5126   __ bind(POST_LOOP16_LOAD_TAIL);
5127     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5128     __ ldr(tmp1, Address(ary1));
5129     __ mov(tmp2, 64);
5130     __ sub(tmp4, tmp2, len, __ LSL, 3);
5131     __ lslv(tmp1, tmp1, tmp4);
5132     __ tst(tmp1, UPPER_BIT_MASK);
5133     __ br(Assembler::NE, RET_ADJUST);
5134     // Fallthrough
5135 
5136   __ bind(RET_LEN);
5137     __ pop(spilled_regs, sp);
5138     __ leave();
5139     __ ret(lr);
5140 
5141     // difference result - len is the count of guaranteed to be
5142     // positive bytes
5143 
5144   __ bind(RET_ADJUST_LONG);
5145     __ add(len, len, (u1)(large_loop_size - 16));
5146   __ bind(RET_ADJUST_16);
5147     __ add(len, len, 16);
5148   __ bind(RET_ADJUST);
5149     __ pop(spilled_regs, sp);
5150     __ leave();
5151     __ sub(result, result, len);
5152     __ ret(lr);
5153 
5154     return entry;
5155   }
5156 
5157   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5158         bool usePrefetch, Label &NOT_EQUAL) {
5159     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5160         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5161         tmp7 = r12, tmp8 = r13;
5162     Label LOOP;
5163 
5164     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5165     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5166     __ bind(LOOP);
5167     if (usePrefetch) {
5168       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5169       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5170     }
5171     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5172     __ eor(tmp1, tmp1, tmp2);
5173     __ eor(tmp3, tmp3, tmp4);
5174     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5175     __ orr(tmp1, tmp1, tmp3);
5176     __ cbnz(tmp1, NOT_EQUAL);
5177     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5178     __ eor(tmp5, tmp5, tmp6);
5179     __ eor(tmp7, tmp7, tmp8);
5180     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5181     __ orr(tmp5, tmp5, tmp7);
5182     __ cbnz(tmp5, NOT_EQUAL);
5183     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5184     __ eor(tmp1, tmp1, tmp2);
5185     __ eor(tmp3, tmp3, tmp4);
5186     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5187     __ orr(tmp1, tmp1, tmp3);
5188     __ cbnz(tmp1, NOT_EQUAL);
5189     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5190     __ eor(tmp5, tmp5, tmp6);
5191     __ sub(cnt1, cnt1, 8 * wordSize);
5192     __ eor(tmp7, tmp7, tmp8);
5193     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5194     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5195     // cmp) because subs allows an unlimited range of immediate operand.
5196     __ subs(tmp6, cnt1, loopThreshold);
5197     __ orr(tmp5, tmp5, tmp7);
5198     __ cbnz(tmp5, NOT_EQUAL);
5199     __ br(__ GE, LOOP);
5200     // post-loop
5201     __ eor(tmp1, tmp1, tmp2);
5202     __ eor(tmp3, tmp3, tmp4);
5203     __ orr(tmp1, tmp1, tmp3);
5204     __ sub(cnt1, cnt1, 2 * wordSize);
5205     __ cbnz(tmp1, NOT_EQUAL);
5206   }
5207 
5208   void generate_large_array_equals_loop_simd(int loopThreshold,
5209         bool usePrefetch, Label &NOT_EQUAL) {
5210     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5211         tmp2 = rscratch2;
5212     Label LOOP;
5213 
5214     __ bind(LOOP);
5215     if (usePrefetch) {
5216       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5217       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5218     }
5219     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5220     __ sub(cnt1, cnt1, 8 * wordSize);
5221     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5222     __ subs(tmp1, cnt1, loopThreshold);
5223     __ eor(v0, __ T16B, v0, v4);
5224     __ eor(v1, __ T16B, v1, v5);
5225     __ eor(v2, __ T16B, v2, v6);
5226     __ eor(v3, __ T16B, v3, v7);
5227     __ orr(v0, __ T16B, v0, v1);
5228     __ orr(v1, __ T16B, v2, v3);
5229     __ orr(v0, __ T16B, v0, v1);
5230     __ umov(tmp1, v0, __ D, 0);
5231     __ umov(tmp2, v0, __ D, 1);
5232     __ orr(tmp1, tmp1, tmp2);
5233     __ cbnz(tmp1, NOT_EQUAL);
5234     __ br(__ GE, LOOP);
5235   }
5236 
5237   // a1 = r1 - array1 address
5238   // a2 = r2 - array2 address
5239   // result = r0 - return value. Already contains "false"
5240   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5241   // r3-r5 are reserved temporary registers
5242   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5243   address generate_large_array_equals() {
5244     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5245         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5246         tmp7 = r12, tmp8 = r13;
5247     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5248         SMALL_LOOP, POST_LOOP;
5249     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5250     // calculate if at least 32 prefetched bytes are used
5251     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5252     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5253     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5254     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5255         tmp5, tmp6, tmp7, tmp8);
5256 
5257     __ align(CodeEntryAlignment);
5258 
5259     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5260 
5261     address entry = __ pc();
5262     __ enter();
5263     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5264     // also advance pointers to use post-increment instead of pre-increment
5265     __ add(a1, a1, wordSize);
5266     __ add(a2, a2, wordSize);
5267     if (AvoidUnalignedAccesses) {
5268       // both implementations (SIMD/nonSIMD) are using relatively large load
5269       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5270       // on some CPUs in case of address is not at least 16-byte aligned.
5271       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5272       // load if needed at least for 1st address and make if 16-byte aligned.
5273       Label ALIGNED16;
5274       __ tbz(a1, 3, ALIGNED16);
5275       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5276       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5277       __ sub(cnt1, cnt1, wordSize);
5278       __ eor(tmp1, tmp1, tmp2);
5279       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5280       __ bind(ALIGNED16);
5281     }
5282     if (UseSIMDForArrayEquals) {
5283       if (SoftwarePrefetchHintDistance >= 0) {
5284         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5285         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5286         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5287             /* prfm = */ true, NOT_EQUAL);
5288         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5289         __ br(__ LT, TAIL);
5290       }
5291       __ bind(NO_PREFETCH_LARGE_LOOP);
5292       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5293           /* prfm = */ false, NOT_EQUAL);
5294     } else {
5295       __ push(spilled_regs, sp);
5296       if (SoftwarePrefetchHintDistance >= 0) {
5297         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5298         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5299         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5300             /* prfm = */ true, NOT_EQUAL);
5301         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5302         __ br(__ LT, TAIL);
5303       }
5304       __ bind(NO_PREFETCH_LARGE_LOOP);
5305       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5306           /* prfm = */ false, NOT_EQUAL);
5307     }
5308     __ bind(TAIL);
5309       __ cbz(cnt1, EQUAL);
5310       __ subs(cnt1, cnt1, wordSize);
5311       __ br(__ LE, POST_LOOP);
5312     __ bind(SMALL_LOOP);
5313       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5314       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5315       __ subs(cnt1, cnt1, wordSize);
5316       __ eor(tmp1, tmp1, tmp2);
5317       __ cbnz(tmp1, NOT_EQUAL);
5318       __ br(__ GT, SMALL_LOOP);
5319     __ bind(POST_LOOP);
5320       __ ldr(tmp1, Address(a1, cnt1));
5321       __ ldr(tmp2, Address(a2, cnt1));
5322       __ eor(tmp1, tmp1, tmp2);
5323       __ cbnz(tmp1, NOT_EQUAL);
5324     __ bind(EQUAL);
5325       __ mov(result, true);
5326     __ bind(NOT_EQUAL);
5327       if (!UseSIMDForArrayEquals) {
5328         __ pop(spilled_regs, sp);
5329       }
5330     __ bind(NOT_EQUAL_NO_POP);
5331     __ leave();
5332     __ ret(lr);
5333     return entry;
5334   }
5335 
5336   address generate_dsin_dcos(bool isCos) {
5337     __ align(CodeEntryAlignment);
5338     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5339     address start = __ pc();
5340     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5341         (address)StubRoutines::aarch64::_two_over_pi,
5342         (address)StubRoutines::aarch64::_pio2,
5343         (address)StubRoutines::aarch64::_dsin_coef,
5344         (address)StubRoutines::aarch64::_dcos_coef);
5345     return start;
5346   }
5347 
5348   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5349   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5350       Label &DIFF2) {
5351     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5352     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5353 
5354     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5355     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5356     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5357     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5358 
5359     __ fmovd(tmpL, vtmp3);
5360     __ eor(rscratch2, tmp3, tmpL);
5361     __ cbnz(rscratch2, DIFF2);
5362 
5363     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5364     __ umov(tmpL, vtmp3, __ D, 1);
5365     __ eor(rscratch2, tmpU, tmpL);
5366     __ cbnz(rscratch2, DIFF1);
5367 
5368     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5369     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5370     __ fmovd(tmpL, vtmp);
5371     __ eor(rscratch2, tmp3, tmpL);
5372     __ cbnz(rscratch2, DIFF2);
5373 
5374     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5375     __ umov(tmpL, vtmp, __ D, 1);
5376     __ eor(rscratch2, tmpU, tmpL);
5377     __ cbnz(rscratch2, DIFF1);
5378   }
5379 
5380   // r0  = result
5381   // r1  = str1
5382   // r2  = cnt1
5383   // r3  = str2
5384   // r4  = cnt2
5385   // r10 = tmp1
5386   // r11 = tmp2
5387   address generate_compare_long_string_different_encoding(bool isLU) {
5388     __ align(CodeEntryAlignment);
5389     StubCodeMark mark(this, "StubRoutines", isLU
5390         ? "compare_long_string_different_encoding LU"
5391         : "compare_long_string_different_encoding UL");
5392     address entry = __ pc();
5393     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5394         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5395         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5396     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5397         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5398     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5399     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5400 
5401     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5402 
5403     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5404     // cnt2 == amount of characters left to compare
5405     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5406     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5407     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5408     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5409     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5410     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5411     __ eor(rscratch2, tmp1, tmp2);
5412     __ mov(rscratch1, tmp2);
5413     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5414     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5415              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5416     __ push(spilled_regs, sp);
5417     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5418     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5419 
5420     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5421 
5422     if (SoftwarePrefetchHintDistance >= 0) {
5423       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5424       __ br(__ LT, NO_PREFETCH);
5425       __ bind(LARGE_LOOP_PREFETCH);
5426         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5427         __ mov(tmp4, 2);
5428         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5429         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5430           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5431           __ subs(tmp4, tmp4, 1);
5432           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5433           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5434           __ mov(tmp4, 2);
5435         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5436           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5437           __ subs(tmp4, tmp4, 1);
5438           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5439           __ sub(cnt2, cnt2, 64);
5440           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5441           __ br(__ GE, LARGE_LOOP_PREFETCH);
5442     }
5443     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5444     __ bind(NO_PREFETCH);
5445     __ subs(cnt2, cnt2, 16);
5446     __ br(__ LT, TAIL);
5447     __ align(OptoLoopAlignment);
5448     __ bind(SMALL_LOOP); // smaller loop
5449       __ subs(cnt2, cnt2, 16);
5450       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5451       __ br(__ GE, SMALL_LOOP);
5452       __ cmn(cnt2, (u1)16);
5453       __ br(__ EQ, LOAD_LAST);
5454     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5455       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5456       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5457       __ ldr(tmp3, Address(cnt1, -8));
5458       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5459       __ b(LOAD_LAST);
5460     __ bind(DIFF2);
5461       __ mov(tmpU, tmp3);
5462     __ bind(DIFF1);
5463       __ pop(spilled_regs, sp);
5464       __ b(CALCULATE_DIFFERENCE);
5465     __ bind(LOAD_LAST);
5466       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5467       // No need to load it again
5468       __ mov(tmpU, tmp3);
5469       __ pop(spilled_regs, sp);
5470 
5471       // tmp2 points to the address of the last 4 Latin1 characters right now
5472       __ ldrs(vtmp, Address(tmp2));
5473       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5474       __ fmovd(tmpL, vtmp);
5475 
5476       __ eor(rscratch2, tmpU, tmpL);
5477       __ cbz(rscratch2, DONE);
5478 
5479     // Find the first different characters in the longwords and
5480     // compute their difference.
5481     __ bind(CALCULATE_DIFFERENCE);
5482       __ rev(rscratch2, rscratch2);
5483       __ clz(rscratch2, rscratch2);
5484       __ andr(rscratch2, rscratch2, -16);
5485       __ lsrv(tmp1, tmp1, rscratch2);
5486       __ uxthw(tmp1, tmp1);
5487       __ lsrv(rscratch1, rscratch1, rscratch2);
5488       __ uxthw(rscratch1, rscratch1);
5489       __ subw(result, tmp1, rscratch1);
5490     __ bind(DONE);
5491       __ ret(lr);
5492     return entry;
5493   }
5494 
5495   // r0 = input (float16)
5496   // v0 = result (float)
5497   // v1 = temporary float register
5498   address generate_float16ToFloat() {
5499     __ align(CodeEntryAlignment);
5500     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
5501     address entry = __ pc();
5502     BLOCK_COMMENT("Entry:");
5503     __ flt16_to_flt(v0, r0, v1);
5504     __ ret(lr);
5505     return entry;
5506   }
5507 
5508   // v0 = input (float)
5509   // r0 = result (float16)
5510   // v1 = temporary float register
5511   address generate_floatToFloat16() {
5512     __ align(CodeEntryAlignment);
5513     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
5514     address entry = __ pc();
5515     BLOCK_COMMENT("Entry:");
5516     __ flt_to_flt16(r0, v0, v1);
5517     __ ret(lr);
5518     return entry;
5519   }
5520 
5521   address generate_method_entry_barrier() {
5522     __ align(CodeEntryAlignment);
5523     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5524 
5525     Label deoptimize_label;
5526 
5527     address start = __ pc();
5528 
5529     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5530 
5531     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5532       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5533       // We can get here despite the nmethod being good, if we have not
5534       // yet applied our cross modification fence (or data fence).
5535       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5536       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5537       __ ldrw(rscratch2, rscratch2);
5538       __ strw(rscratch2, thread_epoch_addr);
5539       __ isb();
5540       __ membar(__ LoadLoad);
5541     }
5542 
5543     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5544 
5545     __ enter();
5546     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5547 
5548     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5549 
5550     __ push_call_clobbered_registers();
5551 
5552     __ mov(c_rarg0, rscratch2);
5553     __ call_VM_leaf
5554          (CAST_FROM_FN_PTR
5555           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5556 
5557     __ reset_last_Java_frame(true);
5558 
5559     __ mov(rscratch1, r0);
5560 
5561     __ pop_call_clobbered_registers();
5562 
5563     __ cbnz(rscratch1, deoptimize_label);
5564 
5565     __ leave();
5566     __ ret(lr);
5567 
5568     __ BIND(deoptimize_label);
5569 
5570     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5571     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5572 
5573     __ mov(sp, rscratch1);
5574     __ br(rscratch2);
5575 
5576     return start;
5577   }
5578 
5579   // r0  = result
5580   // r1  = str1
5581   // r2  = cnt1
5582   // r3  = str2
5583   // r4  = cnt2
5584   // r10 = tmp1
5585   // r11 = tmp2
5586   address generate_compare_long_string_same_encoding(bool isLL) {
5587     __ align(CodeEntryAlignment);
5588     StubCodeMark mark(this, "StubRoutines", isLL
5589         ? "compare_long_string_same_encoding LL"
5590         : "compare_long_string_same_encoding UU");
5591     address entry = __ pc();
5592     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5593         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5594 
5595     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5596 
5597     // exit from large loop when less than 64 bytes left to read or we're about
5598     // to prefetch memory behind array border
5599     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5600 
5601     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5602     __ eor(rscratch2, tmp1, tmp2);
5603     __ cbnz(rscratch2, CAL_DIFFERENCE);
5604 
5605     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5606     // update pointers, because of previous read
5607     __ add(str1, str1, wordSize);
5608     __ add(str2, str2, wordSize);
5609     if (SoftwarePrefetchHintDistance >= 0) {
5610       __ align(OptoLoopAlignment);
5611       __ bind(LARGE_LOOP_PREFETCH);
5612         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5613         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5614 
5615         for (int i = 0; i < 4; i++) {
5616           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5617           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5618           __ cmp(tmp1, tmp2);
5619           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5620           __ br(Assembler::NE, DIFF);
5621         }
5622         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5623         __ add(str1, str1, 64);
5624         __ add(str2, str2, 64);
5625         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5626         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5627         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5628     }
5629 
5630     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5631     __ br(Assembler::LE, LESS16);
5632     __ align(OptoLoopAlignment);
5633     __ bind(LOOP_COMPARE16);
5634       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5635       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5636       __ cmp(tmp1, tmp2);
5637       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5638       __ br(Assembler::NE, DIFF);
5639       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5640       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5641       __ br(Assembler::LT, LESS16);
5642 
5643       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5644       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5645       __ cmp(tmp1, tmp2);
5646       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5647       __ br(Assembler::NE, DIFF);
5648       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5649       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5650       __ br(Assembler::GE, LOOP_COMPARE16);
5651       __ cbz(cnt2, LENGTH_DIFF);
5652 
5653     __ bind(LESS16);
5654       // each 8 compare
5655       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5656       __ br(Assembler::LE, LESS8);
5657       __ ldr(tmp1, Address(__ post(str1, 8)));
5658       __ ldr(tmp2, Address(__ post(str2, 8)));
5659       __ eor(rscratch2, tmp1, tmp2);
5660       __ cbnz(rscratch2, CAL_DIFFERENCE);
5661       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5662 
5663     __ bind(LESS8); // directly load last 8 bytes
5664       if (!isLL) {
5665         __ add(cnt2, cnt2, cnt2);
5666       }
5667       __ ldr(tmp1, Address(str1, cnt2));
5668       __ ldr(tmp2, Address(str2, cnt2));
5669       __ eor(rscratch2, tmp1, tmp2);
5670       __ cbz(rscratch2, LENGTH_DIFF);
5671       __ b(CAL_DIFFERENCE);
5672 
5673     __ bind(DIFF);
5674       __ cmp(tmp1, tmp2);
5675       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5676       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5677       // reuse rscratch2 register for the result of eor instruction
5678       __ eor(rscratch2, tmp1, tmp2);
5679 
5680     __ bind(CAL_DIFFERENCE);
5681       __ rev(rscratch2, rscratch2);
5682       __ clz(rscratch2, rscratch2);
5683       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5684       __ lsrv(tmp1, tmp1, rscratch2);
5685       __ lsrv(tmp2, tmp2, rscratch2);
5686       if (isLL) {
5687         __ uxtbw(tmp1, tmp1);
5688         __ uxtbw(tmp2, tmp2);
5689       } else {
5690         __ uxthw(tmp1, tmp1);
5691         __ uxthw(tmp2, tmp2);
5692       }
5693       __ subw(result, tmp1, tmp2);
5694 
5695     __ bind(LENGTH_DIFF);
5696       __ ret(lr);
5697     return entry;
5698   }
5699 
5700   enum string_compare_mode {
5701     LL,
5702     LU,
5703     UL,
5704     UU,
5705   };
5706 
5707   // The following registers are declared in aarch64.ad
5708   // r0  = result
5709   // r1  = str1
5710   // r2  = cnt1
5711   // r3  = str2
5712   // r4  = cnt2
5713   // r10 = tmp1
5714   // r11 = tmp2
5715   // z0  = ztmp1
5716   // z1  = ztmp2
5717   // p0  = pgtmp1
5718   // p1  = pgtmp2
5719   address generate_compare_long_string_sve(string_compare_mode mode) {
5720     __ align(CodeEntryAlignment);
5721     address entry = __ pc();
5722     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5723              tmp1 = r10, tmp2 = r11;
5724 
5725     Label LOOP, DONE, MISMATCH;
5726     Register vec_len = tmp1;
5727     Register idx = tmp2;
5728     // The minimum of the string lengths has been stored in cnt2.
5729     Register cnt = cnt2;
5730     FloatRegister ztmp1 = z0, ztmp2 = z1;
5731     PRegister pgtmp1 = p0, pgtmp2 = p1;
5732 
5733 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5734     switch (mode) {                                                            \
5735       case LL:                                                                 \
5736         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5737         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5738         break;                                                                 \
5739       case LU:                                                                 \
5740         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5741         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5742         break;                                                                 \
5743       case UL:                                                                 \
5744         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5745         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5746         break;                                                                 \
5747       case UU:                                                                 \
5748         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5749         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5750         break;                                                                 \
5751       default:                                                                 \
5752         ShouldNotReachHere();                                                  \
5753     }
5754 
5755     const char* stubname;
5756     switch (mode) {
5757       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5758       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5759       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5760       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5761       default: ShouldNotReachHere();
5762     }
5763 
5764     StubCodeMark mark(this, "StubRoutines", stubname);
5765 
5766     __ mov(idx, 0);
5767     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5768 
5769     if (mode == LL) {
5770       __ sve_cntb(vec_len);
5771     } else {
5772       __ sve_cnth(vec_len);
5773     }
5774 
5775     __ sub(rscratch1, cnt, vec_len);
5776 
5777     __ bind(LOOP);
5778 
5779       // main loop
5780       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5781       __ add(idx, idx, vec_len);
5782       // Compare strings.
5783       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5784       __ br(__ NE, MISMATCH);
5785       __ cmp(idx, rscratch1);
5786       __ br(__ LT, LOOP);
5787 
5788     // post loop, last iteration
5789     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5790 
5791     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5792     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5793     __ br(__ EQ, DONE);
5794 
5795     __ bind(MISMATCH);
5796 
5797     // Crop the vector to find its location.
5798     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5799     // Extract the first different characters of each string.
5800     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5801     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5802 
5803     // Compute the difference of the first different characters.
5804     __ sub(result, rscratch1, rscratch2);
5805 
5806     __ bind(DONE);
5807     __ ret(lr);
5808 #undef LOAD_PAIR
5809     return entry;
5810   }
5811 
5812   void generate_compare_long_strings() {
5813     if (UseSVE == 0) {
5814       StubRoutines::aarch64::_compare_long_string_LL
5815           = generate_compare_long_string_same_encoding(true);
5816       StubRoutines::aarch64::_compare_long_string_UU
5817           = generate_compare_long_string_same_encoding(false);
5818       StubRoutines::aarch64::_compare_long_string_LU
5819           = generate_compare_long_string_different_encoding(true);
5820       StubRoutines::aarch64::_compare_long_string_UL
5821           = generate_compare_long_string_different_encoding(false);
5822     } else {
5823       StubRoutines::aarch64::_compare_long_string_LL
5824           = generate_compare_long_string_sve(LL);
5825       StubRoutines::aarch64::_compare_long_string_UU
5826           = generate_compare_long_string_sve(UU);
5827       StubRoutines::aarch64::_compare_long_string_LU
5828           = generate_compare_long_string_sve(LU);
5829       StubRoutines::aarch64::_compare_long_string_UL
5830           = generate_compare_long_string_sve(UL);
5831     }
5832   }
5833 
5834   // R0 = result
5835   // R1 = str2
5836   // R2 = cnt1
5837   // R3 = str1
5838   // R4 = cnt2
5839   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5840   //
5841   // This generic linear code use few additional ideas, which makes it faster:
5842   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5843   // in order to skip initial loading(help in systems with 1 ld pipeline)
5844   // 2) we can use "fast" algorithm of finding single character to search for
5845   // first symbol with less branches(1 branch per each loaded register instead
5846   // of branch for each symbol), so, this is where constants like
5847   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5848   // 3) after loading and analyzing 1st register of source string, it can be
5849   // used to search for every 1st character entry, saving few loads in
5850   // comparison with "simplier-but-slower" implementation
5851   // 4) in order to avoid lots of push/pop operations, code below is heavily
5852   // re-using/re-initializing/compressing register values, which makes code
5853   // larger and a bit less readable, however, most of extra operations are
5854   // issued during loads or branches, so, penalty is minimal
5855   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5856     const char* stubName = str1_isL
5857         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5858         : "indexof_linear_uu";
5859     __ align(CodeEntryAlignment);
5860     StubCodeMark mark(this, "StubRoutines", stubName);
5861     address entry = __ pc();
5862 
5863     int str1_chr_size = str1_isL ? 1 : 2;
5864     int str2_chr_size = str2_isL ? 1 : 2;
5865     int str1_chr_shift = str1_isL ? 0 : 1;
5866     int str2_chr_shift = str2_isL ? 0 : 1;
5867     bool isL = str1_isL && str2_isL;
5868    // parameters
5869     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5870     // temporary registers
5871     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5872     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5873     // redefinitions
5874     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5875 
5876     __ push(spilled_regs, sp);
5877     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5878         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5879         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5880         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5881         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5882         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5883     // Read whole register from str1. It is safe, because length >=8 here
5884     __ ldr(ch1, Address(str1));
5885     // Read whole register from str2. It is safe, because length >=8 here
5886     __ ldr(ch2, Address(str2));
5887     __ sub(cnt2, cnt2, cnt1);
5888     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5889     if (str1_isL != str2_isL) {
5890       __ eor(v0, __ T16B, v0, v0);
5891     }
5892     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5893     __ mul(first, first, tmp1);
5894     // check if we have less than 1 register to check
5895     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5896     if (str1_isL != str2_isL) {
5897       __ fmovd(v1, ch1);
5898     }
5899     __ br(__ LE, L_SMALL);
5900     __ eor(ch2, first, ch2);
5901     if (str1_isL != str2_isL) {
5902       __ zip1(v1, __ T16B, v1, v0);
5903     }
5904     __ sub(tmp2, ch2, tmp1);
5905     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5906     __ bics(tmp2, tmp2, ch2);
5907     if (str1_isL != str2_isL) {
5908       __ fmovd(ch1, v1);
5909     }
5910     __ br(__ NE, L_HAS_ZERO);
5911     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5912     __ add(result, result, wordSize/str2_chr_size);
5913     __ add(str2, str2, wordSize);
5914     __ br(__ LT, L_POST_LOOP);
5915     __ BIND(L_LOOP);
5916       __ ldr(ch2, Address(str2));
5917       __ eor(ch2, first, ch2);
5918       __ sub(tmp2, ch2, tmp1);
5919       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5920       __ bics(tmp2, tmp2, ch2);
5921       __ br(__ NE, L_HAS_ZERO);
5922     __ BIND(L_LOOP_PROCEED);
5923       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5924       __ add(str2, str2, wordSize);
5925       __ add(result, result, wordSize/str2_chr_size);
5926       __ br(__ GE, L_LOOP);
5927     __ BIND(L_POST_LOOP);
5928       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5929       __ br(__ LE, NOMATCH);
5930       __ ldr(ch2, Address(str2));
5931       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5932       __ eor(ch2, first, ch2);
5933       __ sub(tmp2, ch2, tmp1);
5934       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5935       __ mov(tmp4, -1); // all bits set
5936       __ b(L_SMALL_PROCEED);
5937     __ align(OptoLoopAlignment);
5938     __ BIND(L_SMALL);
5939       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5940       __ eor(ch2, first, ch2);
5941       if (str1_isL != str2_isL) {
5942         __ zip1(v1, __ T16B, v1, v0);
5943       }
5944       __ sub(tmp2, ch2, tmp1);
5945       __ mov(tmp4, -1); // all bits set
5946       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5947       if (str1_isL != str2_isL) {
5948         __ fmovd(ch1, v1); // move converted 4 symbols
5949       }
5950     __ BIND(L_SMALL_PROCEED);
5951       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5952       __ bic(tmp2, tmp2, ch2);
5953       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5954       __ rbit(tmp2, tmp2);
5955       __ br(__ EQ, NOMATCH);
5956     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5957       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5958       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5959       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5960       if (str2_isL) { // LL
5961         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5962         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5963         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5964         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5965         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5966       } else {
5967         __ mov(ch2, 0xE); // all bits in byte set except last one
5968         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5969         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5970         __ lslv(tmp2, tmp2, tmp4);
5971         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5972         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5973         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5974         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5975       }
5976       __ cmp(ch1, ch2);
5977       __ mov(tmp4, wordSize/str2_chr_size);
5978       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5979     __ BIND(L_SMALL_CMP_LOOP);
5980       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5981                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5982       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5983                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5984       __ add(tmp4, tmp4, 1);
5985       __ cmp(tmp4, cnt1);
5986       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5987       __ cmp(first, ch2);
5988       __ br(__ EQ, L_SMALL_CMP_LOOP);
5989     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5990       __ cbz(tmp2, NOMATCH); // no more matches. exit
5991       __ clz(tmp4, tmp2);
5992       __ add(result, result, 1); // advance index
5993       __ add(str2, str2, str2_chr_size); // advance pointer
5994       __ b(L_SMALL_HAS_ZERO_LOOP);
5995     __ align(OptoLoopAlignment);
5996     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5997       __ cmp(first, ch2);
5998       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5999       __ b(DONE);
6000     __ align(OptoLoopAlignment);
6001     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
6002       if (str2_isL) { // LL
6003         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6004         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6005         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6006         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6007         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6008       } else {
6009         __ mov(ch2, 0xE); // all bits in byte set except last one
6010         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6011         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6012         __ lslv(tmp2, tmp2, tmp4);
6013         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6014         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6015         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6016         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6017       }
6018       __ cmp(ch1, ch2);
6019       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6020       __ b(DONE);
6021     __ align(OptoLoopAlignment);
6022     __ BIND(L_HAS_ZERO);
6023       __ rbit(tmp2, tmp2);
6024       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6025       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6026       // It's fine because both counters are 32bit and are not changed in this
6027       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6028       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6029       __ sub(result, result, 1);
6030     __ BIND(L_HAS_ZERO_LOOP);
6031       __ mov(cnt1, wordSize/str2_chr_size);
6032       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6033       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6034       if (str2_isL) {
6035         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6036         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6037         __ lslv(tmp2, tmp2, tmp4);
6038         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6039         __ add(tmp4, tmp4, 1);
6040         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6041         __ lsl(tmp2, tmp2, 1);
6042         __ mov(tmp4, wordSize/str2_chr_size);
6043       } else {
6044         __ mov(ch2, 0xE);
6045         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6046         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6047         __ lslv(tmp2, tmp2, tmp4);
6048         __ add(tmp4, tmp4, 1);
6049         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6050         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6051         __ lsl(tmp2, tmp2, 1);
6052         __ mov(tmp4, wordSize/str2_chr_size);
6053         __ sub(str2, str2, str2_chr_size);
6054       }
6055       __ cmp(ch1, ch2);
6056       __ mov(tmp4, wordSize/str2_chr_size);
6057       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6058     __ BIND(L_CMP_LOOP);
6059       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6060                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6061       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6062                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6063       __ add(tmp4, tmp4, 1);
6064       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6065       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6066       __ cmp(cnt1, ch2);
6067       __ br(__ EQ, L_CMP_LOOP);
6068     __ BIND(L_CMP_LOOP_NOMATCH);
6069       // here we're not matched
6070       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6071       __ clz(tmp4, tmp2);
6072       __ add(str2, str2, str2_chr_size); // advance pointer
6073       __ b(L_HAS_ZERO_LOOP);
6074     __ align(OptoLoopAlignment);
6075     __ BIND(L_CMP_LOOP_LAST_CMP);
6076       __ cmp(cnt1, ch2);
6077       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6078       __ b(DONE);
6079     __ align(OptoLoopAlignment);
6080     __ BIND(L_CMP_LOOP_LAST_CMP2);
6081       if (str2_isL) {
6082         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6083         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6084         __ lslv(tmp2, tmp2, tmp4);
6085         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6086         __ add(tmp4, tmp4, 1);
6087         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6088         __ lsl(tmp2, tmp2, 1);
6089       } else {
6090         __ mov(ch2, 0xE);
6091         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6092         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6093         __ lslv(tmp2, tmp2, tmp4);
6094         __ add(tmp4, tmp4, 1);
6095         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6096         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6097         __ lsl(tmp2, tmp2, 1);
6098         __ sub(str2, str2, str2_chr_size);
6099       }
6100       __ cmp(ch1, ch2);
6101       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6102       __ b(DONE);
6103     __ align(OptoLoopAlignment);
6104     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6105       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6106       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6107       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6108       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6109       // result by analyzed characters value, so, we can just reset lower bits
6110       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6111       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6112       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6113       // index of last analyzed substring inside current octet. So, str2 in at
6114       // respective start address. We need to advance it to next octet
6115       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6116       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6117       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6118       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6119       __ movw(cnt2, cnt2);
6120       __ b(L_LOOP_PROCEED);
6121     __ align(OptoLoopAlignment);
6122     __ BIND(NOMATCH);
6123       __ mov(result, -1);
6124     __ BIND(DONE);
6125       __ pop(spilled_regs, sp);
6126       __ ret(lr);
6127     return entry;
6128   }
6129 
6130   void generate_string_indexof_stubs() {
6131     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6132     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6133     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6134   }
6135 
6136   void inflate_and_store_2_fp_registers(bool generatePrfm,
6137       FloatRegister src1, FloatRegister src2) {
6138     Register dst = r1;
6139     __ zip1(v1, __ T16B, src1, v0);
6140     __ zip2(v2, __ T16B, src1, v0);
6141     if (generatePrfm) {
6142       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6143     }
6144     __ zip1(v3, __ T16B, src2, v0);
6145     __ zip2(v4, __ T16B, src2, v0);
6146     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6147   }
6148 
6149   // R0 = src
6150   // R1 = dst
6151   // R2 = len
6152   // R3 = len >> 3
6153   // V0 = 0
6154   // v1 = loaded 8 bytes
6155   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6156   address generate_large_byte_array_inflate() {
6157     __ align(CodeEntryAlignment);
6158     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6159     address entry = __ pc();
6160     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6161     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6162     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6163 
6164     // do one more 8-byte read to have address 16-byte aligned in most cases
6165     // also use single store instruction
6166     __ ldrd(v2, __ post(src, 8));
6167     __ sub(octetCounter, octetCounter, 2);
6168     __ zip1(v1, __ T16B, v1, v0);
6169     __ zip1(v2, __ T16B, v2, v0);
6170     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6171     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6172     __ subs(rscratch1, octetCounter, large_loop_threshold);
6173     __ br(__ LE, LOOP_START);
6174     __ b(LOOP_PRFM_START);
6175     __ bind(LOOP_PRFM);
6176       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6177     __ bind(LOOP_PRFM_START);
6178       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6179       __ sub(octetCounter, octetCounter, 8);
6180       __ subs(rscratch1, octetCounter, large_loop_threshold);
6181       inflate_and_store_2_fp_registers(true, v3, v4);
6182       inflate_and_store_2_fp_registers(true, v5, v6);
6183       __ br(__ GT, LOOP_PRFM);
6184       __ cmp(octetCounter, (u1)8);
6185       __ br(__ LT, DONE);
6186     __ bind(LOOP);
6187       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6188       __ bind(LOOP_START);
6189       __ sub(octetCounter, octetCounter, 8);
6190       __ cmp(octetCounter, (u1)8);
6191       inflate_and_store_2_fp_registers(false, v3, v4);
6192       inflate_and_store_2_fp_registers(false, v5, v6);
6193       __ br(__ GE, LOOP);
6194     __ bind(DONE);
6195       __ ret(lr);
6196     return entry;
6197   }
6198 
6199   /**
6200    *  Arguments:
6201    *
6202    *  Input:
6203    *  c_rarg0   - current state address
6204    *  c_rarg1   - H key address
6205    *  c_rarg2   - data address
6206    *  c_rarg3   - number of blocks
6207    *
6208    *  Output:
6209    *  Updated state at c_rarg0
6210    */
6211   address generate_ghash_processBlocks() {
6212     // Bafflingly, GCM uses little-endian for the byte order, but
6213     // big-endian for the bit order.  For example, the polynomial 1 is
6214     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6215     //
6216     // So, we must either reverse the bytes in each word and do
6217     // everything big-endian or reverse the bits in each byte and do
6218     // it little-endian.  On AArch64 it's more idiomatic to reverse
6219     // the bits in each byte (we have an instruction, RBIT, to do
6220     // that) and keep the data in little-endian bit order through the
6221     // calculation, bit-reversing the inputs and outputs.
6222 
6223     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6224     __ align(wordSize * 2);
6225     address p = __ pc();
6226     __ emit_int64(0x87);  // The low-order bits of the field
6227                           // polynomial (i.e. p = z^7+z^2+z+1)
6228                           // repeated in the low and high parts of a
6229                           // 128-bit vector
6230     __ emit_int64(0x87);
6231 
6232     __ align(CodeEntryAlignment);
6233     address start = __ pc();
6234 
6235     Register state   = c_rarg0;
6236     Register subkeyH = c_rarg1;
6237     Register data    = c_rarg2;
6238     Register blocks  = c_rarg3;
6239 
6240     FloatRegister vzr = v30;
6241     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6242 
6243     __ ldrq(v24, p);    // The field polynomial
6244 
6245     __ ldrq(v0, Address(state));
6246     __ ldrq(v1, Address(subkeyH));
6247 
6248     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6249     __ rbit(v0, __ T16B, v0);
6250     __ rev64(v1, __ T16B, v1);
6251     __ rbit(v1, __ T16B, v1);
6252 
6253     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6254     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6255 
6256     {
6257       Label L_ghash_loop;
6258       __ bind(L_ghash_loop);
6259 
6260       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6261                                                  // reversing each byte
6262       __ rbit(v2, __ T16B, v2);
6263       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6264 
6265       // Multiply state in v2 by subkey in v1
6266       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6267                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6268                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6269       // Reduce v7:v5 by the field polynomial
6270       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6271 
6272       __ sub(blocks, blocks, 1);
6273       __ cbnz(blocks, L_ghash_loop);
6274     }
6275 
6276     // The bit-reversed result is at this point in v0
6277     __ rev64(v0, __ T16B, v0);
6278     __ rbit(v0, __ T16B, v0);
6279 
6280     __ st1(v0, __ T16B, state);
6281     __ ret(lr);
6282 
6283     return start;
6284   }
6285 
6286   address generate_ghash_processBlocks_wide() {
6287     address small = generate_ghash_processBlocks();
6288 
6289     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6290     __ align(wordSize * 2);
6291     address p = __ pc();
6292     __ emit_int64(0x87);  // The low-order bits of the field
6293                           // polynomial (i.e. p = z^7+z^2+z+1)
6294                           // repeated in the low and high parts of a
6295                           // 128-bit vector
6296     __ emit_int64(0x87);
6297 
6298     __ align(CodeEntryAlignment);
6299     address start = __ pc();
6300 
6301     Register state   = c_rarg0;
6302     Register subkeyH = c_rarg1;
6303     Register data    = c_rarg2;
6304     Register blocks  = c_rarg3;
6305 
6306     const int unroll = 4;
6307 
6308     __ cmp(blocks, (unsigned char)(unroll * 2));
6309     __ br(__ LT, small);
6310 
6311     if (unroll > 1) {
6312     // Save state before entering routine
6313       __ sub(sp, sp, 4 * 16);
6314       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6315       __ sub(sp, sp, 4 * 16);
6316       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6317     }
6318 
6319     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6320 
6321     if (unroll > 1) {
6322       // And restore state
6323       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6324       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6325     }
6326 
6327     __ cmp(blocks, (unsigned char)0);
6328     __ br(__ GT, small);
6329 
6330     __ ret(lr);
6331 
6332     return start;
6333   }
6334 
6335   void generate_base64_encode_simdround(Register src, Register dst,
6336         FloatRegister codec, u8 size) {
6337 
6338     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6339     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6340     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6341 
6342     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6343 
6344     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6345 
6346     __ ushr(ind0, arrangement, in0,  2);
6347 
6348     __ ushr(ind1, arrangement, in1,  2);
6349     __ shl(in0,   arrangement, in0,  6);
6350     __ orr(ind1,  arrangement, ind1, in0);
6351     __ ushr(ind1, arrangement, ind1, 2);
6352 
6353     __ ushr(ind2, arrangement, in2,  4);
6354     __ shl(in1,   arrangement, in1,  4);
6355     __ orr(ind2,  arrangement, in1,  ind2);
6356     __ ushr(ind2, arrangement, ind2, 2);
6357 
6358     __ shl(ind3,  arrangement, in2,  2);
6359     __ ushr(ind3, arrangement, ind3, 2);
6360 
6361     __ tbl(out0,  arrangement, codec,  4, ind0);
6362     __ tbl(out1,  arrangement, codec,  4, ind1);
6363     __ tbl(out2,  arrangement, codec,  4, ind2);
6364     __ tbl(out3,  arrangement, codec,  4, ind3);
6365 
6366     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6367   }
6368 
6369    /**
6370    *  Arguments:
6371    *
6372    *  Input:
6373    *  c_rarg0   - src_start
6374    *  c_rarg1   - src_offset
6375    *  c_rarg2   - src_length
6376    *  c_rarg3   - dest_start
6377    *  c_rarg4   - dest_offset
6378    *  c_rarg5   - isURL
6379    *
6380    */
6381   address generate_base64_encodeBlock() {
6382 
6383     static const char toBase64[64] = {
6384       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6385       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6386       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6387       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6388       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6389     };
6390 
6391     static const char toBase64URL[64] = {
6392       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6393       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6394       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6395       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6396       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6397     };
6398 
6399     __ align(CodeEntryAlignment);
6400     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6401     address start = __ pc();
6402 
6403     Register src   = c_rarg0;  // source array
6404     Register soff  = c_rarg1;  // source start offset
6405     Register send  = c_rarg2;  // source end offset
6406     Register dst   = c_rarg3;  // dest array
6407     Register doff  = c_rarg4;  // position for writing to dest array
6408     Register isURL = c_rarg5;  // Base64 or URL character set
6409 
6410     // c_rarg6 and c_rarg7 are free to use as temps
6411     Register codec  = c_rarg6;
6412     Register length = c_rarg7;
6413 
6414     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6415 
6416     __ add(src, src, soff);
6417     __ add(dst, dst, doff);
6418     __ sub(length, send, soff);
6419 
6420     // load the codec base address
6421     __ lea(codec, ExternalAddress((address) toBase64));
6422     __ cbz(isURL, ProcessData);
6423     __ lea(codec, ExternalAddress((address) toBase64URL));
6424 
6425     __ BIND(ProcessData);
6426 
6427     // too short to formup a SIMD loop, roll back
6428     __ cmp(length, (u1)24);
6429     __ br(Assembler::LT, Process3B);
6430 
6431     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6432 
6433     __ BIND(Process48B);
6434     __ cmp(length, (u1)48);
6435     __ br(Assembler::LT, Process24B);
6436     generate_base64_encode_simdround(src, dst, v0, 16);
6437     __ sub(length, length, 48);
6438     __ b(Process48B);
6439 
6440     __ BIND(Process24B);
6441     __ cmp(length, (u1)24);
6442     __ br(Assembler::LT, SIMDExit);
6443     generate_base64_encode_simdround(src, dst, v0, 8);
6444     __ sub(length, length, 24);
6445 
6446     __ BIND(SIMDExit);
6447     __ cbz(length, Exit);
6448 
6449     __ BIND(Process3B);
6450     //  3 src bytes, 24 bits
6451     __ ldrb(r10, __ post(src, 1));
6452     __ ldrb(r11, __ post(src, 1));
6453     __ ldrb(r12, __ post(src, 1));
6454     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6455     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6456     // codec index
6457     __ ubfmw(r15, r12, 18, 23);
6458     __ ubfmw(r14, r12, 12, 17);
6459     __ ubfmw(r13, r12, 6,  11);
6460     __ andw(r12,  r12, 63);
6461     // get the code based on the codec
6462     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6463     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6464     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6465     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6466     __ strb(r15, __ post(dst, 1));
6467     __ strb(r14, __ post(dst, 1));
6468     __ strb(r13, __ post(dst, 1));
6469     __ strb(r12, __ post(dst, 1));
6470     __ sub(length, length, 3);
6471     __ cbnz(length, Process3B);
6472 
6473     __ BIND(Exit);
6474     __ ret(lr);
6475 
6476     return start;
6477   }
6478 
6479   void generate_base64_decode_simdround(Register src, Register dst,
6480         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6481 
6482     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6483     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6484 
6485     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6486     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6487 
6488     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6489 
6490     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6491 
6492     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6493 
6494     // we need unsigned saturating subtract, to make sure all input values
6495     // in range [0, 63] will have 0U value in the higher half lookup
6496     __ uqsubv(decH0, __ T16B, in0, v27);
6497     __ uqsubv(decH1, __ T16B, in1, v27);
6498     __ uqsubv(decH2, __ T16B, in2, v27);
6499     __ uqsubv(decH3, __ T16B, in3, v27);
6500 
6501     // lower half lookup
6502     __ tbl(decL0, arrangement, codecL, 4, in0);
6503     __ tbl(decL1, arrangement, codecL, 4, in1);
6504     __ tbl(decL2, arrangement, codecL, 4, in2);
6505     __ tbl(decL3, arrangement, codecL, 4, in3);
6506 
6507     // higher half lookup
6508     __ tbx(decH0, arrangement, codecH, 4, decH0);
6509     __ tbx(decH1, arrangement, codecH, 4, decH1);
6510     __ tbx(decH2, arrangement, codecH, 4, decH2);
6511     __ tbx(decH3, arrangement, codecH, 4, decH3);
6512 
6513     // combine lower and higher
6514     __ orr(decL0, arrangement, decL0, decH0);
6515     __ orr(decL1, arrangement, decL1, decH1);
6516     __ orr(decL2, arrangement, decL2, decH2);
6517     __ orr(decL3, arrangement, decL3, decH3);
6518 
6519     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6520     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6521     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6522     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6523     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6524     __ orr(in0, arrangement, decH0, decH1);
6525     __ orr(in1, arrangement, decH2, decH3);
6526     __ orr(in2, arrangement, in0,   in1);
6527     __ umaxv(in3, arrangement, in2);
6528     __ umov(rscratch2, in3, __ B, 0);
6529 
6530     // get the data to output
6531     __ shl(out0,  arrangement, decL0, 2);
6532     __ ushr(out1, arrangement, decL1, 4);
6533     __ orr(out0,  arrangement, out0,  out1);
6534     __ shl(out1,  arrangement, decL1, 4);
6535     __ ushr(out2, arrangement, decL2, 2);
6536     __ orr(out1,  arrangement, out1,  out2);
6537     __ shl(out2,  arrangement, decL2, 6);
6538     __ orr(out2,  arrangement, out2,  decL3);
6539 
6540     __ cbz(rscratch2, NoIllegalData);
6541 
6542     // handle illegal input
6543     __ umov(r10, in2, __ D, 0);
6544     if (size == 16) {
6545       __ cbnz(r10, ErrorInLowerHalf);
6546 
6547       // illegal input is in higher half, store the lower half now.
6548       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6549 
6550       __ umov(r10, in2,  __ D, 1);
6551       __ umov(r11, out0, __ D, 1);
6552       __ umov(r12, out1, __ D, 1);
6553       __ umov(r13, out2, __ D, 1);
6554       __ b(StoreLegalData);
6555 
6556       __ BIND(ErrorInLowerHalf);
6557     }
6558     __ umov(r11, out0, __ D, 0);
6559     __ umov(r12, out1, __ D, 0);
6560     __ umov(r13, out2, __ D, 0);
6561 
6562     __ BIND(StoreLegalData);
6563     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6564     __ strb(r11, __ post(dst, 1));
6565     __ strb(r12, __ post(dst, 1));
6566     __ strb(r13, __ post(dst, 1));
6567     __ lsr(r10, r10, 8);
6568     __ lsr(r11, r11, 8);
6569     __ lsr(r12, r12, 8);
6570     __ lsr(r13, r13, 8);
6571     __ b(StoreLegalData);
6572 
6573     __ BIND(NoIllegalData);
6574     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6575   }
6576 
6577 
6578    /**
6579    *  Arguments:
6580    *
6581    *  Input:
6582    *  c_rarg0   - src_start
6583    *  c_rarg1   - src_offset
6584    *  c_rarg2   - src_length
6585    *  c_rarg3   - dest_start
6586    *  c_rarg4   - dest_offset
6587    *  c_rarg5   - isURL
6588    *  c_rarg6   - isMIME
6589    *
6590    */
6591   address generate_base64_decodeBlock() {
6592 
6593     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6594     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6595     // titled "Base64 decoding".
6596 
6597     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6598     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6599     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6600     static const uint8_t fromBase64ForNoSIMD[256] = {
6601       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6602       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6603       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6604        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6605       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6606        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6607       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6608        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6609       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6610       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6611       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6612       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6613       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6614       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6616       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6617     };
6618 
6619     static const uint8_t fromBase64URLForNoSIMD[256] = {
6620       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6621       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6622       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6623        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6624       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6625        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6626       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6627        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6628       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6629       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6630       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6631       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6632       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6633       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6634       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6635       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6636     };
6637 
6638     // A legal value of base64 code is in range [0, 127].  We need two lookups
6639     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6640     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6641     // table vector lookup use tbx, out of range indices are unchanged in
6642     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6643     // The value of index 64 is set to 0, so that we know that we already get the
6644     // decoded data with the 1st lookup.
6645     static const uint8_t fromBase64ForSIMD[128] = {
6646       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6647       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6648       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6649        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6650         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6651        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6652       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6653        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6654     };
6655 
6656     static const uint8_t fromBase64URLForSIMD[128] = {
6657       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6658       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6659       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6660        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6661         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6662        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6663        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6664        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6665     };
6666 
6667     __ align(CodeEntryAlignment);
6668     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6669     address start = __ pc();
6670 
6671     Register src    = c_rarg0;  // source array
6672     Register soff   = c_rarg1;  // source start offset
6673     Register send   = c_rarg2;  // source end offset
6674     Register dst    = c_rarg3;  // dest array
6675     Register doff   = c_rarg4;  // position for writing to dest array
6676     Register isURL  = c_rarg5;  // Base64 or URL character set
6677     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6678 
6679     Register length = send;    // reuse send as length of source data to process
6680 
6681     Register simd_codec   = c_rarg6;
6682     Register nosimd_codec = c_rarg7;
6683 
6684     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6685 
6686     __ enter();
6687 
6688     __ add(src, src, soff);
6689     __ add(dst, dst, doff);
6690 
6691     __ mov(doff, dst);
6692 
6693     __ sub(length, send, soff);
6694     __ bfm(length, zr, 0, 1);
6695 
6696     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6697     __ cbz(isURL, ProcessData);
6698     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6699 
6700     __ BIND(ProcessData);
6701     __ mov(rscratch1, length);
6702     __ cmp(length, (u1)144); // 144 = 80 + 64
6703     __ br(Assembler::LT, Process4B);
6704 
6705     // In the MIME case, the line length cannot be more than 76
6706     // bytes (see RFC 2045). This is too short a block for SIMD
6707     // to be worthwhile, so we use non-SIMD here.
6708     __ movw(rscratch1, 79);
6709 
6710     __ BIND(Process4B);
6711     __ ldrw(r14, __ post(src, 4));
6712     __ ubfxw(r10, r14, 0,  8);
6713     __ ubfxw(r11, r14, 8,  8);
6714     __ ubfxw(r12, r14, 16, 8);
6715     __ ubfxw(r13, r14, 24, 8);
6716     // get the de-code
6717     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6718     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6719     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6720     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6721     // error detection, 255u indicates an illegal input
6722     __ orrw(r14, r10, r11);
6723     __ orrw(r15, r12, r13);
6724     __ orrw(r14, r14, r15);
6725     __ tbnz(r14, 7, Exit);
6726     // recover the data
6727     __ lslw(r14, r10, 10);
6728     __ bfiw(r14, r11, 4, 6);
6729     __ bfmw(r14, r12, 2, 5);
6730     __ rev16w(r14, r14);
6731     __ bfiw(r13, r12, 6, 2);
6732     __ strh(r14, __ post(dst, 2));
6733     __ strb(r13, __ post(dst, 1));
6734     // non-simd loop
6735     __ subsw(rscratch1, rscratch1, 4);
6736     __ br(Assembler::GT, Process4B);
6737 
6738     // if exiting from PreProcess80B, rscratch1 == -1;
6739     // otherwise, rscratch1 == 0.
6740     __ cbzw(rscratch1, Exit);
6741     __ sub(length, length, 80);
6742 
6743     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6744     __ cbz(isURL, SIMDEnter);
6745     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6746 
6747     __ BIND(SIMDEnter);
6748     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6749     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6750     __ mov(rscratch1, 63);
6751     __ dup(v27, __ T16B, rscratch1);
6752 
6753     __ BIND(Process64B);
6754     __ cmp(length, (u1)64);
6755     __ br(Assembler::LT, Process32B);
6756     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6757     __ sub(length, length, 64);
6758     __ b(Process64B);
6759 
6760     __ BIND(Process32B);
6761     __ cmp(length, (u1)32);
6762     __ br(Assembler::LT, SIMDExit);
6763     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6764     __ sub(length, length, 32);
6765     __ b(Process32B);
6766 
6767     __ BIND(SIMDExit);
6768     __ cbz(length, Exit);
6769     __ movw(rscratch1, length);
6770     __ b(Process4B);
6771 
6772     __ BIND(Exit);
6773     __ sub(c_rarg0, dst, doff);
6774 
6775     __ leave();
6776     __ ret(lr);
6777 
6778     return start;
6779   }
6780 
6781   // Support for spin waits.
6782   address generate_spin_wait() {
6783     __ align(CodeEntryAlignment);
6784     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6785     address start = __ pc();
6786 
6787     __ spin_wait();
6788     __ ret(lr);
6789 
6790     return start;
6791   }
6792 
6793   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
6794     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
6795 
6796     address start = __ pc();
6797     const Register
6798       r_super_klass  = r0,
6799       r_array_base   = r1,
6800       r_array_length = r2,
6801       r_array_index  = r3,
6802       r_sub_klass    = r4,
6803       r_bitmap       = rscratch2,
6804       result         = r5;
6805     const FloatRegister
6806       vtemp          = v0;
6807 
6808     Label L_success;
6809     __ enter();
6810     __ lookup_secondary_supers_table(r_sub_klass, r_super_klass,
6811                                      r_array_base, r_array_length, r_array_index,
6812                                      vtemp, result, super_klass_index,
6813                                      /*stub_is_near*/true);
6814     __ leave();
6815     __ ret(lr);
6816 
6817     return start;
6818   }
6819 
6820   // Slow path implementation for UseSecondarySupersTable.
6821   address generate_lookup_secondary_supers_table_slow_path_stub() {
6822     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
6823 
6824     address start = __ pc();
6825     const Register
6826       r_super_klass  = r0,        // argument
6827       r_array_base   = r1,        // argument
6828       temp1          = r2,        // temp
6829       r_array_index  = r3,        // argument
6830       r_bitmap       = rscratch2, // argument
6831       result         = r5;        // argument
6832 
6833     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
6834     __ ret(lr);
6835 
6836     return start;
6837   }
6838 
6839 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6840 
6841   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6842   //
6843   // If LSE is in use, generate LSE versions of all the stubs. The
6844   // non-LSE versions are in atomic_aarch64.S.
6845 
6846   // class AtomicStubMark records the entry point of a stub and the
6847   // stub pointer which will point to it. The stub pointer is set to
6848   // the entry point when ~AtomicStubMark() is called, which must be
6849   // after ICache::invalidate_range. This ensures safe publication of
6850   // the generated code.
6851   class AtomicStubMark {
6852     address _entry_point;
6853     aarch64_atomic_stub_t *_stub;
6854     MacroAssembler *_masm;
6855   public:
6856     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6857       _masm = masm;
6858       __ align(32);
6859       _entry_point = __ pc();
6860       _stub = stub;
6861     }
6862     ~AtomicStubMark() {
6863       *_stub = (aarch64_atomic_stub_t)_entry_point;
6864     }
6865   };
6866 
6867   // NB: For memory_order_conservative we need a trailing membar after
6868   // LSE atomic operations but not a leading membar.
6869   //
6870   // We don't need a leading membar because a clause in the Arm ARM
6871   // says:
6872   //
6873   //   Barrier-ordered-before
6874   //
6875   //   Barrier instructions order prior Memory effects before subsequent
6876   //   Memory effects generated by the same Observer. A read or a write
6877   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6878   //   Observer if and only if RW1 appears in program order before RW 2
6879   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6880   //   instruction with both Acquire and Release semantics.
6881   //
6882   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6883   // and Release semantics, therefore we don't need a leading
6884   // barrier. However, there is no corresponding Barrier-ordered-after
6885   // relationship, therefore we need a trailing membar to prevent a
6886   // later store or load from being reordered with the store in an
6887   // atomic instruction.
6888   //
6889   // This was checked by using the herd7 consistency model simulator
6890   // (http://diy.inria.fr/) with this test case:
6891   //
6892   // AArch64 LseCas
6893   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6894   // P0 | P1;
6895   // LDR W4, [X2] | MOV W3, #0;
6896   // DMB LD       | MOV W4, #1;
6897   // LDR W3, [X1] | CASAL W3, W4, [X1];
6898   //              | DMB ISH;
6899   //              | STR W4, [X2];
6900   // exists
6901   // (0:X3=0 /\ 0:X4=1)
6902   //
6903   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6904   // with the store to x in P1. Without the DMB in P1 this may happen.
6905   //
6906   // At the time of writing we don't know of any AArch64 hardware that
6907   // reorders stores in this way, but the Reference Manual permits it.
6908 
6909   void gen_cas_entry(Assembler::operand_size size,
6910                      atomic_memory_order order) {
6911     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6912       exchange_val = c_rarg2;
6913     bool acquire, release;
6914     switch (order) {
6915       case memory_order_relaxed:
6916         acquire = false;
6917         release = false;
6918         break;
6919       case memory_order_release:
6920         acquire = false;
6921         release = true;
6922         break;
6923       default:
6924         acquire = true;
6925         release = true;
6926         break;
6927     }
6928     __ mov(prev, compare_val);
6929     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6930     if (order == memory_order_conservative) {
6931       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6932     }
6933     if (size == Assembler::xword) {
6934       __ mov(r0, prev);
6935     } else {
6936       __ movw(r0, prev);
6937     }
6938     __ ret(lr);
6939   }
6940 
6941   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6942     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6943     // If not relaxed, then default to conservative.  Relaxed is the only
6944     // case we use enough to be worth specializing.
6945     if (order == memory_order_relaxed) {
6946       __ ldadd(size, incr, prev, addr);
6947     } else {
6948       __ ldaddal(size, incr, prev, addr);
6949       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6950     }
6951     if (size == Assembler::xword) {
6952       __ mov(r0, prev);
6953     } else {
6954       __ movw(r0, prev);
6955     }
6956     __ ret(lr);
6957   }
6958 
6959   void gen_swpal_entry(Assembler::operand_size size) {
6960     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6961     __ swpal(size, incr, prev, addr);
6962     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6963     if (size == Assembler::xword) {
6964       __ mov(r0, prev);
6965     } else {
6966       __ movw(r0, prev);
6967     }
6968     __ ret(lr);
6969   }
6970 
6971   void generate_atomic_entry_points() {
6972     if (! UseLSE) {
6973       return;
6974     }
6975 
6976     __ align(CodeEntryAlignment);
6977     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6978     address first_entry = __ pc();
6979 
6980     // ADD, memory_order_conservative
6981     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6982     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6983     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6984     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6985 
6986     // ADD, memory_order_relaxed
6987     AtomicStubMark mark_fetch_add_4_relaxed
6988       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6989     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6990     AtomicStubMark mark_fetch_add_8_relaxed
6991       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6992     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6993 
6994     // XCHG, memory_order_conservative
6995     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6996     gen_swpal_entry(Assembler::word);
6997     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6998     gen_swpal_entry(Assembler::xword);
6999 
7000     // CAS, memory_order_conservative
7001     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
7002     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
7003     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
7004     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
7005     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
7006     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
7007 
7008     // CAS, memory_order_relaxed
7009     AtomicStubMark mark_cmpxchg_1_relaxed
7010       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
7011     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
7012     AtomicStubMark mark_cmpxchg_4_relaxed
7013       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
7014     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
7015     AtomicStubMark mark_cmpxchg_8_relaxed
7016       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
7017     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
7018 
7019     AtomicStubMark mark_cmpxchg_4_release
7020       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
7021     gen_cas_entry(MacroAssembler::word, memory_order_release);
7022     AtomicStubMark mark_cmpxchg_8_release
7023       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
7024     gen_cas_entry(MacroAssembler::xword, memory_order_release);
7025 
7026     AtomicStubMark mark_cmpxchg_4_seq_cst
7027       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
7028     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
7029     AtomicStubMark mark_cmpxchg_8_seq_cst
7030       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
7031     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
7032 
7033     ICache::invalidate_range(first_entry, __ pc() - first_entry);
7034   }
7035 #endif // LINUX
7036 
7037   address generate_cont_thaw(Continuation::thaw_kind kind) {
7038     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
7039     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
7040 
7041     address start = __ pc();
7042 
7043     if (return_barrier) {
7044       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7045       __ mov(sp, rscratch1);
7046     }
7047     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7048 
7049     if (return_barrier) {
7050       // preserve possible return value from a method returning to the return barrier
7051       __ fmovd(rscratch1, v0);
7052       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7053     }
7054 
7055     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7056     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7057     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7058 
7059     if (return_barrier) {
7060       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7061       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7062       __ fmovd(v0, rscratch1);
7063     }
7064     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7065 
7066 
7067     Label thaw_success;
7068     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7069     __ cbnz(rscratch2, thaw_success);
7070     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
7071     __ br(rscratch1);
7072     __ bind(thaw_success);
7073 
7074     // make room for the thawed frames
7075     __ sub(rscratch1, sp, rscratch2);
7076     __ andr(rscratch1, rscratch1, -16); // align
7077     __ mov(sp, rscratch1);
7078 
7079     if (return_barrier) {
7080       // save original return value -- again
7081       __ fmovd(rscratch1, v0);
7082       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7083     }
7084 
7085     // If we want, we can templatize thaw by kind, and have three different entries
7086     __ movw(c_rarg1, (uint32_t)kind);
7087 
7088     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7089     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7090 
7091     if (return_barrier) {
7092       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7093       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7094       __ fmovd(v0, rscratch1);
7095     } else {
7096       __ mov(r0, zr); // return 0 (success) from doYield
7097     }
7098 
7099     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7100     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7101     __ mov(rfp, sp);
7102 
7103     if (return_barrier_exception) {
7104       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7105       __ authenticate_return_address(c_rarg1);
7106       __ verify_oop(r0);
7107       // save return value containing the exception oop in callee-saved R19
7108       __ mov(r19, r0);
7109 
7110       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7111 
7112       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7113       // __ reinitialize_ptrue();
7114 
7115       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7116 
7117       __ mov(r1, r0); // the exception handler
7118       __ mov(r0, r19); // restore return value containing the exception oop
7119       __ verify_oop(r0);
7120 
7121       __ leave();
7122       __ mov(r3, lr);
7123       __ br(r1); // the exception handler
7124     } else {
7125       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7126       __ leave();
7127       __ ret(lr);
7128     }
7129 
7130     return start;
7131   }
7132 
7133   address generate_cont_thaw() {
7134     if (!Continuations::enabled()) return nullptr;
7135 
7136     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7137     address start = __ pc();
7138     generate_cont_thaw(Continuation::thaw_top);
7139     return start;
7140   }
7141 
7142   address generate_cont_returnBarrier() {
7143     if (!Continuations::enabled()) return nullptr;
7144 
7145     // TODO: will probably need multiple return barriers depending on return type
7146     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7147     address start = __ pc();
7148 
7149     generate_cont_thaw(Continuation::thaw_return_barrier);
7150 
7151     return start;
7152   }
7153 
7154   address generate_cont_returnBarrier_exception() {
7155     if (!Continuations::enabled()) return nullptr;
7156 
7157     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7158     address start = __ pc();
7159 
7160     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7161 
7162     return start;
7163   }
7164 
7165   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7166   // are represented as long[5], with BITS_PER_LIMB = 26.
7167   // Pack five 26-bit limbs into three 64-bit registers.
7168   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7169     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7170     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7171     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7172     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7173 
7174     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7175     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7176     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7177     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7178 
7179     if (dest2->is_valid()) {
7180       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7181     } else {
7182 #ifdef ASSERT
7183       Label OK;
7184       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7185       __ br(__ EQ, OK);
7186       __ stop("high bits of Poly1305 integer should be zero");
7187       __ should_not_reach_here();
7188       __ bind(OK);
7189 #endif
7190     }
7191   }
7192 
7193   // As above, but return only a 128-bit integer, packed into two
7194   // 64-bit registers.
7195   void pack_26(Register dest0, Register dest1, Register src) {
7196     pack_26(dest0, dest1, noreg, src);
7197   }
7198 
7199   // Multiply and multiply-accumulate unsigned 64-bit registers.
7200   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7201     __ mul(prod_lo, n, m);
7202     __ umulh(prod_hi, n, m);
7203   }
7204   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7205     wide_mul(rscratch1, rscratch2, n, m);
7206     __ adds(sum_lo, sum_lo, rscratch1);
7207     __ adc(sum_hi, sum_hi, rscratch2);
7208   }
7209 
7210   // Poly1305, RFC 7539
7211 
7212   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7213   // description of the tricks used to simplify and accelerate this
7214   // computation.
7215 
7216   address generate_poly1305_processBlocks() {
7217     __ align(CodeEntryAlignment);
7218     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7219     address start = __ pc();
7220     Label here;
7221     __ enter();
7222     RegSet callee_saved = RegSet::range(r19, r28);
7223     __ push(callee_saved, sp);
7224 
7225     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7226 
7227     // Arguments
7228     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7229 
7230     // R_n is the 128-bit randomly-generated key, packed into two
7231     // registers.  The caller passes this key to us as long[5], with
7232     // BITS_PER_LIMB = 26.
7233     const Register R_0 = *++regs, R_1 = *++regs;
7234     pack_26(R_0, R_1, r_start);
7235 
7236     // RR_n is (R_n >> 2) * 5
7237     const Register RR_0 = *++regs, RR_1 = *++regs;
7238     __ lsr(RR_0, R_0, 2);
7239     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7240     __ lsr(RR_1, R_1, 2);
7241     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7242 
7243     // U_n is the current checksum
7244     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7245     pack_26(U_0, U_1, U_2, acc_start);
7246 
7247     static constexpr int BLOCK_LENGTH = 16;
7248     Label DONE, LOOP;
7249 
7250     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7251     __ br(Assembler::LT, DONE); {
7252       __ bind(LOOP);
7253 
7254       // S_n is to be the sum of U_n and the next block of data
7255       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7256       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7257       __ adds(S_0, U_0, S_0);
7258       __ adcs(S_1, U_1, S_1);
7259       __ adc(S_2, U_2, zr);
7260       __ add(S_2, S_2, 1);
7261 
7262       const Register U_0HI = *++regs, U_1HI = *++regs;
7263 
7264       // NB: this logic depends on some of the special properties of
7265       // Poly1305 keys. In particular, because we know that the top
7266       // four bits of R_0 and R_1 are zero, we can add together
7267       // partial products without any risk of needing to propagate a
7268       // carry out.
7269       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7270       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7271       __ andr(U_2, R_0, 3);
7272       __ mul(U_2, S_2, U_2);
7273 
7274       // Recycle registers S_0, S_1, S_2
7275       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7276 
7277       // Partial reduction mod 2**130 - 5
7278       __ adds(U_1, U_0HI, U_1);
7279       __ adc(U_2, U_1HI, U_2);
7280       // Sum now in U_2:U_1:U_0.
7281       // Dead: U_0HI, U_1HI.
7282       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7283 
7284       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7285 
7286       // First, U_2:U_1:U_0 += (U_2 >> 2)
7287       __ lsr(rscratch1, U_2, 2);
7288       __ andr(U_2, U_2, (u8)3);
7289       __ adds(U_0, U_0, rscratch1);
7290       __ adcs(U_1, U_1, zr);
7291       __ adc(U_2, U_2, zr);
7292       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7293       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7294       __ adcs(U_1, U_1, zr);
7295       __ adc(U_2, U_2, zr);
7296 
7297       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7298       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7299       __ br(~ Assembler::LT, LOOP);
7300     }
7301 
7302     // Further reduce modulo 2^130 - 5
7303     __ lsr(rscratch1, U_2, 2);
7304     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7305     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7306     __ adcs(U_1, U_1, zr);
7307     __ andr(U_2, U_2, (u1)3);
7308     __ adc(U_2, U_2, zr);
7309 
7310     // Unpack the sum into five 26-bit limbs and write to memory.
7311     __ ubfiz(rscratch1, U_0, 0, 26);
7312     __ ubfx(rscratch2, U_0, 26, 26);
7313     __ stp(rscratch1, rscratch2, Address(acc_start));
7314     __ ubfx(rscratch1, U_0, 52, 12);
7315     __ bfi(rscratch1, U_1, 12, 14);
7316     __ ubfx(rscratch2, U_1, 14, 26);
7317     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7318     __ ubfx(rscratch1, U_1, 40, 24);
7319     __ bfi(rscratch1, U_2, 24, 3);
7320     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7321 
7322     __ bind(DONE);
7323     __ pop(callee_saved, sp);
7324     __ leave();
7325     __ ret(lr);
7326 
7327     return start;
7328   }
7329 
7330   // exception handler for upcall stubs
7331   address generate_upcall_stub_exception_handler() {
7332     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7333     address start = __ pc();
7334 
7335     // Native caller has no idea how to handle exceptions,
7336     // so we just crash here. Up to callee to catch exceptions.
7337     __ verify_oop(r0);
7338     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7339     __ blr(rscratch1);
7340     __ should_not_reach_here();
7341 
7342     return start;
7343   }
7344 
7345   // load Method* target of MethodHandle
7346   // j_rarg0 = jobject receiver
7347   // rmethod = result
7348   address generate_upcall_stub_load_target() {
7349     StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target");
7350     address start = __ pc();
7351 
7352     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
7353       // Load target method from receiver
7354     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
7355     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
7356     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
7357     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
7358                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7359                       noreg, noreg);
7360     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7361 
7362     __ ret(lr);
7363 
7364     return start;
7365   }
7366 
7367 #undef __
7368 #define __ masm->
7369 
7370   class MontgomeryMultiplyGenerator : public MacroAssembler {
7371 
7372     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7373       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7374 
7375     RegSet _toSave;
7376     bool _squaring;
7377 
7378   public:
7379     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7380       : MacroAssembler(as->code()), _squaring(squaring) {
7381 
7382       // Register allocation
7383 
7384       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7385       Pa_base = *regs;       // Argument registers
7386       if (squaring)
7387         Pb_base = Pa_base;
7388       else
7389         Pb_base = *++regs;
7390       Pn_base = *++regs;
7391       Rlen= *++regs;
7392       inv = *++regs;
7393       Pm_base = *++regs;
7394 
7395                           // Working registers:
7396       Ra =  *++regs;        // The current digit of a, b, n, and m.
7397       Rb =  *++regs;
7398       Rm =  *++regs;
7399       Rn =  *++regs;
7400 
7401       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7402       Pb =  *++regs;
7403       Pm =  *++regs;
7404       Pn =  *++regs;
7405 
7406       t0 =  *++regs;        // Three registers which form a
7407       t1 =  *++regs;        // triple-precision accumuator.
7408       t2 =  *++regs;
7409 
7410       Ri =  *++regs;        // Inner and outer loop indexes.
7411       Rj =  *++regs;
7412 
7413       Rhi_ab = *++regs;     // Product registers: low and high parts
7414       Rlo_ab = *++regs;     // of a*b and m*n.
7415       Rhi_mn = *++regs;
7416       Rlo_mn = *++regs;
7417 
7418       // r19 and up are callee-saved.
7419       _toSave = RegSet::range(r19, *regs) + Pm_base;
7420     }
7421 
7422   private:
7423     void save_regs() {
7424       push(_toSave, sp);
7425     }
7426 
7427     void restore_regs() {
7428       pop(_toSave, sp);
7429     }
7430 
7431     template <typename T>
7432     void unroll_2(Register count, T block) {
7433       Label loop, end, odd;
7434       tbnz(count, 0, odd);
7435       cbz(count, end);
7436       align(16);
7437       bind(loop);
7438       (this->*block)();
7439       bind(odd);
7440       (this->*block)();
7441       subs(count, count, 2);
7442       br(Assembler::GT, loop);
7443       bind(end);
7444     }
7445 
7446     template <typename T>
7447     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7448       Label loop, end, odd;
7449       tbnz(count, 0, odd);
7450       cbz(count, end);
7451       align(16);
7452       bind(loop);
7453       (this->*block)(d, s, tmp);
7454       bind(odd);
7455       (this->*block)(d, s, tmp);
7456       subs(count, count, 2);
7457       br(Assembler::GT, loop);
7458       bind(end);
7459     }
7460 
7461     void pre1(RegisterOrConstant i) {
7462       block_comment("pre1");
7463       // Pa = Pa_base;
7464       // Pb = Pb_base + i;
7465       // Pm = Pm_base;
7466       // Pn = Pn_base + i;
7467       // Ra = *Pa;
7468       // Rb = *Pb;
7469       // Rm = *Pm;
7470       // Rn = *Pn;
7471       ldr(Ra, Address(Pa_base));
7472       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7473       ldr(Rm, Address(Pm_base));
7474       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7475       lea(Pa, Address(Pa_base));
7476       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7477       lea(Pm, Address(Pm_base));
7478       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7479 
7480       // Zero the m*n result.
7481       mov(Rhi_mn, zr);
7482       mov(Rlo_mn, zr);
7483     }
7484 
7485     // The core multiply-accumulate step of a Montgomery
7486     // multiplication.  The idea is to schedule operations as a
7487     // pipeline so that instructions with long latencies (loads and
7488     // multiplies) have time to complete before their results are
7489     // used.  This most benefits in-order implementations of the
7490     // architecture but out-of-order ones also benefit.
7491     void step() {
7492       block_comment("step");
7493       // MACC(Ra, Rb, t0, t1, t2);
7494       // Ra = *++Pa;
7495       // Rb = *--Pb;
7496       umulh(Rhi_ab, Ra, Rb);
7497       mul(Rlo_ab, Ra, Rb);
7498       ldr(Ra, pre(Pa, wordSize));
7499       ldr(Rb, pre(Pb, -wordSize));
7500       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7501                                        // previous iteration.
7502       // MACC(Rm, Rn, t0, t1, t2);
7503       // Rm = *++Pm;
7504       // Rn = *--Pn;
7505       umulh(Rhi_mn, Rm, Rn);
7506       mul(Rlo_mn, Rm, Rn);
7507       ldr(Rm, pre(Pm, wordSize));
7508       ldr(Rn, pre(Pn, -wordSize));
7509       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7510     }
7511 
7512     void post1() {
7513       block_comment("post1");
7514 
7515       // MACC(Ra, Rb, t0, t1, t2);
7516       // Ra = *++Pa;
7517       // Rb = *--Pb;
7518       umulh(Rhi_ab, Ra, Rb);
7519       mul(Rlo_ab, Ra, Rb);
7520       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7521       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7522 
7523       // *Pm = Rm = t0 * inv;
7524       mul(Rm, t0, inv);
7525       str(Rm, Address(Pm));
7526 
7527       // MACC(Rm, Rn, t0, t1, t2);
7528       // t0 = t1; t1 = t2; t2 = 0;
7529       umulh(Rhi_mn, Rm, Rn);
7530 
7531 #ifndef PRODUCT
7532       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7533       {
7534         mul(Rlo_mn, Rm, Rn);
7535         add(Rlo_mn, t0, Rlo_mn);
7536         Label ok;
7537         cbz(Rlo_mn, ok); {
7538           stop("broken Montgomery multiply");
7539         } bind(ok);
7540       }
7541 #endif
7542       // We have very carefully set things up so that
7543       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7544       // the lower half of Rm * Rn because we know the result already:
7545       // it must be -t0.  t0 + (-t0) must generate a carry iff
7546       // t0 != 0.  So, rather than do a mul and an adds we just set
7547       // the carry flag iff t0 is nonzero.
7548       //
7549       // mul(Rlo_mn, Rm, Rn);
7550       // adds(zr, t0, Rlo_mn);
7551       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7552       adcs(t0, t1, Rhi_mn);
7553       adc(t1, t2, zr);
7554       mov(t2, zr);
7555     }
7556 
7557     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7558       block_comment("pre2");
7559       // Pa = Pa_base + i-len;
7560       // Pb = Pb_base + len;
7561       // Pm = Pm_base + i-len;
7562       // Pn = Pn_base + len;
7563 
7564       if (i.is_register()) {
7565         sub(Rj, i.as_register(), len);
7566       } else {
7567         mov(Rj, i.as_constant());
7568         sub(Rj, Rj, len);
7569       }
7570       // Rj == i-len
7571 
7572       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7573       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7574       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7575       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7576 
7577       // Ra = *++Pa;
7578       // Rb = *--Pb;
7579       // Rm = *++Pm;
7580       // Rn = *--Pn;
7581       ldr(Ra, pre(Pa, wordSize));
7582       ldr(Rb, pre(Pb, -wordSize));
7583       ldr(Rm, pre(Pm, wordSize));
7584       ldr(Rn, pre(Pn, -wordSize));
7585 
7586       mov(Rhi_mn, zr);
7587       mov(Rlo_mn, zr);
7588     }
7589 
7590     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7591       block_comment("post2");
7592       if (i.is_constant()) {
7593         mov(Rj, i.as_constant()-len.as_constant());
7594       } else {
7595         sub(Rj, i.as_register(), len);
7596       }
7597 
7598       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7599 
7600       // As soon as we know the least significant digit of our result,
7601       // store it.
7602       // Pm_base[i-len] = t0;
7603       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7604 
7605       // t0 = t1; t1 = t2; t2 = 0;
7606       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7607       adc(t1, t2, zr);
7608       mov(t2, zr);
7609     }
7610 
7611     // A carry in t0 after Montgomery multiplication means that we
7612     // should subtract multiples of n from our result in m.  We'll
7613     // keep doing that until there is no carry.
7614     void normalize(RegisterOrConstant len) {
7615       block_comment("normalize");
7616       // while (t0)
7617       //   t0 = sub(Pm_base, Pn_base, t0, len);
7618       Label loop, post, again;
7619       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7620       cbz(t0, post); {
7621         bind(again); {
7622           mov(i, zr);
7623           mov(cnt, len);
7624           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7625           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7626           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7627           align(16);
7628           bind(loop); {
7629             sbcs(Rm, Rm, Rn);
7630             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7631             add(i, i, 1);
7632             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7633             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7634             sub(cnt, cnt, 1);
7635           } cbnz(cnt, loop);
7636           sbc(t0, t0, zr);
7637         } cbnz(t0, again);
7638       } bind(post);
7639     }
7640 
7641     // Move memory at s to d, reversing words.
7642     //    Increments d to end of copied memory
7643     //    Destroys tmp1, tmp2
7644     //    Preserves len
7645     //    Leaves s pointing to the address which was in d at start
7646     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7647       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7648       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7649 
7650       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7651       mov(tmp1, len);
7652       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7653       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7654     }
7655     // where
7656     void reverse1(Register d, Register s, Register tmp) {
7657       ldr(tmp, pre(s, -wordSize));
7658       ror(tmp, tmp, 32);
7659       str(tmp, post(d, wordSize));
7660     }
7661 
7662     void step_squaring() {
7663       // An extra ACC
7664       step();
7665       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7666     }
7667 
7668     void last_squaring(RegisterOrConstant i) {
7669       Label dont;
7670       // if ((i & 1) == 0) {
7671       tbnz(i.as_register(), 0, dont); {
7672         // MACC(Ra, Rb, t0, t1, t2);
7673         // Ra = *++Pa;
7674         // Rb = *--Pb;
7675         umulh(Rhi_ab, Ra, Rb);
7676         mul(Rlo_ab, Ra, Rb);
7677         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7678       } bind(dont);
7679     }
7680 
7681     void extra_step_squaring() {
7682       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7683 
7684       // MACC(Rm, Rn, t0, t1, t2);
7685       // Rm = *++Pm;
7686       // Rn = *--Pn;
7687       umulh(Rhi_mn, Rm, Rn);
7688       mul(Rlo_mn, Rm, Rn);
7689       ldr(Rm, pre(Pm, wordSize));
7690       ldr(Rn, pre(Pn, -wordSize));
7691     }
7692 
7693     void post1_squaring() {
7694       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7695 
7696       // *Pm = Rm = t0 * inv;
7697       mul(Rm, t0, inv);
7698       str(Rm, Address(Pm));
7699 
7700       // MACC(Rm, Rn, t0, t1, t2);
7701       // t0 = t1; t1 = t2; t2 = 0;
7702       umulh(Rhi_mn, Rm, Rn);
7703 
7704 #ifndef PRODUCT
7705       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7706       {
7707         mul(Rlo_mn, Rm, Rn);
7708         add(Rlo_mn, t0, Rlo_mn);
7709         Label ok;
7710         cbz(Rlo_mn, ok); {
7711           stop("broken Montgomery multiply");
7712         } bind(ok);
7713       }
7714 #endif
7715       // We have very carefully set things up so that
7716       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7717       // the lower half of Rm * Rn because we know the result already:
7718       // it must be -t0.  t0 + (-t0) must generate a carry iff
7719       // t0 != 0.  So, rather than do a mul and an adds we just set
7720       // the carry flag iff t0 is nonzero.
7721       //
7722       // mul(Rlo_mn, Rm, Rn);
7723       // adds(zr, t0, Rlo_mn);
7724       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7725       adcs(t0, t1, Rhi_mn);
7726       adc(t1, t2, zr);
7727       mov(t2, zr);
7728     }
7729 
7730     void acc(Register Rhi, Register Rlo,
7731              Register t0, Register t1, Register t2) {
7732       adds(t0, t0, Rlo);
7733       adcs(t1, t1, Rhi);
7734       adc(t2, t2, zr);
7735     }
7736 
7737   public:
7738     /**
7739      * Fast Montgomery multiplication.  The derivation of the
7740      * algorithm is in A Cryptographic Library for the Motorola
7741      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7742      *
7743      * Arguments:
7744      *
7745      * Inputs for multiplication:
7746      *   c_rarg0   - int array elements a
7747      *   c_rarg1   - int array elements b
7748      *   c_rarg2   - int array elements n (the modulus)
7749      *   c_rarg3   - int length
7750      *   c_rarg4   - int inv
7751      *   c_rarg5   - int array elements m (the result)
7752      *
7753      * Inputs for squaring:
7754      *   c_rarg0   - int array elements a
7755      *   c_rarg1   - int array elements n (the modulus)
7756      *   c_rarg2   - int length
7757      *   c_rarg3   - int inv
7758      *   c_rarg4   - int array elements m (the result)
7759      *
7760      */
7761     address generate_multiply() {
7762       Label argh, nothing;
7763       bind(argh);
7764       stop("MontgomeryMultiply total_allocation must be <= 8192");
7765 
7766       align(CodeEntryAlignment);
7767       address entry = pc();
7768 
7769       cbzw(Rlen, nothing);
7770 
7771       enter();
7772 
7773       // Make room.
7774       cmpw(Rlen, 512);
7775       br(Assembler::HI, argh);
7776       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7777       andr(sp, Ra, -2 * wordSize);
7778 
7779       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7780 
7781       {
7782         // Copy input args, reversing as we go.  We use Ra as a
7783         // temporary variable.
7784         reverse(Ra, Pa_base, Rlen, t0, t1);
7785         if (!_squaring)
7786           reverse(Ra, Pb_base, Rlen, t0, t1);
7787         reverse(Ra, Pn_base, Rlen, t0, t1);
7788       }
7789 
7790       // Push all call-saved registers and also Pm_base which we'll need
7791       // at the end.
7792       save_regs();
7793 
7794 #ifndef PRODUCT
7795       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7796       {
7797         ldr(Rn, Address(Pn_base, 0));
7798         mul(Rlo_mn, Rn, inv);
7799         subs(zr, Rlo_mn, -1);
7800         Label ok;
7801         br(EQ, ok); {
7802           stop("broken inverse in Montgomery multiply");
7803         } bind(ok);
7804       }
7805 #endif
7806 
7807       mov(Pm_base, Ra);
7808 
7809       mov(t0, zr);
7810       mov(t1, zr);
7811       mov(t2, zr);
7812 
7813       block_comment("for (int i = 0; i < len; i++) {");
7814       mov(Ri, zr); {
7815         Label loop, end;
7816         cmpw(Ri, Rlen);
7817         br(Assembler::GE, end);
7818 
7819         bind(loop);
7820         pre1(Ri);
7821 
7822         block_comment("  for (j = i; j; j--) {"); {
7823           movw(Rj, Ri);
7824           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7825         } block_comment("  } // j");
7826 
7827         post1();
7828         addw(Ri, Ri, 1);
7829         cmpw(Ri, Rlen);
7830         br(Assembler::LT, loop);
7831         bind(end);
7832         block_comment("} // i");
7833       }
7834 
7835       block_comment("for (int i = len; i < 2*len; i++) {");
7836       mov(Ri, Rlen); {
7837         Label loop, end;
7838         cmpw(Ri, Rlen, Assembler::LSL, 1);
7839         br(Assembler::GE, end);
7840 
7841         bind(loop);
7842         pre2(Ri, Rlen);
7843 
7844         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7845           lslw(Rj, Rlen, 1);
7846           subw(Rj, Rj, Ri);
7847           subw(Rj, Rj, 1);
7848           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7849         } block_comment("  } // j");
7850 
7851         post2(Ri, Rlen);
7852         addw(Ri, Ri, 1);
7853         cmpw(Ri, Rlen, Assembler::LSL, 1);
7854         br(Assembler::LT, loop);
7855         bind(end);
7856       }
7857       block_comment("} // i");
7858 
7859       normalize(Rlen);
7860 
7861       mov(Ra, Pm_base);  // Save Pm_base in Ra
7862       restore_regs();  // Restore caller's Pm_base
7863 
7864       // Copy our result into caller's Pm_base
7865       reverse(Pm_base, Ra, Rlen, t0, t1);
7866 
7867       leave();
7868       bind(nothing);
7869       ret(lr);
7870 
7871       return entry;
7872     }
7873     // In C, approximately:
7874 
7875     // void
7876     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7877     //                     julong Pn_base[], julong Pm_base[],
7878     //                     julong inv, int len) {
7879     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7880     //   julong *Pa, *Pb, *Pn, *Pm;
7881     //   julong Ra, Rb, Rn, Rm;
7882 
7883     //   int i;
7884 
7885     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7886 
7887     //   for (i = 0; i < len; i++) {
7888     //     int j;
7889 
7890     //     Pa = Pa_base;
7891     //     Pb = Pb_base + i;
7892     //     Pm = Pm_base;
7893     //     Pn = Pn_base + i;
7894 
7895     //     Ra = *Pa;
7896     //     Rb = *Pb;
7897     //     Rm = *Pm;
7898     //     Rn = *Pn;
7899 
7900     //     int iters = i;
7901     //     for (j = 0; iters--; j++) {
7902     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7903     //       MACC(Ra, Rb, t0, t1, t2);
7904     //       Ra = *++Pa;
7905     //       Rb = *--Pb;
7906     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7907     //       MACC(Rm, Rn, t0, t1, t2);
7908     //       Rm = *++Pm;
7909     //       Rn = *--Pn;
7910     //     }
7911 
7912     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7913     //     MACC(Ra, Rb, t0, t1, t2);
7914     //     *Pm = Rm = t0 * inv;
7915     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7916     //     MACC(Rm, Rn, t0, t1, t2);
7917 
7918     //     assert(t0 == 0, "broken Montgomery multiply");
7919 
7920     //     t0 = t1; t1 = t2; t2 = 0;
7921     //   }
7922 
7923     //   for (i = len; i < 2*len; i++) {
7924     //     int j;
7925 
7926     //     Pa = Pa_base + i-len;
7927     //     Pb = Pb_base + len;
7928     //     Pm = Pm_base + i-len;
7929     //     Pn = Pn_base + len;
7930 
7931     //     Ra = *++Pa;
7932     //     Rb = *--Pb;
7933     //     Rm = *++Pm;
7934     //     Rn = *--Pn;
7935 
7936     //     int iters = len*2-i-1;
7937     //     for (j = i-len+1; iters--; j++) {
7938     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7939     //       MACC(Ra, Rb, t0, t1, t2);
7940     //       Ra = *++Pa;
7941     //       Rb = *--Pb;
7942     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7943     //       MACC(Rm, Rn, t0, t1, t2);
7944     //       Rm = *++Pm;
7945     //       Rn = *--Pn;
7946     //     }
7947 
7948     //     Pm_base[i-len] = t0;
7949     //     t0 = t1; t1 = t2; t2 = 0;
7950     //   }
7951 
7952     //   while (t0)
7953     //     t0 = sub(Pm_base, Pn_base, t0, len);
7954     // }
7955 
7956     /**
7957      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7958      * multiplies than Montgomery multiplication so it should be up to
7959      * 25% faster.  However, its loop control is more complex and it
7960      * may actually run slower on some machines.
7961      *
7962      * Arguments:
7963      *
7964      * Inputs:
7965      *   c_rarg0   - int array elements a
7966      *   c_rarg1   - int array elements n (the modulus)
7967      *   c_rarg2   - int length
7968      *   c_rarg3   - int inv
7969      *   c_rarg4   - int array elements m (the result)
7970      *
7971      */
7972     address generate_square() {
7973       Label argh;
7974       bind(argh);
7975       stop("MontgomeryMultiply total_allocation must be <= 8192");
7976 
7977       align(CodeEntryAlignment);
7978       address entry = pc();
7979 
7980       enter();
7981 
7982       // Make room.
7983       cmpw(Rlen, 512);
7984       br(Assembler::HI, argh);
7985       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7986       andr(sp, Ra, -2 * wordSize);
7987 
7988       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7989 
7990       {
7991         // Copy input args, reversing as we go.  We use Ra as a
7992         // temporary variable.
7993         reverse(Ra, Pa_base, Rlen, t0, t1);
7994         reverse(Ra, Pn_base, Rlen, t0, t1);
7995       }
7996 
7997       // Push all call-saved registers and also Pm_base which we'll need
7998       // at the end.
7999       save_regs();
8000 
8001       mov(Pm_base, Ra);
8002 
8003       mov(t0, zr);
8004       mov(t1, zr);
8005       mov(t2, zr);
8006 
8007       block_comment("for (int i = 0; i < len; i++) {");
8008       mov(Ri, zr); {
8009         Label loop, end;
8010         bind(loop);
8011         cmp(Ri, Rlen);
8012         br(Assembler::GE, end);
8013 
8014         pre1(Ri);
8015 
8016         block_comment("for (j = (i+1)/2; j; j--) {"); {
8017           add(Rj, Ri, 1);
8018           lsr(Rj, Rj, 1);
8019           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8020         } block_comment("  } // j");
8021 
8022         last_squaring(Ri);
8023 
8024         block_comment("  for (j = i/2; j; j--) {"); {
8025           lsr(Rj, Ri, 1);
8026           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8027         } block_comment("  } // j");
8028 
8029         post1_squaring();
8030         add(Ri, Ri, 1);
8031         cmp(Ri, Rlen);
8032         br(Assembler::LT, loop);
8033 
8034         bind(end);
8035         block_comment("} // i");
8036       }
8037 
8038       block_comment("for (int i = len; i < 2*len; i++) {");
8039       mov(Ri, Rlen); {
8040         Label loop, end;
8041         bind(loop);
8042         cmp(Ri, Rlen, Assembler::LSL, 1);
8043         br(Assembler::GE, end);
8044 
8045         pre2(Ri, Rlen);
8046 
8047         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8048           lsl(Rj, Rlen, 1);
8049           sub(Rj, Rj, Ri);
8050           sub(Rj, Rj, 1);
8051           lsr(Rj, Rj, 1);
8052           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8053         } block_comment("  } // j");
8054 
8055         last_squaring(Ri);
8056 
8057         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8058           lsl(Rj, Rlen, 1);
8059           sub(Rj, Rj, Ri);
8060           lsr(Rj, Rj, 1);
8061           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8062         } block_comment("  } // j");
8063 
8064         post2(Ri, Rlen);
8065         add(Ri, Ri, 1);
8066         cmp(Ri, Rlen, Assembler::LSL, 1);
8067 
8068         br(Assembler::LT, loop);
8069         bind(end);
8070         block_comment("} // i");
8071       }
8072 
8073       normalize(Rlen);
8074 
8075       mov(Ra, Pm_base);  // Save Pm_base in Ra
8076       restore_regs();  // Restore caller's Pm_base
8077 
8078       // Copy our result into caller's Pm_base
8079       reverse(Pm_base, Ra, Rlen, t0, t1);
8080 
8081       leave();
8082       ret(lr);
8083 
8084       return entry;
8085     }
8086     // In C, approximately:
8087 
8088     // void
8089     // montgomery_square(julong Pa_base[], julong Pn_base[],
8090     //                   julong Pm_base[], julong inv, int len) {
8091     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8092     //   julong *Pa, *Pb, *Pn, *Pm;
8093     //   julong Ra, Rb, Rn, Rm;
8094 
8095     //   int i;
8096 
8097     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8098 
8099     //   for (i = 0; i < len; i++) {
8100     //     int j;
8101 
8102     //     Pa = Pa_base;
8103     //     Pb = Pa_base + i;
8104     //     Pm = Pm_base;
8105     //     Pn = Pn_base + i;
8106 
8107     //     Ra = *Pa;
8108     //     Rb = *Pb;
8109     //     Rm = *Pm;
8110     //     Rn = *Pn;
8111 
8112     //     int iters = (i+1)/2;
8113     //     for (j = 0; iters--; j++) {
8114     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8115     //       MACC2(Ra, Rb, t0, t1, t2);
8116     //       Ra = *++Pa;
8117     //       Rb = *--Pb;
8118     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8119     //       MACC(Rm, Rn, t0, t1, t2);
8120     //       Rm = *++Pm;
8121     //       Rn = *--Pn;
8122     //     }
8123     //     if ((i & 1) == 0) {
8124     //       assert(Ra == Pa_base[j], "must be");
8125     //       MACC(Ra, Ra, t0, t1, t2);
8126     //     }
8127     //     iters = i/2;
8128     //     assert(iters == i-j, "must be");
8129     //     for (; iters--; j++) {
8130     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8131     //       MACC(Rm, Rn, t0, t1, t2);
8132     //       Rm = *++Pm;
8133     //       Rn = *--Pn;
8134     //     }
8135 
8136     //     *Pm = Rm = t0 * inv;
8137     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8138     //     MACC(Rm, Rn, t0, t1, t2);
8139 
8140     //     assert(t0 == 0, "broken Montgomery multiply");
8141 
8142     //     t0 = t1; t1 = t2; t2 = 0;
8143     //   }
8144 
8145     //   for (i = len; i < 2*len; i++) {
8146     //     int start = i-len+1;
8147     //     int end = start + (len - start)/2;
8148     //     int j;
8149 
8150     //     Pa = Pa_base + i-len;
8151     //     Pb = Pa_base + len;
8152     //     Pm = Pm_base + i-len;
8153     //     Pn = Pn_base + len;
8154 
8155     //     Ra = *++Pa;
8156     //     Rb = *--Pb;
8157     //     Rm = *++Pm;
8158     //     Rn = *--Pn;
8159 
8160     //     int iters = (2*len-i-1)/2;
8161     //     assert(iters == end-start, "must be");
8162     //     for (j = start; iters--; j++) {
8163     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8164     //       MACC2(Ra, Rb, t0, t1, t2);
8165     //       Ra = *++Pa;
8166     //       Rb = *--Pb;
8167     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8168     //       MACC(Rm, Rn, t0, t1, t2);
8169     //       Rm = *++Pm;
8170     //       Rn = *--Pn;
8171     //     }
8172     //     if ((i & 1) == 0) {
8173     //       assert(Ra == Pa_base[j], "must be");
8174     //       MACC(Ra, Ra, t0, t1, t2);
8175     //     }
8176     //     iters =  (2*len-i)/2;
8177     //     assert(iters == len-j, "must be");
8178     //     for (; iters--; j++) {
8179     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8180     //       MACC(Rm, Rn, t0, t1, t2);
8181     //       Rm = *++Pm;
8182     //       Rn = *--Pn;
8183     //     }
8184     //     Pm_base[i-len] = t0;
8185     //     t0 = t1; t1 = t2; t2 = 0;
8186     //   }
8187 
8188     //   while (t0)
8189     //     t0 = sub(Pm_base, Pn_base, t0, len);
8190     // }
8191   };
8192 
8193 
8194   // Call here from the interpreter or compiled code to either load
8195   // multiple returned values from the inline type instance being
8196   // returned to registers or to store returned values to a newly
8197   // allocated inline type instance.
8198   address generate_return_value_stub(address destination, const char* name, bool has_res) {
8199     // We need to save all registers the calling convention may use so
8200     // the runtime calls read or update those registers. This needs to
8201     // be in sync with SharedRuntime::java_return_convention().
8202     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
8203     enum layout {
8204       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
8205       j_rarg6_off, j_rarg6_2,
8206       j_rarg5_off, j_rarg5_2,
8207       j_rarg4_off, j_rarg4_2,
8208       j_rarg3_off, j_rarg3_2,
8209       j_rarg2_off, j_rarg2_2,
8210       j_rarg1_off, j_rarg1_2,
8211       j_rarg0_off, j_rarg0_2,
8212 
8213       j_farg7_off, j_farg7_2,
8214       j_farg6_off, j_farg6_2,
8215       j_farg5_off, j_farg5_2,
8216       j_farg4_off, j_farg4_2,
8217       j_farg3_off, j_farg3_2,
8218       j_farg2_off, j_farg2_2,
8219       j_farg1_off, j_farg1_2,
8220       j_farg0_off, j_farg0_2,
8221 
8222       rfp_off, rfp_off2,
8223       return_off, return_off2,
8224 
8225       framesize // inclusive of return address
8226     };
8227 
8228     CodeBuffer code(name, 512, 64);
8229     MacroAssembler* masm = new MacroAssembler(&code);
8230 
8231     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
8232     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
8233     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
8234     int frame_size_in_words = frame_size_in_bytes / wordSize;
8235 
8236     OopMapSet* oop_maps = new OopMapSet();
8237     OopMap* map = new OopMap(frame_size_in_slots, 0);
8238 
8239     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
8240     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
8241     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
8242     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
8243     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
8244     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
8245     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
8246     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
8247 
8248     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
8249     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
8250     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
8251     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
8252     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
8253     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
8254     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
8255     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
8256 
8257     address start = __ pc();
8258 
8259     __ enter(); // Save FP and LR before call
8260 
8261     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
8262     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
8263     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
8264     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
8265 
8266     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
8267     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
8268     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
8269     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
8270 
8271     int frame_complete = __ offset();
8272 
8273     // Set up last_Java_sp and last_Java_fp
8274     address the_pc = __ pc();
8275     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
8276 
8277     // Call runtime
8278     __ mov(c_rarg1, r0);
8279     __ mov(c_rarg0, rthread);
8280 
8281     __ mov(rscratch1, destination);
8282     __ blr(rscratch1);
8283 
8284     oop_maps->add_gc_map(the_pc - start, map);
8285 
8286     __ reset_last_Java_frame(false);
8287 
8288     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
8289     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
8290     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
8291     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
8292 
8293     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
8294     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
8295     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
8296     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
8297 
8298     __ leave();
8299 
8300     // check for pending exceptions
8301     Label pending;
8302     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
8303     __ cbnz(rscratch1, pending);
8304 
8305     if (has_res) {
8306       __ get_vm_result(r0, rthread);
8307     }
8308 
8309     __ ret(lr);
8310 
8311     __ bind(pending);
8312     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
8313 
8314     // -------------
8315     // make sure all code is generated
8316     masm->flush();
8317 
8318     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
8319     return stub->entry_point();
8320   }
8321 
8322   // Initialization
8323   void generate_initial_stubs() {
8324     // Generate initial stubs and initializes the entry points
8325 
8326     // entry points that exist in all platforms Note: This is code
8327     // that could be shared among different platforms - however the
8328     // benefit seems to be smaller than the disadvantage of having a
8329     // much more complicated generator structure. See also comment in
8330     // stubRoutines.hpp.
8331 
8332     StubRoutines::_forward_exception_entry = generate_forward_exception();
8333 
8334     StubRoutines::_call_stub_entry =
8335       generate_call_stub(StubRoutines::_call_stub_return_address);
8336 
8337     // is referenced by megamorphic call
8338     StubRoutines::_catch_exception_entry = generate_catch_exception();
8339 
8340     // Initialize table for copy memory (arraycopy) check.
8341     if (UnsafeMemoryAccess::_table == nullptr) {
8342       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
8343     }
8344 
8345     if (UseCRC32Intrinsics) {
8346       // set table address before stub generation which use it
8347       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8348       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8349     }
8350 
8351     if (UseCRC32CIntrinsics) {
8352       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8353     }
8354 
8355     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8356       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8357     }
8358 
8359     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8360       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8361     }
8362 
8363     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8364         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8365       StubRoutines::_hf2f = generate_float16ToFloat();
8366       StubRoutines::_f2hf = generate_floatToFloat16();
8367     }
8368 
8369     if (InlineTypeReturnedAsFields) {
8370       StubRoutines::_load_inline_type_fields_in_regs =
8371          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
8372       StubRoutines::_store_inline_type_fields_to_buf =
8373          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
8374     }
8375 
8376   }
8377 
8378   void generate_continuation_stubs() {
8379     // Continuation stubs:
8380     StubRoutines::_cont_thaw          = generate_cont_thaw();
8381     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8382     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8383   }
8384 
8385   void generate_final_stubs() {
8386     // support for verify_oop (must happen after universe_init)
8387     if (VerifyOops) {
8388       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8389     }
8390 
8391     // arraycopy stubs used by compilers
8392     generate_arraycopy_stubs();
8393 
8394     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8395     if (bs_nm != nullptr) {
8396       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8397     }
8398 
8399     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8400 
8401     if (UsePoly1305Intrinsics) {
8402       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8403     }
8404 
8405 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8406 
8407     generate_atomic_entry_points();
8408 
8409 #endif // LINUX
8410 
8411 #ifdef COMPILER2
8412     if (UseSecondarySupersTable) {
8413       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
8414       if (! InlineSecondarySupersTest) {
8415         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
8416           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
8417             = generate_lookup_secondary_supers_table_stub(slot);
8418         }
8419       }
8420     }
8421 #endif
8422 
8423     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8424     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
8425 
8426     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8427   }
8428 
8429   void generate_compiler_stubs() {
8430 #if COMPILER2_OR_JVMCI
8431 
8432     if (UseSVE == 0) {
8433       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8434     }
8435 
8436     // array equals stub for large arrays.
8437     if (!UseSimpleArrayEquals) {
8438       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8439     }
8440 
8441     // byte_array_inflate stub for large arrays.
8442     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8443 
8444     // countPositives stub for large arrays.
8445     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8446 
8447     generate_compare_long_strings();
8448 
8449     generate_string_indexof_stubs();
8450 
8451 #ifdef COMPILER2
8452     if (UseMultiplyToLenIntrinsic) {
8453       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8454     }
8455 
8456     if (UseSquareToLenIntrinsic) {
8457       StubRoutines::_squareToLen = generate_squareToLen();
8458     }
8459 
8460     if (UseMulAddIntrinsic) {
8461       StubRoutines::_mulAdd = generate_mulAdd();
8462     }
8463 
8464     if (UseSIMDForBigIntegerShiftIntrinsics) {
8465       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8466       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8467     }
8468 
8469     if (UseMontgomeryMultiplyIntrinsic) {
8470       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8471       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8472       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8473     }
8474 
8475     if (UseMontgomerySquareIntrinsic) {
8476       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8477       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8478       // We use generate_multiply() rather than generate_square()
8479       // because it's faster for the sizes of modulus we care about.
8480       StubRoutines::_montgomerySquare = g.generate_multiply();
8481     }
8482 #endif // COMPILER2
8483 
8484     if (UseChaCha20Intrinsics) {
8485       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8486     }
8487 
8488     if (UseBASE64Intrinsics) {
8489         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8490         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8491     }
8492 
8493     // data cache line writeback
8494     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8495     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8496 
8497     if (UseAESIntrinsics) {
8498       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8499       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8500       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8501       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8502       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8503     }
8504     if (UseGHASHIntrinsics) {
8505       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8506       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8507     }
8508     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8509       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8510     }
8511 
8512     if (UseMD5Intrinsics) {
8513       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8514       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8515     }
8516     if (UseSHA1Intrinsics) {
8517       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8518       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8519     }
8520     if (UseSHA256Intrinsics) {
8521       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8522       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8523     }
8524     if (UseSHA512Intrinsics) {
8525       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8526       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8527     }
8528     if (UseSHA3Intrinsics) {
8529       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8530       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8531     }
8532 
8533     // generate Adler32 intrinsics code
8534     if (UseAdler32Intrinsics) {
8535       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8536     }
8537 #endif // COMPILER2_OR_JVMCI
8538   }
8539 
8540  public:
8541   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8542     switch(kind) {
8543     case Initial_stubs:
8544       generate_initial_stubs();
8545       break;
8546      case Continuation_stubs:
8547       generate_continuation_stubs();
8548       break;
8549     case Compiler_stubs:
8550       generate_compiler_stubs();
8551       break;
8552     case Final_stubs:
8553       generate_final_stubs();
8554       break;
8555     default:
8556       fatal("unexpected stubs kind: %d", kind);
8557       break;
8558     };
8559   }
8560 }; // end class declaration
8561 
8562 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8563   StubGenerator g(code, kind);
8564 }
8565 
8566 
8567 #if defined (LINUX)
8568 
8569 // Define pointers to atomic stubs and initialize them to point to the
8570 // code in atomic_aarch64.S.
8571 
8572 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8573   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8574     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8575   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8576     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8577 
8578 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8579 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8580 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8581 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8582 DEFAULT_ATOMIC_OP(xchg, 4, )
8583 DEFAULT_ATOMIC_OP(xchg, 8, )
8584 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8585 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8586 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8587 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8588 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8589 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8590 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8591 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8592 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8593 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8594 
8595 #undef DEFAULT_ATOMIC_OP
8596 
8597 #endif // LINUX