1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "code/SCCache.hpp"
  32 #include "compiler/oopMap.hpp"
  33 #include "gc/shared/barrierSet.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/gc_globals.hpp"
  36 #include "gc/shared/tlab_globals.hpp"
  37 #include "interpreter/interpreter.hpp"
  38 #include "memory/universe.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/instanceOop.hpp"
  41 #include "oops/method.hpp"
  42 #include "oops/objArrayKlass.hpp"
  43 #include "oops/oop.inline.hpp"
  44 #include "prims/methodHandles.hpp"
  45 #include "prims/upcallLinker.hpp"
  46 #include "runtime/arguments.hpp"
  47 #include "runtime/atomic.hpp"
  48 #include "runtime/continuation.hpp"
  49 #include "runtime/continuationEntry.inline.hpp"
  50 #include "runtime/frame.inline.hpp"
  51 #include "runtime/handles.inline.hpp"
  52 #include "runtime/javaThread.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 #include "runtime/stubCodeGenerator.hpp"
  55 #include "runtime/stubRoutines.hpp"
  56 #include "utilities/align.hpp"
  57 #include "utilities/checkedCast.hpp"
  58 #include "utilities/debug.hpp"
  59 #include "utilities/globalDefinitions.hpp"
  60 #include "utilities/intpow.hpp"
  61 #include "utilities/powerOfTwo.hpp"
  62 #ifdef COMPILER2
  63 #include "opto/runtime.hpp"
  64 #endif
  65 #if INCLUDE_ZGC
  66 #include "gc/z/zThreadLocalData.hpp"
  67 #endif
  68 
  69 // Declaration and definition of StubGenerator (no .hpp file).
  70 // For a more detailed description of the stub routine structure
  71 // see the comment in stubRoutines.hpp
  72 
  73 #undef __
  74 #define __ _masm->
  75 
  76 #ifdef PRODUCT
  77 #define BLOCK_COMMENT(str) /* nothing */
  78 #else
  79 #define BLOCK_COMMENT(str) __ block_comment(str)
  80 #endif
  81 
  82 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  83 
  84 // Stub Code definitions
  85 
  86 class StubGenerator: public StubCodeGenerator {
  87  private:
  88 
  89 #ifdef PRODUCT
  90 #define inc_counter_np(counter) ((void)0)
  91 #else
  92   void inc_counter_np_(uint& counter) {
  93     __ incrementw(ExternalAddress((address)&counter));
  94   }
  95 #define inc_counter_np(counter) \
  96   BLOCK_COMMENT("inc_counter " #counter); \
  97   inc_counter_np_(counter);
  98 #endif
  99 
 100   // Call stubs are used to call Java from C
 101   //
 102   // Arguments:
 103   //    c_rarg0:   call wrapper address                   address
 104   //    c_rarg1:   result                                 address
 105   //    c_rarg2:   result type                            BasicType
 106   //    c_rarg3:   method                                 Method*
 107   //    c_rarg4:   (interpreter) entry point              address
 108   //    c_rarg5:   parameters                             intptr_t*
 109   //    c_rarg6:   parameter size (in words)              int
 110   //    c_rarg7:   thread                                 Thread*
 111   //
 112   // There is no return from the stub itself as any Java result
 113   // is written to result
 114   //
 115   // we save r30 (lr) as the return PC at the base of the frame and
 116   // link r29 (fp) below it as the frame pointer installing sp (r31)
 117   // into fp.
 118   //
 119   // we save r0-r7, which accounts for all the c arguments.
 120   //
 121   // TODO: strictly do we need to save them all? they are treated as
 122   // volatile by C so could we omit saving the ones we are going to
 123   // place in global registers (thread? method?) or those we only use
 124   // during setup of the Java call?
 125   //
 126   // we don't need to save r8 which C uses as an indirect result location
 127   // return register.
 128   //
 129   // we don't need to save r9-r15 which both C and Java treat as
 130   // volatile
 131   //
 132   // we don't need to save r16-18 because Java does not use them
 133   //
 134   // we save r19-r28 which Java uses as scratch registers and C
 135   // expects to be callee-save
 136   //
 137   // we save the bottom 64 bits of each value stored in v8-v15; it is
 138   // the responsibility of the caller to preserve larger values.
 139   //
 140   // so the stub frame looks like this when we enter Java code
 141   //
 142   //     [ return_from_Java     ] <--- sp
 143   //     [ argument word n      ]
 144   //      ...
 145   // -29 [ argument word 1      ]
 146   // -28 [ saved Floating-point Control Register ]
 147   // -26 [ saved v15            ] <--- sp_after_call
 148   // -25 [ saved v14            ]
 149   // -24 [ saved v13            ]
 150   // -23 [ saved v12            ]
 151   // -22 [ saved v11            ]
 152   // -21 [ saved v10            ]
 153   // -20 [ saved v9             ]
 154   // -19 [ saved v8             ]
 155   // -18 [ saved r28            ]
 156   // -17 [ saved r27            ]
 157   // -16 [ saved r26            ]
 158   // -15 [ saved r25            ]
 159   // -14 [ saved r24            ]
 160   // -13 [ saved r23            ]
 161   // -12 [ saved r22            ]
 162   // -11 [ saved r21            ]
 163   // -10 [ saved r20            ]
 164   //  -9 [ saved r19            ]
 165   //  -8 [ call wrapper    (r0) ]
 166   //  -7 [ result          (r1) ]
 167   //  -6 [ result type     (r2) ]
 168   //  -5 [ method          (r3) ]
 169   //  -4 [ entry point     (r4) ]
 170   //  -3 [ parameters      (r5) ]
 171   //  -2 [ parameter size  (r6) ]
 172   //  -1 [ thread (r7)          ]
 173   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 174   //   1 [ saved lr       (r30) ]
 175 
 176   // Call stub stack layout word offsets from fp
 177   enum call_stub_layout {
 178     sp_after_call_off  = -28,
 179 
 180     fpcr_off           = sp_after_call_off,
 181     d15_off            = -26,
 182     d13_off            = -24,
 183     d11_off            = -22,
 184     d9_off             = -20,
 185 
 186     r28_off            = -18,
 187     r26_off            = -16,
 188     r24_off            = -14,
 189     r22_off            = -12,
 190     r20_off            = -10,
 191     call_wrapper_off   =  -8,
 192     result_off         =  -7,
 193     result_type_off    =  -6,
 194     method_off         =  -5,
 195     entry_point_off    =  -4,
 196     parameter_size_off =  -2,
 197     thread_off         =  -1,
 198     fp_f               =   0,
 199     retaddr_off        =   1,
 200   };
 201 
 202   address generate_call_stub(address& return_address) {
 203     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 204            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 205            "adjust this code");
 206 
 207     StubCodeMark mark(this, "StubRoutines", "call_stub");
 208     address start = __ pc();
 209 
 210     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 211 
 212     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 213     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 214     const Address result        (rfp, result_off         * wordSize);
 215     const Address result_type   (rfp, result_type_off    * wordSize);
 216     const Address method        (rfp, method_off         * wordSize);
 217     const Address entry_point   (rfp, entry_point_off    * wordSize);
 218     const Address parameter_size(rfp, parameter_size_off * wordSize);
 219 
 220     const Address thread        (rfp, thread_off         * wordSize);
 221 
 222     const Address d15_save      (rfp, d15_off * wordSize);
 223     const Address d13_save      (rfp, d13_off * wordSize);
 224     const Address d11_save      (rfp, d11_off * wordSize);
 225     const Address d9_save       (rfp, d9_off * wordSize);
 226 
 227     const Address r28_save      (rfp, r28_off * wordSize);
 228     const Address r26_save      (rfp, r26_off * wordSize);
 229     const Address r24_save      (rfp, r24_off * wordSize);
 230     const Address r22_save      (rfp, r22_off * wordSize);
 231     const Address r20_save      (rfp, r20_off * wordSize);
 232 
 233     // stub code
 234 
 235     address aarch64_entry = __ pc();
 236 
 237     // set up frame and move sp to end of save area
 238     __ enter();
 239     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 240 
 241     // save register parameters and Java scratch/global registers
 242     // n.b. we save thread even though it gets installed in
 243     // rthread because we want to sanity check rthread later
 244     __ str(c_rarg7,  thread);
 245     __ strw(c_rarg6, parameter_size);
 246     __ stp(c_rarg4, c_rarg5,  entry_point);
 247     __ stp(c_rarg2, c_rarg3,  result_type);
 248     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 249 
 250     __ stp(r20, r19,   r20_save);
 251     __ stp(r22, r21,   r22_save);
 252     __ stp(r24, r23,   r24_save);
 253     __ stp(r26, r25,   r26_save);
 254     __ stp(r28, r27,   r28_save);
 255 
 256     __ stpd(v9,  v8,   d9_save);
 257     __ stpd(v11, v10,  d11_save);
 258     __ stpd(v13, v12,  d13_save);
 259     __ stpd(v15, v14,  d15_save);
 260 
 261     __ get_fpcr(rscratch1);
 262     __ str(rscratch1, fpcr_save);
 263     // Set FPCR to the state we need. We do want Round to Nearest. We
 264     // don't want non-IEEE rounding modes or floating-point traps.
 265     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 266     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 267     __ set_fpcr(rscratch1);
 268 
 269     // install Java thread in global register now we have saved
 270     // whatever value it held
 271     __ mov(rthread, c_rarg7);
 272     // And method
 273     __ mov(rmethod, c_rarg3);
 274 
 275     // set up the heapbase register
 276     __ reinit_heapbase();
 277 
 278 #ifdef ASSERT
 279     // make sure we have no pending exceptions
 280     {
 281       Label L;
 282       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 283       __ cmp(rscratch1, (u1)NULL_WORD);
 284       __ br(Assembler::EQ, L);
 285       __ stop("StubRoutines::call_stub: entered with pending exception");
 286       __ BIND(L);
 287     }
 288 #endif
 289     // pass parameters if any
 290     __ mov(esp, sp);
 291     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 292     __ andr(sp, rscratch1, -2 * wordSize);
 293 
 294     BLOCK_COMMENT("pass parameters if any");
 295     Label parameters_done;
 296     // parameter count is still in c_rarg6
 297     // and parameter pointer identifying param 1 is in c_rarg5
 298     __ cbzw(c_rarg6, parameters_done);
 299 
 300     address loop = __ pc();
 301     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 302     __ subsw(c_rarg6, c_rarg6, 1);
 303     __ push(rscratch1);
 304     __ br(Assembler::GT, loop);
 305 
 306     __ BIND(parameters_done);
 307 
 308     // call Java entry -- passing methdoOop, and current sp
 309     //      rmethod: Method*
 310     //      r19_sender_sp: sender sp
 311     BLOCK_COMMENT("call Java function");
 312     __ mov(r19_sender_sp, sp);
 313     __ blr(c_rarg4);
 314 
 315     // we do this here because the notify will already have been done
 316     // if we get to the next instruction via an exception
 317     //
 318     // n.b. adding this instruction here affects the calculation of
 319     // whether or not a routine returns to the call stub (used when
 320     // doing stack walks) since the normal test is to check the return
 321     // pc against the address saved below. so we may need to allow for
 322     // this extra instruction in the check.
 323 
 324     // save current address for use by exception handling code
 325 
 326     return_address = __ pc();
 327 
 328     // store result depending on type (everything that is not
 329     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 330     // n.b. this assumes Java returns an integral result in r0
 331     // and a floating result in j_farg0
 332     __ ldr(j_rarg2, result);
 333     Label is_long, is_float, is_double, exit;
 334     __ ldr(j_rarg1, result_type);
 335     __ cmp(j_rarg1, (u1)T_OBJECT);
 336     __ br(Assembler::EQ, is_long);
 337     __ cmp(j_rarg1, (u1)T_LONG);
 338     __ br(Assembler::EQ, is_long);
 339     __ cmp(j_rarg1, (u1)T_FLOAT);
 340     __ br(Assembler::EQ, is_float);
 341     __ cmp(j_rarg1, (u1)T_DOUBLE);
 342     __ br(Assembler::EQ, is_double);
 343 
 344     // handle T_INT case
 345     __ strw(r0, Address(j_rarg2));
 346 
 347     __ BIND(exit);
 348 
 349     // pop parameters
 350     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 351 
 352 #ifdef ASSERT
 353     // verify that threads correspond
 354     {
 355       Label L, S;
 356       __ ldr(rscratch1, thread);
 357       __ cmp(rthread, rscratch1);
 358       __ br(Assembler::NE, S);
 359       __ get_thread(rscratch1);
 360       __ cmp(rthread, rscratch1);
 361       __ br(Assembler::EQ, L);
 362       __ BIND(S);
 363       __ stop("StubRoutines::call_stub: threads must correspond");
 364       __ BIND(L);
 365     }
 366 #endif
 367 
 368     __ pop_cont_fastpath(rthread);
 369 
 370     // restore callee-save registers
 371     __ ldpd(v15, v14,  d15_save);
 372     __ ldpd(v13, v12,  d13_save);
 373     __ ldpd(v11, v10,  d11_save);
 374     __ ldpd(v9,  v8,   d9_save);
 375 
 376     __ ldp(r28, r27,   r28_save);
 377     __ ldp(r26, r25,   r26_save);
 378     __ ldp(r24, r23,   r24_save);
 379     __ ldp(r22, r21,   r22_save);
 380     __ ldp(r20, r19,   r20_save);
 381 
 382     // restore fpcr
 383     __ ldr(rscratch1,  fpcr_save);
 384     __ set_fpcr(rscratch1);
 385 
 386     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 387     __ ldrw(c_rarg2, result_type);
 388     __ ldr(c_rarg3,  method);
 389     __ ldp(c_rarg4, c_rarg5,  entry_point);
 390     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 391 
 392     // leave frame and return to caller
 393     __ leave();
 394     __ ret(lr);
 395 
 396     // handle return types different from T_INT
 397 
 398     __ BIND(is_long);
 399     __ str(r0, Address(j_rarg2, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     __ BIND(is_float);
 403     __ strs(j_farg0, Address(j_rarg2, 0));
 404     __ br(Assembler::AL, exit);
 405 
 406     __ BIND(is_double);
 407     __ strd(j_farg0, Address(j_rarg2, 0));
 408     __ br(Assembler::AL, exit);
 409 
 410     return start;
 411   }
 412 
 413   // Return point for a Java call if there's an exception thrown in
 414   // Java code.  The exception is caught and transformed into a
 415   // pending exception stored in JavaThread that can be tested from
 416   // within the VM.
 417   //
 418   // Note: Usually the parameters are removed by the callee. In case
 419   // of an exception crossing an activation frame boundary, that is
 420   // not the case if the callee is compiled code => need to setup the
 421   // rsp.
 422   //
 423   // r0: exception oop
 424 
 425   address generate_catch_exception() {
 426     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 427     address start = __ pc();
 428 
 429     // same as in generate_call_stub():
 430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 431     const Address thread        (rfp, thread_off         * wordSize);
 432 
 433 #ifdef ASSERT
 434     // verify that threads correspond
 435     {
 436       Label L, S;
 437       __ ldr(rscratch1, thread);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::NE, S);
 440       __ get_thread(rscratch1);
 441       __ cmp(rthread, rscratch1);
 442       __ br(Assembler::EQ, L);
 443       __ bind(S);
 444       __ stop("StubRoutines::catch_exception: threads must correspond");
 445       __ bind(L);
 446     }
 447 #endif
 448 
 449     // set pending exception
 450     __ verify_oop(r0);
 451 
 452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 453     __ mov(rscratch1, (address)__FILE__);
 454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 455     __ movw(rscratch1, (int)__LINE__);
 456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 457 
 458     // complete return to VM
 459     assert(StubRoutines::_call_stub_return_address != nullptr,
 460            "_call_stub_return_address must have been generated before");
 461     __ b(StubRoutines::_call_stub_return_address);
 462 
 463     return start;
 464   }
 465 
 466   // Continuation point for runtime calls returning with a pending
 467   // exception.  The pending exception check happened in the runtime
 468   // or native call stub.  The pending exception in Thread is
 469   // converted into a Java-level exception.
 470   //
 471   // Contract with Java-level exception handlers:
 472   // r0: exception
 473   // r3: throwing pc
 474   //
 475   // NOTE: At entry of this stub, exception-pc must be in LR !!
 476 
 477   // NOTE: this is always used as a jump target within generated code
 478   // so it just needs to be generated code with no x86 prolog
 479 
 480   address generate_forward_exception() {
 481     StubCodeMark mark(this, "StubRoutines", "forward exception");
 482     address start = __ pc();
 483 
 484     // Upon entry, LR points to the return address returning into
 485     // Java (interpreted or compiled) code; i.e., the return address
 486     // becomes the throwing pc.
 487     //
 488     // Arguments pushed before the runtime call are still on the stack
 489     // but the exception handler will reset the stack pointer ->
 490     // ignore them.  A potential result in registers can be ignored as
 491     // well.
 492 
 493 #ifdef ASSERT
 494     // make sure this code is only executed if there is a pending exception
 495     {
 496       Label L;
 497       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 498       __ cbnz(rscratch1, L);
 499       __ stop("StubRoutines::forward exception: no pending exception (1)");
 500       __ bind(L);
 501     }
 502 #endif
 503 
 504     // compute exception handler into r19
 505 
 506     // call the VM to find the handler address associated with the
 507     // caller address. pass thread in r0 and caller pc (ret address)
 508     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 509     // the stack.
 510     __ mov(c_rarg1, lr);
 511     // lr will be trashed by the VM call so we move it to R19
 512     // (callee-saved) because we also need to pass it to the handler
 513     // returned by this call.
 514     __ mov(r19, lr);
 515     BLOCK_COMMENT("call exception_handler_for_return_address");
 516     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 517                          SharedRuntime::exception_handler_for_return_address),
 518                     rthread, c_rarg1);
 519     // Reinitialize the ptrue predicate register, in case the external runtime
 520     // call clobbers ptrue reg, as we may return to SVE compiled code.
 521     __ reinitialize_ptrue();
 522 
 523     // we should not really care that lr is no longer the callee
 524     // address. we saved the value the handler needs in r19 so we can
 525     // just copy it to r3. however, the C2 handler will push its own
 526     // frame and then calls into the VM and the VM code asserts that
 527     // the PC for the frame above the handler belongs to a compiled
 528     // Java method. So, we restore lr here to satisfy that assert.
 529     __ mov(lr, r19);
 530     // setup r0 & r3 & clear pending exception
 531     __ mov(r3, r19);
 532     __ mov(r19, r0);
 533     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 534     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 535 
 536 #ifdef ASSERT
 537     // make sure exception is set
 538     {
 539       Label L;
 540       __ cbnz(r0, L);
 541       __ stop("StubRoutines::forward exception: no pending exception (2)");
 542       __ bind(L);
 543     }
 544 #endif
 545 
 546     // continue at exception handler
 547     // r0: exception
 548     // r3: throwing pc
 549     // r19: exception handler
 550     __ verify_oop(r0);
 551     __ br(r19);
 552 
 553     return start;
 554   }
 555 
 556   // Non-destructive plausibility checks for oops
 557   //
 558   // Arguments:
 559   //    r0: oop to verify
 560   //    rscratch1: error message
 561   //
 562   // Stack after saving c_rarg3:
 563   //    [tos + 0]: saved c_rarg3
 564   //    [tos + 1]: saved c_rarg2
 565   //    [tos + 2]: saved lr
 566   //    [tos + 3]: saved rscratch2
 567   //    [tos + 4]: saved r0
 568   //    [tos + 5]: saved rscratch1
 569   address generate_verify_oop() {
 570 
 571     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 572     address start = __ pc();
 573 
 574     Label exit, error;
 575 
 576     // save c_rarg2 and c_rarg3
 577     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 578 
 579     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 580     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 581     __ ldr(c_rarg3, Address(c_rarg2));
 582     __ add(c_rarg3, c_rarg3, 1);
 583     __ str(c_rarg3, Address(c_rarg2));
 584 
 585     // object is in r0
 586     // make sure object is 'reasonable'
 587     __ cbz(r0, exit); // if obj is null it is OK
 588 
 589     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 590     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 591 
 592     // return if everything seems ok
 593     __ bind(exit);
 594 
 595     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 596     __ ret(lr);
 597 
 598     // handle errors
 599     __ bind(error);
 600     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 601 
 602     __ push(RegSet::range(r0, r29), sp);
 603     // debug(char* msg, int64_t pc, int64_t regs[])
 604     __ mov(c_rarg0, rscratch1);      // pass address of error message
 605     __ mov(c_rarg1, lr);             // pass return address
 606     __ mov(c_rarg2, sp);             // pass address of regs on stack
 607 #ifndef PRODUCT
 608     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 609 #endif
 610     BLOCK_COMMENT("call MacroAssembler::debug");
 611     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 612     __ blr(rscratch1);
 613     __ hlt(0);
 614 
 615     return start;
 616   }
 617 
 618   // Generate indices for iota vector.
 619   address generate_iota_indices(const char *stub_name) {
 620     __ align(CodeEntryAlignment);
 621     StubCodeMark mark(this, "StubRoutines", stub_name);
 622     address start = __ pc();
 623     // B
 624     __ emit_data64(0x0706050403020100, relocInfo::none);
 625     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 626     // H
 627     __ emit_data64(0x0003000200010000, relocInfo::none);
 628     __ emit_data64(0x0007000600050004, relocInfo::none);
 629     // S
 630     __ emit_data64(0x0000000100000000, relocInfo::none);
 631     __ emit_data64(0x0000000300000002, relocInfo::none);
 632     // D
 633     __ emit_data64(0x0000000000000000, relocInfo::none);
 634     __ emit_data64(0x0000000000000001, relocInfo::none);
 635     // S - FP
 636     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 637     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 638     // D - FP
 639     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 640     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 641     return start;
 642   }
 643 
 644   // The inner part of zero_words().  This is the bulk operation,
 645   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 646   // caller is responsible for zeroing the last few words.
 647   //
 648   // Inputs:
 649   // r10: the HeapWord-aligned base address of an array to zero.
 650   // r11: the count in HeapWords, r11 > 0.
 651   //
 652   // Returns r10 and r11, adjusted for the caller to clear.
 653   // r10: the base address of the tail of words left to clear.
 654   // r11: the number of words in the tail.
 655   //      r11 < MacroAssembler::zero_words_block_size.
 656 
 657   address generate_zero_blocks() {
 658     Label done;
 659     Label base_aligned;
 660 
 661     Register base = r10, cnt = r11;
 662 
 663     __ align(CodeEntryAlignment);
 664     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 665     address start = __ pc();
 666 
 667     if (UseBlockZeroing) {
 668       int zva_length = VM_Version::zva_length();
 669 
 670       // Ensure ZVA length can be divided by 16. This is required by
 671       // the subsequent operations.
 672       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 673 
 674       __ tbz(base, 3, base_aligned);
 675       __ str(zr, Address(__ post(base, 8)));
 676       __ sub(cnt, cnt, 1);
 677       __ bind(base_aligned);
 678 
 679       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 680       // alignment.
 681       Label small;
 682       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 683       __ subs(rscratch1, cnt, low_limit >> 3);
 684       __ br(Assembler::LT, small);
 685       __ zero_dcache_blocks(base, cnt);
 686       __ bind(small);
 687     }
 688 
 689     {
 690       // Number of stp instructions we'll unroll
 691       const int unroll =
 692         MacroAssembler::zero_words_block_size / 2;
 693       // Clear the remaining blocks.
 694       Label loop;
 695       __ subs(cnt, cnt, unroll * 2);
 696       __ br(Assembler::LT, done);
 697       __ bind(loop);
 698       for (int i = 0; i < unroll; i++)
 699         __ stp(zr, zr, __ post(base, 16));
 700       __ subs(cnt, cnt, unroll * 2);
 701       __ br(Assembler::GE, loop);
 702       __ bind(done);
 703       __ add(cnt, cnt, unroll * 2);
 704     }
 705 
 706     __ ret(lr);
 707 
 708     return start;
 709   }
 710 
 711 
 712   typedef enum {
 713     copy_forwards = 1,
 714     copy_backwards = -1
 715   } copy_direction;
 716 
 717   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 718   // for arraycopy stubs.
 719   class ArrayCopyBarrierSetHelper : StackObj {
 720     BarrierSetAssembler* _bs_asm;
 721     MacroAssembler* _masm;
 722     DecoratorSet _decorators;
 723     BasicType _type;
 724     Register _gct1;
 725     Register _gct2;
 726     Register _gct3;
 727     FloatRegister _gcvt1;
 728     FloatRegister _gcvt2;
 729     FloatRegister _gcvt3;
 730 
 731   public:
 732     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 733                               DecoratorSet decorators,
 734                               BasicType type,
 735                               Register gct1,
 736                               Register gct2,
 737                               Register gct3,
 738                               FloatRegister gcvt1,
 739                               FloatRegister gcvt2,
 740                               FloatRegister gcvt3)
 741       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 742         _masm(masm),
 743         _decorators(decorators),
 744         _type(type),
 745         _gct1(gct1),
 746         _gct2(gct2),
 747         _gct3(gct3),
 748         _gcvt1(gcvt1),
 749         _gcvt2(gcvt2),
 750         _gcvt3(gcvt3) {
 751     }
 752 
 753     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 754       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 755                             dst1, dst2, src,
 756                             _gct1, _gct2, _gcvt1);
 757     }
 758 
 759     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 760       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 761                              dst, src1, src2,
 762                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 763     }
 764 
 765     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 766       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 767                             dst1, dst2, src,
 768                             _gct1);
 769     }
 770 
 771     void copy_store_at_16(Address dst, Register src1, Register src2) {
 772       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 773                              dst, src1, src2,
 774                              _gct1, _gct2, _gct3);
 775     }
 776 
 777     void copy_load_at_8(Register dst, Address src) {
 778       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 779                             dst, noreg, src,
 780                             _gct1);
 781     }
 782 
 783     void copy_store_at_8(Address dst, Register src) {
 784       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 785                              dst, src, noreg,
 786                              _gct1, _gct2, _gct3);
 787     }
 788   };
 789 
 790   // Bulk copy of blocks of 8 words.
 791   //
 792   // count is a count of words.
 793   //
 794   // Precondition: count >= 8
 795   //
 796   // Postconditions:
 797   //
 798   // The least significant bit of count contains the remaining count
 799   // of words to copy.  The rest of count is trash.
 800   //
 801   // s and d are adjusted to point to the remaining words to copy
 802   //
 803   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 804                            copy_direction direction) {
 805     int unit = wordSize * direction;
 806     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 807 
 808     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 809       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 810     const Register stride = r14;
 811     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 812     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 813     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 814 
 815     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 816     assert_different_registers(s, d, count, rscratch1, rscratch2);
 817 
 818     Label again, drain;
 819     const char *stub_name;
 820     if (direction == copy_forwards)
 821       stub_name = "forward_copy_longs";
 822     else
 823       stub_name = "backward_copy_longs";
 824 
 825     __ align(CodeEntryAlignment);
 826 
 827     StubCodeMark mark(this, "StubRoutines", stub_name);
 828 
 829     __ bind(start);
 830 
 831     Label unaligned_copy_long;
 832     if (AvoidUnalignedAccesses) {
 833       __ tbnz(d, 3, unaligned_copy_long);
 834     }
 835 
 836     if (direction == copy_forwards) {
 837       __ sub(s, s, bias);
 838       __ sub(d, d, bias);
 839     }
 840 
 841 #ifdef ASSERT
 842     // Make sure we are never given < 8 words
 843     {
 844       Label L;
 845       __ cmp(count, (u1)8);
 846       __ br(Assembler::GE, L);
 847       __ stop("genrate_copy_longs called with < 8 words");
 848       __ bind(L);
 849     }
 850 #endif
 851 
 852     // Fill 8 registers
 853     if (UseSIMDForMemoryOps) {
 854       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 855       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 856     } else {
 857       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 858       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 859       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 860       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 861     }
 862 
 863     __ subs(count, count, 16);
 864     __ br(Assembler::LO, drain);
 865 
 866     int prefetch = PrefetchCopyIntervalInBytes;
 867     bool use_stride = false;
 868     if (direction == copy_backwards) {
 869        use_stride = prefetch > 256;
 870        prefetch = -prefetch;
 871        if (use_stride) __ mov(stride, prefetch);
 872     }
 873 
 874     __ bind(again);
 875 
 876     if (PrefetchCopyIntervalInBytes > 0)
 877       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 878 
 879     if (UseSIMDForMemoryOps) {
 880       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 881       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 882       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 884     } else {
 885       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 887       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 888       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 889       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 890       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 891       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 892       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 893     }
 894 
 895     __ subs(count, count, 8);
 896     __ br(Assembler::HS, again);
 897 
 898     // Drain
 899     __ bind(drain);
 900     if (UseSIMDForMemoryOps) {
 901       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 902       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 903     } else {
 904       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 905       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 906       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 907       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 908     }
 909 
 910     {
 911       Label L1, L2;
 912       __ tbz(count, exact_log2(4), L1);
 913       if (UseSIMDForMemoryOps) {
 914         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 915         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 916       } else {
 917         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 918         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 919         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 920         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 921       }
 922       __ bind(L1);
 923 
 924       if (direction == copy_forwards) {
 925         __ add(s, s, bias);
 926         __ add(d, d, bias);
 927       }
 928 
 929       __ tbz(count, 1, L2);
 930       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 931       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 932       __ bind(L2);
 933     }
 934 
 935     __ ret(lr);
 936 
 937     if (AvoidUnalignedAccesses) {
 938       Label drain, again;
 939       // Register order for storing. Order is different for backward copy.
 940 
 941       __ bind(unaligned_copy_long);
 942 
 943       // source address is even aligned, target odd aligned
 944       //
 945       // when forward copying word pairs we read long pairs at offsets
 946       // {0, 2, 4, 6} (in long words). when backwards copying we read
 947       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 948       // address by -2 in the forwards case so we can compute the
 949       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 950       // or -1.
 951       //
 952       // when forward copying we need to store 1 word, 3 pairs and
 953       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 954       // zero offset We adjust the destination by -1 which means we
 955       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 956       //
 957       // When backwards copyng we need to store 1 word, 3 pairs and
 958       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 959       // offsets {1, 3, 5, 7, 8} * unit.
 960 
 961       if (direction == copy_forwards) {
 962         __ sub(s, s, 16);
 963         __ sub(d, d, 8);
 964       }
 965 
 966       // Fill 8 registers
 967       //
 968       // for forwards copy s was offset by -16 from the original input
 969       // value of s so the register contents are at these offsets
 970       // relative to the 64 bit block addressed by that original input
 971       // and so on for each successive 64 byte block when s is updated
 972       //
 973       // t0 at offset 0,  t1 at offset 8
 974       // t2 at offset 16, t3 at offset 24
 975       // t4 at offset 32, t5 at offset 40
 976       // t6 at offset 48, t7 at offset 56
 977 
 978       // for backwards copy s was not offset so the register contents
 979       // are at these offsets into the preceding 64 byte block
 980       // relative to that original input and so on for each successive
 981       // preceding 64 byte block when s is updated. this explains the
 982       // slightly counter-intuitive looking pattern of register usage
 983       // in the stp instructions for backwards copy.
 984       //
 985       // t0 at offset -16, t1 at offset -8
 986       // t2 at offset -32, t3 at offset -24
 987       // t4 at offset -48, t5 at offset -40
 988       // t6 at offset -64, t7 at offset -56
 989 
 990       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 991       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 992       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 993       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 994 
 995       __ subs(count, count, 16);
 996       __ br(Assembler::LO, drain);
 997 
 998       int prefetch = PrefetchCopyIntervalInBytes;
 999       bool use_stride = false;
1000       if (direction == copy_backwards) {
1001          use_stride = prefetch > 256;
1002          prefetch = -prefetch;
1003          if (use_stride) __ mov(stride, prefetch);
1004       }
1005 
1006       __ bind(again);
1007 
1008       if (PrefetchCopyIntervalInBytes > 0)
1009         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1010 
1011       if (direction == copy_forwards) {
1012        // allowing for the offset of -8 the store instructions place
1013        // registers into the target 64 bit block at the following
1014        // offsets
1015        //
1016        // t0 at offset 0
1017        // t1 at offset 8,  t2 at offset 16
1018        // t3 at offset 24, t4 at offset 32
1019        // t5 at offset 40, t6 at offset 48
1020        // t7 at offset 56
1021 
1022         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1023         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1024         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1025         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1026         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1027         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1028         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1029         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1030         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1031       } else {
1032        // d was not offset when we started so the registers are
1033        // written into the 64 bit block preceding d with the following
1034        // offsets
1035        //
1036        // t1 at offset -8
1037        // t3 at offset -24, t0 at offset -16
1038        // t5 at offset -48, t2 at offset -32
1039        // t7 at offset -56, t4 at offset -48
1040        //                   t6 at offset -64
1041        //
1042        // note that this matches the offsets previously noted for the
1043        // loads
1044 
1045         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1046         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1047         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1048         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1049         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1050         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1051         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1052         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1053         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1054       }
1055 
1056       __ subs(count, count, 8);
1057       __ br(Assembler::HS, again);
1058 
1059       // Drain
1060       //
1061       // this uses the same pattern of offsets and register arguments
1062       // as above
1063       __ bind(drain);
1064       if (direction == copy_forwards) {
1065         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1066         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1067         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1068         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1069         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1070       } else {
1071         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1072         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1073         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1074         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1075         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1076       }
1077       // now we need to copy any remaining part block which may
1078       // include a 4 word block subblock and/or a 2 word subblock.
1079       // bits 2 and 1 in the count are the tell-tale for whether we
1080       // have each such subblock
1081       {
1082         Label L1, L2;
1083         __ tbz(count, exact_log2(4), L1);
1084        // this is the same as above but copying only 4 longs hence
1085        // with only one intervening stp between the str instructions
1086        // but note that the offsets and registers still follow the
1087        // same pattern
1088         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1089         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1090         if (direction == copy_forwards) {
1091           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1092           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1093           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1094         } else {
1095           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1096           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1097           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1098         }
1099         __ bind(L1);
1100 
1101         __ tbz(count, 1, L2);
1102        // this is the same as above but copying only 2 longs hence
1103        // there is no intervening stp between the str instructions
1104        // but note that the offset and register patterns are still
1105        // the same
1106         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1107         if (direction == copy_forwards) {
1108           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1109           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1110         } else {
1111           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1112           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1113         }
1114         __ bind(L2);
1115 
1116        // for forwards copy we need to re-adjust the offsets we
1117        // applied so that s and d are follow the last words written
1118 
1119        if (direction == copy_forwards) {
1120          __ add(s, s, 16);
1121          __ add(d, d, 8);
1122        }
1123 
1124       }
1125 
1126       __ ret(lr);
1127       }
1128   }
1129 
1130   // Small copy: less than 16 bytes.
1131   //
1132   // NB: Ignores all of the bits of count which represent more than 15
1133   // bytes, so a caller doesn't have to mask them.
1134 
1135   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1136     bool is_backwards = step < 0;
1137     size_t granularity = uabs(step);
1138     int direction = is_backwards ? -1 : 1;
1139 
1140     Label Lword, Lint, Lshort, Lbyte;
1141 
1142     assert(granularity
1143            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1144 
1145     const Register t0 = r3;
1146     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1147     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1148 
1149     // ??? I don't know if this bit-test-and-branch is the right thing
1150     // to do.  It does a lot of jumping, resulting in several
1151     // mispredicted branches.  It might make more sense to do this
1152     // with something like Duff's device with a single computed branch.
1153 
1154     __ tbz(count, 3 - exact_log2(granularity), Lword);
1155     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1156     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1157     __ bind(Lword);
1158 
1159     if (granularity <= sizeof (jint)) {
1160       __ tbz(count, 2 - exact_log2(granularity), Lint);
1161       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1162       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1163       __ bind(Lint);
1164     }
1165 
1166     if (granularity <= sizeof (jshort)) {
1167       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1168       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1169       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1170       __ bind(Lshort);
1171     }
1172 
1173     if (granularity <= sizeof (jbyte)) {
1174       __ tbz(count, 0, Lbyte);
1175       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1176       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1177       __ bind(Lbyte);
1178     }
1179   }
1180 
1181   Label copy_f, copy_b;
1182   Label copy_obj_f, copy_obj_b;
1183   Label copy_obj_uninit_f, copy_obj_uninit_b;
1184 
1185   // All-singing all-dancing memory copy.
1186   //
1187   // Copy count units of memory from s to d.  The size of a unit is
1188   // step, which can be positive or negative depending on the direction
1189   // of copy.  If is_aligned is false, we align the source address.
1190   //
1191 
1192   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1193                    Register s, Register d, Register count, int step) {
1194     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1195     bool is_backwards = step < 0;
1196     unsigned int granularity = uabs(step);
1197     const Register t0 = r3, t1 = r4;
1198 
1199     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1200     // load all the data before writing anything
1201     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1202     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1203     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1204     const Register send = r17, dend = r16;
1205     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1206     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1207     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1208 
1209     if (PrefetchCopyIntervalInBytes > 0)
1210       __ prfm(Address(s, 0), PLDL1KEEP);
1211     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1212     __ br(Assembler::HI, copy_big);
1213 
1214     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1215     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1216 
1217     __ cmp(count, u1(16/granularity));
1218     __ br(Assembler::LS, copy16);
1219 
1220     __ cmp(count, u1(64/granularity));
1221     __ br(Assembler::HI, copy80);
1222 
1223     __ cmp(count, u1(32/granularity));
1224     __ br(Assembler::LS, copy32);
1225 
1226     // 33..64 bytes
1227     if (UseSIMDForMemoryOps) {
1228       bs.copy_load_at_32(v0, v1, Address(s, 0));
1229       bs.copy_load_at_32(v2, v3, Address(send, -32));
1230       bs.copy_store_at_32(Address(d, 0), v0, v1);
1231       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1232     } else {
1233       bs.copy_load_at_16(t0, t1, Address(s, 0));
1234       bs.copy_load_at_16(t2, t3, Address(s, 16));
1235       bs.copy_load_at_16(t4, t5, Address(send, -32));
1236       bs.copy_load_at_16(t6, t7, Address(send, -16));
1237 
1238       bs.copy_store_at_16(Address(d, 0), t0, t1);
1239       bs.copy_store_at_16(Address(d, 16), t2, t3);
1240       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1241       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1242     }
1243     __ b(finish);
1244 
1245     // 17..32 bytes
1246     __ bind(copy32);
1247     bs.copy_load_at_16(t0, t1, Address(s, 0));
1248     bs.copy_load_at_16(t6, t7, Address(send, -16));
1249 
1250     bs.copy_store_at_16(Address(d, 0), t0, t1);
1251     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1252     __ b(finish);
1253 
1254     // 65..80/96 bytes
1255     // (96 bytes if SIMD because we do 32 byes per instruction)
1256     __ bind(copy80);
1257     if (UseSIMDForMemoryOps) {
1258       bs.copy_load_at_32(v0, v1, Address(s, 0));
1259       bs.copy_load_at_32(v2, v3, Address(s, 32));
1260       // Unaligned pointers can be an issue for copying.
1261       // The issue has more chances to happen when granularity of data is
1262       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1263       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1264       // The most performance drop has been seen for the range 65-80 bytes.
1265       // For such cases using the pair of ldp/stp instead of the third pair of
1266       // ldpq/stpq fixes the performance issue.
1267       if (granularity < sizeof (jint)) {
1268         Label copy96;
1269         __ cmp(count, u1(80/granularity));
1270         __ br(Assembler::HI, copy96);
1271         bs.copy_load_at_16(t0, t1, Address(send, -16));
1272 
1273         bs.copy_store_at_32(Address(d, 0), v0, v1);
1274         bs.copy_store_at_32(Address(d, 32), v2, v3);
1275 
1276         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1277         __ b(finish);
1278 
1279         __ bind(copy96);
1280       }
1281       bs.copy_load_at_32(v4, v5, Address(send, -32));
1282 
1283       bs.copy_store_at_32(Address(d, 0), v0, v1);
1284       bs.copy_store_at_32(Address(d, 32), v2, v3);
1285 
1286       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1287     } else {
1288       bs.copy_load_at_16(t0, t1, Address(s, 0));
1289       bs.copy_load_at_16(t2, t3, Address(s, 16));
1290       bs.copy_load_at_16(t4, t5, Address(s, 32));
1291       bs.copy_load_at_16(t6, t7, Address(s, 48));
1292       bs.copy_load_at_16(t8, t9, Address(send, -16));
1293 
1294       bs.copy_store_at_16(Address(d, 0), t0, t1);
1295       bs.copy_store_at_16(Address(d, 16), t2, t3);
1296       bs.copy_store_at_16(Address(d, 32), t4, t5);
1297       bs.copy_store_at_16(Address(d, 48), t6, t7);
1298       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1299     }
1300     __ b(finish);
1301 
1302     // 0..16 bytes
1303     __ bind(copy16);
1304     __ cmp(count, u1(8/granularity));
1305     __ br(Assembler::LO, copy8);
1306 
1307     // 8..16 bytes
1308     bs.copy_load_at_8(t0, Address(s, 0));
1309     bs.copy_load_at_8(t1, Address(send, -8));
1310     bs.copy_store_at_8(Address(d, 0), t0);
1311     bs.copy_store_at_8(Address(dend, -8), t1);
1312     __ b(finish);
1313 
1314     if (granularity < 8) {
1315       // 4..7 bytes
1316       __ bind(copy8);
1317       __ tbz(count, 2 - exact_log2(granularity), copy4);
1318       __ ldrw(t0, Address(s, 0));
1319       __ ldrw(t1, Address(send, -4));
1320       __ strw(t0, Address(d, 0));
1321       __ strw(t1, Address(dend, -4));
1322       __ b(finish);
1323       if (granularity < 4) {
1324         // 0..3 bytes
1325         __ bind(copy4);
1326         __ cbz(count, finish); // get rid of 0 case
1327         if (granularity == 2) {
1328           __ ldrh(t0, Address(s, 0));
1329           __ strh(t0, Address(d, 0));
1330         } else { // granularity == 1
1331           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1332           // the first and last byte.
1333           // Handle the 3 byte case by loading and storing base + count/2
1334           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1335           // This does means in the 1 byte case we load/store the same
1336           // byte 3 times.
1337           __ lsr(count, count, 1);
1338           __ ldrb(t0, Address(s, 0));
1339           __ ldrb(t1, Address(send, -1));
1340           __ ldrb(t2, Address(s, count));
1341           __ strb(t0, Address(d, 0));
1342           __ strb(t1, Address(dend, -1));
1343           __ strb(t2, Address(d, count));
1344         }
1345         __ b(finish);
1346       }
1347     }
1348 
1349     __ bind(copy_big);
1350     if (is_backwards) {
1351       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1352       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1353     }
1354 
1355     // Now we've got the small case out of the way we can align the
1356     // source address on a 2-word boundary.
1357 
1358     // Here we will materialize a count in r15, which is used by copy_memory_small
1359     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1360     // Up until here, we have used t9, which aliases r15, but from here on, that register
1361     // can not be used as a temp register, as it contains the count.
1362 
1363     Label aligned;
1364 
1365     if (is_aligned) {
1366       // We may have to adjust by 1 word to get s 2-word-aligned.
1367       __ tbz(s, exact_log2(wordSize), aligned);
1368       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1369       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1370       __ sub(count, count, wordSize/granularity);
1371     } else {
1372       if (is_backwards) {
1373         __ andr(r15, s, 2 * wordSize - 1);
1374       } else {
1375         __ neg(r15, s);
1376         __ andr(r15, r15, 2 * wordSize - 1);
1377       }
1378       // r15 is the byte adjustment needed to align s.
1379       __ cbz(r15, aligned);
1380       int shift = exact_log2(granularity);
1381       if (shift > 0) {
1382         __ lsr(r15, r15, shift);
1383       }
1384       __ sub(count, count, r15);
1385 
1386 #if 0
1387       // ?? This code is only correct for a disjoint copy.  It may or
1388       // may not make sense to use it in that case.
1389 
1390       // Copy the first pair; s and d may not be aligned.
1391       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1392       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1393 
1394       // Align s and d, adjust count
1395       if (is_backwards) {
1396         __ sub(s, s, r15);
1397         __ sub(d, d, r15);
1398       } else {
1399         __ add(s, s, r15);
1400         __ add(d, d, r15);
1401       }
1402 #else
1403       copy_memory_small(decorators, type, s, d, r15, step);
1404 #endif
1405     }
1406 
1407     __ bind(aligned);
1408 
1409     // s is now 2-word-aligned.
1410 
1411     // We have a count of units and some trailing bytes. Adjust the
1412     // count and do a bulk copy of words. If the shift is zero
1413     // perform a move instead to benefit from zero latency moves.
1414     int shift = exact_log2(wordSize/granularity);
1415     if (shift > 0) {
1416       __ lsr(r15, count, shift);
1417     } else {
1418       __ mov(r15, count);
1419     }
1420     if (direction == copy_forwards) {
1421       if (type != T_OBJECT) {
1422         __ bl(copy_f);
1423       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1424         __ bl(copy_obj_uninit_f);
1425       } else {
1426         __ bl(copy_obj_f);
1427       }
1428     } else {
1429       if (type != T_OBJECT) {
1430         __ bl(copy_b);
1431       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1432         __ bl(copy_obj_uninit_b);
1433       } else {
1434         __ bl(copy_obj_b);
1435       }
1436     }
1437 
1438     // And the tail.
1439     copy_memory_small(decorators, type, s, d, count, step);
1440 
1441     if (granularity >= 8) __ bind(copy8);
1442     if (granularity >= 4) __ bind(copy4);
1443     __ bind(finish);
1444   }
1445 
1446 
1447   void clobber_registers() {
1448 #ifdef ASSERT
1449     RegSet clobbered
1450       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1451     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1452     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1453     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1454       __ mov(*it, rscratch1);
1455     }
1456 #endif
1457 
1458   }
1459 
1460   // Scan over array at a for count oops, verifying each one.
1461   // Preserves a and count, clobbers rscratch1 and rscratch2.
1462   void verify_oop_array (int size, Register a, Register count, Register temp) {
1463     Label loop, end;
1464     __ mov(rscratch1, a);
1465     __ mov(rscratch2, zr);
1466     __ bind(loop);
1467     __ cmp(rscratch2, count);
1468     __ br(Assembler::HS, end);
1469     if (size == wordSize) {
1470       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1471       __ verify_oop(temp);
1472     } else {
1473       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1474       __ decode_heap_oop(temp); // calls verify_oop
1475     }
1476     __ add(rscratch2, rscratch2, 1);
1477     __ b(loop);
1478     __ bind(end);
1479   }
1480 
1481   // Arguments:
1482   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1483   //             ignored
1484   //   is_oop  - true => oop array, so generate store check code
1485   //   name    - stub name string
1486   //
1487   // Inputs:
1488   //   c_rarg0   - source array address
1489   //   c_rarg1   - destination array address
1490   //   c_rarg2   - element count, treated as ssize_t, can be zero
1491   //
1492   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1493   // the hardware handle it.  The two dwords within qwords that span
1494   // cache line boundaries will still be loaded and stored atomically.
1495   //
1496   // Side Effects:
1497   //   disjoint_int_copy_entry is set to the no-overlap entry point
1498   //   used by generate_conjoint_int_oop_copy().
1499   //
1500   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1501                                   const char *name, bool dest_uninitialized = false) {
1502     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1503     RegSet saved_reg = RegSet::of(s, d, count);
1504     __ align(CodeEntryAlignment);
1505     StubCodeMark mark(this, "StubRoutines", name);
1506     address start = __ pc();
1507     __ enter();
1508 
1509     if (entry != nullptr) {
1510       *entry = __ pc();
1511       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1512       BLOCK_COMMENT("Entry:");
1513     }
1514 
1515     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1516     if (dest_uninitialized) {
1517       decorators |= IS_DEST_UNINITIALIZED;
1518     }
1519     if (aligned) {
1520       decorators |= ARRAYCOPY_ALIGNED;
1521     }
1522 
1523     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1524     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1525 
1526     if (is_oop) {
1527       // save regs before copy_memory
1528       __ push(RegSet::of(d, count), sp);
1529     }
1530     {
1531       // UnsafeMemoryAccess page error: continue after unsafe access
1532       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1533       UnsafeMemoryAccessMark umam(this, add_entry, true);
1534       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1535     }
1536 
1537     if (is_oop) {
1538       __ pop(RegSet::of(d, count), sp);
1539       if (VerifyOops)
1540         verify_oop_array(size, d, count, r16);
1541     }
1542 
1543     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1544 
1545     __ leave();
1546     __ mov(r0, zr); // return 0
1547     __ ret(lr);
1548     return start;
1549   }
1550 
1551   // Arguments:
1552   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1553   //             ignored
1554   //   is_oop  - true => oop array, so generate store check code
1555   //   name    - stub name string
1556   //
1557   // Inputs:
1558   //   c_rarg0   - source array address
1559   //   c_rarg1   - destination array address
1560   //   c_rarg2   - element count, treated as ssize_t, can be zero
1561   //
1562   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1563   // the hardware handle it.  The two dwords within qwords that span
1564   // cache line boundaries will still be loaded and stored atomically.
1565   //
1566   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1567                                  address *entry, const char *name,
1568                                  bool dest_uninitialized = false) {
1569     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1570     RegSet saved_regs = RegSet::of(s, d, count);
1571     StubCodeMark mark(this, "StubRoutines", name);
1572     address start = __ pc();
1573     __ enter();
1574 
1575     if (entry != nullptr) {
1576       *entry = __ pc();
1577       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1578       BLOCK_COMMENT("Entry:");
1579     }
1580 
1581     // use fwd copy when (d-s) above_equal (count*size)
1582     __ sub(rscratch1, d, s);
1583     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1584     __ br(Assembler::HS, nooverlap_target);
1585 
1586     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1587     if (dest_uninitialized) {
1588       decorators |= IS_DEST_UNINITIALIZED;
1589     }
1590     if (aligned) {
1591       decorators |= ARRAYCOPY_ALIGNED;
1592     }
1593 
1594     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1595     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1596 
1597     if (is_oop) {
1598       // save regs before copy_memory
1599       __ push(RegSet::of(d, count), sp);
1600     }
1601     {
1602       // UnsafeMemoryAccess page error: continue after unsafe access
1603       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1604       UnsafeMemoryAccessMark umam(this, add_entry, true);
1605       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1606     }
1607     if (is_oop) {
1608       __ pop(RegSet::of(d, count), sp);
1609       if (VerifyOops)
1610         verify_oop_array(size, d, count, r16);
1611     }
1612     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1613     __ leave();
1614     __ mov(r0, zr); // return 0
1615     __ ret(lr);
1616     return start;
1617 }
1618 
1619   // Arguments:
1620   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1621   //             ignored
1622   //   name    - stub name string
1623   //
1624   // Inputs:
1625   //   c_rarg0   - source array address
1626   //   c_rarg1   - destination array address
1627   //   c_rarg2   - element count, treated as ssize_t, can be zero
1628   //
1629   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1630   // we let the hardware handle it.  The one to eight bytes within words,
1631   // dwords or qwords that span cache line boundaries will still be loaded
1632   // and stored atomically.
1633   //
1634   // Side Effects:
1635   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1636   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1637   // we let the hardware handle it.  The one to eight bytes within words,
1638   // dwords or qwords that span cache line boundaries will still be loaded
1639   // and stored atomically.
1640   //
1641   // Side Effects:
1642   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1643   //   used by generate_conjoint_byte_copy().
1644   //
1645   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1646     const bool not_oop = false;
1647     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1648   }
1649 
1650   // Arguments:
1651   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1652   //             ignored
1653   //   name    - stub name string
1654   //
1655   // Inputs:
1656   //   c_rarg0   - source array address
1657   //   c_rarg1   - destination array address
1658   //   c_rarg2   - element count, treated as ssize_t, can be zero
1659   //
1660   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1661   // we let the hardware handle it.  The one to eight bytes within words,
1662   // dwords or qwords that span cache line boundaries will still be loaded
1663   // and stored atomically.
1664   //
1665   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1666                                       address* entry, const char *name) {
1667     const bool not_oop = false;
1668     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1669   }
1670 
1671   // Arguments:
1672   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1673   //             ignored
1674   //   name    - stub name string
1675   //
1676   // Inputs:
1677   //   c_rarg0   - source array address
1678   //   c_rarg1   - destination array address
1679   //   c_rarg2   - element count, treated as ssize_t, can be zero
1680   //
1681   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1682   // let the hardware handle it.  The two or four words within dwords
1683   // or qwords that span cache line boundaries will still be loaded
1684   // and stored atomically.
1685   //
1686   // Side Effects:
1687   //   disjoint_short_copy_entry is set to the no-overlap entry point
1688   //   used by generate_conjoint_short_copy().
1689   //
1690   address generate_disjoint_short_copy(bool aligned,
1691                                        address* entry, const char *name) {
1692     const bool not_oop = false;
1693     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1694   }
1695 
1696   // Arguments:
1697   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1698   //             ignored
1699   //   name    - stub name string
1700   //
1701   // Inputs:
1702   //   c_rarg0   - source array address
1703   //   c_rarg1   - destination array address
1704   //   c_rarg2   - element count, treated as ssize_t, can be zero
1705   //
1706   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1707   // let the hardware handle it.  The two or four words within dwords
1708   // or qwords that span cache line boundaries will still be loaded
1709   // and stored atomically.
1710   //
1711   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1712                                        address *entry, const char *name) {
1713     const bool not_oop = false;
1714     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1715 
1716   }
1717   // Arguments:
1718   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1719   //             ignored
1720   //   name    - stub name string
1721   //
1722   // Inputs:
1723   //   c_rarg0   - source array address
1724   //   c_rarg1   - destination array address
1725   //   c_rarg2   - element count, treated as ssize_t, can be zero
1726   //
1727   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1728   // the hardware handle it.  The two dwords within qwords that span
1729   // cache line boundaries will still be loaded and stored atomically.
1730   //
1731   // Side Effects:
1732   //   disjoint_int_copy_entry is set to the no-overlap entry point
1733   //   used by generate_conjoint_int_oop_copy().
1734   //
1735   address generate_disjoint_int_copy(bool aligned, address *entry,
1736                                          const char *name, bool dest_uninitialized = false) {
1737     const bool not_oop = false;
1738     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1739   }
1740 
1741   // Arguments:
1742   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1743   //             ignored
1744   //   name    - stub name string
1745   //
1746   // Inputs:
1747   //   c_rarg0   - source array address
1748   //   c_rarg1   - destination array address
1749   //   c_rarg2   - element count, treated as ssize_t, can be zero
1750   //
1751   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1752   // the hardware handle it.  The two dwords within qwords that span
1753   // cache line boundaries will still be loaded and stored atomically.
1754   //
1755   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1756                                      address *entry, const char *name,
1757                                      bool dest_uninitialized = false) {
1758     const bool not_oop = false;
1759     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1760   }
1761 
1762 
1763   // Arguments:
1764   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1765   //             ignored
1766   //   name    - stub name string
1767   //
1768   // Inputs:
1769   //   c_rarg0   - source array address
1770   //   c_rarg1   - destination array address
1771   //   c_rarg2   - element count, treated as size_t, can be zero
1772   //
1773   // Side Effects:
1774   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1775   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1776   //
1777   address generate_disjoint_long_copy(bool aligned, address *entry,
1778                                           const char *name, bool dest_uninitialized = false) {
1779     const bool not_oop = false;
1780     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1781   }
1782 
1783   // Arguments:
1784   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1785   //             ignored
1786   //   name    - stub name string
1787   //
1788   // Inputs:
1789   //   c_rarg0   - source array address
1790   //   c_rarg1   - destination array address
1791   //   c_rarg2   - element count, treated as size_t, can be zero
1792   //
1793   address generate_conjoint_long_copy(bool aligned,
1794                                       address nooverlap_target, address *entry,
1795                                       const char *name, bool dest_uninitialized = false) {
1796     const bool not_oop = false;
1797     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1798   }
1799 
1800   // Arguments:
1801   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1802   //             ignored
1803   //   name    - stub name string
1804   //
1805   // Inputs:
1806   //   c_rarg0   - source array address
1807   //   c_rarg1   - destination array address
1808   //   c_rarg2   - element count, treated as size_t, can be zero
1809   //
1810   // Side Effects:
1811   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1812   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1813   //
1814   address generate_disjoint_oop_copy(bool aligned, address *entry,
1815                                      const char *name, bool dest_uninitialized) {
1816     const bool is_oop = true;
1817     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1818     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1819   }
1820 
1821   // Arguments:
1822   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1823   //             ignored
1824   //   name    - stub name string
1825   //
1826   // Inputs:
1827   //   c_rarg0   - source array address
1828   //   c_rarg1   - destination array address
1829   //   c_rarg2   - element count, treated as size_t, can be zero
1830   //
1831   address generate_conjoint_oop_copy(bool aligned,
1832                                      address nooverlap_target, address *entry,
1833                                      const char *name, bool dest_uninitialized) {
1834     const bool is_oop = true;
1835     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1836     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1837                                   name, dest_uninitialized);
1838   }
1839 
1840 
1841   // Helper for generating a dynamic type check.
1842   // Smashes rscratch1, rscratch2.
1843   void generate_type_check(Register sub_klass,
1844                            Register super_check_offset,
1845                            Register super_klass,
1846                            Register temp1,
1847                            Register temp2,
1848                            Register result,
1849                            Label& L_success) {
1850     assert_different_registers(sub_klass, super_check_offset, super_klass);
1851 
1852     BLOCK_COMMENT("type_check:");
1853 
1854     Label L_miss;
1855 
1856     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1857                                      super_check_offset);
1858     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1859 
1860     // Fall through on failure!
1861     __ BIND(L_miss);
1862   }
1863 
1864   //
1865   //  Generate checkcasting array copy stub
1866   //
1867   //  Input:
1868   //    c_rarg0   - source array address
1869   //    c_rarg1   - destination array address
1870   //    c_rarg2   - element count, treated as ssize_t, can be zero
1871   //    c_rarg3   - size_t ckoff (super_check_offset)
1872   //    c_rarg4   - oop ckval (super_klass)
1873   //
1874   //  Output:
1875   //    r0 ==  0  -  success
1876   //    r0 == -1^K - failure, where K is partial transfer count
1877   //
1878   address generate_checkcast_copy(const char *name, address *entry,
1879                                   bool dest_uninitialized = false) {
1880 
1881     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1882 
1883     // Input registers (after setup_arg_regs)
1884     const Register from        = c_rarg0;   // source array address
1885     const Register to          = c_rarg1;   // destination array address
1886     const Register count       = c_rarg2;   // elementscount
1887     const Register ckoff       = c_rarg3;   // super_check_offset
1888     const Register ckval       = c_rarg4;   // super_klass
1889 
1890     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1891     RegSet wb_post_saved_regs = RegSet::of(count);
1892 
1893     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1894     const Register copied_oop  = r22;       // actual oop copied
1895     const Register count_save  = r21;       // orig elementscount
1896     const Register start_to    = r20;       // destination array start address
1897     const Register r19_klass   = r19;       // oop._klass
1898 
1899     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1900     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1901 
1902     //---------------------------------------------------------------
1903     // Assembler stub will be used for this call to arraycopy
1904     // if the two arrays are subtypes of Object[] but the
1905     // destination array type is not equal to or a supertype
1906     // of the source type.  Each element must be separately
1907     // checked.
1908 
1909     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1910                                copied_oop, r19_klass, count_save);
1911 
1912     __ align(CodeEntryAlignment);
1913     StubCodeMark mark(this, "StubRoutines", name);
1914     address start = __ pc();
1915 
1916     __ enter(); // required for proper stackwalking of RuntimeStub frame
1917 
1918 #ifdef ASSERT
1919     // caller guarantees that the arrays really are different
1920     // otherwise, we would have to make conjoint checks
1921     { Label L;
1922       __ b(L);                  // conjoint check not yet implemented
1923       __ stop("checkcast_copy within a single array");
1924       __ bind(L);
1925     }
1926 #endif //ASSERT
1927 
1928     // Caller of this entry point must set up the argument registers.
1929     if (entry != nullptr) {
1930       *entry = __ pc();
1931       BLOCK_COMMENT("Entry:");
1932     }
1933 
1934      // Empty array:  Nothing to do.
1935     __ cbz(count, L_done);
1936     __ push(RegSet::of(r19, r20, r21, r22), sp);
1937 
1938 #ifdef ASSERT
1939     BLOCK_COMMENT("assert consistent ckoff/ckval");
1940     // The ckoff and ckval must be mutually consistent,
1941     // even though caller generates both.
1942     { Label L;
1943       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1944       __ ldrw(start_to, Address(ckval, sco_offset));
1945       __ cmpw(ckoff, start_to);
1946       __ br(Assembler::EQ, L);
1947       __ stop("super_check_offset inconsistent");
1948       __ bind(L);
1949     }
1950 #endif //ASSERT
1951 
1952     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1953     bool is_oop = true;
1954     int element_size = UseCompressedOops ? 4 : 8;
1955     if (dest_uninitialized) {
1956       decorators |= IS_DEST_UNINITIALIZED;
1957     }
1958 
1959     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1960     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1961 
1962     // save the original count
1963     __ mov(count_save, count);
1964 
1965     // Copy from low to high addresses
1966     __ mov(start_to, to);              // Save destination array start address
1967     __ b(L_load_element);
1968 
1969     // ======== begin loop ========
1970     // (Loop is rotated; its entry is L_load_element.)
1971     // Loop control:
1972     //   for (; count != 0; count--) {
1973     //     copied_oop = load_heap_oop(from++);
1974     //     ... generate_type_check ...;
1975     //     store_heap_oop(to++, copied_oop);
1976     //   }
1977     __ align(OptoLoopAlignment);
1978 
1979     __ BIND(L_store_element);
1980     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1981                       __ post(to, element_size), copied_oop, noreg,
1982                       gct1, gct2, gct3);
1983     __ sub(count, count, 1);
1984     __ cbz(count, L_do_card_marks);
1985 
1986     // ======== loop entry is here ========
1987     __ BIND(L_load_element);
1988     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1989                      copied_oop, noreg, __ post(from, element_size),
1990                      gct1);
1991     __ cbz(copied_oop, L_store_element);
1992 
1993     __ load_klass(r19_klass, copied_oop);// query the object klass
1994 
1995     BLOCK_COMMENT("type_check:");
1996     generate_type_check(/*sub_klass*/r19_klass,
1997                         /*super_check_offset*/ckoff,
1998                         /*super_klass*/ckval,
1999                         /*r_array_base*/gct1,
2000                         /*temp2*/gct2,
2001                         /*result*/r10, L_store_element);
2002 
2003     // Fall through on failure!
2004 
2005     // ======== end loop ========
2006 
2007     // It was a real error; we must depend on the caller to finish the job.
2008     // Register count = remaining oops, count_orig = total oops.
2009     // Emit GC store barriers for the oops we have copied and report
2010     // their number to the caller.
2011 
2012     __ subs(count, count_save, count);     // K = partially copied oop count
2013     __ eon(count, count, zr);              // report (-1^K) to caller
2014     __ br(Assembler::EQ, L_done_pop);
2015 
2016     __ BIND(L_do_card_marks);
2017     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2018 
2019     __ bind(L_done_pop);
2020     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2021     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2022 
2023     __ bind(L_done);
2024     __ mov(r0, count);
2025     __ leave();
2026     __ ret(lr);
2027 
2028     return start;
2029   }
2030 
2031   // Perform range checks on the proposed arraycopy.
2032   // Kills temp, but nothing else.
2033   // Also, clean the sign bits of src_pos and dst_pos.
2034   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2035                               Register src_pos, // source position (c_rarg1)
2036                               Register dst,     // destination array oo (c_rarg2)
2037                               Register dst_pos, // destination position (c_rarg3)
2038                               Register length,
2039                               Register temp,
2040                               Label& L_failed) {
2041     BLOCK_COMMENT("arraycopy_range_checks:");
2042 
2043     assert_different_registers(rscratch1, temp);
2044 
2045     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2046     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2047     __ addw(temp, length, src_pos);
2048     __ cmpw(temp, rscratch1);
2049     __ br(Assembler::HI, L_failed);
2050 
2051     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2052     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2053     __ addw(temp, length, dst_pos);
2054     __ cmpw(temp, rscratch1);
2055     __ br(Assembler::HI, L_failed);
2056 
2057     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2058     __ movw(src_pos, src_pos);
2059     __ movw(dst_pos, dst_pos);
2060 
2061     BLOCK_COMMENT("arraycopy_range_checks done");
2062   }
2063 
2064   // These stubs get called from some dumb test routine.
2065   // I'll write them properly when they're called from
2066   // something that's actually doing something.
2067   static void fake_arraycopy_stub(address src, address dst, int count) {
2068     assert(count == 0, "huh?");
2069   }
2070 
2071 
2072   //
2073   //  Generate 'unsafe' array copy stub
2074   //  Though just as safe as the other stubs, it takes an unscaled
2075   //  size_t argument instead of an element count.
2076   //
2077   //  Input:
2078   //    c_rarg0   - source array address
2079   //    c_rarg1   - destination array address
2080   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2081   //
2082   // Examines the alignment of the operands and dispatches
2083   // to a long, int, short, or byte copy loop.
2084   //
2085   address generate_unsafe_copy(const char *name,
2086                                address byte_copy_entry,
2087                                address short_copy_entry,
2088                                address int_copy_entry,
2089                                address long_copy_entry) {
2090     Label L_long_aligned, L_int_aligned, L_short_aligned;
2091     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2092 
2093     __ align(CodeEntryAlignment);
2094     StubCodeMark mark(this, "StubRoutines", name);
2095     address start = __ pc();
2096     __ enter(); // required for proper stackwalking of RuntimeStub frame
2097 
2098     // bump this on entry, not on exit:
2099     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2100 
2101     __ orr(rscratch1, s, d);
2102     __ orr(rscratch1, rscratch1, count);
2103 
2104     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2105     __ cbz(rscratch1, L_long_aligned);
2106     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2107     __ cbz(rscratch1, L_int_aligned);
2108     __ tbz(rscratch1, 0, L_short_aligned);
2109     __ b(RuntimeAddress(byte_copy_entry));
2110 
2111     __ BIND(L_short_aligned);
2112     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2113     __ b(RuntimeAddress(short_copy_entry));
2114     __ BIND(L_int_aligned);
2115     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2116     __ b(RuntimeAddress(int_copy_entry));
2117     __ BIND(L_long_aligned);
2118     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2119     __ b(RuntimeAddress(long_copy_entry));
2120 
2121     return start;
2122   }
2123 
2124   //
2125   //  Generate generic array copy stubs
2126   //
2127   //  Input:
2128   //    c_rarg0    -  src oop
2129   //    c_rarg1    -  src_pos (32-bits)
2130   //    c_rarg2    -  dst oop
2131   //    c_rarg3    -  dst_pos (32-bits)
2132   //    c_rarg4    -  element count (32-bits)
2133   //
2134   //  Output:
2135   //    r0 ==  0  -  success
2136   //    r0 == -1^K - failure, where K is partial transfer count
2137   //
2138   address generate_generic_copy(const char *name,
2139                                 address byte_copy_entry, address short_copy_entry,
2140                                 address int_copy_entry, address oop_copy_entry,
2141                                 address long_copy_entry, address checkcast_copy_entry) {
2142 
2143     Label L_failed, L_objArray;
2144     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2145 
2146     // Input registers
2147     const Register src        = c_rarg0;  // source array oop
2148     const Register src_pos    = c_rarg1;  // source position
2149     const Register dst        = c_rarg2;  // destination array oop
2150     const Register dst_pos    = c_rarg3;  // destination position
2151     const Register length     = c_rarg4;
2152 
2153 
2154     // Registers used as temps
2155     const Register dst_klass  = c_rarg5;
2156 
2157     __ align(CodeEntryAlignment);
2158 
2159     StubCodeMark mark(this, "StubRoutines", name);
2160 
2161     address start = __ pc();
2162 
2163     __ enter(); // required for proper stackwalking of RuntimeStub frame
2164 
2165     // bump this on entry, not on exit:
2166     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2167 
2168     //-----------------------------------------------------------------------
2169     // Assembler stub will be used for this call to arraycopy
2170     // if the following conditions are met:
2171     //
2172     // (1) src and dst must not be null.
2173     // (2) src_pos must not be negative.
2174     // (3) dst_pos must not be negative.
2175     // (4) length  must not be negative.
2176     // (5) src klass and dst klass should be the same and not null.
2177     // (6) src and dst should be arrays.
2178     // (7) src_pos + length must not exceed length of src.
2179     // (8) dst_pos + length must not exceed length of dst.
2180     //
2181 
2182     //  if (src == nullptr) return -1;
2183     __ cbz(src, L_failed);
2184 
2185     //  if (src_pos < 0) return -1;
2186     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2187 
2188     //  if (dst == nullptr) return -1;
2189     __ cbz(dst, L_failed);
2190 
2191     //  if (dst_pos < 0) return -1;
2192     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2193 
2194     // registers used as temp
2195     const Register scratch_length    = r16; // elements count to copy
2196     const Register scratch_src_klass = r17; // array klass
2197     const Register lh                = r15; // layout helper
2198 
2199     //  if (length < 0) return -1;
2200     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2201     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2202 
2203     __ load_klass(scratch_src_klass, src);
2204 #ifdef ASSERT
2205     //  assert(src->klass() != nullptr);
2206     {
2207       BLOCK_COMMENT("assert klasses not null {");
2208       Label L1, L2;
2209       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2210       __ bind(L1);
2211       __ stop("broken null klass");
2212       __ bind(L2);
2213       __ load_klass(rscratch1, dst);
2214       __ cbz(rscratch1, L1);     // this would be broken also
2215       BLOCK_COMMENT("} assert klasses not null done");
2216     }
2217 #endif
2218 
2219     // Load layout helper (32-bits)
2220     //
2221     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2222     // 32        30    24            16              8     2                 0
2223     //
2224     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2225     //
2226 
2227     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2228 
2229     // Handle objArrays completely differently...
2230     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2231     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2232     __ movw(rscratch1, objArray_lh);
2233     __ eorw(rscratch2, lh, rscratch1);
2234     __ cbzw(rscratch2, L_objArray);
2235 
2236     //  if (src->klass() != dst->klass()) return -1;
2237     __ load_klass(rscratch2, dst);
2238     __ eor(rscratch2, rscratch2, scratch_src_klass);
2239     __ cbnz(rscratch2, L_failed);
2240 
2241     //  if (!src->is_Array()) return -1;
2242     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2243 
2244     // At this point, it is known to be a typeArray (array_tag 0x3).
2245 #ifdef ASSERT
2246     {
2247       BLOCK_COMMENT("assert primitive array {");
2248       Label L;
2249       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2250       __ cmpw(lh, rscratch2);
2251       __ br(Assembler::GE, L);
2252       __ stop("must be a primitive array");
2253       __ bind(L);
2254       BLOCK_COMMENT("} assert primitive array done");
2255     }
2256 #endif
2257 
2258     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2259                            rscratch2, L_failed);
2260 
2261     // TypeArrayKlass
2262     //
2263     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2264     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2265     //
2266 
2267     const Register rscratch1_offset = rscratch1;    // array offset
2268     const Register r15_elsize = lh; // element size
2269 
2270     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2271            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2272     __ add(src, src, rscratch1_offset);           // src array offset
2273     __ add(dst, dst, rscratch1_offset);           // dst array offset
2274     BLOCK_COMMENT("choose copy loop based on element size");
2275 
2276     // next registers should be set before the jump to corresponding stub
2277     const Register from     = c_rarg0;  // source array address
2278     const Register to       = c_rarg1;  // destination array address
2279     const Register count    = c_rarg2;  // elements count
2280 
2281     // 'from', 'to', 'count' registers should be set in such order
2282     // since they are the same as 'src', 'src_pos', 'dst'.
2283 
2284     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2285 
2286     // The possible values of elsize are 0-3, i.e. exact_log2(element
2287     // size in bytes).  We do a simple bitwise binary search.
2288   __ BIND(L_copy_bytes);
2289     __ tbnz(r15_elsize, 1, L_copy_ints);
2290     __ tbnz(r15_elsize, 0, L_copy_shorts);
2291     __ lea(from, Address(src, src_pos));// src_addr
2292     __ lea(to,   Address(dst, dst_pos));// dst_addr
2293     __ movw(count, scratch_length); // length
2294     __ b(RuntimeAddress(byte_copy_entry));
2295 
2296   __ BIND(L_copy_shorts);
2297     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2298     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2299     __ movw(count, scratch_length); // length
2300     __ b(RuntimeAddress(short_copy_entry));
2301 
2302   __ BIND(L_copy_ints);
2303     __ tbnz(r15_elsize, 0, L_copy_longs);
2304     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2305     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2306     __ movw(count, scratch_length); // length
2307     __ b(RuntimeAddress(int_copy_entry));
2308 
2309   __ BIND(L_copy_longs);
2310 #ifdef ASSERT
2311     {
2312       BLOCK_COMMENT("assert long copy {");
2313       Label L;
2314       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2315       __ cmpw(r15_elsize, LogBytesPerLong);
2316       __ br(Assembler::EQ, L);
2317       __ stop("must be long copy, but elsize is wrong");
2318       __ bind(L);
2319       BLOCK_COMMENT("} assert long copy done");
2320     }
2321 #endif
2322     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2323     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2324     __ movw(count, scratch_length); // length
2325     __ b(RuntimeAddress(long_copy_entry));
2326 
2327     // ObjArrayKlass
2328   __ BIND(L_objArray);
2329     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2330 
2331     Label L_plain_copy, L_checkcast_copy;
2332     //  test array classes for subtyping
2333     __ load_klass(r15, dst);
2334     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2335     __ br(Assembler::NE, L_checkcast_copy);
2336 
2337     // Identically typed arrays can be copied without element-wise checks.
2338     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2339                            rscratch2, L_failed);
2340 
2341     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2342     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2343     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2344     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2345     __ movw(count, scratch_length); // length
2346   __ BIND(L_plain_copy);
2347     __ b(RuntimeAddress(oop_copy_entry));
2348 
2349   __ BIND(L_checkcast_copy);
2350     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2351     {
2352       // Before looking at dst.length, make sure dst is also an objArray.
2353       __ ldrw(rscratch1, Address(r15, lh_offset));
2354       __ movw(rscratch2, objArray_lh);
2355       __ eorw(rscratch1, rscratch1, rscratch2);
2356       __ cbnzw(rscratch1, L_failed);
2357 
2358       // It is safe to examine both src.length and dst.length.
2359       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2360                              r15, L_failed);
2361 
2362       __ load_klass(dst_klass, dst); // reload
2363 
2364       // Marshal the base address arguments now, freeing registers.
2365       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2366       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2367       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2368       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2369       __ movw(count, length);           // length (reloaded)
2370       Register sco_temp = c_rarg3;      // this register is free now
2371       assert_different_registers(from, to, count, sco_temp,
2372                                  dst_klass, scratch_src_klass);
2373       // assert_clean_int(count, sco_temp);
2374 
2375       // Generate the type check.
2376       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2377       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2378 
2379       // Smashes rscratch1, rscratch2
2380       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2381                           L_plain_copy);
2382 
2383       // Fetch destination element klass from the ObjArrayKlass header.
2384       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2385       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2386       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2387 
2388       // the checkcast_copy loop needs two extra arguments:
2389       assert(c_rarg3 == sco_temp, "#3 already in place");
2390       // Set up arguments for checkcast_copy_entry.
2391       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2392       __ b(RuntimeAddress(checkcast_copy_entry));
2393     }
2394 
2395   __ BIND(L_failed);
2396     __ mov(r0, -1);
2397     __ leave();   // required for proper stackwalking of RuntimeStub frame
2398     __ ret(lr);
2399 
2400     return start;
2401   }
2402 
2403   //
2404   // Generate stub for array fill. If "aligned" is true, the
2405   // "to" address is assumed to be heapword aligned.
2406   //
2407   // Arguments for generated stub:
2408   //   to:    c_rarg0
2409   //   value: c_rarg1
2410   //   count: c_rarg2 treated as signed
2411   //
2412   address generate_fill(BasicType t, bool aligned, const char *name) {
2413     __ align(CodeEntryAlignment);
2414     StubCodeMark mark(this, "StubRoutines", name);
2415     address start = __ pc();
2416 
2417     BLOCK_COMMENT("Entry:");
2418 
2419     const Register to        = c_rarg0;  // source array address
2420     const Register value     = c_rarg1;  // value
2421     const Register count     = c_rarg2;  // elements count
2422 
2423     const Register bz_base = r10;        // base for block_zero routine
2424     const Register cnt_words = r11;      // temp register
2425 
2426     __ enter();
2427 
2428     Label L_fill_elements, L_exit1;
2429 
2430     int shift = -1;
2431     switch (t) {
2432       case T_BYTE:
2433         shift = 0;
2434         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2435         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2436         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2437         __ br(Assembler::LO, L_fill_elements);
2438         break;
2439       case T_SHORT:
2440         shift = 1;
2441         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2442         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2443         __ br(Assembler::LO, L_fill_elements);
2444         break;
2445       case T_INT:
2446         shift = 2;
2447         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2448         __ br(Assembler::LO, L_fill_elements);
2449         break;
2450       default: ShouldNotReachHere();
2451     }
2452 
2453     // Align source address at 8 bytes address boundary.
2454     Label L_skip_align1, L_skip_align2, L_skip_align4;
2455     if (!aligned) {
2456       switch (t) {
2457         case T_BYTE:
2458           // One byte misalignment happens only for byte arrays.
2459           __ tbz(to, 0, L_skip_align1);
2460           __ strb(value, Address(__ post(to, 1)));
2461           __ subw(count, count, 1);
2462           __ bind(L_skip_align1);
2463           // Fallthrough
2464         case T_SHORT:
2465           // Two bytes misalignment happens only for byte and short (char) arrays.
2466           __ tbz(to, 1, L_skip_align2);
2467           __ strh(value, Address(__ post(to, 2)));
2468           __ subw(count, count, 2 >> shift);
2469           __ bind(L_skip_align2);
2470           // Fallthrough
2471         case T_INT:
2472           // Align to 8 bytes, we know we are 4 byte aligned to start.
2473           __ tbz(to, 2, L_skip_align4);
2474           __ strw(value, Address(__ post(to, 4)));
2475           __ subw(count, count, 4 >> shift);
2476           __ bind(L_skip_align4);
2477           break;
2478         default: ShouldNotReachHere();
2479       }
2480     }
2481 
2482     //
2483     //  Fill large chunks
2484     //
2485     __ lsrw(cnt_words, count, 3 - shift); // number of words
2486     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2487     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2488     if (UseBlockZeroing) {
2489       Label non_block_zeroing, rest;
2490       // If the fill value is zero we can use the fast zero_words().
2491       __ cbnz(value, non_block_zeroing);
2492       __ mov(bz_base, to);
2493       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2494       address tpc = __ zero_words(bz_base, cnt_words);
2495       if (tpc == nullptr) {
2496         fatal("CodeCache is full at generate_fill");
2497       }
2498       __ b(rest);
2499       __ bind(non_block_zeroing);
2500       __ fill_words(to, cnt_words, value);
2501       __ bind(rest);
2502     } else {
2503       __ fill_words(to, cnt_words, value);
2504     }
2505 
2506     // Remaining count is less than 8 bytes. Fill it by a single store.
2507     // Note that the total length is no less than 8 bytes.
2508     if (t == T_BYTE || t == T_SHORT) {
2509       Label L_exit1;
2510       __ cbzw(count, L_exit1);
2511       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2512       __ str(value, Address(to, -8));    // overwrite some elements
2513       __ bind(L_exit1);
2514       __ leave();
2515       __ ret(lr);
2516     }
2517 
2518     // Handle copies less than 8 bytes.
2519     Label L_fill_2, L_fill_4, L_exit2;
2520     __ bind(L_fill_elements);
2521     switch (t) {
2522       case T_BYTE:
2523         __ tbz(count, 0, L_fill_2);
2524         __ strb(value, Address(__ post(to, 1)));
2525         __ bind(L_fill_2);
2526         __ tbz(count, 1, L_fill_4);
2527         __ strh(value, Address(__ post(to, 2)));
2528         __ bind(L_fill_4);
2529         __ tbz(count, 2, L_exit2);
2530         __ strw(value, Address(to));
2531         break;
2532       case T_SHORT:
2533         __ tbz(count, 0, L_fill_4);
2534         __ strh(value, Address(__ post(to, 2)));
2535         __ bind(L_fill_4);
2536         __ tbz(count, 1, L_exit2);
2537         __ strw(value, Address(to));
2538         break;
2539       case T_INT:
2540         __ cbzw(count, L_exit2);
2541         __ strw(value, Address(to));
2542         break;
2543       default: ShouldNotReachHere();
2544     }
2545     __ bind(L_exit2);
2546     __ leave();
2547     __ ret(lr);
2548     return start;
2549   }
2550 
2551   address generate_data_cache_writeback() {
2552     const Register line        = c_rarg0;  // address of line to write back
2553 
2554     __ align(CodeEntryAlignment);
2555 
2556     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2557 
2558     address start = __ pc();
2559     __ enter();
2560     __ cache_wb(Address(line, 0));
2561     __ leave();
2562     __ ret(lr);
2563 
2564     return start;
2565   }
2566 
2567   address generate_data_cache_writeback_sync() {
2568     const Register is_pre     = c_rarg0;  // pre or post sync
2569 
2570     __ align(CodeEntryAlignment);
2571 
2572     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2573 
2574     // pre wbsync is a no-op
2575     // post wbsync translates to an sfence
2576 
2577     Label skip;
2578     address start = __ pc();
2579     __ enter();
2580     __ cbnz(is_pre, skip);
2581     __ cache_wbsync(false);
2582     __ bind(skip);
2583     __ leave();
2584     __ ret(lr);
2585 
2586     return start;
2587   }
2588 
2589   void generate_arraycopy_stubs() {
2590     address entry;
2591     address entry_jbyte_arraycopy;
2592     address entry_jshort_arraycopy;
2593     address entry_jint_arraycopy;
2594     address entry_oop_arraycopy;
2595     address entry_jlong_arraycopy;
2596     address entry_checkcast_arraycopy;
2597 
2598     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2599     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2600 
2601     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2602     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2603 
2604     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2605     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2606 
2607     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2608 
2609     //*** jbyte
2610     // Always need aligned and unaligned versions
2611     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2612                                                                                   "jbyte_disjoint_arraycopy");
2613     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2614                                                                                   &entry_jbyte_arraycopy,
2615                                                                                   "jbyte_arraycopy");
2616     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2617                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2618     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2619                                                                                   "arrayof_jbyte_arraycopy");
2620 
2621     //*** jshort
2622     // Always need aligned and unaligned versions
2623     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2624                                                                                     "jshort_disjoint_arraycopy");
2625     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2626                                                                                     &entry_jshort_arraycopy,
2627                                                                                     "jshort_arraycopy");
2628     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2629                                                                                     "arrayof_jshort_disjoint_arraycopy");
2630     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2631                                                                                     "arrayof_jshort_arraycopy");
2632 
2633     //*** jint
2634     // Aligned versions
2635     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2636                                                                                 "arrayof_jint_disjoint_arraycopy");
2637     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2638                                                                                 "arrayof_jint_arraycopy");
2639     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2640     // entry_jint_arraycopy always points to the unaligned version
2641     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2642                                                                                 "jint_disjoint_arraycopy");
2643     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2644                                                                                 &entry_jint_arraycopy,
2645                                                                                 "jint_arraycopy");
2646 
2647     //*** jlong
2648     // It is always aligned
2649     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2650                                                                                   "arrayof_jlong_disjoint_arraycopy");
2651     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2652                                                                                   "arrayof_jlong_arraycopy");
2653     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2654     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2655 
2656     //*** oops
2657     {
2658       // With compressed oops we need unaligned versions; notice that
2659       // we overwrite entry_oop_arraycopy.
2660       bool aligned = !UseCompressedOops;
2661 
2662       StubRoutines::_arrayof_oop_disjoint_arraycopy
2663         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2664                                      /*dest_uninitialized*/false);
2665       StubRoutines::_arrayof_oop_arraycopy
2666         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2667                                      /*dest_uninitialized*/false);
2668       // Aligned versions without pre-barriers
2669       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2670         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2671                                      /*dest_uninitialized*/true);
2672       StubRoutines::_arrayof_oop_arraycopy_uninit
2673         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2674                                      /*dest_uninitialized*/true);
2675     }
2676 
2677     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2678     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2679     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2680     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2681 
2682     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2683     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2684                                                                         /*dest_uninitialized*/true);
2685 
2686     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2687                                                               entry_jbyte_arraycopy,
2688                                                               entry_jshort_arraycopy,
2689                                                               entry_jint_arraycopy,
2690                                                               entry_jlong_arraycopy);
2691 
2692     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2693                                                                entry_jbyte_arraycopy,
2694                                                                entry_jshort_arraycopy,
2695                                                                entry_jint_arraycopy,
2696                                                                entry_oop_arraycopy,
2697                                                                entry_jlong_arraycopy,
2698                                                                entry_checkcast_arraycopy);
2699 
2700     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2701     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2702     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2703     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2704     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2705     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2706   }
2707 
2708   void generate_math_stubs() { Unimplemented(); }
2709 
2710   // Arguments:
2711   //
2712   // Inputs:
2713   //   c_rarg0   - source byte array address
2714   //   c_rarg1   - destination byte array address
2715   //   c_rarg2   - K (key) in little endian int array
2716   //
2717   address generate_aescrypt_encryptBlock() {
2718     __ align(CodeEntryAlignment);
2719     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2720 
2721     const Register from        = c_rarg0;  // source array address
2722     const Register to          = c_rarg1;  // destination array address
2723     const Register key         = c_rarg2;  // key array address
2724     const Register keylen      = rscratch1;
2725 
2726     address start = __ pc();
2727     __ enter();
2728 
2729     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2730 
2731     __ aesenc_loadkeys(key, keylen);
2732     __ aesecb_encrypt(from, to, keylen);
2733 
2734     __ mov(r0, 0);
2735 
2736     __ leave();
2737     __ ret(lr);
2738 
2739     return start;
2740   }
2741 
2742   // Arguments:
2743   //
2744   // Inputs:
2745   //   c_rarg0   - source byte array address
2746   //   c_rarg1   - destination byte array address
2747   //   c_rarg2   - K (key) in little endian int array
2748   //
2749   address generate_aescrypt_decryptBlock() {
2750     assert(UseAES, "need AES cryptographic extension support");
2751     __ align(CodeEntryAlignment);
2752     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2753     Label L_doLast;
2754 
2755     const Register from        = c_rarg0;  // source array address
2756     const Register to          = c_rarg1;  // destination array address
2757     const Register key         = c_rarg2;  // key array address
2758     const Register keylen      = rscratch1;
2759 
2760     address start = __ pc();
2761     __ enter(); // required for proper stackwalking of RuntimeStub frame
2762 
2763     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2764 
2765     __ aesecb_decrypt(from, to, key, keylen);
2766 
2767     __ mov(r0, 0);
2768 
2769     __ leave();
2770     __ ret(lr);
2771 
2772     return start;
2773   }
2774 
2775   // Arguments:
2776   //
2777   // Inputs:
2778   //   c_rarg0   - source byte array address
2779   //   c_rarg1   - destination byte array address
2780   //   c_rarg2   - K (key) in little endian int array
2781   //   c_rarg3   - r vector byte array address
2782   //   c_rarg4   - input length
2783   //
2784   // Output:
2785   //   x0        - input length
2786   //
2787   address generate_cipherBlockChaining_encryptAESCrypt() {
2788     assert(UseAES, "need AES cryptographic extension support");
2789     __ align(CodeEntryAlignment);
2790     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2791 
2792     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2793 
2794     const Register from        = c_rarg0;  // source array address
2795     const Register to          = c_rarg1;  // destination array address
2796     const Register key         = c_rarg2;  // key array address
2797     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2798                                            // and left with the results of the last encryption block
2799     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2800     const Register keylen      = rscratch1;
2801 
2802     address start = __ pc();
2803 
2804       __ enter();
2805 
2806       __ movw(rscratch2, len_reg);
2807 
2808       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2809 
2810       __ ld1(v0, __ T16B, rvec);
2811 
2812       __ cmpw(keylen, 52);
2813       __ br(Assembler::CC, L_loadkeys_44);
2814       __ br(Assembler::EQ, L_loadkeys_52);
2815 
2816       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2817       __ rev32(v17, __ T16B, v17);
2818       __ rev32(v18, __ T16B, v18);
2819     __ BIND(L_loadkeys_52);
2820       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2821       __ rev32(v19, __ T16B, v19);
2822       __ rev32(v20, __ T16B, v20);
2823     __ BIND(L_loadkeys_44);
2824       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2825       __ rev32(v21, __ T16B, v21);
2826       __ rev32(v22, __ T16B, v22);
2827       __ rev32(v23, __ T16B, v23);
2828       __ rev32(v24, __ T16B, v24);
2829       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2830       __ rev32(v25, __ T16B, v25);
2831       __ rev32(v26, __ T16B, v26);
2832       __ rev32(v27, __ T16B, v27);
2833       __ rev32(v28, __ T16B, v28);
2834       __ ld1(v29, v30, v31, __ T16B, key);
2835       __ rev32(v29, __ T16B, v29);
2836       __ rev32(v30, __ T16B, v30);
2837       __ rev32(v31, __ T16B, v31);
2838 
2839     __ BIND(L_aes_loop);
2840       __ ld1(v1, __ T16B, __ post(from, 16));
2841       __ eor(v0, __ T16B, v0, v1);
2842 
2843       __ br(Assembler::CC, L_rounds_44);
2844       __ br(Assembler::EQ, L_rounds_52);
2845 
2846       __ aese(v0, v17); __ aesmc(v0, v0);
2847       __ aese(v0, v18); __ aesmc(v0, v0);
2848     __ BIND(L_rounds_52);
2849       __ aese(v0, v19); __ aesmc(v0, v0);
2850       __ aese(v0, v20); __ aesmc(v0, v0);
2851     __ BIND(L_rounds_44);
2852       __ aese(v0, v21); __ aesmc(v0, v0);
2853       __ aese(v0, v22); __ aesmc(v0, v0);
2854       __ aese(v0, v23); __ aesmc(v0, v0);
2855       __ aese(v0, v24); __ aesmc(v0, v0);
2856       __ aese(v0, v25); __ aesmc(v0, v0);
2857       __ aese(v0, v26); __ aesmc(v0, v0);
2858       __ aese(v0, v27); __ aesmc(v0, v0);
2859       __ aese(v0, v28); __ aesmc(v0, v0);
2860       __ aese(v0, v29); __ aesmc(v0, v0);
2861       __ aese(v0, v30);
2862       __ eor(v0, __ T16B, v0, v31);
2863 
2864       __ st1(v0, __ T16B, __ post(to, 16));
2865 
2866       __ subw(len_reg, len_reg, 16);
2867       __ cbnzw(len_reg, L_aes_loop);
2868 
2869       __ st1(v0, __ T16B, rvec);
2870 
2871       __ mov(r0, rscratch2);
2872 
2873       __ leave();
2874       __ ret(lr);
2875 
2876       return start;
2877   }
2878 
2879   // Arguments:
2880   //
2881   // Inputs:
2882   //   c_rarg0   - source byte array address
2883   //   c_rarg1   - destination byte array address
2884   //   c_rarg2   - K (key) in little endian int array
2885   //   c_rarg3   - r vector byte array address
2886   //   c_rarg4   - input length
2887   //
2888   // Output:
2889   //   r0        - input length
2890   //
2891   address generate_cipherBlockChaining_decryptAESCrypt() {
2892     assert(UseAES, "need AES cryptographic extension support");
2893     __ align(CodeEntryAlignment);
2894     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2895 
2896     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2897 
2898     const Register from        = c_rarg0;  // source array address
2899     const Register to          = c_rarg1;  // destination array address
2900     const Register key         = c_rarg2;  // key array address
2901     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2902                                            // and left with the results of the last encryption block
2903     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2904     const Register keylen      = rscratch1;
2905 
2906     address start = __ pc();
2907 
2908       __ enter();
2909 
2910       __ movw(rscratch2, len_reg);
2911 
2912       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2913 
2914       __ ld1(v2, __ T16B, rvec);
2915 
2916       __ ld1(v31, __ T16B, __ post(key, 16));
2917       __ rev32(v31, __ T16B, v31);
2918 
2919       __ cmpw(keylen, 52);
2920       __ br(Assembler::CC, L_loadkeys_44);
2921       __ br(Assembler::EQ, L_loadkeys_52);
2922 
2923       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2924       __ rev32(v17, __ T16B, v17);
2925       __ rev32(v18, __ T16B, v18);
2926     __ BIND(L_loadkeys_52);
2927       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2928       __ rev32(v19, __ T16B, v19);
2929       __ rev32(v20, __ T16B, v20);
2930     __ BIND(L_loadkeys_44);
2931       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2932       __ rev32(v21, __ T16B, v21);
2933       __ rev32(v22, __ T16B, v22);
2934       __ rev32(v23, __ T16B, v23);
2935       __ rev32(v24, __ T16B, v24);
2936       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2937       __ rev32(v25, __ T16B, v25);
2938       __ rev32(v26, __ T16B, v26);
2939       __ rev32(v27, __ T16B, v27);
2940       __ rev32(v28, __ T16B, v28);
2941       __ ld1(v29, v30, __ T16B, key);
2942       __ rev32(v29, __ T16B, v29);
2943       __ rev32(v30, __ T16B, v30);
2944 
2945     __ BIND(L_aes_loop);
2946       __ ld1(v0, __ T16B, __ post(from, 16));
2947       __ orr(v1, __ T16B, v0, v0);
2948 
2949       __ br(Assembler::CC, L_rounds_44);
2950       __ br(Assembler::EQ, L_rounds_52);
2951 
2952       __ aesd(v0, v17); __ aesimc(v0, v0);
2953       __ aesd(v0, v18); __ aesimc(v0, v0);
2954     __ BIND(L_rounds_52);
2955       __ aesd(v0, v19); __ aesimc(v0, v0);
2956       __ aesd(v0, v20); __ aesimc(v0, v0);
2957     __ BIND(L_rounds_44);
2958       __ aesd(v0, v21); __ aesimc(v0, v0);
2959       __ aesd(v0, v22); __ aesimc(v0, v0);
2960       __ aesd(v0, v23); __ aesimc(v0, v0);
2961       __ aesd(v0, v24); __ aesimc(v0, v0);
2962       __ aesd(v0, v25); __ aesimc(v0, v0);
2963       __ aesd(v0, v26); __ aesimc(v0, v0);
2964       __ aesd(v0, v27); __ aesimc(v0, v0);
2965       __ aesd(v0, v28); __ aesimc(v0, v0);
2966       __ aesd(v0, v29); __ aesimc(v0, v0);
2967       __ aesd(v0, v30);
2968       __ eor(v0, __ T16B, v0, v31);
2969       __ eor(v0, __ T16B, v0, v2);
2970 
2971       __ st1(v0, __ T16B, __ post(to, 16));
2972       __ orr(v2, __ T16B, v1, v1);
2973 
2974       __ subw(len_reg, len_reg, 16);
2975       __ cbnzw(len_reg, L_aes_loop);
2976 
2977       __ st1(v2, __ T16B, rvec);
2978 
2979       __ mov(r0, rscratch2);
2980 
2981       __ leave();
2982       __ ret(lr);
2983 
2984     return start;
2985   }
2986 
2987   // Big-endian 128-bit + 64-bit -> 128-bit addition.
2988   // Inputs: 128-bits. in is preserved.
2989   // The least-significant 64-bit word is in the upper dword of each vector.
2990   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2991   // Output: result
2992   void be_add_128_64(FloatRegister result, FloatRegister in,
2993                      FloatRegister inc, FloatRegister tmp) {
2994     assert_different_registers(result, tmp, inc);
2995 
2996     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
2997                                            // input
2998     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2999     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
3000                                            // MSD == 0 (must be!) to LSD
3001     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
3002   }
3003 
3004   // CTR AES crypt.
3005   // Arguments:
3006   //
3007   // Inputs:
3008   //   c_rarg0   - source byte array address
3009   //   c_rarg1   - destination byte array address
3010   //   c_rarg2   - K (key) in little endian int array
3011   //   c_rarg3   - counter vector byte array address
3012   //   c_rarg4   - input length
3013   //   c_rarg5   - saved encryptedCounter start
3014   //   c_rarg6   - saved used length
3015   //
3016   // Output:
3017   //   r0       - input length
3018   //
3019   address generate_counterMode_AESCrypt() {
3020     const Register in = c_rarg0;
3021     const Register out = c_rarg1;
3022     const Register key = c_rarg2;
3023     const Register counter = c_rarg3;
3024     const Register saved_len = c_rarg4, len = r10;
3025     const Register saved_encrypted_ctr = c_rarg5;
3026     const Register used_ptr = c_rarg6, used = r12;
3027 
3028     const Register offset = r7;
3029     const Register keylen = r11;
3030 
3031     const unsigned char block_size = 16;
3032     const int bulk_width = 4;
3033     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3034     // performance with larger data sizes, but it also means that the
3035     // fast path isn't used until you have at least 8 blocks, and up
3036     // to 127 bytes of data will be executed on the slow path. For
3037     // that reason, and also so as not to blow away too much icache, 4
3038     // blocks seems like a sensible compromise.
3039 
3040     // Algorithm:
3041     //
3042     //    if (len == 0) {
3043     //        goto DONE;
3044     //    }
3045     //    int result = len;
3046     //    do {
3047     //        if (used >= blockSize) {
3048     //            if (len >= bulk_width * blockSize) {
3049     //                CTR_large_block();
3050     //                if (len == 0)
3051     //                    goto DONE;
3052     //            }
3053     //            for (;;) {
3054     //                16ByteVector v0 = counter;
3055     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3056     //                used = 0;
3057     //                if (len < blockSize)
3058     //                    break;    /* goto NEXT */
3059     //                16ByteVector v1 = load16Bytes(in, offset);
3060     //                v1 = v1 ^ encryptedCounter;
3061     //                store16Bytes(out, offset);
3062     //                used = blockSize;
3063     //                offset += blockSize;
3064     //                len -= blockSize;
3065     //                if (len == 0)
3066     //                    goto DONE;
3067     //            }
3068     //        }
3069     //      NEXT:
3070     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3071     //        len--;
3072     //    } while (len != 0);
3073     //  DONE:
3074     //    return result;
3075     //
3076     // CTR_large_block()
3077     //    Wide bulk encryption of whole blocks.
3078 
3079     __ align(CodeEntryAlignment);
3080     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3081     const address start = __ pc();
3082     __ enter();
3083 
3084     Label DONE, CTR_large_block, large_block_return;
3085     __ ldrw(used, Address(used_ptr));
3086     __ cbzw(saved_len, DONE);
3087 
3088     __ mov(len, saved_len);
3089     __ mov(offset, 0);
3090 
3091     // Compute #rounds for AES based on the length of the key array
3092     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3093 
3094     __ aesenc_loadkeys(key, keylen);
3095 
3096     {
3097       Label L_CTR_loop, NEXT;
3098 
3099       __ bind(L_CTR_loop);
3100 
3101       __ cmp(used, block_size);
3102       __ br(__ LO, NEXT);
3103 
3104       // Maybe we have a lot of data
3105       __ subsw(rscratch1, len, bulk_width * block_size);
3106       __ br(__ HS, CTR_large_block);
3107       __ BIND(large_block_return);
3108       __ cbzw(len, DONE);
3109 
3110       // Setup the counter
3111       __ movi(v4, __ T4S, 0);
3112       __ movi(v5, __ T4S, 1);
3113       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3114 
3115       // 128-bit big-endian increment
3116       __ ld1(v0, __ T16B, counter);
3117       __ rev64(v16, __ T16B, v0);
3118       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3119       __ rev64(v16, __ T16B, v16);
3120       __ st1(v16, __ T16B, counter);
3121       // Previous counter value is in v0
3122       // v4 contains { 0, 1 }
3123 
3124       {
3125         // We have fewer than bulk_width blocks of data left. Encrypt
3126         // them one by one until there is less than a full block
3127         // remaining, being careful to save both the encrypted counter
3128         // and the counter.
3129 
3130         Label inner_loop;
3131         __ bind(inner_loop);
3132         // Counter to encrypt is in v0
3133         __ aesecb_encrypt(noreg, noreg, keylen);
3134         __ st1(v0, __ T16B, saved_encrypted_ctr);
3135 
3136         // Do we have a remaining full block?
3137 
3138         __ mov(used, 0);
3139         __ cmp(len, block_size);
3140         __ br(__ LO, NEXT);
3141 
3142         // Yes, we have a full block
3143         __ ldrq(v1, Address(in, offset));
3144         __ eor(v1, __ T16B, v1, v0);
3145         __ strq(v1, Address(out, offset));
3146         __ mov(used, block_size);
3147         __ add(offset, offset, block_size);
3148 
3149         __ subw(len, len, block_size);
3150         __ cbzw(len, DONE);
3151 
3152         // Increment the counter, store it back
3153         __ orr(v0, __ T16B, v16, v16);
3154         __ rev64(v16, __ T16B, v16);
3155         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3156         __ rev64(v16, __ T16B, v16);
3157         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3158 
3159         __ b(inner_loop);
3160       }
3161 
3162       __ BIND(NEXT);
3163 
3164       // Encrypt a single byte, and loop.
3165       // We expect this to be a rare event.
3166       __ ldrb(rscratch1, Address(in, offset));
3167       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3168       __ eor(rscratch1, rscratch1, rscratch2);
3169       __ strb(rscratch1, Address(out, offset));
3170       __ add(offset, offset, 1);
3171       __ add(used, used, 1);
3172       __ subw(len, len,1);
3173       __ cbnzw(len, L_CTR_loop);
3174     }
3175 
3176     __ bind(DONE);
3177     __ strw(used, Address(used_ptr));
3178     __ mov(r0, saved_len);
3179 
3180     __ leave(); // required for proper stackwalking of RuntimeStub frame
3181     __ ret(lr);
3182 
3183     // Bulk encryption
3184 
3185     __ BIND (CTR_large_block);
3186     assert(bulk_width == 4 || bulk_width == 8, "must be");
3187 
3188     if (bulk_width == 8) {
3189       __ sub(sp, sp, 4 * 16);
3190       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3191     }
3192     __ sub(sp, sp, 4 * 16);
3193     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3194     RegSet saved_regs = (RegSet::of(in, out, offset)
3195                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3196     __ push(saved_regs, sp);
3197     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3198     __ add(in, in, offset);
3199     __ add(out, out, offset);
3200 
3201     // Keys should already be loaded into the correct registers
3202 
3203     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3204     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3205 
3206     // AES/CTR loop
3207     {
3208       Label L_CTR_loop;
3209       __ BIND(L_CTR_loop);
3210 
3211       // Setup the counters
3212       __ movi(v8, __ T4S, 0);
3213       __ movi(v9, __ T4S, 1);
3214       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3215 
3216       for (int i = 0; i < bulk_width; i++) {
3217         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3218         __ rev64(v0_ofs, __ T16B, v16);
3219         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3220       }
3221 
3222       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3223 
3224       // Encrypt the counters
3225       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3226 
3227       if (bulk_width == 8) {
3228         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3229       }
3230 
3231       // XOR the encrypted counters with the inputs
3232       for (int i = 0; i < bulk_width; i++) {
3233         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3234         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3235         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3236       }
3237 
3238       // Write the encrypted data
3239       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3240       if (bulk_width == 8) {
3241         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3242       }
3243 
3244       __ subw(len, len, 16 * bulk_width);
3245       __ cbnzw(len, L_CTR_loop);
3246     }
3247 
3248     // Save the counter back where it goes
3249     __ rev64(v16, __ T16B, v16);
3250     __ st1(v16, __ T16B, counter);
3251 
3252     __ pop(saved_regs, sp);
3253 
3254     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3255     if (bulk_width == 8) {
3256       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3257     }
3258 
3259     __ andr(rscratch1, len, -16 * bulk_width);
3260     __ sub(len, len, rscratch1);
3261     __ add(offset, offset, rscratch1);
3262     __ mov(used, 16);
3263     __ strw(used, Address(used_ptr));
3264     __ b(large_block_return);
3265 
3266     return start;
3267   }
3268 
3269   // Vector AES Galois Counter Mode implementation. Parameters:
3270   //
3271   // in = c_rarg0
3272   // len = c_rarg1
3273   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3274   // out = c_rarg3
3275   // key = c_rarg4
3276   // state = c_rarg5 - GHASH.state
3277   // subkeyHtbl = c_rarg6 - powers of H
3278   // counter = c_rarg7 - 16 bytes of CTR
3279   // return - number of processed bytes
3280   address generate_galoisCounterMode_AESCrypt() {
3281     address ghash_polynomial = __ pc();
3282     __ emit_int64(0x87);  // The low-order bits of the field
3283                           // polynomial (i.e. p = z^7+z^2+z+1)
3284                           // repeated in the low and high parts of a
3285                           // 128-bit vector
3286     __ emit_int64(0x87);
3287 
3288     __ align(CodeEntryAlignment);
3289      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3290     address start = __ pc();
3291     __ enter();
3292 
3293     const Register in = c_rarg0;
3294     const Register len = c_rarg1;
3295     const Register ct = c_rarg2;
3296     const Register out = c_rarg3;
3297     // and updated with the incremented counter in the end
3298 
3299     const Register key = c_rarg4;
3300     const Register state = c_rarg5;
3301 
3302     const Register subkeyHtbl = c_rarg6;
3303 
3304     const Register counter = c_rarg7;
3305 
3306     const Register keylen = r10;
3307     // Save state before entering routine
3308     __ sub(sp, sp, 4 * 16);
3309     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3310     __ sub(sp, sp, 4 * 16);
3311     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3312 
3313     // __ andr(len, len, -512);
3314     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3315     __ str(len, __ pre(sp, -2 * wordSize));
3316 
3317     Label DONE;
3318     __ cbz(len, DONE);
3319 
3320     // Compute #rounds for AES based on the length of the key array
3321     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3322 
3323     __ aesenc_loadkeys(key, keylen);
3324     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3325     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3326 
3327     // AES/CTR loop
3328     {
3329       Label L_CTR_loop;
3330       __ BIND(L_CTR_loop);
3331 
3332       // Setup the counters
3333       __ movi(v8, __ T4S, 0);
3334       __ movi(v9, __ T4S, 1);
3335       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3336 
3337       assert(v0->encoding() < v8->encoding(), "");
3338       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3339         FloatRegister f = as_FloatRegister(i);
3340         __ rev32(f, __ T16B, v16);
3341         __ addv(v16, __ T4S, v16, v8);
3342       }
3343 
3344       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3345 
3346       // Encrypt the counters
3347       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3348 
3349       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3350 
3351       // XOR the encrypted counters with the inputs
3352       for (int i = 0; i < 8; i++) {
3353         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3354         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3355         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3356       }
3357       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3358       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3359 
3360       __ subw(len, len, 16 * 8);
3361       __ cbnzw(len, L_CTR_loop);
3362     }
3363 
3364     __ rev32(v16, __ T16B, v16);
3365     __ st1(v16, __ T16B, counter);
3366 
3367     __ ldr(len, Address(sp));
3368     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3369 
3370     // GHASH/CTR loop
3371     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3372                                 len, /*unrolls*/4);
3373 
3374 #ifdef ASSERT
3375     { Label L;
3376       __ cmp(len, (unsigned char)0);
3377       __ br(Assembler::EQ, L);
3378       __ stop("stubGenerator: abort");
3379       __ bind(L);
3380   }
3381 #endif
3382 
3383   __ bind(DONE);
3384     // Return the number of bytes processed
3385     __ ldr(r0, __ post(sp, 2 * wordSize));
3386 
3387     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3388     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3389 
3390     __ leave(); // required for proper stackwalking of RuntimeStub frame
3391     __ ret(lr);
3392      return start;
3393   }
3394 
3395   class Cached64Bytes {
3396   private:
3397     MacroAssembler *_masm;
3398     Register _regs[8];
3399 
3400   public:
3401     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3402       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3403       auto it = rs.begin();
3404       for (auto &r: _regs) {
3405         r = *it;
3406         ++it;
3407       }
3408     }
3409 
3410     void gen_loads(Register base) {
3411       for (int i = 0; i < 8; i += 2) {
3412         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3413       }
3414     }
3415 
3416     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3417     void extract_u32(Register dest, int i) {
3418       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3419     }
3420   };
3421 
3422   // Utility routines for md5.
3423   // Clobbers r10 and r11.
3424   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3425               int k, int s, int t) {
3426     Register rscratch3 = r10;
3427     Register rscratch4 = r11;
3428 
3429     __ eorw(rscratch3, r3, r4);
3430     __ movw(rscratch2, t);
3431     __ andw(rscratch3, rscratch3, r2);
3432     __ addw(rscratch4, r1, rscratch2);
3433     reg_cache.extract_u32(rscratch1, k);
3434     __ eorw(rscratch3, rscratch3, r4);
3435     __ addw(rscratch4, rscratch4, rscratch1);
3436     __ addw(rscratch3, rscratch3, rscratch4);
3437     __ rorw(rscratch2, rscratch3, 32 - s);
3438     __ addw(r1, rscratch2, r2);
3439   }
3440 
3441   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3442               int k, int s, int t) {
3443     Register rscratch3 = r10;
3444     Register rscratch4 = r11;
3445 
3446     reg_cache.extract_u32(rscratch1, k);
3447     __ movw(rscratch2, t);
3448     __ addw(rscratch4, r1, rscratch2);
3449     __ addw(rscratch4, rscratch4, rscratch1);
3450     __ bicw(rscratch2, r3, r4);
3451     __ andw(rscratch3, r2, r4);
3452     __ addw(rscratch2, rscratch2, rscratch4);
3453     __ addw(rscratch2, rscratch2, rscratch3);
3454     __ rorw(rscratch2, rscratch2, 32 - s);
3455     __ addw(r1, rscratch2, r2);
3456   }
3457 
3458   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3459               int k, int s, int t) {
3460     Register rscratch3 = r10;
3461     Register rscratch4 = r11;
3462 
3463     __ eorw(rscratch3, r3, r4);
3464     __ movw(rscratch2, t);
3465     __ addw(rscratch4, r1, rscratch2);
3466     reg_cache.extract_u32(rscratch1, k);
3467     __ eorw(rscratch3, rscratch3, r2);
3468     __ addw(rscratch4, rscratch4, rscratch1);
3469     __ addw(rscratch3, rscratch3, rscratch4);
3470     __ rorw(rscratch2, rscratch3, 32 - s);
3471     __ addw(r1, rscratch2, r2);
3472   }
3473 
3474   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3475               int k, int s, int t) {
3476     Register rscratch3 = r10;
3477     Register rscratch4 = r11;
3478 
3479     __ movw(rscratch3, t);
3480     __ ornw(rscratch2, r2, r4);
3481     __ addw(rscratch4, r1, rscratch3);
3482     reg_cache.extract_u32(rscratch1, k);
3483     __ eorw(rscratch3, rscratch2, r3);
3484     __ addw(rscratch4, rscratch4, rscratch1);
3485     __ addw(rscratch3, rscratch3, rscratch4);
3486     __ rorw(rscratch2, rscratch3, 32 - s);
3487     __ addw(r1, rscratch2, r2);
3488   }
3489 
3490   // Arguments:
3491   //
3492   // Inputs:
3493   //   c_rarg0   - byte[]  source+offset
3494   //   c_rarg1   - int[]   SHA.state
3495   //   c_rarg2   - int     offset
3496   //   c_rarg3   - int     limit
3497   //
3498   address generate_md5_implCompress(bool multi_block, const char *name) {
3499     __ align(CodeEntryAlignment);
3500     StubCodeMark mark(this, "StubRoutines", name);
3501     address start = __ pc();
3502 
3503     Register buf       = c_rarg0;
3504     Register state     = c_rarg1;
3505     Register ofs       = c_rarg2;
3506     Register limit     = c_rarg3;
3507     Register a         = r4;
3508     Register b         = r5;
3509     Register c         = r6;
3510     Register d         = r7;
3511     Register rscratch3 = r10;
3512     Register rscratch4 = r11;
3513 
3514     Register state_regs[2] = { r12, r13 };
3515     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3516     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3517 
3518     __ push(saved_regs, sp);
3519 
3520     __ ldp(state_regs[0], state_regs[1], Address(state));
3521     __ ubfx(a, state_regs[0],  0, 32);
3522     __ ubfx(b, state_regs[0], 32, 32);
3523     __ ubfx(c, state_regs[1],  0, 32);
3524     __ ubfx(d, state_regs[1], 32, 32);
3525 
3526     Label md5_loop;
3527     __ BIND(md5_loop);
3528 
3529     reg_cache.gen_loads(buf);
3530 
3531     // Round 1
3532     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3533     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3534     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3535     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3536     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3537     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3538     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3539     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3540     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3541     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3542     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3543     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3544     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3545     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3546     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3547     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3548 
3549     // Round 2
3550     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3551     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3552     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3553     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3554     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3555     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3556     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3557     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3558     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3559     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3560     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3561     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3562     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3563     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3564     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3565     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3566 
3567     // Round 3
3568     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3569     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3570     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3571     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3572     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3573     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3574     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3575     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3576     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3577     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3578     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3579     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3580     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3581     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3582     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3583     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3584 
3585     // Round 4
3586     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3587     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3588     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3589     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3590     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3591     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3592     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3593     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3594     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3595     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3596     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3597     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3598     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3599     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3600     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3601     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3602 
3603     __ addw(a, state_regs[0], a);
3604     __ ubfx(rscratch2, state_regs[0], 32, 32);
3605     __ addw(b, rscratch2, b);
3606     __ addw(c, state_regs[1], c);
3607     __ ubfx(rscratch4, state_regs[1], 32, 32);
3608     __ addw(d, rscratch4, d);
3609 
3610     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3611     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3612 
3613     if (multi_block) {
3614       __ add(buf, buf, 64);
3615       __ add(ofs, ofs, 64);
3616       __ cmp(ofs, limit);
3617       __ br(Assembler::LE, md5_loop);
3618       __ mov(c_rarg0, ofs); // return ofs
3619     }
3620 
3621     // write hash values back in the correct order
3622     __ stp(state_regs[0], state_regs[1], Address(state));
3623 
3624     __ pop(saved_regs, sp);
3625 
3626     __ ret(lr);
3627 
3628     return start;
3629   }
3630 
3631   // Arguments:
3632   //
3633   // Inputs:
3634   //   c_rarg0   - byte[]  source+offset
3635   //   c_rarg1   - int[]   SHA.state
3636   //   c_rarg2   - int     offset
3637   //   c_rarg3   - int     limit
3638   //
3639   address generate_sha1_implCompress(bool multi_block, const char *name) {
3640     __ align(CodeEntryAlignment);
3641     StubCodeMark mark(this, "StubRoutines", name);
3642     address start = __ pc();
3643 
3644     Register buf   = c_rarg0;
3645     Register state = c_rarg1;
3646     Register ofs   = c_rarg2;
3647     Register limit = c_rarg3;
3648 
3649     Label keys;
3650     Label sha1_loop;
3651 
3652     // load the keys into v0..v3
3653     __ adr(rscratch1, keys);
3654     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3655     // load 5 words state into v6, v7
3656     __ ldrq(v6, Address(state, 0));
3657     __ ldrs(v7, Address(state, 16));
3658 
3659 
3660     __ BIND(sha1_loop);
3661     // load 64 bytes of data into v16..v19
3662     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3663     __ rev32(v16, __ T16B, v16);
3664     __ rev32(v17, __ T16B, v17);
3665     __ rev32(v18, __ T16B, v18);
3666     __ rev32(v19, __ T16B, v19);
3667 
3668     // do the sha1
3669     __ addv(v4, __ T4S, v16, v0);
3670     __ orr(v20, __ T16B, v6, v6);
3671 
3672     FloatRegister d0 = v16;
3673     FloatRegister d1 = v17;
3674     FloatRegister d2 = v18;
3675     FloatRegister d3 = v19;
3676 
3677     for (int round = 0; round < 20; round++) {
3678       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3679       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3680       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3681       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3682       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3683 
3684       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3685       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3686       __ sha1h(tmp2, __ T4S, v20);
3687       if (round < 5)
3688         __ sha1c(v20, __ T4S, tmp3, tmp4);
3689       else if (round < 10 || round >= 15)
3690         __ sha1p(v20, __ T4S, tmp3, tmp4);
3691       else
3692         __ sha1m(v20, __ T4S, tmp3, tmp4);
3693       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3694 
3695       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3696     }
3697 
3698     __ addv(v7, __ T2S, v7, v21);
3699     __ addv(v6, __ T4S, v6, v20);
3700 
3701     if (multi_block) {
3702       __ add(ofs, ofs, 64);
3703       __ cmp(ofs, limit);
3704       __ br(Assembler::LE, sha1_loop);
3705       __ mov(c_rarg0, ofs); // return ofs
3706     }
3707 
3708     __ strq(v6, Address(state, 0));
3709     __ strs(v7, Address(state, 16));
3710 
3711     __ ret(lr);
3712 
3713     __ bind(keys);
3714     __ emit_int32(0x5a827999);
3715     __ emit_int32(0x6ed9eba1);
3716     __ emit_int32(0x8f1bbcdc);
3717     __ emit_int32(0xca62c1d6);
3718 
3719     return start;
3720   }
3721 
3722 
3723   // Arguments:
3724   //
3725   // Inputs:
3726   //   c_rarg0   - byte[]  source+offset
3727   //   c_rarg1   - int[]   SHA.state
3728   //   c_rarg2   - int     offset
3729   //   c_rarg3   - int     limit
3730   //
3731   address generate_sha256_implCompress(bool multi_block, const char *name) {
3732     static const uint32_t round_consts[64] = {
3733       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3734       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3735       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3736       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3737       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3738       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3739       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3740       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3741       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3742       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3743       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3744       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3745       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3746       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3747       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3748       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3749     };
3750     __ align(CodeEntryAlignment);
3751     StubCodeMark mark(this, "StubRoutines", name);
3752     address start = __ pc();
3753 
3754     Register buf   = c_rarg0;
3755     Register state = c_rarg1;
3756     Register ofs   = c_rarg2;
3757     Register limit = c_rarg3;
3758 
3759     Label sha1_loop;
3760 
3761     __ stpd(v8, v9, __ pre(sp, -32));
3762     __ stpd(v10, v11, Address(sp, 16));
3763 
3764 // dga == v0
3765 // dgb == v1
3766 // dg0 == v2
3767 // dg1 == v3
3768 // dg2 == v4
3769 // t0 == v6
3770 // t1 == v7
3771 
3772     // load 16 keys to v16..v31
3773     __ lea(rscratch1, ExternalAddress((address)round_consts));
3774     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3775     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3776     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3777     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3778 
3779     // load 8 words (256 bits) state
3780     __ ldpq(v0, v1, state);
3781 
3782     __ BIND(sha1_loop);
3783     // load 64 bytes of data into v8..v11
3784     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3785     __ rev32(v8, __ T16B, v8);
3786     __ rev32(v9, __ T16B, v9);
3787     __ rev32(v10, __ T16B, v10);
3788     __ rev32(v11, __ T16B, v11);
3789 
3790     __ addv(v6, __ T4S, v8, v16);
3791     __ orr(v2, __ T16B, v0, v0);
3792     __ orr(v3, __ T16B, v1, v1);
3793 
3794     FloatRegister d0 = v8;
3795     FloatRegister d1 = v9;
3796     FloatRegister d2 = v10;
3797     FloatRegister d3 = v11;
3798 
3799 
3800     for (int round = 0; round < 16; round++) {
3801       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3802       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3803       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3804       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3805 
3806       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3807        __ orr(v4, __ T16B, v2, v2);
3808       if (round < 15)
3809         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3810       __ sha256h(v2, __ T4S, v3, tmp2);
3811       __ sha256h2(v3, __ T4S, v4, tmp2);
3812       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3813 
3814       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3815     }
3816 
3817     __ addv(v0, __ T4S, v0, v2);
3818     __ addv(v1, __ T4S, v1, v3);
3819 
3820     if (multi_block) {
3821       __ add(ofs, ofs, 64);
3822       __ cmp(ofs, limit);
3823       __ br(Assembler::LE, sha1_loop);
3824       __ mov(c_rarg0, ofs); // return ofs
3825     }
3826 
3827     __ ldpd(v10, v11, Address(sp, 16));
3828     __ ldpd(v8, v9, __ post(sp, 32));
3829 
3830     __ stpq(v0, v1, state);
3831 
3832     __ ret(lr);
3833 
3834     return start;
3835   }
3836 
3837   // Double rounds for sha512.
3838   void sha512_dround(int dr,
3839                      FloatRegister vi0, FloatRegister vi1,
3840                      FloatRegister vi2, FloatRegister vi3,
3841                      FloatRegister vi4, FloatRegister vrc0,
3842                      FloatRegister vrc1, FloatRegister vin0,
3843                      FloatRegister vin1, FloatRegister vin2,
3844                      FloatRegister vin3, FloatRegister vin4) {
3845       if (dr < 36) {
3846         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3847       }
3848       __ addv(v5, __ T2D, vrc0, vin0);
3849       __ ext(v6, __ T16B, vi2, vi3, 8);
3850       __ ext(v5, __ T16B, v5, v5, 8);
3851       __ ext(v7, __ T16B, vi1, vi2, 8);
3852       __ addv(vi3, __ T2D, vi3, v5);
3853       if (dr < 32) {
3854         __ ext(v5, __ T16B, vin3, vin4, 8);
3855         __ sha512su0(vin0, __ T2D, vin1);
3856       }
3857       __ sha512h(vi3, __ T2D, v6, v7);
3858       if (dr < 32) {
3859         __ sha512su1(vin0, __ T2D, vin2, v5);
3860       }
3861       __ addv(vi4, __ T2D, vi1, vi3);
3862       __ sha512h2(vi3, __ T2D, vi1, vi0);
3863   }
3864 
3865   // Arguments:
3866   //
3867   // Inputs:
3868   //   c_rarg0   - byte[]  source+offset
3869   //   c_rarg1   - int[]   SHA.state
3870   //   c_rarg2   - int     offset
3871   //   c_rarg3   - int     limit
3872   //
3873   address generate_sha512_implCompress(bool multi_block, const char *name) {
3874     static const uint64_t round_consts[80] = {
3875       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3876       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3877       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3878       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3879       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3880       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3881       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3882       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3883       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3884       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3885       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3886       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3887       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3888       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3889       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3890       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3891       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3892       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3893       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3894       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3895       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3896       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3897       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3898       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3899       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3900       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3901       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3902     };
3903 
3904     __ align(CodeEntryAlignment);
3905     StubCodeMark mark(this, "StubRoutines", name);
3906     address start = __ pc();
3907 
3908     Register buf   = c_rarg0;
3909     Register state = c_rarg1;
3910     Register ofs   = c_rarg2;
3911     Register limit = c_rarg3;
3912 
3913     __ stpd(v8, v9, __ pre(sp, -64));
3914     __ stpd(v10, v11, Address(sp, 16));
3915     __ stpd(v12, v13, Address(sp, 32));
3916     __ stpd(v14, v15, Address(sp, 48));
3917 
3918     Label sha512_loop;
3919 
3920     // load state
3921     __ ld1(v8, v9, v10, v11, __ T2D, state);
3922 
3923     // load first 4 round constants
3924     __ lea(rscratch1, ExternalAddress((address)round_consts));
3925     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3926 
3927     __ BIND(sha512_loop);
3928     // load 128B of data into v12..v19
3929     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3930     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3931     __ rev64(v12, __ T16B, v12);
3932     __ rev64(v13, __ T16B, v13);
3933     __ rev64(v14, __ T16B, v14);
3934     __ rev64(v15, __ T16B, v15);
3935     __ rev64(v16, __ T16B, v16);
3936     __ rev64(v17, __ T16B, v17);
3937     __ rev64(v18, __ T16B, v18);
3938     __ rev64(v19, __ T16B, v19);
3939 
3940     __ mov(rscratch2, rscratch1);
3941 
3942     __ mov(v0, __ T16B, v8);
3943     __ mov(v1, __ T16B, v9);
3944     __ mov(v2, __ T16B, v10);
3945     __ mov(v3, __ T16B, v11);
3946 
3947     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3948     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3949     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3950     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3951     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3952     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3953     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3954     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3955     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3956     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3957     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3958     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3959     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3960     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3961     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3962     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3963     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3964     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3965     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3966     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3967     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3968     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3969     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3970     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3971     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3972     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3973     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3974     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3975     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3976     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3977     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3978     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3979     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3980     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3981     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3982     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3983     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3984     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3985     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3986     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3987 
3988     __ addv(v8, __ T2D, v8, v0);
3989     __ addv(v9, __ T2D, v9, v1);
3990     __ addv(v10, __ T2D, v10, v2);
3991     __ addv(v11, __ T2D, v11, v3);
3992 
3993     if (multi_block) {
3994       __ add(ofs, ofs, 128);
3995       __ cmp(ofs, limit);
3996       __ br(Assembler::LE, sha512_loop);
3997       __ mov(c_rarg0, ofs); // return ofs
3998     }
3999 
4000     __ st1(v8, v9, v10, v11, __ T2D, state);
4001 
4002     __ ldpd(v14, v15, Address(sp, 48));
4003     __ ldpd(v12, v13, Address(sp, 32));
4004     __ ldpd(v10, v11, Address(sp, 16));
4005     __ ldpd(v8, v9, __ post(sp, 64));
4006 
4007     __ ret(lr);
4008 
4009     return start;
4010   }
4011 
4012   // Arguments:
4013   //
4014   // Inputs:
4015   //   c_rarg0   - byte[]  source+offset
4016   //   c_rarg1   - byte[]  SHA.state
4017   //   c_rarg2   - int     block_size
4018   //   c_rarg3   - int     offset
4019   //   c_rarg4   - int     limit
4020   //
4021   address generate_sha3_implCompress(bool multi_block, const char *name) {
4022     static const uint64_t round_consts[24] = {
4023       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4024       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4025       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4026       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4027       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4028       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4029       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4030       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4031     };
4032 
4033     __ align(CodeEntryAlignment);
4034     StubCodeMark mark(this, "StubRoutines", name);
4035     address start = __ pc();
4036 
4037     Register buf           = c_rarg0;
4038     Register state         = c_rarg1;
4039     Register block_size    = c_rarg2;
4040     Register ofs           = c_rarg3;
4041     Register limit         = c_rarg4;
4042 
4043     Label sha3_loop, rounds24_loop;
4044     Label sha3_512_or_sha3_384, shake128;
4045 
4046     __ stpd(v8, v9, __ pre(sp, -64));
4047     __ stpd(v10, v11, Address(sp, 16));
4048     __ stpd(v12, v13, Address(sp, 32));
4049     __ stpd(v14, v15, Address(sp, 48));
4050 
4051     // load state
4052     __ add(rscratch1, state, 32);
4053     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4054     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4055     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4056     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4057     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4058     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4059     __ ld1(v24, __ T1D, rscratch1);
4060 
4061     __ BIND(sha3_loop);
4062 
4063     // 24 keccak rounds
4064     __ movw(rscratch2, 24);
4065 
4066     // load round_constants base
4067     __ lea(rscratch1, ExternalAddress((address) round_consts));
4068 
4069     // load input
4070     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4071     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4072     __ eor(v0, __ T8B, v0, v25);
4073     __ eor(v1, __ T8B, v1, v26);
4074     __ eor(v2, __ T8B, v2, v27);
4075     __ eor(v3, __ T8B, v3, v28);
4076     __ eor(v4, __ T8B, v4, v29);
4077     __ eor(v5, __ T8B, v5, v30);
4078     __ eor(v6, __ T8B, v6, v31);
4079 
4080     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4081     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4082 
4083     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4084     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4085     __ eor(v7, __ T8B, v7, v25);
4086     __ eor(v8, __ T8B, v8, v26);
4087     __ eor(v9, __ T8B, v9, v27);
4088     __ eor(v10, __ T8B, v10, v28);
4089     __ eor(v11, __ T8B, v11, v29);
4090     __ eor(v12, __ T8B, v12, v30);
4091     __ eor(v13, __ T8B, v13, v31);
4092 
4093     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4094     __ eor(v14, __ T8B, v14, v25);
4095     __ eor(v15, __ T8B, v15, v26);
4096     __ eor(v16, __ T8B, v16, v27);
4097 
4098     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4099     __ andw(c_rarg5, block_size, 48);
4100     __ cbzw(c_rarg5, rounds24_loop);
4101 
4102     __ tbnz(block_size, 5, shake128);
4103     // block_size == 144, bit5 == 0, SHA3-244
4104     __ ldrd(v28, __ post(buf, 8));
4105     __ eor(v17, __ T8B, v17, v28);
4106     __ b(rounds24_loop);
4107 
4108     __ BIND(shake128);
4109     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4110     __ eor(v17, __ T8B, v17, v28);
4111     __ eor(v18, __ T8B, v18, v29);
4112     __ eor(v19, __ T8B, v19, v30);
4113     __ eor(v20, __ T8B, v20, v31);
4114     __ b(rounds24_loop); // block_size == 168, SHAKE128
4115 
4116     __ BIND(sha3_512_or_sha3_384);
4117     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4118     __ eor(v7, __ T8B, v7, v25);
4119     __ eor(v8, __ T8B, v8, v26);
4120     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4121 
4122     // SHA3-384
4123     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4124     __ eor(v9,  __ T8B, v9,  v27);
4125     __ eor(v10, __ T8B, v10, v28);
4126     __ eor(v11, __ T8B, v11, v29);
4127     __ eor(v12, __ T8B, v12, v30);
4128 
4129     __ BIND(rounds24_loop);
4130     __ subw(rscratch2, rscratch2, 1);
4131 
4132     __ eor3(v29, __ T16B, v4, v9, v14);
4133     __ eor3(v26, __ T16B, v1, v6, v11);
4134     __ eor3(v28, __ T16B, v3, v8, v13);
4135     __ eor3(v25, __ T16B, v0, v5, v10);
4136     __ eor3(v27, __ T16B, v2, v7, v12);
4137     __ eor3(v29, __ T16B, v29, v19, v24);
4138     __ eor3(v26, __ T16B, v26, v16, v21);
4139     __ eor3(v28, __ T16B, v28, v18, v23);
4140     __ eor3(v25, __ T16B, v25, v15, v20);
4141     __ eor3(v27, __ T16B, v27, v17, v22);
4142 
4143     __ rax1(v30, __ T2D, v29, v26);
4144     __ rax1(v26, __ T2D, v26, v28);
4145     __ rax1(v28, __ T2D, v28, v25);
4146     __ rax1(v25, __ T2D, v25, v27);
4147     __ rax1(v27, __ T2D, v27, v29);
4148 
4149     __ eor(v0, __ T16B, v0, v30);
4150     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4151     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4152     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4153     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4154     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4155     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4156     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4157     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4158     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4159     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4160     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4161     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4162     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4163     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4164     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4165     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4166     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4167     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4168     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4169     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4170     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4171     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4172     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4173     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4174 
4175     __ bcax(v20, __ T16B, v31, v22, v8);
4176     __ bcax(v21, __ T16B, v8,  v23, v22);
4177     __ bcax(v22, __ T16B, v22, v24, v23);
4178     __ bcax(v23, __ T16B, v23, v31, v24);
4179     __ bcax(v24, __ T16B, v24, v8,  v31);
4180 
4181     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4182 
4183     __ bcax(v17, __ T16B, v25, v19, v3);
4184     __ bcax(v18, __ T16B, v3,  v15, v19);
4185     __ bcax(v19, __ T16B, v19, v16, v15);
4186     __ bcax(v15, __ T16B, v15, v25, v16);
4187     __ bcax(v16, __ T16B, v16, v3,  v25);
4188 
4189     __ bcax(v10, __ T16B, v29, v12, v26);
4190     __ bcax(v11, __ T16B, v26, v13, v12);
4191     __ bcax(v12, __ T16B, v12, v14, v13);
4192     __ bcax(v13, __ T16B, v13, v29, v14);
4193     __ bcax(v14, __ T16B, v14, v26, v29);
4194 
4195     __ bcax(v7, __ T16B, v30, v9,  v4);
4196     __ bcax(v8, __ T16B, v4,  v5,  v9);
4197     __ bcax(v9, __ T16B, v9,  v6,  v5);
4198     __ bcax(v5, __ T16B, v5,  v30, v6);
4199     __ bcax(v6, __ T16B, v6,  v4,  v30);
4200 
4201     __ bcax(v3, __ T16B, v27, v0,  v28);
4202     __ bcax(v4, __ T16B, v28, v1,  v0);
4203     __ bcax(v0, __ T16B, v0,  v2,  v1);
4204     __ bcax(v1, __ T16B, v1,  v27, v2);
4205     __ bcax(v2, __ T16B, v2,  v28, v27);
4206 
4207     __ eor(v0, __ T16B, v0, v31);
4208 
4209     __ cbnzw(rscratch2, rounds24_loop);
4210 
4211     if (multi_block) {
4212       __ add(ofs, ofs, block_size);
4213       __ cmp(ofs, limit);
4214       __ br(Assembler::LE, sha3_loop);
4215       __ mov(c_rarg0, ofs); // return ofs
4216     }
4217 
4218     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4219     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4220     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4221     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4222     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4223     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4224     __ st1(v24, __ T1D, state);
4225 
4226     __ ldpd(v14, v15, Address(sp, 48));
4227     __ ldpd(v12, v13, Address(sp, 32));
4228     __ ldpd(v10, v11, Address(sp, 16));
4229     __ ldpd(v8, v9, __ post(sp, 64));
4230 
4231     __ ret(lr);
4232 
4233     return start;
4234   }
4235 
4236   /**
4237    *  Arguments:
4238    *
4239    * Inputs:
4240    *   c_rarg0   - int crc
4241    *   c_rarg1   - byte* buf
4242    *   c_rarg2   - int length
4243    *
4244    * Output:
4245    *       rax   - int crc result
4246    */
4247   address generate_updateBytesCRC32() {
4248     assert(UseCRC32Intrinsics, "what are we doing here?");
4249 
4250     __ align(CodeEntryAlignment);
4251     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4252 
4253     address start = __ pc();
4254 
4255     const Register crc   = c_rarg0;  // crc
4256     const Register buf   = c_rarg1;  // source java byte array address
4257     const Register len   = c_rarg2;  // length
4258     const Register table0 = c_rarg3; // crc_table address
4259     const Register table1 = c_rarg4;
4260     const Register table2 = c_rarg5;
4261     const Register table3 = c_rarg6;
4262     const Register tmp3 = c_rarg7;
4263 
4264     BLOCK_COMMENT("Entry:");
4265     __ enter(); // required for proper stackwalking of RuntimeStub frame
4266 
4267     __ kernel_crc32(crc, buf, len,
4268               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4269 
4270     __ leave(); // required for proper stackwalking of RuntimeStub frame
4271     __ ret(lr);
4272 
4273     return start;
4274   }
4275 
4276   // ChaCha20 block function.  This version parallelizes by loading
4277   // individual 32-bit state elements into vectors for four blocks
4278   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4279   //
4280   // state (int[16]) = c_rarg0
4281   // keystream (byte[1024]) = c_rarg1
4282   // return - number of bytes of keystream (always 256)
4283   address generate_chacha20Block_blockpar() {
4284     Label L_twoRounds, L_cc20_const;
4285     // The constant data is broken into two 128-bit segments to be loaded
4286     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4287     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4288     // The second 128-bits is a table constant used for 8-bit left rotations.
4289     __ BIND(L_cc20_const);
4290     __ emit_int64(0x0000000100000000UL);
4291     __ emit_int64(0x0000000300000002UL);
4292     __ emit_int64(0x0605040702010003UL);
4293     __ emit_int64(0x0E0D0C0F0A09080BUL);
4294 
4295     __ align(CodeEntryAlignment);
4296     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4297     address start = __ pc();
4298     __ enter();
4299 
4300     int i, j;
4301     const Register state = c_rarg0;
4302     const Register keystream = c_rarg1;
4303     const Register loopCtr = r10;
4304     const Register tmpAddr = r11;
4305 
4306     const FloatRegister stateFirst = v0;
4307     const FloatRegister stateSecond = v1;
4308     const FloatRegister stateThird = v2;
4309     const FloatRegister stateFourth = v3;
4310     const FloatRegister origCtrState = v28;
4311     const FloatRegister scratch = v29;
4312     const FloatRegister lrot8Tbl = v30;
4313 
4314     // Organize SIMD registers in an array that facilitates
4315     // putting repetitive opcodes into loop structures.  It is
4316     // important that each grouping of 4 registers is monotonically
4317     // increasing to support the requirements of multi-register
4318     // instructions (e.g. ld4r, st4, etc.)
4319     const FloatRegister workSt[16] = {
4320          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4321         v20, v21, v22, v23, v24, v25, v26, v27
4322     };
4323 
4324     // Load from memory and interlace across 16 SIMD registers,
4325     // With each word from memory being broadcast to all lanes of
4326     // each successive SIMD register.
4327     //      Addr(0) -> All lanes in workSt[i]
4328     //      Addr(4) -> All lanes workSt[i + 1], etc.
4329     __ mov(tmpAddr, state);
4330     for (i = 0; i < 16; i += 4) {
4331       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4332           __ post(tmpAddr, 16));
4333     }
4334 
4335     // Pull in constant data.  The first 16 bytes are the add overlay
4336     // which is applied to the vector holding the counter (state[12]).
4337     // The second 16 bytes is the index register for the 8-bit left
4338     // rotation tbl instruction.
4339     __ adr(tmpAddr, L_cc20_const);
4340     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4341     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4342 
4343     // Set up the 10 iteration loop and perform all 8 quarter round ops
4344     __ mov(loopCtr, 10);
4345     __ BIND(L_twoRounds);
4346 
4347     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4348         scratch, lrot8Tbl);
4349     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4350         scratch, lrot8Tbl);
4351     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4352         scratch, lrot8Tbl);
4353     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4354         scratch, lrot8Tbl);
4355 
4356     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4357         scratch, lrot8Tbl);
4358     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4359         scratch, lrot8Tbl);
4360     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4361         scratch, lrot8Tbl);
4362     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4363         scratch, lrot8Tbl);
4364 
4365     // Decrement and iterate
4366     __ sub(loopCtr, loopCtr, 1);
4367     __ cbnz(loopCtr, L_twoRounds);
4368 
4369     __ mov(tmpAddr, state);
4370 
4371     // Add the starting state back to the post-loop keystream
4372     // state.  We read/interlace the state array from memory into
4373     // 4 registers similar to what we did in the beginning.  Then
4374     // add the counter overlay onto workSt[12] at the end.
4375     for (i = 0; i < 16; i += 4) {
4376       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4377           __ post(tmpAddr, 16));
4378       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4379       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4380       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4381       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4382     }
4383     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4384 
4385     // Write to key stream, storing the same element out of workSt[0..15]
4386     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4387     // for the next element position.
4388     for (i = 0; i < 4; i++) {
4389       for (j = 0; j < 16; j += 4) {
4390         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4391             __ post(keystream, 16));
4392       }
4393     }
4394 
4395     __ mov(r0, 256);             // Return length of output keystream
4396     __ leave();
4397     __ ret(lr);
4398 
4399     return start;
4400   }
4401 
4402   /**
4403    *  Arguments:
4404    *
4405    * Inputs:
4406    *   c_rarg0   - int crc
4407    *   c_rarg1   - byte* buf
4408    *   c_rarg2   - int length
4409    *   c_rarg3   - int* table
4410    *
4411    * Output:
4412    *       r0   - int crc result
4413    */
4414   address generate_updateBytesCRC32C() {
4415     assert(UseCRC32CIntrinsics, "what are we doing here?");
4416 
4417     __ align(CodeEntryAlignment);
4418     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4419 
4420     address start = __ pc();
4421 
4422     const Register crc   = c_rarg0;  // crc
4423     const Register buf   = c_rarg1;  // source java byte array address
4424     const Register len   = c_rarg2;  // length
4425     const Register table0 = c_rarg3; // crc_table address
4426     const Register table1 = c_rarg4;
4427     const Register table2 = c_rarg5;
4428     const Register table3 = c_rarg6;
4429     const Register tmp3 = c_rarg7;
4430 
4431     BLOCK_COMMENT("Entry:");
4432     __ enter(); // required for proper stackwalking of RuntimeStub frame
4433 
4434     __ kernel_crc32c(crc, buf, len,
4435               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4436 
4437     __ leave(); // required for proper stackwalking of RuntimeStub frame
4438     __ ret(lr);
4439 
4440     return start;
4441   }
4442 
4443   /***
4444    *  Arguments:
4445    *
4446    *  Inputs:
4447    *   c_rarg0   - int   adler
4448    *   c_rarg1   - byte* buff
4449    *   c_rarg2   - int   len
4450    *
4451    * Output:
4452    *   c_rarg0   - int adler result
4453    */
4454   address generate_updateBytesAdler32() {
4455     __ align(CodeEntryAlignment);
4456     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4457     address start = __ pc();
4458 
4459     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4460 
4461     // Aliases
4462     Register adler  = c_rarg0;
4463     Register s1     = c_rarg0;
4464     Register s2     = c_rarg3;
4465     Register buff   = c_rarg1;
4466     Register len    = c_rarg2;
4467     Register nmax  = r4;
4468     Register base  = r5;
4469     Register count = r6;
4470     Register temp0 = rscratch1;
4471     Register temp1 = rscratch2;
4472     FloatRegister vbytes = v0;
4473     FloatRegister vs1acc = v1;
4474     FloatRegister vs2acc = v2;
4475     FloatRegister vtable = v3;
4476 
4477     // Max number of bytes we can process before having to take the mod
4478     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4479     uint64_t BASE = 0xfff1;
4480     uint64_t NMAX = 0x15B0;
4481 
4482     __ mov(base, BASE);
4483     __ mov(nmax, NMAX);
4484 
4485     // Load accumulation coefficients for the upper 16 bits
4486     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4487     __ ld1(vtable, __ T16B, Address(temp0));
4488 
4489     // s1 is initialized to the lower 16 bits of adler
4490     // s2 is initialized to the upper 16 bits of adler
4491     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4492     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4493 
4494     // The pipelined loop needs at least 16 elements for 1 iteration
4495     // It does check this, but it is more effective to skip to the cleanup loop
4496     __ cmp(len, (u1)16);
4497     __ br(Assembler::HS, L_nmax);
4498     __ cbz(len, L_combine);
4499 
4500     __ bind(L_simple_by1_loop);
4501     __ ldrb(temp0, Address(__ post(buff, 1)));
4502     __ add(s1, s1, temp0);
4503     __ add(s2, s2, s1);
4504     __ subs(len, len, 1);
4505     __ br(Assembler::HI, L_simple_by1_loop);
4506 
4507     // s1 = s1 % BASE
4508     __ subs(temp0, s1, base);
4509     __ csel(s1, temp0, s1, Assembler::HS);
4510 
4511     // s2 = s2 % BASE
4512     __ lsr(temp0, s2, 16);
4513     __ lsl(temp1, temp0, 4);
4514     __ sub(temp1, temp1, temp0);
4515     __ add(s2, temp1, s2, ext::uxth);
4516 
4517     __ subs(temp0, s2, base);
4518     __ csel(s2, temp0, s2, Assembler::HS);
4519 
4520     __ b(L_combine);
4521 
4522     __ bind(L_nmax);
4523     __ subs(len, len, nmax);
4524     __ sub(count, nmax, 16);
4525     __ br(Assembler::LO, L_by16);
4526 
4527     __ bind(L_nmax_loop);
4528 
4529     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4530                                       vbytes, vs1acc, vs2acc, vtable);
4531 
4532     __ subs(count, count, 16);
4533     __ br(Assembler::HS, L_nmax_loop);
4534 
4535     // s1 = s1 % BASE
4536     __ lsr(temp0, s1, 16);
4537     __ lsl(temp1, temp0, 4);
4538     __ sub(temp1, temp1, temp0);
4539     __ add(temp1, temp1, s1, ext::uxth);
4540 
4541     __ lsr(temp0, temp1, 16);
4542     __ lsl(s1, temp0, 4);
4543     __ sub(s1, s1, temp0);
4544     __ add(s1, s1, temp1, ext:: uxth);
4545 
4546     __ subs(temp0, s1, base);
4547     __ csel(s1, temp0, s1, Assembler::HS);
4548 
4549     // s2 = s2 % BASE
4550     __ lsr(temp0, s2, 16);
4551     __ lsl(temp1, temp0, 4);
4552     __ sub(temp1, temp1, temp0);
4553     __ add(temp1, temp1, s2, ext::uxth);
4554 
4555     __ lsr(temp0, temp1, 16);
4556     __ lsl(s2, temp0, 4);
4557     __ sub(s2, s2, temp0);
4558     __ add(s2, s2, temp1, ext:: uxth);
4559 
4560     __ subs(temp0, s2, base);
4561     __ csel(s2, temp0, s2, Assembler::HS);
4562 
4563     __ subs(len, len, nmax);
4564     __ sub(count, nmax, 16);
4565     __ br(Assembler::HS, L_nmax_loop);
4566 
4567     __ bind(L_by16);
4568     __ adds(len, len, count);
4569     __ br(Assembler::LO, L_by1);
4570 
4571     __ bind(L_by16_loop);
4572 
4573     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4574                                       vbytes, vs1acc, vs2acc, vtable);
4575 
4576     __ subs(len, len, 16);
4577     __ br(Assembler::HS, L_by16_loop);
4578 
4579     __ bind(L_by1);
4580     __ adds(len, len, 15);
4581     __ br(Assembler::LO, L_do_mod);
4582 
4583     __ bind(L_by1_loop);
4584     __ ldrb(temp0, Address(__ post(buff, 1)));
4585     __ add(s1, temp0, s1);
4586     __ add(s2, s2, s1);
4587     __ subs(len, len, 1);
4588     __ br(Assembler::HS, L_by1_loop);
4589 
4590     __ bind(L_do_mod);
4591     // s1 = s1 % BASE
4592     __ lsr(temp0, s1, 16);
4593     __ lsl(temp1, temp0, 4);
4594     __ sub(temp1, temp1, temp0);
4595     __ add(temp1, temp1, s1, ext::uxth);
4596 
4597     __ lsr(temp0, temp1, 16);
4598     __ lsl(s1, temp0, 4);
4599     __ sub(s1, s1, temp0);
4600     __ add(s1, s1, temp1, ext:: uxth);
4601 
4602     __ subs(temp0, s1, base);
4603     __ csel(s1, temp0, s1, Assembler::HS);
4604 
4605     // s2 = s2 % BASE
4606     __ lsr(temp0, s2, 16);
4607     __ lsl(temp1, temp0, 4);
4608     __ sub(temp1, temp1, temp0);
4609     __ add(temp1, temp1, s2, ext::uxth);
4610 
4611     __ lsr(temp0, temp1, 16);
4612     __ lsl(s2, temp0, 4);
4613     __ sub(s2, s2, temp0);
4614     __ add(s2, s2, temp1, ext:: uxth);
4615 
4616     __ subs(temp0, s2, base);
4617     __ csel(s2, temp0, s2, Assembler::HS);
4618 
4619     // Combine lower bits and higher bits
4620     __ bind(L_combine);
4621     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4622 
4623     __ ret(lr);
4624 
4625     return start;
4626   }
4627 
4628   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4629           Register temp0, Register temp1, FloatRegister vbytes,
4630           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4631     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4632     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4633     // In non-vectorized code, we update s1 and s2 as:
4634     //   s1 <- s1 + b1
4635     //   s2 <- s2 + s1
4636     //   s1 <- s1 + b2
4637     //   s2 <- s2 + b1
4638     //   ...
4639     //   s1 <- s1 + b16
4640     //   s2 <- s2 + s1
4641     // Putting above assignments together, we have:
4642     //   s1_new = s1 + b1 + b2 + ... + b16
4643     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4644     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4645     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4646     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4647 
4648     // s2 = s2 + s1 * 16
4649     __ add(s2, s2, s1, Assembler::LSL, 4);
4650 
4651     // vs1acc = b1 + b2 + b3 + ... + b16
4652     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4653     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4654     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4655     __ uaddlv(vs1acc, __ T16B, vbytes);
4656     __ uaddlv(vs2acc, __ T8H, vs2acc);
4657 
4658     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4659     __ fmovd(temp0, vs1acc);
4660     __ fmovd(temp1, vs2acc);
4661     __ add(s1, s1, temp0);
4662     __ add(s2, s2, temp1);
4663   }
4664 
4665   /**
4666    *  Arguments:
4667    *
4668    *  Input:
4669    *    c_rarg0   - x address
4670    *    c_rarg1   - x length
4671    *    c_rarg2   - y address
4672    *    c_rarg3   - y length
4673    *    c_rarg4   - z address
4674    */
4675   address generate_multiplyToLen() {
4676     __ align(CodeEntryAlignment);
4677     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4678 
4679     address start = __ pc();
4680  
4681     if (SCCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
4682       return start;
4683     }
4684     const Register x     = r0;
4685     const Register xlen  = r1;
4686     const Register y     = r2;
4687     const Register ylen  = r3;
4688     const Register z     = r4;
4689 
4690     const Register tmp0  = r5;
4691     const Register tmp1  = r10;
4692     const Register tmp2  = r11;
4693     const Register tmp3  = r12;
4694     const Register tmp4  = r13;
4695     const Register tmp5  = r14;
4696     const Register tmp6  = r15;
4697     const Register tmp7  = r16;
4698 
4699     BLOCK_COMMENT("Entry:");
4700     __ enter(); // required for proper stackwalking of RuntimeStub frame
4701     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4702     __ leave(); // required for proper stackwalking of RuntimeStub frame
4703     __ ret(lr);
4704 
4705     SCCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
4706     return start;
4707   }
4708 
4709   address generate_squareToLen() {
4710     // squareToLen algorithm for sizes 1..127 described in java code works
4711     // faster than multiply_to_len on some CPUs and slower on others, but
4712     // multiply_to_len shows a bit better overall results
4713     __ align(CodeEntryAlignment);
4714     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4715     address start = __ pc();
4716 
4717     if (SCCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
4718       return start;
4719     }
4720     const Register x     = r0;
4721     const Register xlen  = r1;
4722     const Register z     = r2;
4723     const Register y     = r4; // == x
4724     const Register ylen  = r5; // == xlen
4725 
4726     const Register tmp0  = r3;
4727     const Register tmp1  = r10;
4728     const Register tmp2  = r11;
4729     const Register tmp3  = r12;
4730     const Register tmp4  = r13;
4731     const Register tmp5  = r14;
4732     const Register tmp6  = r15;
4733     const Register tmp7  = r16;
4734 
4735     RegSet spilled_regs = RegSet::of(y, ylen);
4736     BLOCK_COMMENT("Entry:");
4737     __ enter();
4738     __ push(spilled_regs, sp);
4739     __ mov(y, x);
4740     __ mov(ylen, xlen);
4741     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4742     __ pop(spilled_regs, sp);
4743     __ leave();
4744     __ ret(lr);
4745 
4746     SCCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
4747     return start;
4748   }
4749 
4750   address generate_mulAdd() {
4751     __ align(CodeEntryAlignment);
4752     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4753 
4754     address start = __ pc();
4755 
4756     if (SCCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
4757       return start;
4758     }
4759     const Register out     = r0;
4760     const Register in      = r1;
4761     const Register offset  = r2;
4762     const Register len     = r3;
4763     const Register k       = r4;
4764 
4765     BLOCK_COMMENT("Entry:");
4766     __ enter();
4767     __ mul_add(out, in, offset, len, k);
4768     __ leave();
4769     __ ret(lr);
4770 
4771     SCCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
4772     return start;
4773   }
4774 
4775   // Arguments:
4776   //
4777   // Input:
4778   //   c_rarg0   - newArr address
4779   //   c_rarg1   - oldArr address
4780   //   c_rarg2   - newIdx
4781   //   c_rarg3   - shiftCount
4782   //   c_rarg4   - numIter
4783   //
4784   address generate_bigIntegerRightShift() {
4785     __ align(CodeEntryAlignment);
4786     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4787     address start = __ pc();
4788 
4789     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4790 
4791     Register newArr        = c_rarg0;
4792     Register oldArr        = c_rarg1;
4793     Register newIdx        = c_rarg2;
4794     Register shiftCount    = c_rarg3;
4795     Register numIter       = c_rarg4;
4796     Register idx           = numIter;
4797 
4798     Register newArrCur     = rscratch1;
4799     Register shiftRevCount = rscratch2;
4800     Register oldArrCur     = r13;
4801     Register oldArrNext    = r14;
4802 
4803     FloatRegister oldElem0        = v0;
4804     FloatRegister oldElem1        = v1;
4805     FloatRegister newElem         = v2;
4806     FloatRegister shiftVCount     = v3;
4807     FloatRegister shiftVRevCount  = v4;
4808 
4809     __ cbz(idx, Exit);
4810 
4811     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4812 
4813     // left shift count
4814     __ movw(shiftRevCount, 32);
4815     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4816 
4817     // numIter too small to allow a 4-words SIMD loop, rolling back
4818     __ cmp(numIter, (u1)4);
4819     __ br(Assembler::LT, ShiftThree);
4820 
4821     __ dup(shiftVCount,    __ T4S, shiftCount);
4822     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4823     __ negr(shiftVCount,   __ T4S, shiftVCount);
4824 
4825     __ BIND(ShiftSIMDLoop);
4826 
4827     // Calculate the load addresses
4828     __ sub(idx, idx, 4);
4829     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4830     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4831     __ add(oldArrCur,  oldArrNext, 4);
4832 
4833     // Load 4 words and process
4834     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4835     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4836     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4837     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4838     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4839     __ st1(newElem,   __ T4S,  Address(newArrCur));
4840 
4841     __ cmp(idx, (u1)4);
4842     __ br(Assembler::LT, ShiftTwoLoop);
4843     __ b(ShiftSIMDLoop);
4844 
4845     __ BIND(ShiftTwoLoop);
4846     __ cbz(idx, Exit);
4847     __ cmp(idx, (u1)1);
4848     __ br(Assembler::EQ, ShiftOne);
4849 
4850     // Calculate the load addresses
4851     __ sub(idx, idx, 2);
4852     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4853     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4854     __ add(oldArrCur,  oldArrNext, 4);
4855 
4856     // Load 2 words and process
4857     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4858     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4859     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4860     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4861     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4862     __ st1(newElem,   __ T2S, Address(newArrCur));
4863     __ b(ShiftTwoLoop);
4864 
4865     __ BIND(ShiftThree);
4866     __ tbz(idx, 1, ShiftOne);
4867     __ tbz(idx, 0, ShiftTwo);
4868     __ ldrw(r10,  Address(oldArr, 12));
4869     __ ldrw(r11,  Address(oldArr, 8));
4870     __ lsrvw(r10, r10, shiftCount);
4871     __ lslvw(r11, r11, shiftRevCount);
4872     __ orrw(r12,  r10, r11);
4873     __ strw(r12,  Address(newArr, 8));
4874 
4875     __ BIND(ShiftTwo);
4876     __ ldrw(r10,  Address(oldArr, 8));
4877     __ ldrw(r11,  Address(oldArr, 4));
4878     __ lsrvw(r10, r10, shiftCount);
4879     __ lslvw(r11, r11, shiftRevCount);
4880     __ orrw(r12,  r10, r11);
4881     __ strw(r12,  Address(newArr, 4));
4882 
4883     __ BIND(ShiftOne);
4884     __ ldrw(r10,  Address(oldArr, 4));
4885     __ ldrw(r11,  Address(oldArr));
4886     __ lsrvw(r10, r10, shiftCount);
4887     __ lslvw(r11, r11, shiftRevCount);
4888     __ orrw(r12,  r10, r11);
4889     __ strw(r12,  Address(newArr));
4890 
4891     __ BIND(Exit);
4892     __ ret(lr);
4893 
4894     return start;
4895   }
4896 
4897   // Arguments:
4898   //
4899   // Input:
4900   //   c_rarg0   - newArr address
4901   //   c_rarg1   - oldArr address
4902   //   c_rarg2   - newIdx
4903   //   c_rarg3   - shiftCount
4904   //   c_rarg4   - numIter
4905   //
4906   address generate_bigIntegerLeftShift() {
4907     __ align(CodeEntryAlignment);
4908     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4909     address start = __ pc();
4910 
4911     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4912 
4913     Register newArr        = c_rarg0;
4914     Register oldArr        = c_rarg1;
4915     Register newIdx        = c_rarg2;
4916     Register shiftCount    = c_rarg3;
4917     Register numIter       = c_rarg4;
4918 
4919     Register shiftRevCount = rscratch1;
4920     Register oldArrNext    = rscratch2;
4921 
4922     FloatRegister oldElem0        = v0;
4923     FloatRegister oldElem1        = v1;
4924     FloatRegister newElem         = v2;
4925     FloatRegister shiftVCount     = v3;
4926     FloatRegister shiftVRevCount  = v4;
4927 
4928     __ cbz(numIter, Exit);
4929 
4930     __ add(oldArrNext, oldArr, 4);
4931     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4932 
4933     // right shift count
4934     __ movw(shiftRevCount, 32);
4935     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4936 
4937     // numIter too small to allow a 4-words SIMD loop, rolling back
4938     __ cmp(numIter, (u1)4);
4939     __ br(Assembler::LT, ShiftThree);
4940 
4941     __ dup(shiftVCount,     __ T4S, shiftCount);
4942     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4943     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4944 
4945     __ BIND(ShiftSIMDLoop);
4946 
4947     // load 4 words and process
4948     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4949     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4950     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4951     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4952     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4953     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4954     __ sub(numIter,   numIter, 4);
4955 
4956     __ cmp(numIter, (u1)4);
4957     __ br(Assembler::LT, ShiftTwoLoop);
4958     __ b(ShiftSIMDLoop);
4959 
4960     __ BIND(ShiftTwoLoop);
4961     __ cbz(numIter, Exit);
4962     __ cmp(numIter, (u1)1);
4963     __ br(Assembler::EQ, ShiftOne);
4964 
4965     // load 2 words and process
4966     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4967     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4968     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4969     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4970     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4971     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4972     __ sub(numIter,   numIter, 2);
4973     __ b(ShiftTwoLoop);
4974 
4975     __ BIND(ShiftThree);
4976     __ ldrw(r10,  __ post(oldArr, 4));
4977     __ ldrw(r11,  __ post(oldArrNext, 4));
4978     __ lslvw(r10, r10, shiftCount);
4979     __ lsrvw(r11, r11, shiftRevCount);
4980     __ orrw(r12,  r10, r11);
4981     __ strw(r12,  __ post(newArr, 4));
4982     __ tbz(numIter, 1, Exit);
4983     __ tbz(numIter, 0, ShiftOne);
4984 
4985     __ BIND(ShiftTwo);
4986     __ ldrw(r10,  __ post(oldArr, 4));
4987     __ ldrw(r11,  __ post(oldArrNext, 4));
4988     __ lslvw(r10, r10, shiftCount);
4989     __ lsrvw(r11, r11, shiftRevCount);
4990     __ orrw(r12,  r10, r11);
4991     __ strw(r12,  __ post(newArr, 4));
4992 
4993     __ BIND(ShiftOne);
4994     __ ldrw(r10,  Address(oldArr));
4995     __ ldrw(r11,  Address(oldArrNext));
4996     __ lslvw(r10, r10, shiftCount);
4997     __ lsrvw(r11, r11, shiftRevCount);
4998     __ orrw(r12,  r10, r11);
4999     __ strw(r12,  Address(newArr));
5000 
5001     __ BIND(Exit);
5002     __ ret(lr);
5003 
5004     return start;
5005   }
5006 
5007   address generate_count_positives(address &count_positives_long) {
5008     const u1 large_loop_size = 64;
5009     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5010     int dcache_line = VM_Version::dcache_line_size();
5011 
5012     Register ary1 = r1, len = r2, result = r0;
5013 
5014     __ align(CodeEntryAlignment);
5015 
5016     StubCodeMark mark(this, "StubRoutines", "count_positives");
5017 
5018     address entry = __ pc();
5019 
5020     __ enter();
5021     // precondition: a copy of len is already in result
5022     // __ mov(result, len);
5023 
5024   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5025         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5026 
5027   __ cmp(len, (u1)15);
5028   __ br(Assembler::GT, LEN_OVER_15);
5029   // The only case when execution falls into this code is when pointer is near
5030   // the end of memory page and we have to avoid reading next page
5031   __ add(ary1, ary1, len);
5032   __ subs(len, len, 8);
5033   __ br(Assembler::GT, LEN_OVER_8);
5034   __ ldr(rscratch2, Address(ary1, -8));
5035   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5036   __ lsrv(rscratch2, rscratch2, rscratch1);
5037   __ tst(rscratch2, UPPER_BIT_MASK);
5038   __ csel(result, zr, result, Assembler::NE);
5039   __ leave();
5040   __ ret(lr);
5041   __ bind(LEN_OVER_8);
5042   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5043   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5044   __ tst(rscratch2, UPPER_BIT_MASK);
5045   __ br(Assembler::NE, RET_NO_POP);
5046   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5047   __ lsrv(rscratch1, rscratch1, rscratch2);
5048   __ tst(rscratch1, UPPER_BIT_MASK);
5049   __ bind(RET_NO_POP);
5050   __ csel(result, zr, result, Assembler::NE);
5051   __ leave();
5052   __ ret(lr);
5053 
5054   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5055   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5056 
5057   count_positives_long = __ pc(); // 2nd entry point
5058 
5059   __ enter();
5060 
5061   __ bind(LEN_OVER_15);
5062     __ push(spilled_regs, sp);
5063     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5064     __ cbz(rscratch2, ALIGNED);
5065     __ ldp(tmp6, tmp1, Address(ary1));
5066     __ mov(tmp5, 16);
5067     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5068     __ add(ary1, ary1, rscratch1);
5069     __ orr(tmp6, tmp6, tmp1);
5070     __ tst(tmp6, UPPER_BIT_MASK);
5071     __ br(Assembler::NE, RET_ADJUST);
5072     __ sub(len, len, rscratch1);
5073 
5074   __ bind(ALIGNED);
5075     __ cmp(len, large_loop_size);
5076     __ br(Assembler::LT, CHECK_16);
5077     // Perform 16-byte load as early return in pre-loop to handle situation
5078     // when initially aligned large array has negative values at starting bytes,
5079     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5080     // slower. Cases with negative bytes further ahead won't be affected that
5081     // much. In fact, it'll be faster due to early loads, less instructions and
5082     // less branches in LARGE_LOOP.
5083     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5084     __ sub(len, len, 16);
5085     __ orr(tmp6, tmp6, tmp1);
5086     __ tst(tmp6, UPPER_BIT_MASK);
5087     __ br(Assembler::NE, RET_ADJUST_16);
5088     __ cmp(len, large_loop_size);
5089     __ br(Assembler::LT, CHECK_16);
5090 
5091     if (SoftwarePrefetchHintDistance >= 0
5092         && SoftwarePrefetchHintDistance >= dcache_line) {
5093       // initial prefetch
5094       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5095     }
5096   __ bind(LARGE_LOOP);
5097     if (SoftwarePrefetchHintDistance >= 0) {
5098       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5099     }
5100     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5101     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5102     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5103     // instructions per cycle and have less branches, but this approach disables
5104     // early return, thus, all 64 bytes are loaded and checked every time.
5105     __ ldp(tmp2, tmp3, Address(ary1));
5106     __ ldp(tmp4, tmp5, Address(ary1, 16));
5107     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5108     __ ldp(tmp6, tmp1, Address(ary1, 48));
5109     __ add(ary1, ary1, large_loop_size);
5110     __ sub(len, len, large_loop_size);
5111     __ orr(tmp2, tmp2, tmp3);
5112     __ orr(tmp4, tmp4, tmp5);
5113     __ orr(rscratch1, rscratch1, rscratch2);
5114     __ orr(tmp6, tmp6, tmp1);
5115     __ orr(tmp2, tmp2, tmp4);
5116     __ orr(rscratch1, rscratch1, tmp6);
5117     __ orr(tmp2, tmp2, rscratch1);
5118     __ tst(tmp2, UPPER_BIT_MASK);
5119     __ br(Assembler::NE, RET_ADJUST_LONG);
5120     __ cmp(len, large_loop_size);
5121     __ br(Assembler::GE, LARGE_LOOP);
5122 
5123   __ bind(CHECK_16); // small 16-byte load pre-loop
5124     __ cmp(len, (u1)16);
5125     __ br(Assembler::LT, POST_LOOP16);
5126 
5127   __ bind(LOOP16); // small 16-byte load loop
5128     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5129     __ sub(len, len, 16);
5130     __ orr(tmp2, tmp2, tmp3);
5131     __ tst(tmp2, UPPER_BIT_MASK);
5132     __ br(Assembler::NE, RET_ADJUST_16);
5133     __ cmp(len, (u1)16);
5134     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5135 
5136   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5137     __ cmp(len, (u1)8);
5138     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5139     __ ldr(tmp3, Address(__ post(ary1, 8)));
5140     __ tst(tmp3, UPPER_BIT_MASK);
5141     __ br(Assembler::NE, RET_ADJUST);
5142     __ sub(len, len, 8);
5143 
5144   __ bind(POST_LOOP16_LOAD_TAIL);
5145     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5146     __ ldr(tmp1, Address(ary1));
5147     __ mov(tmp2, 64);
5148     __ sub(tmp4, tmp2, len, __ LSL, 3);
5149     __ lslv(tmp1, tmp1, tmp4);
5150     __ tst(tmp1, UPPER_BIT_MASK);
5151     __ br(Assembler::NE, RET_ADJUST);
5152     // Fallthrough
5153 
5154   __ bind(RET_LEN);
5155     __ pop(spilled_regs, sp);
5156     __ leave();
5157     __ ret(lr);
5158 
5159     // difference result - len is the count of guaranteed to be
5160     // positive bytes
5161 
5162   __ bind(RET_ADJUST_LONG);
5163     __ add(len, len, (u1)(large_loop_size - 16));
5164   __ bind(RET_ADJUST_16);
5165     __ add(len, len, 16);
5166   __ bind(RET_ADJUST);
5167     __ pop(spilled_regs, sp);
5168     __ leave();
5169     __ sub(result, result, len);
5170     __ ret(lr);
5171 
5172     return entry;
5173   }
5174 
5175   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5176         bool usePrefetch, Label &NOT_EQUAL) {
5177     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5178         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5179         tmp7 = r12, tmp8 = r13;
5180     Label LOOP;
5181 
5182     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5183     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5184     __ bind(LOOP);
5185     if (usePrefetch) {
5186       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5187       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5188     }
5189     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5190     __ eor(tmp1, tmp1, tmp2);
5191     __ eor(tmp3, tmp3, tmp4);
5192     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5193     __ orr(tmp1, tmp1, tmp3);
5194     __ cbnz(tmp1, NOT_EQUAL);
5195     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5196     __ eor(tmp5, tmp5, tmp6);
5197     __ eor(tmp7, tmp7, tmp8);
5198     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5199     __ orr(tmp5, tmp5, tmp7);
5200     __ cbnz(tmp5, NOT_EQUAL);
5201     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5202     __ eor(tmp1, tmp1, tmp2);
5203     __ eor(tmp3, tmp3, tmp4);
5204     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5205     __ orr(tmp1, tmp1, tmp3);
5206     __ cbnz(tmp1, NOT_EQUAL);
5207     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5208     __ eor(tmp5, tmp5, tmp6);
5209     __ sub(cnt1, cnt1, 8 * wordSize);
5210     __ eor(tmp7, tmp7, tmp8);
5211     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5212     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5213     // cmp) because subs allows an unlimited range of immediate operand.
5214     __ subs(tmp6, cnt1, loopThreshold);
5215     __ orr(tmp5, tmp5, tmp7);
5216     __ cbnz(tmp5, NOT_EQUAL);
5217     __ br(__ GE, LOOP);
5218     // post-loop
5219     __ eor(tmp1, tmp1, tmp2);
5220     __ eor(tmp3, tmp3, tmp4);
5221     __ orr(tmp1, tmp1, tmp3);
5222     __ sub(cnt1, cnt1, 2 * wordSize);
5223     __ cbnz(tmp1, NOT_EQUAL);
5224   }
5225 
5226   void generate_large_array_equals_loop_simd(int loopThreshold,
5227         bool usePrefetch, Label &NOT_EQUAL) {
5228     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5229         tmp2 = rscratch2;
5230     Label LOOP;
5231 
5232     __ bind(LOOP);
5233     if (usePrefetch) {
5234       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5235       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5236     }
5237     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5238     __ sub(cnt1, cnt1, 8 * wordSize);
5239     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5240     __ subs(tmp1, cnt1, loopThreshold);
5241     __ eor(v0, __ T16B, v0, v4);
5242     __ eor(v1, __ T16B, v1, v5);
5243     __ eor(v2, __ T16B, v2, v6);
5244     __ eor(v3, __ T16B, v3, v7);
5245     __ orr(v0, __ T16B, v0, v1);
5246     __ orr(v1, __ T16B, v2, v3);
5247     __ orr(v0, __ T16B, v0, v1);
5248     __ umov(tmp1, v0, __ D, 0);
5249     __ umov(tmp2, v0, __ D, 1);
5250     __ orr(tmp1, tmp1, tmp2);
5251     __ cbnz(tmp1, NOT_EQUAL);
5252     __ br(__ GE, LOOP);
5253   }
5254 
5255   // a1 = r1 - array1 address
5256   // a2 = r2 - array2 address
5257   // result = r0 - return value. Already contains "false"
5258   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5259   // r3-r5 are reserved temporary registers
5260   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5261   address generate_large_array_equals() {
5262     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5263         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5264         tmp7 = r12, tmp8 = r13;
5265     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5266         SMALL_LOOP, POST_LOOP;
5267     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5268     // calculate if at least 32 prefetched bytes are used
5269     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5270     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5271     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5272     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5273         tmp5, tmp6, tmp7, tmp8);
5274 
5275     __ align(CodeEntryAlignment);
5276 
5277     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5278 
5279     address entry = __ pc();
5280     __ enter();
5281     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5282     // also advance pointers to use post-increment instead of pre-increment
5283     __ add(a1, a1, wordSize);
5284     __ add(a2, a2, wordSize);
5285     if (AvoidUnalignedAccesses) {
5286       // both implementations (SIMD/nonSIMD) are using relatively large load
5287       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5288       // on some CPUs in case of address is not at least 16-byte aligned.
5289       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5290       // load if needed at least for 1st address and make if 16-byte aligned.
5291       Label ALIGNED16;
5292       __ tbz(a1, 3, ALIGNED16);
5293       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5294       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5295       __ sub(cnt1, cnt1, wordSize);
5296       __ eor(tmp1, tmp1, tmp2);
5297       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5298       __ bind(ALIGNED16);
5299     }
5300     if (UseSIMDForArrayEquals) {
5301       if (SoftwarePrefetchHintDistance >= 0) {
5302         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5303         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5304         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5305             /* prfm = */ true, NOT_EQUAL);
5306         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5307         __ br(__ LT, TAIL);
5308       }
5309       __ bind(NO_PREFETCH_LARGE_LOOP);
5310       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5311           /* prfm = */ false, NOT_EQUAL);
5312     } else {
5313       __ push(spilled_regs, sp);
5314       if (SoftwarePrefetchHintDistance >= 0) {
5315         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5316         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5317         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5318             /* prfm = */ true, NOT_EQUAL);
5319         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5320         __ br(__ LT, TAIL);
5321       }
5322       __ bind(NO_PREFETCH_LARGE_LOOP);
5323       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5324           /* prfm = */ false, NOT_EQUAL);
5325     }
5326     __ bind(TAIL);
5327       __ cbz(cnt1, EQUAL);
5328       __ subs(cnt1, cnt1, wordSize);
5329       __ br(__ LE, POST_LOOP);
5330     __ bind(SMALL_LOOP);
5331       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5332       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5333       __ subs(cnt1, cnt1, wordSize);
5334       __ eor(tmp1, tmp1, tmp2);
5335       __ cbnz(tmp1, NOT_EQUAL);
5336       __ br(__ GT, SMALL_LOOP);
5337     __ bind(POST_LOOP);
5338       __ ldr(tmp1, Address(a1, cnt1));
5339       __ ldr(tmp2, Address(a2, cnt1));
5340       __ eor(tmp1, tmp1, tmp2);
5341       __ cbnz(tmp1, NOT_EQUAL);
5342     __ bind(EQUAL);
5343       __ mov(result, true);
5344     __ bind(NOT_EQUAL);
5345       if (!UseSIMDForArrayEquals) {
5346         __ pop(spilled_regs, sp);
5347       }
5348     __ bind(NOT_EQUAL_NO_POP);
5349     __ leave();
5350     __ ret(lr);
5351     return entry;
5352   }
5353 
5354   // result = r0 - return value. Contains initial hashcode value on entry.
5355   // ary = r1 - array address
5356   // cnt = r2 - elements count
5357   // Clobbers: v0-v13, rscratch1, rscratch2
5358   address generate_large_arrays_hashcode(BasicType eltype) {
5359     const Register result = r0, ary = r1, cnt = r2;
5360     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
5361     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
5362     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
5363     const FloatRegister vpowm = v13;
5364 
5365     ARRAYS_HASHCODE_REGISTERS;
5366 
5367     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
5368 
5369     unsigned int vf; // vectorization factor
5370     bool multiply_by_halves;
5371     Assembler::SIMD_Arrangement load_arrangement;
5372     switch (eltype) {
5373     case T_BOOLEAN:
5374     case T_BYTE:
5375       load_arrangement = Assembler::T8B;
5376       multiply_by_halves = true;
5377       vf = 8;
5378       break;
5379     case T_CHAR:
5380     case T_SHORT:
5381       load_arrangement = Assembler::T8H;
5382       multiply_by_halves = true;
5383       vf = 8;
5384       break;
5385     case T_INT:
5386       load_arrangement = Assembler::T4S;
5387       multiply_by_halves = false;
5388       vf = 4;
5389       break;
5390     default:
5391       ShouldNotReachHere();
5392     }
5393 
5394     // Unroll factor
5395     const unsigned uf = 4;
5396 
5397     // Effective vectorization factor
5398     const unsigned evf = vf * uf;
5399 
5400     __ align(CodeEntryAlignment);
5401 
5402     const char *mark_name = "";
5403     switch (eltype) {
5404     case T_BOOLEAN:
5405       mark_name = "_large_arrays_hashcode_boolean";
5406       break;
5407     case T_BYTE:
5408       mark_name = "_large_arrays_hashcode_byte";
5409       break;
5410     case T_CHAR:
5411       mark_name = "_large_arrays_hashcode_char";
5412       break;
5413     case T_SHORT:
5414       mark_name = "_large_arrays_hashcode_short";
5415       break;
5416     case T_INT:
5417       mark_name = "_large_arrays_hashcode_int";
5418       break;
5419     default:
5420       mark_name = "_large_arrays_hashcode_incorrect_type";
5421       __ should_not_reach_here();
5422     };
5423 
5424     StubCodeMark mark(this, "StubRoutines", mark_name);
5425 
5426     address entry = __ pc();
5427     __ enter();
5428 
5429     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
5430     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
5431     // value shouldn't change throughout both loops.
5432     __ movw(rscratch1, intpow(31U, 3));
5433     __ mov(vpow, Assembler::S, 0, rscratch1);
5434     __ movw(rscratch1, intpow(31U, 2));
5435     __ mov(vpow, Assembler::S, 1, rscratch1);
5436     __ movw(rscratch1, intpow(31U, 1));
5437     __ mov(vpow, Assembler::S, 2, rscratch1);
5438     __ movw(rscratch1, intpow(31U, 0));
5439     __ mov(vpow, Assembler::S, 3, rscratch1);
5440 
5441     __ mov(vmul0, Assembler::T16B, 0);
5442     __ mov(vmul0, Assembler::S, 3, result);
5443 
5444     __ andr(rscratch2, cnt, (uf - 1) * vf);
5445     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
5446 
5447     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
5448     __ mov(vpowm, Assembler::S, 0, rscratch1);
5449 
5450     // SMALL LOOP
5451     __ bind(SMALL_LOOP);
5452 
5453     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
5454     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5455     __ subsw(rscratch2, rscratch2, vf);
5456 
5457     if (load_arrangement == Assembler::T8B) {
5458       // Extend 8B to 8H to be able to use vector multiply
5459       // instructions
5460       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5461       if (is_signed_subword_type(eltype)) {
5462         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5463       } else {
5464         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5465       }
5466     }
5467 
5468     switch (load_arrangement) {
5469     case Assembler::T4S:
5470       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5471       break;
5472     case Assembler::T8B:
5473     case Assembler::T8H:
5474       assert(is_subword_type(eltype), "subword type expected");
5475       if (is_signed_subword_type(eltype)) {
5476         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5477       } else {
5478         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5479       }
5480       break;
5481     default:
5482       __ should_not_reach_here();
5483     }
5484 
5485     // Process the upper half of a vector
5486     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5487       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5488       if (is_signed_subword_type(eltype)) {
5489         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5490       } else {
5491         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5492       }
5493     }
5494 
5495     __ br(Assembler::HI, SMALL_LOOP);
5496 
5497     // SMALL LOOP'S EPILOQUE
5498     __ lsr(rscratch2, cnt, exact_log2(evf));
5499     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
5500 
5501     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5502     __ addv(vmul0, Assembler::T4S, vmul0);
5503     __ umov(result, vmul0, Assembler::S, 0);
5504 
5505     // TAIL
5506     __ bind(TAIL);
5507 
5508     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
5509     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
5510     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
5511     __ andr(rscratch2, cnt, vf - 1);
5512     __ bind(TAIL_SHORTCUT);
5513     __ adr(rscratch1, BR_BASE);
5514     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
5515     __ movw(rscratch2, 0x1f);
5516     __ br(rscratch1);
5517 
5518     for (size_t i = 0; i < vf - 1; ++i) {
5519       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
5520                                    eltype);
5521       __ maddw(result, result, rscratch2, rscratch1);
5522     }
5523     __ bind(BR_BASE);
5524 
5525     __ leave();
5526     __ ret(lr);
5527 
5528     // LARGE LOOP
5529     __ bind(LARGE_LOOP_PREHEADER);
5530 
5531     __ lsr(rscratch2, cnt, exact_log2(evf));
5532 
5533     if (multiply_by_halves) {
5534       // 31^4 - multiplier between lower and upper parts of a register
5535       __ movw(rscratch1, intpow(31U, vf / 2));
5536       __ mov(vpowm, Assembler::S, 1, rscratch1);
5537       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
5538       __ movw(rscratch1, intpow(31U, evf - vf / 2));
5539       __ mov(vpowm, Assembler::S, 0, rscratch1);
5540     } else {
5541       // 31^16
5542       __ movw(rscratch1, intpow(31U, evf));
5543       __ mov(vpowm, Assembler::S, 0, rscratch1);
5544     }
5545 
5546     __ mov(vmul3, Assembler::T16B, 0);
5547     __ mov(vmul2, Assembler::T16B, 0);
5548     __ mov(vmul1, Assembler::T16B, 0);
5549 
5550     __ bind(LARGE_LOOP);
5551 
5552     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
5553     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
5554     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
5555     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5556 
5557     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
5558            Address(__ post(ary, evf * type2aelembytes(eltype))));
5559 
5560     if (load_arrangement == Assembler::T8B) {
5561       // Extend 8B to 8H to be able to use vector multiply
5562       // instructions
5563       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5564       if (is_signed_subword_type(eltype)) {
5565         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5566         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5567         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5568         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5569       } else {
5570         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5571         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5572         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5573         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5574       }
5575     }
5576 
5577     switch (load_arrangement) {
5578     case Assembler::T4S:
5579       __ addv(vmul3, load_arrangement, vmul3, vdata3);
5580       __ addv(vmul2, load_arrangement, vmul2, vdata2);
5581       __ addv(vmul1, load_arrangement, vmul1, vdata1);
5582       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5583       break;
5584     case Assembler::T8B:
5585     case Assembler::T8H:
5586       assert(is_subword_type(eltype), "subword type expected");
5587       if (is_signed_subword_type(eltype)) {
5588         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5589         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5590         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5591         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5592       } else {
5593         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5594         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5595         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5596         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5597       }
5598       break;
5599     default:
5600       __ should_not_reach_here();
5601     }
5602 
5603     // Process the upper half of a vector
5604     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5605       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
5606       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
5607       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
5608       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
5609       if (is_signed_subword_type(eltype)) {
5610         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5611         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5612         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5613         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5614       } else {
5615         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5616         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5617         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5618         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5619       }
5620     }
5621 
5622     __ subsw(rscratch2, rscratch2, 1);
5623     __ br(Assembler::HI, LARGE_LOOP);
5624 
5625     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
5626     __ addv(vmul3, Assembler::T4S, vmul3);
5627     __ umov(result, vmul3, Assembler::S, 0);
5628 
5629     __ mov(rscratch2, intpow(31U, vf));
5630 
5631     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
5632     __ addv(vmul2, Assembler::T4S, vmul2);
5633     __ umov(rscratch1, vmul2, Assembler::S, 0);
5634     __ maddw(result, result, rscratch2, rscratch1);
5635 
5636     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
5637     __ addv(vmul1, Assembler::T4S, vmul1);
5638     __ umov(rscratch1, vmul1, Assembler::S, 0);
5639     __ maddw(result, result, rscratch2, rscratch1);
5640 
5641     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5642     __ addv(vmul0, Assembler::T4S, vmul0);
5643     __ umov(rscratch1, vmul0, Assembler::S, 0);
5644     __ maddw(result, result, rscratch2, rscratch1);
5645 
5646     __ andr(rscratch2, cnt, vf - 1);
5647     __ cbnz(rscratch2, TAIL_SHORTCUT);
5648 
5649     __ leave();
5650     __ ret(lr);
5651 
5652     return entry;
5653   }
5654 
5655   address generate_dsin_dcos(bool isCos) {
5656     __ align(CodeEntryAlignment);
5657     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5658     address start = __ pc();
5659     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5660         (address)StubRoutines::aarch64::_two_over_pi,
5661         (address)StubRoutines::aarch64::_pio2,
5662         (address)StubRoutines::aarch64::_dsin_coef,
5663         (address)StubRoutines::aarch64::_dcos_coef);
5664     return start;
5665   }
5666 
5667   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5668   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5669       Label &DIFF2) {
5670     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5671     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5672 
5673     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5674     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5675     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5676     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5677 
5678     __ fmovd(tmpL, vtmp3);
5679     __ eor(rscratch2, tmp3, tmpL);
5680     __ cbnz(rscratch2, DIFF2);
5681 
5682     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5683     __ umov(tmpL, vtmp3, __ D, 1);
5684     __ eor(rscratch2, tmpU, tmpL);
5685     __ cbnz(rscratch2, DIFF1);
5686 
5687     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5688     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5689     __ fmovd(tmpL, vtmp);
5690     __ eor(rscratch2, tmp3, tmpL);
5691     __ cbnz(rscratch2, DIFF2);
5692 
5693     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5694     __ umov(tmpL, vtmp, __ D, 1);
5695     __ eor(rscratch2, tmpU, tmpL);
5696     __ cbnz(rscratch2, DIFF1);
5697   }
5698 
5699   // r0  = result
5700   // r1  = str1
5701   // r2  = cnt1
5702   // r3  = str2
5703   // r4  = cnt2
5704   // r10 = tmp1
5705   // r11 = tmp2
5706   address generate_compare_long_string_different_encoding(bool isLU) {
5707     __ align(CodeEntryAlignment);
5708     StubCodeMark mark(this, "StubRoutines", isLU
5709         ? "compare_long_string_different_encoding LU"
5710         : "compare_long_string_different_encoding UL");
5711     address entry = __ pc();
5712     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5713         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5714         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5715     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5716         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5717     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5718     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5719 
5720     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5721 
5722     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5723     // cnt2 == amount of characters left to compare
5724     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5725     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5726     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5727     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5728     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5729     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5730     __ eor(rscratch2, tmp1, tmp2);
5731     __ mov(rscratch1, tmp2);
5732     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5733     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5734              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5735     __ push(spilled_regs, sp);
5736     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5737     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5738 
5739     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5740 
5741     if (SoftwarePrefetchHintDistance >= 0) {
5742       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5743       __ br(__ LT, NO_PREFETCH);
5744       __ bind(LARGE_LOOP_PREFETCH);
5745         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5746         __ mov(tmp4, 2);
5747         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5748         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5749           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5750           __ subs(tmp4, tmp4, 1);
5751           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5752           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5753           __ mov(tmp4, 2);
5754         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5755           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5756           __ subs(tmp4, tmp4, 1);
5757           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5758           __ sub(cnt2, cnt2, 64);
5759           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5760           __ br(__ GE, LARGE_LOOP_PREFETCH);
5761     }
5762     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5763     __ bind(NO_PREFETCH);
5764     __ subs(cnt2, cnt2, 16);
5765     __ br(__ LT, TAIL);
5766     __ align(OptoLoopAlignment);
5767     __ bind(SMALL_LOOP); // smaller loop
5768       __ subs(cnt2, cnt2, 16);
5769       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5770       __ br(__ GE, SMALL_LOOP);
5771       __ cmn(cnt2, (u1)16);
5772       __ br(__ EQ, LOAD_LAST);
5773     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5774       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5775       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5776       __ ldr(tmp3, Address(cnt1, -8));
5777       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5778       __ b(LOAD_LAST);
5779     __ bind(DIFF2);
5780       __ mov(tmpU, tmp3);
5781     __ bind(DIFF1);
5782       __ pop(spilled_regs, sp);
5783       __ b(CALCULATE_DIFFERENCE);
5784     __ bind(LOAD_LAST);
5785       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5786       // No need to load it again
5787       __ mov(tmpU, tmp3);
5788       __ pop(spilled_regs, sp);
5789 
5790       // tmp2 points to the address of the last 4 Latin1 characters right now
5791       __ ldrs(vtmp, Address(tmp2));
5792       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5793       __ fmovd(tmpL, vtmp);
5794 
5795       __ eor(rscratch2, tmpU, tmpL);
5796       __ cbz(rscratch2, DONE);
5797 
5798     // Find the first different characters in the longwords and
5799     // compute their difference.
5800     __ bind(CALCULATE_DIFFERENCE);
5801       __ rev(rscratch2, rscratch2);
5802       __ clz(rscratch2, rscratch2);
5803       __ andr(rscratch2, rscratch2, -16);
5804       __ lsrv(tmp1, tmp1, rscratch2);
5805       __ uxthw(tmp1, tmp1);
5806       __ lsrv(rscratch1, rscratch1, rscratch2);
5807       __ uxthw(rscratch1, rscratch1);
5808       __ subw(result, tmp1, rscratch1);
5809     __ bind(DONE);
5810       __ ret(lr);
5811     return entry;
5812   }
5813 
5814   // r0 = input (float16)
5815   // v0 = result (float)
5816   // v1 = temporary float register
5817   address generate_float16ToFloat() {
5818     __ align(CodeEntryAlignment);
5819     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
5820     address entry = __ pc();
5821     BLOCK_COMMENT("Entry:");
5822     __ flt16_to_flt(v0, r0, v1);
5823     __ ret(lr);
5824     return entry;
5825   }
5826 
5827   // v0 = input (float)
5828   // r0 = result (float16)
5829   // v1 = temporary float register
5830   address generate_floatToFloat16() {
5831     __ align(CodeEntryAlignment);
5832     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
5833     address entry = __ pc();
5834     BLOCK_COMMENT("Entry:");
5835     __ flt_to_flt16(r0, v0, v1);
5836     __ ret(lr);
5837     return entry;
5838   }
5839 
5840   address generate_method_entry_barrier() {
5841     __ align(CodeEntryAlignment);
5842     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5843 
5844     Label deoptimize_label;
5845 
5846     address start = __ pc();
5847 
5848     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5849 
5850     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5851       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5852       // We can get here despite the nmethod being good, if we have not
5853       // yet applied our cross modification fence (or data fence).
5854       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5855       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5856       __ ldrw(rscratch2, rscratch2);
5857       __ strw(rscratch2, thread_epoch_addr);
5858       __ isb();
5859       __ membar(__ LoadLoad);
5860     }
5861 
5862     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5863 
5864     __ enter();
5865     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5866 
5867     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5868 
5869     __ push_call_clobbered_registers();
5870 
5871     __ mov(c_rarg0, rscratch2);
5872     __ call_VM_leaf
5873          (CAST_FROM_FN_PTR
5874           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5875 
5876     __ reset_last_Java_frame(true);
5877 
5878     __ mov(rscratch1, r0);
5879 
5880     __ pop_call_clobbered_registers();
5881 
5882     __ cbnz(rscratch1, deoptimize_label);
5883 
5884     __ leave();
5885     __ ret(lr);
5886 
5887     __ BIND(deoptimize_label);
5888 
5889     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5890     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5891 
5892     __ mov(sp, rscratch1);
5893     __ br(rscratch2);
5894 
5895     return start;
5896   }
5897 
5898   // r0  = result
5899   // r1  = str1
5900   // r2  = cnt1
5901   // r3  = str2
5902   // r4  = cnt2
5903   // r10 = tmp1
5904   // r11 = tmp2
5905   address generate_compare_long_string_same_encoding(bool isLL) {
5906     __ align(CodeEntryAlignment);
5907     StubCodeMark mark(this, "StubRoutines", isLL
5908         ? "compare_long_string_same_encoding LL"
5909         : "compare_long_string_same_encoding UU");
5910     address entry = __ pc();
5911     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5912         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5913 
5914     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5915 
5916     // exit from large loop when less than 64 bytes left to read or we're about
5917     // to prefetch memory behind array border
5918     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5919 
5920     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5921     __ eor(rscratch2, tmp1, tmp2);
5922     __ cbnz(rscratch2, CAL_DIFFERENCE);
5923 
5924     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5925     // update pointers, because of previous read
5926     __ add(str1, str1, wordSize);
5927     __ add(str2, str2, wordSize);
5928     if (SoftwarePrefetchHintDistance >= 0) {
5929       __ align(OptoLoopAlignment);
5930       __ bind(LARGE_LOOP_PREFETCH);
5931         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5932         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5933 
5934         for (int i = 0; i < 4; i++) {
5935           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5936           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5937           __ cmp(tmp1, tmp2);
5938           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5939           __ br(Assembler::NE, DIFF);
5940         }
5941         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5942         __ add(str1, str1, 64);
5943         __ add(str2, str2, 64);
5944         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5945         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5946         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5947     }
5948 
5949     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5950     __ br(Assembler::LE, LESS16);
5951     __ align(OptoLoopAlignment);
5952     __ bind(LOOP_COMPARE16);
5953       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5954       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5955       __ cmp(tmp1, tmp2);
5956       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5957       __ br(Assembler::NE, DIFF);
5958       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5959       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5960       __ br(Assembler::LT, LESS16);
5961 
5962       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5963       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5964       __ cmp(tmp1, tmp2);
5965       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5966       __ br(Assembler::NE, DIFF);
5967       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5968       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5969       __ br(Assembler::GE, LOOP_COMPARE16);
5970       __ cbz(cnt2, LENGTH_DIFF);
5971 
5972     __ bind(LESS16);
5973       // each 8 compare
5974       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5975       __ br(Assembler::LE, LESS8);
5976       __ ldr(tmp1, Address(__ post(str1, 8)));
5977       __ ldr(tmp2, Address(__ post(str2, 8)));
5978       __ eor(rscratch2, tmp1, tmp2);
5979       __ cbnz(rscratch2, CAL_DIFFERENCE);
5980       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5981 
5982     __ bind(LESS8); // directly load last 8 bytes
5983       if (!isLL) {
5984         __ add(cnt2, cnt2, cnt2);
5985       }
5986       __ ldr(tmp1, Address(str1, cnt2));
5987       __ ldr(tmp2, Address(str2, cnt2));
5988       __ eor(rscratch2, tmp1, tmp2);
5989       __ cbz(rscratch2, LENGTH_DIFF);
5990       __ b(CAL_DIFFERENCE);
5991 
5992     __ bind(DIFF);
5993       __ cmp(tmp1, tmp2);
5994       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5995       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5996       // reuse rscratch2 register for the result of eor instruction
5997       __ eor(rscratch2, tmp1, tmp2);
5998 
5999     __ bind(CAL_DIFFERENCE);
6000       __ rev(rscratch2, rscratch2);
6001       __ clz(rscratch2, rscratch2);
6002       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
6003       __ lsrv(tmp1, tmp1, rscratch2);
6004       __ lsrv(tmp2, tmp2, rscratch2);
6005       if (isLL) {
6006         __ uxtbw(tmp1, tmp1);
6007         __ uxtbw(tmp2, tmp2);
6008       } else {
6009         __ uxthw(tmp1, tmp1);
6010         __ uxthw(tmp2, tmp2);
6011       }
6012       __ subw(result, tmp1, tmp2);
6013 
6014     __ bind(LENGTH_DIFF);
6015       __ ret(lr);
6016     return entry;
6017   }
6018 
6019   enum string_compare_mode {
6020     LL,
6021     LU,
6022     UL,
6023     UU,
6024   };
6025 
6026   // The following registers are declared in aarch64.ad
6027   // r0  = result
6028   // r1  = str1
6029   // r2  = cnt1
6030   // r3  = str2
6031   // r4  = cnt2
6032   // r10 = tmp1
6033   // r11 = tmp2
6034   // z0  = ztmp1
6035   // z1  = ztmp2
6036   // p0  = pgtmp1
6037   // p1  = pgtmp2
6038   address generate_compare_long_string_sve(string_compare_mode mode) {
6039     __ align(CodeEntryAlignment);
6040     address entry = __ pc();
6041     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
6042              tmp1 = r10, tmp2 = r11;
6043 
6044     Label LOOP, DONE, MISMATCH;
6045     Register vec_len = tmp1;
6046     Register idx = tmp2;
6047     // The minimum of the string lengths has been stored in cnt2.
6048     Register cnt = cnt2;
6049     FloatRegister ztmp1 = z0, ztmp2 = z1;
6050     PRegister pgtmp1 = p0, pgtmp2 = p1;
6051 
6052 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
6053     switch (mode) {                                                            \
6054       case LL:                                                                 \
6055         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
6056         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
6057         break;                                                                 \
6058       case LU:                                                                 \
6059         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
6060         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6061         break;                                                                 \
6062       case UL:                                                                 \
6063         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6064         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
6065         break;                                                                 \
6066       case UU:                                                                 \
6067         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6068         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6069         break;                                                                 \
6070       default:                                                                 \
6071         ShouldNotReachHere();                                                  \
6072     }
6073 
6074     const char* stubname;
6075     switch (mode) {
6076       case LL: stubname = "compare_long_string_same_encoding LL";      break;
6077       case LU: stubname = "compare_long_string_different_encoding LU"; break;
6078       case UL: stubname = "compare_long_string_different_encoding UL"; break;
6079       case UU: stubname = "compare_long_string_same_encoding UU";      break;
6080       default: ShouldNotReachHere();
6081     }
6082 
6083     StubCodeMark mark(this, "StubRoutines", stubname);
6084 
6085     __ mov(idx, 0);
6086     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6087 
6088     if (mode == LL) {
6089       __ sve_cntb(vec_len);
6090     } else {
6091       __ sve_cnth(vec_len);
6092     }
6093 
6094     __ sub(rscratch1, cnt, vec_len);
6095 
6096     __ bind(LOOP);
6097 
6098       // main loop
6099       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6100       __ add(idx, idx, vec_len);
6101       // Compare strings.
6102       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6103       __ br(__ NE, MISMATCH);
6104       __ cmp(idx, rscratch1);
6105       __ br(__ LT, LOOP);
6106 
6107     // post loop, last iteration
6108     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6109 
6110     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6111     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6112     __ br(__ EQ, DONE);
6113 
6114     __ bind(MISMATCH);
6115 
6116     // Crop the vector to find its location.
6117     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
6118     // Extract the first different characters of each string.
6119     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
6120     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
6121 
6122     // Compute the difference of the first different characters.
6123     __ sub(result, rscratch1, rscratch2);
6124 
6125     __ bind(DONE);
6126     __ ret(lr);
6127 #undef LOAD_PAIR
6128     return entry;
6129   }
6130 
6131   void generate_compare_long_strings() {
6132     if (UseSVE == 0) {
6133       StubRoutines::aarch64::_compare_long_string_LL
6134           = generate_compare_long_string_same_encoding(true);
6135       StubRoutines::aarch64::_compare_long_string_UU
6136           = generate_compare_long_string_same_encoding(false);
6137       StubRoutines::aarch64::_compare_long_string_LU
6138           = generate_compare_long_string_different_encoding(true);
6139       StubRoutines::aarch64::_compare_long_string_UL
6140           = generate_compare_long_string_different_encoding(false);
6141     } else {
6142       StubRoutines::aarch64::_compare_long_string_LL
6143           = generate_compare_long_string_sve(LL);
6144       StubRoutines::aarch64::_compare_long_string_UU
6145           = generate_compare_long_string_sve(UU);
6146       StubRoutines::aarch64::_compare_long_string_LU
6147           = generate_compare_long_string_sve(LU);
6148       StubRoutines::aarch64::_compare_long_string_UL
6149           = generate_compare_long_string_sve(UL);
6150     }
6151   }
6152 
6153   // R0 = result
6154   // R1 = str2
6155   // R2 = cnt1
6156   // R3 = str1
6157   // R4 = cnt2
6158   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
6159   //
6160   // This generic linear code use few additional ideas, which makes it faster:
6161   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
6162   // in order to skip initial loading(help in systems with 1 ld pipeline)
6163   // 2) we can use "fast" algorithm of finding single character to search for
6164   // first symbol with less branches(1 branch per each loaded register instead
6165   // of branch for each symbol), so, this is where constants like
6166   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
6167   // 3) after loading and analyzing 1st register of source string, it can be
6168   // used to search for every 1st character entry, saving few loads in
6169   // comparison with "simplier-but-slower" implementation
6170   // 4) in order to avoid lots of push/pop operations, code below is heavily
6171   // re-using/re-initializing/compressing register values, which makes code
6172   // larger and a bit less readable, however, most of extra operations are
6173   // issued during loads or branches, so, penalty is minimal
6174   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
6175     const char* stubName = str1_isL
6176         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
6177         : "indexof_linear_uu";
6178     __ align(CodeEntryAlignment);
6179     StubCodeMark mark(this, "StubRoutines", stubName);
6180     address entry = __ pc();
6181 
6182     int str1_chr_size = str1_isL ? 1 : 2;
6183     int str2_chr_size = str2_isL ? 1 : 2;
6184     int str1_chr_shift = str1_isL ? 0 : 1;
6185     int str2_chr_shift = str2_isL ? 0 : 1;
6186     bool isL = str1_isL && str2_isL;
6187    // parameters
6188     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
6189     // temporary registers
6190     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
6191     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
6192     // redefinitions
6193     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
6194 
6195     __ push(spilled_regs, sp);
6196     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
6197         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
6198         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
6199         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
6200         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
6201         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
6202     // Read whole register from str1. It is safe, because length >=8 here
6203     __ ldr(ch1, Address(str1));
6204     // Read whole register from str2. It is safe, because length >=8 here
6205     __ ldr(ch2, Address(str2));
6206     __ sub(cnt2, cnt2, cnt1);
6207     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
6208     if (str1_isL != str2_isL) {
6209       __ eor(v0, __ T16B, v0, v0);
6210     }
6211     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
6212     __ mul(first, first, tmp1);
6213     // check if we have less than 1 register to check
6214     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
6215     if (str1_isL != str2_isL) {
6216       __ fmovd(v1, ch1);
6217     }
6218     __ br(__ LE, L_SMALL);
6219     __ eor(ch2, first, ch2);
6220     if (str1_isL != str2_isL) {
6221       __ zip1(v1, __ T16B, v1, v0);
6222     }
6223     __ sub(tmp2, ch2, tmp1);
6224     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6225     __ bics(tmp2, tmp2, ch2);
6226     if (str1_isL != str2_isL) {
6227       __ fmovd(ch1, v1);
6228     }
6229     __ br(__ NE, L_HAS_ZERO);
6230     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6231     __ add(result, result, wordSize/str2_chr_size);
6232     __ add(str2, str2, wordSize);
6233     __ br(__ LT, L_POST_LOOP);
6234     __ BIND(L_LOOP);
6235       __ ldr(ch2, Address(str2));
6236       __ eor(ch2, first, ch2);
6237       __ sub(tmp2, ch2, tmp1);
6238       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6239       __ bics(tmp2, tmp2, ch2);
6240       __ br(__ NE, L_HAS_ZERO);
6241     __ BIND(L_LOOP_PROCEED);
6242       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6243       __ add(str2, str2, wordSize);
6244       __ add(result, result, wordSize/str2_chr_size);
6245       __ br(__ GE, L_LOOP);
6246     __ BIND(L_POST_LOOP);
6247       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
6248       __ br(__ LE, NOMATCH);
6249       __ ldr(ch2, Address(str2));
6250       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6251       __ eor(ch2, first, ch2);
6252       __ sub(tmp2, ch2, tmp1);
6253       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6254       __ mov(tmp4, -1); // all bits set
6255       __ b(L_SMALL_PROCEED);
6256     __ align(OptoLoopAlignment);
6257     __ BIND(L_SMALL);
6258       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6259       __ eor(ch2, first, ch2);
6260       if (str1_isL != str2_isL) {
6261         __ zip1(v1, __ T16B, v1, v0);
6262       }
6263       __ sub(tmp2, ch2, tmp1);
6264       __ mov(tmp4, -1); // all bits set
6265       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6266       if (str1_isL != str2_isL) {
6267         __ fmovd(ch1, v1); // move converted 4 symbols
6268       }
6269     __ BIND(L_SMALL_PROCEED);
6270       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
6271       __ bic(tmp2, tmp2, ch2);
6272       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
6273       __ rbit(tmp2, tmp2);
6274       __ br(__ EQ, NOMATCH);
6275     __ BIND(L_SMALL_HAS_ZERO_LOOP);
6276       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
6277       __ cmp(cnt1, u1(wordSize/str2_chr_size));
6278       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
6279       if (str2_isL) { // LL
6280         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6281         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6282         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6283         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6284         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6285       } else {
6286         __ mov(ch2, 0xE); // all bits in byte set except last one
6287         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6288         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6289         __ lslv(tmp2, tmp2, tmp4);
6290         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6291         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6292         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6293         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6294       }
6295       __ cmp(ch1, ch2);
6296       __ mov(tmp4, wordSize/str2_chr_size);
6297       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6298     __ BIND(L_SMALL_CMP_LOOP);
6299       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6300                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6301       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6302                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6303       __ add(tmp4, tmp4, 1);
6304       __ cmp(tmp4, cnt1);
6305       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
6306       __ cmp(first, ch2);
6307       __ br(__ EQ, L_SMALL_CMP_LOOP);
6308     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
6309       __ cbz(tmp2, NOMATCH); // no more matches. exit
6310       __ clz(tmp4, tmp2);
6311       __ add(result, result, 1); // advance index
6312       __ add(str2, str2, str2_chr_size); // advance pointer
6313       __ b(L_SMALL_HAS_ZERO_LOOP);
6314     __ align(OptoLoopAlignment);
6315     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
6316       __ cmp(first, ch2);
6317       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6318       __ b(DONE);
6319     __ align(OptoLoopAlignment);
6320     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
6321       if (str2_isL) { // LL
6322         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6323         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6324         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6325         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6326         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6327       } else {
6328         __ mov(ch2, 0xE); // all bits in byte set except last one
6329         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6330         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6331         __ lslv(tmp2, tmp2, tmp4);
6332         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6333         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6334         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6335         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6336       }
6337       __ cmp(ch1, ch2);
6338       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6339       __ b(DONE);
6340     __ align(OptoLoopAlignment);
6341     __ BIND(L_HAS_ZERO);
6342       __ rbit(tmp2, tmp2);
6343       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6344       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6345       // It's fine because both counters are 32bit and are not changed in this
6346       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6347       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6348       __ sub(result, result, 1);
6349     __ BIND(L_HAS_ZERO_LOOP);
6350       __ mov(cnt1, wordSize/str2_chr_size);
6351       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6352       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6353       if (str2_isL) {
6354         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6355         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6356         __ lslv(tmp2, tmp2, tmp4);
6357         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6358         __ add(tmp4, tmp4, 1);
6359         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6360         __ lsl(tmp2, tmp2, 1);
6361         __ mov(tmp4, wordSize/str2_chr_size);
6362       } else {
6363         __ mov(ch2, 0xE);
6364         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6365         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6366         __ lslv(tmp2, tmp2, tmp4);
6367         __ add(tmp4, tmp4, 1);
6368         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6369         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6370         __ lsl(tmp2, tmp2, 1);
6371         __ mov(tmp4, wordSize/str2_chr_size);
6372         __ sub(str2, str2, str2_chr_size);
6373       }
6374       __ cmp(ch1, ch2);
6375       __ mov(tmp4, wordSize/str2_chr_size);
6376       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6377     __ BIND(L_CMP_LOOP);
6378       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6379                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6380       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6381                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6382       __ add(tmp4, tmp4, 1);
6383       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6384       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6385       __ cmp(cnt1, ch2);
6386       __ br(__ EQ, L_CMP_LOOP);
6387     __ BIND(L_CMP_LOOP_NOMATCH);
6388       // here we're not matched
6389       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6390       __ clz(tmp4, tmp2);
6391       __ add(str2, str2, str2_chr_size); // advance pointer
6392       __ b(L_HAS_ZERO_LOOP);
6393     __ align(OptoLoopAlignment);
6394     __ BIND(L_CMP_LOOP_LAST_CMP);
6395       __ cmp(cnt1, ch2);
6396       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6397       __ b(DONE);
6398     __ align(OptoLoopAlignment);
6399     __ BIND(L_CMP_LOOP_LAST_CMP2);
6400       if (str2_isL) {
6401         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6402         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6403         __ lslv(tmp2, tmp2, tmp4);
6404         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6405         __ add(tmp4, tmp4, 1);
6406         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6407         __ lsl(tmp2, tmp2, 1);
6408       } else {
6409         __ mov(ch2, 0xE);
6410         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6411         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6412         __ lslv(tmp2, tmp2, tmp4);
6413         __ add(tmp4, tmp4, 1);
6414         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6415         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6416         __ lsl(tmp2, tmp2, 1);
6417         __ sub(str2, str2, str2_chr_size);
6418       }
6419       __ cmp(ch1, ch2);
6420       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6421       __ b(DONE);
6422     __ align(OptoLoopAlignment);
6423     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6424       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6425       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6426       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6427       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6428       // result by analyzed characters value, so, we can just reset lower bits
6429       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6430       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6431       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6432       // index of last analyzed substring inside current octet. So, str2 in at
6433       // respective start address. We need to advance it to next octet
6434       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6435       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6436       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6437       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6438       __ movw(cnt2, cnt2);
6439       __ b(L_LOOP_PROCEED);
6440     __ align(OptoLoopAlignment);
6441     __ BIND(NOMATCH);
6442       __ mov(result, -1);
6443     __ BIND(DONE);
6444       __ pop(spilled_regs, sp);
6445       __ ret(lr);
6446     return entry;
6447   }
6448 
6449   void generate_string_indexof_stubs() {
6450     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6451     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6452     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6453   }
6454 
6455   void inflate_and_store_2_fp_registers(bool generatePrfm,
6456       FloatRegister src1, FloatRegister src2) {
6457     Register dst = r1;
6458     __ zip1(v1, __ T16B, src1, v0);
6459     __ zip2(v2, __ T16B, src1, v0);
6460     if (generatePrfm) {
6461       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6462     }
6463     __ zip1(v3, __ T16B, src2, v0);
6464     __ zip2(v4, __ T16B, src2, v0);
6465     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6466   }
6467 
6468   // R0 = src
6469   // R1 = dst
6470   // R2 = len
6471   // R3 = len >> 3
6472   // V0 = 0
6473   // v1 = loaded 8 bytes
6474   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6475   address generate_large_byte_array_inflate() {
6476     __ align(CodeEntryAlignment);
6477     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6478     address entry = __ pc();
6479     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6480     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6481     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6482 
6483     // do one more 8-byte read to have address 16-byte aligned in most cases
6484     // also use single store instruction
6485     __ ldrd(v2, __ post(src, 8));
6486     __ sub(octetCounter, octetCounter, 2);
6487     __ zip1(v1, __ T16B, v1, v0);
6488     __ zip1(v2, __ T16B, v2, v0);
6489     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6490     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6491     __ subs(rscratch1, octetCounter, large_loop_threshold);
6492     __ br(__ LE, LOOP_START);
6493     __ b(LOOP_PRFM_START);
6494     __ bind(LOOP_PRFM);
6495       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6496     __ bind(LOOP_PRFM_START);
6497       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6498       __ sub(octetCounter, octetCounter, 8);
6499       __ subs(rscratch1, octetCounter, large_loop_threshold);
6500       inflate_and_store_2_fp_registers(true, v3, v4);
6501       inflate_and_store_2_fp_registers(true, v5, v6);
6502       __ br(__ GT, LOOP_PRFM);
6503       __ cmp(octetCounter, (u1)8);
6504       __ br(__ LT, DONE);
6505     __ bind(LOOP);
6506       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6507       __ bind(LOOP_START);
6508       __ sub(octetCounter, octetCounter, 8);
6509       __ cmp(octetCounter, (u1)8);
6510       inflate_and_store_2_fp_registers(false, v3, v4);
6511       inflate_and_store_2_fp_registers(false, v5, v6);
6512       __ br(__ GE, LOOP);
6513     __ bind(DONE);
6514       __ ret(lr);
6515     return entry;
6516   }
6517 
6518   /**
6519    *  Arguments:
6520    *
6521    *  Input:
6522    *  c_rarg0   - current state address
6523    *  c_rarg1   - H key address
6524    *  c_rarg2   - data address
6525    *  c_rarg3   - number of blocks
6526    *
6527    *  Output:
6528    *  Updated state at c_rarg0
6529    */
6530   address generate_ghash_processBlocks() {
6531     // Bafflingly, GCM uses little-endian for the byte order, but
6532     // big-endian for the bit order.  For example, the polynomial 1 is
6533     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6534     //
6535     // So, we must either reverse the bytes in each word and do
6536     // everything big-endian or reverse the bits in each byte and do
6537     // it little-endian.  On AArch64 it's more idiomatic to reverse
6538     // the bits in each byte (we have an instruction, RBIT, to do
6539     // that) and keep the data in little-endian bit order through the
6540     // calculation, bit-reversing the inputs and outputs.
6541 
6542     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6543     __ align(wordSize * 2);
6544     address p = __ pc();
6545     __ emit_int64(0x87);  // The low-order bits of the field
6546                           // polynomial (i.e. p = z^7+z^2+z+1)
6547                           // repeated in the low and high parts of a
6548                           // 128-bit vector
6549     __ emit_int64(0x87);
6550 
6551     __ align(CodeEntryAlignment);
6552     address start = __ pc();
6553 
6554     Register state   = c_rarg0;
6555     Register subkeyH = c_rarg1;
6556     Register data    = c_rarg2;
6557     Register blocks  = c_rarg3;
6558 
6559     FloatRegister vzr = v30;
6560     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6561 
6562     __ ldrq(v24, p);    // The field polynomial
6563 
6564     __ ldrq(v0, Address(state));
6565     __ ldrq(v1, Address(subkeyH));
6566 
6567     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6568     __ rbit(v0, __ T16B, v0);
6569     __ rev64(v1, __ T16B, v1);
6570     __ rbit(v1, __ T16B, v1);
6571 
6572     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6573     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6574 
6575     {
6576       Label L_ghash_loop;
6577       __ bind(L_ghash_loop);
6578 
6579       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6580                                                  // reversing each byte
6581       __ rbit(v2, __ T16B, v2);
6582       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6583 
6584       // Multiply state in v2 by subkey in v1
6585       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6586                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6587                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6588       // Reduce v7:v5 by the field polynomial
6589       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6590 
6591       __ sub(blocks, blocks, 1);
6592       __ cbnz(blocks, L_ghash_loop);
6593     }
6594 
6595     // The bit-reversed result is at this point in v0
6596     __ rev64(v0, __ T16B, v0);
6597     __ rbit(v0, __ T16B, v0);
6598 
6599     __ st1(v0, __ T16B, state);
6600     __ ret(lr);
6601 
6602     return start;
6603   }
6604 
6605   address generate_ghash_processBlocks_wide() {
6606     address small = generate_ghash_processBlocks();
6607 
6608     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6609     __ align(wordSize * 2);
6610     address p = __ pc();
6611     __ emit_int64(0x87);  // The low-order bits of the field
6612                           // polynomial (i.e. p = z^7+z^2+z+1)
6613                           // repeated in the low and high parts of a
6614                           // 128-bit vector
6615     __ emit_int64(0x87);
6616 
6617     __ align(CodeEntryAlignment);
6618     address start = __ pc();
6619 
6620     Register state   = c_rarg0;
6621     Register subkeyH = c_rarg1;
6622     Register data    = c_rarg2;
6623     Register blocks  = c_rarg3;
6624 
6625     const int unroll = 4;
6626 
6627     __ cmp(blocks, (unsigned char)(unroll * 2));
6628     __ br(__ LT, small);
6629 
6630     if (unroll > 1) {
6631     // Save state before entering routine
6632       __ sub(sp, sp, 4 * 16);
6633       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6634       __ sub(sp, sp, 4 * 16);
6635       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6636     }
6637 
6638     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6639 
6640     if (unroll > 1) {
6641       // And restore state
6642       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6643       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6644     }
6645 
6646     __ cmp(blocks, (unsigned char)0);
6647     __ br(__ GT, small);
6648 
6649     __ ret(lr);
6650 
6651     return start;
6652   }
6653 
6654   void generate_base64_encode_simdround(Register src, Register dst,
6655         FloatRegister codec, u8 size) {
6656 
6657     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6658     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6659     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6660 
6661     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6662 
6663     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6664 
6665     __ ushr(ind0, arrangement, in0,  2);
6666 
6667     __ ushr(ind1, arrangement, in1,  2);
6668     __ shl(in0,   arrangement, in0,  6);
6669     __ orr(ind1,  arrangement, ind1, in0);
6670     __ ushr(ind1, arrangement, ind1, 2);
6671 
6672     __ ushr(ind2, arrangement, in2,  4);
6673     __ shl(in1,   arrangement, in1,  4);
6674     __ orr(ind2,  arrangement, in1,  ind2);
6675     __ ushr(ind2, arrangement, ind2, 2);
6676 
6677     __ shl(ind3,  arrangement, in2,  2);
6678     __ ushr(ind3, arrangement, ind3, 2);
6679 
6680     __ tbl(out0,  arrangement, codec,  4, ind0);
6681     __ tbl(out1,  arrangement, codec,  4, ind1);
6682     __ tbl(out2,  arrangement, codec,  4, ind2);
6683     __ tbl(out3,  arrangement, codec,  4, ind3);
6684 
6685     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6686   }
6687 
6688    /**
6689    *  Arguments:
6690    *
6691    *  Input:
6692    *  c_rarg0   - src_start
6693    *  c_rarg1   - src_offset
6694    *  c_rarg2   - src_length
6695    *  c_rarg3   - dest_start
6696    *  c_rarg4   - dest_offset
6697    *  c_rarg5   - isURL
6698    *
6699    */
6700   address generate_base64_encodeBlock() {
6701 
6702     static const char toBase64[64] = {
6703       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6704       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6705       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6706       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6707       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6708     };
6709 
6710     static const char toBase64URL[64] = {
6711       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6712       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6713       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6714       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6715       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6716     };
6717 
6718     __ align(CodeEntryAlignment);
6719     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6720     address start = __ pc();
6721 
6722     Register src   = c_rarg0;  // source array
6723     Register soff  = c_rarg1;  // source start offset
6724     Register send  = c_rarg2;  // source end offset
6725     Register dst   = c_rarg3;  // dest array
6726     Register doff  = c_rarg4;  // position for writing to dest array
6727     Register isURL = c_rarg5;  // Base64 or URL character set
6728 
6729     // c_rarg6 and c_rarg7 are free to use as temps
6730     Register codec  = c_rarg6;
6731     Register length = c_rarg7;
6732 
6733     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6734 
6735     __ add(src, src, soff);
6736     __ add(dst, dst, doff);
6737     __ sub(length, send, soff);
6738 
6739     // load the codec base address
6740     __ lea(codec, ExternalAddress((address) toBase64));
6741     __ cbz(isURL, ProcessData);
6742     __ lea(codec, ExternalAddress((address) toBase64URL));
6743 
6744     __ BIND(ProcessData);
6745 
6746     // too short to formup a SIMD loop, roll back
6747     __ cmp(length, (u1)24);
6748     __ br(Assembler::LT, Process3B);
6749 
6750     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6751 
6752     __ BIND(Process48B);
6753     __ cmp(length, (u1)48);
6754     __ br(Assembler::LT, Process24B);
6755     generate_base64_encode_simdround(src, dst, v0, 16);
6756     __ sub(length, length, 48);
6757     __ b(Process48B);
6758 
6759     __ BIND(Process24B);
6760     __ cmp(length, (u1)24);
6761     __ br(Assembler::LT, SIMDExit);
6762     generate_base64_encode_simdround(src, dst, v0, 8);
6763     __ sub(length, length, 24);
6764 
6765     __ BIND(SIMDExit);
6766     __ cbz(length, Exit);
6767 
6768     __ BIND(Process3B);
6769     //  3 src bytes, 24 bits
6770     __ ldrb(r10, __ post(src, 1));
6771     __ ldrb(r11, __ post(src, 1));
6772     __ ldrb(r12, __ post(src, 1));
6773     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6774     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6775     // codec index
6776     __ ubfmw(r15, r12, 18, 23);
6777     __ ubfmw(r14, r12, 12, 17);
6778     __ ubfmw(r13, r12, 6,  11);
6779     __ andw(r12,  r12, 63);
6780     // get the code based on the codec
6781     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6782     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6783     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6784     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6785     __ strb(r15, __ post(dst, 1));
6786     __ strb(r14, __ post(dst, 1));
6787     __ strb(r13, __ post(dst, 1));
6788     __ strb(r12, __ post(dst, 1));
6789     __ sub(length, length, 3);
6790     __ cbnz(length, Process3B);
6791 
6792     __ BIND(Exit);
6793     __ ret(lr);
6794 
6795     return start;
6796   }
6797 
6798   void generate_base64_decode_simdround(Register src, Register dst,
6799         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6800 
6801     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6802     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6803 
6804     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6805     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6806 
6807     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6808 
6809     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6810 
6811     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6812 
6813     // we need unsigned saturating subtract, to make sure all input values
6814     // in range [0, 63] will have 0U value in the higher half lookup
6815     __ uqsubv(decH0, __ T16B, in0, v27);
6816     __ uqsubv(decH1, __ T16B, in1, v27);
6817     __ uqsubv(decH2, __ T16B, in2, v27);
6818     __ uqsubv(decH3, __ T16B, in3, v27);
6819 
6820     // lower half lookup
6821     __ tbl(decL0, arrangement, codecL, 4, in0);
6822     __ tbl(decL1, arrangement, codecL, 4, in1);
6823     __ tbl(decL2, arrangement, codecL, 4, in2);
6824     __ tbl(decL3, arrangement, codecL, 4, in3);
6825 
6826     // higher half lookup
6827     __ tbx(decH0, arrangement, codecH, 4, decH0);
6828     __ tbx(decH1, arrangement, codecH, 4, decH1);
6829     __ tbx(decH2, arrangement, codecH, 4, decH2);
6830     __ tbx(decH3, arrangement, codecH, 4, decH3);
6831 
6832     // combine lower and higher
6833     __ orr(decL0, arrangement, decL0, decH0);
6834     __ orr(decL1, arrangement, decL1, decH1);
6835     __ orr(decL2, arrangement, decL2, decH2);
6836     __ orr(decL3, arrangement, decL3, decH3);
6837 
6838     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6839     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6840     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6841     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6842     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6843     __ orr(in0, arrangement, decH0, decH1);
6844     __ orr(in1, arrangement, decH2, decH3);
6845     __ orr(in2, arrangement, in0,   in1);
6846     __ umaxv(in3, arrangement, in2);
6847     __ umov(rscratch2, in3, __ B, 0);
6848 
6849     // get the data to output
6850     __ shl(out0,  arrangement, decL0, 2);
6851     __ ushr(out1, arrangement, decL1, 4);
6852     __ orr(out0,  arrangement, out0,  out1);
6853     __ shl(out1,  arrangement, decL1, 4);
6854     __ ushr(out2, arrangement, decL2, 2);
6855     __ orr(out1,  arrangement, out1,  out2);
6856     __ shl(out2,  arrangement, decL2, 6);
6857     __ orr(out2,  arrangement, out2,  decL3);
6858 
6859     __ cbz(rscratch2, NoIllegalData);
6860 
6861     // handle illegal input
6862     __ umov(r10, in2, __ D, 0);
6863     if (size == 16) {
6864       __ cbnz(r10, ErrorInLowerHalf);
6865 
6866       // illegal input is in higher half, store the lower half now.
6867       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6868 
6869       __ umov(r10, in2,  __ D, 1);
6870       __ umov(r11, out0, __ D, 1);
6871       __ umov(r12, out1, __ D, 1);
6872       __ umov(r13, out2, __ D, 1);
6873       __ b(StoreLegalData);
6874 
6875       __ BIND(ErrorInLowerHalf);
6876     }
6877     __ umov(r11, out0, __ D, 0);
6878     __ umov(r12, out1, __ D, 0);
6879     __ umov(r13, out2, __ D, 0);
6880 
6881     __ BIND(StoreLegalData);
6882     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6883     __ strb(r11, __ post(dst, 1));
6884     __ strb(r12, __ post(dst, 1));
6885     __ strb(r13, __ post(dst, 1));
6886     __ lsr(r10, r10, 8);
6887     __ lsr(r11, r11, 8);
6888     __ lsr(r12, r12, 8);
6889     __ lsr(r13, r13, 8);
6890     __ b(StoreLegalData);
6891 
6892     __ BIND(NoIllegalData);
6893     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6894   }
6895 
6896 
6897    /**
6898    *  Arguments:
6899    *
6900    *  Input:
6901    *  c_rarg0   - src_start
6902    *  c_rarg1   - src_offset
6903    *  c_rarg2   - src_length
6904    *  c_rarg3   - dest_start
6905    *  c_rarg4   - dest_offset
6906    *  c_rarg5   - isURL
6907    *  c_rarg6   - isMIME
6908    *
6909    */
6910   address generate_base64_decodeBlock() {
6911 
6912     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6913     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6914     // titled "Base64 decoding".
6915 
6916     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6917     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6918     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6919     static const uint8_t fromBase64ForNoSIMD[256] = {
6920       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6921       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6922       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6923        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6924       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6925        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6926       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6927        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6928       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6929       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6930       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6931       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6932       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6933       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6934       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6935       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6936     };
6937 
6938     static const uint8_t fromBase64URLForNoSIMD[256] = {
6939       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6940       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6941       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6942        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6943       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6944        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6945       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6946        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6947       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6948       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6949       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6950       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6951       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6952       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6953       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6954       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6955     };
6956 
6957     // A legal value of base64 code is in range [0, 127].  We need two lookups
6958     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6959     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6960     // table vector lookup use tbx, out of range indices are unchanged in
6961     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6962     // The value of index 64 is set to 0, so that we know that we already get the
6963     // decoded data with the 1st lookup.
6964     static const uint8_t fromBase64ForSIMD[128] = {
6965       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6966       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6967       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6968        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6969         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6970        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6971       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6972        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6973     };
6974 
6975     static const uint8_t fromBase64URLForSIMD[128] = {
6976       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6977       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6978       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6979        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6980         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6981        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6982        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6983        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6984     };
6985 
6986     __ align(CodeEntryAlignment);
6987     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6988     address start = __ pc();
6989 
6990     Register src    = c_rarg0;  // source array
6991     Register soff   = c_rarg1;  // source start offset
6992     Register send   = c_rarg2;  // source end offset
6993     Register dst    = c_rarg3;  // dest array
6994     Register doff   = c_rarg4;  // position for writing to dest array
6995     Register isURL  = c_rarg5;  // Base64 or URL character set
6996     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6997 
6998     Register length = send;    // reuse send as length of source data to process
6999 
7000     Register simd_codec   = c_rarg6;
7001     Register nosimd_codec = c_rarg7;
7002 
7003     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
7004 
7005     __ enter();
7006 
7007     __ add(src, src, soff);
7008     __ add(dst, dst, doff);
7009 
7010     __ mov(doff, dst);
7011 
7012     __ sub(length, send, soff);
7013     __ bfm(length, zr, 0, 1);
7014 
7015     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
7016     __ cbz(isURL, ProcessData);
7017     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
7018 
7019     __ BIND(ProcessData);
7020     __ mov(rscratch1, length);
7021     __ cmp(length, (u1)144); // 144 = 80 + 64
7022     __ br(Assembler::LT, Process4B);
7023 
7024     // In the MIME case, the line length cannot be more than 76
7025     // bytes (see RFC 2045). This is too short a block for SIMD
7026     // to be worthwhile, so we use non-SIMD here.
7027     __ movw(rscratch1, 79);
7028 
7029     __ BIND(Process4B);
7030     __ ldrw(r14, __ post(src, 4));
7031     __ ubfxw(r10, r14, 0,  8);
7032     __ ubfxw(r11, r14, 8,  8);
7033     __ ubfxw(r12, r14, 16, 8);
7034     __ ubfxw(r13, r14, 24, 8);
7035     // get the de-code
7036     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
7037     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
7038     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
7039     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
7040     // error detection, 255u indicates an illegal input
7041     __ orrw(r14, r10, r11);
7042     __ orrw(r15, r12, r13);
7043     __ orrw(r14, r14, r15);
7044     __ tbnz(r14, 7, Exit);
7045     // recover the data
7046     __ lslw(r14, r10, 10);
7047     __ bfiw(r14, r11, 4, 6);
7048     __ bfmw(r14, r12, 2, 5);
7049     __ rev16w(r14, r14);
7050     __ bfiw(r13, r12, 6, 2);
7051     __ strh(r14, __ post(dst, 2));
7052     __ strb(r13, __ post(dst, 1));
7053     // non-simd loop
7054     __ subsw(rscratch1, rscratch1, 4);
7055     __ br(Assembler::GT, Process4B);
7056 
7057     // if exiting from PreProcess80B, rscratch1 == -1;
7058     // otherwise, rscratch1 == 0.
7059     __ cbzw(rscratch1, Exit);
7060     __ sub(length, length, 80);
7061 
7062     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
7063     __ cbz(isURL, SIMDEnter);
7064     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
7065 
7066     __ BIND(SIMDEnter);
7067     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
7068     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
7069     __ mov(rscratch1, 63);
7070     __ dup(v27, __ T16B, rscratch1);
7071 
7072     __ BIND(Process64B);
7073     __ cmp(length, (u1)64);
7074     __ br(Assembler::LT, Process32B);
7075     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
7076     __ sub(length, length, 64);
7077     __ b(Process64B);
7078 
7079     __ BIND(Process32B);
7080     __ cmp(length, (u1)32);
7081     __ br(Assembler::LT, SIMDExit);
7082     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
7083     __ sub(length, length, 32);
7084     __ b(Process32B);
7085 
7086     __ BIND(SIMDExit);
7087     __ cbz(length, Exit);
7088     __ movw(rscratch1, length);
7089     __ b(Process4B);
7090 
7091     __ BIND(Exit);
7092     __ sub(c_rarg0, dst, doff);
7093 
7094     __ leave();
7095     __ ret(lr);
7096 
7097     return start;
7098   }
7099 
7100   // Support for spin waits.
7101   address generate_spin_wait() {
7102     __ align(CodeEntryAlignment);
7103     StubCodeMark mark(this, "StubRoutines", "spin_wait");
7104     address start = __ pc();
7105 
7106     __ spin_wait();
7107     __ ret(lr);
7108 
7109     return start;
7110   }
7111 
7112   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
7113     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
7114 
7115     address start = __ pc();
7116     const Register
7117       r_super_klass  = r0,
7118       r_array_base   = r1,
7119       r_array_length = r2,
7120       r_array_index  = r3,
7121       r_sub_klass    = r4,
7122       r_bitmap       = rscratch2,
7123       result         = r5;
7124     const FloatRegister
7125       vtemp          = v0;
7126 
7127     Label L_success;
7128     __ enter();
7129     __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
7130                                            r_array_base, r_array_length, r_array_index,
7131                                            vtemp, result, super_klass_index,
7132                                            /*stub_is_near*/true);
7133     __ leave();
7134     __ ret(lr);
7135 
7136     return start;
7137   }
7138 
7139   // Slow path implementation for UseSecondarySupersTable.
7140   address generate_lookup_secondary_supers_table_slow_path_stub() {
7141     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
7142 
7143     address start = __ pc();
7144     const Register
7145       r_super_klass  = r0,        // argument
7146       r_array_base   = r1,        // argument
7147       temp1          = r2,        // temp
7148       r_array_index  = r3,        // argument
7149       r_bitmap       = rscratch2, // argument
7150       result         = r5;        // argument
7151 
7152     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
7153     __ ret(lr);
7154 
7155     return start;
7156   }
7157 
7158 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
7159 
7160   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
7161   //
7162   // If LSE is in use, generate LSE versions of all the stubs. The
7163   // non-LSE versions are in atomic_aarch64.S.
7164 
7165   // class AtomicStubMark records the entry point of a stub and the
7166   // stub pointer which will point to it. The stub pointer is set to
7167   // the entry point when ~AtomicStubMark() is called, which must be
7168   // after ICache::invalidate_range. This ensures safe publication of
7169   // the generated code.
7170   class AtomicStubMark {
7171     address _entry_point;
7172     aarch64_atomic_stub_t *_stub;
7173     MacroAssembler *_masm;
7174   public:
7175     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
7176       _masm = masm;
7177       __ align(32);
7178       _entry_point = __ pc();
7179       _stub = stub;
7180     }
7181     ~AtomicStubMark() {
7182       *_stub = (aarch64_atomic_stub_t)_entry_point;
7183     }
7184   };
7185 
7186   // NB: For memory_order_conservative we need a trailing membar after
7187   // LSE atomic operations but not a leading membar.
7188   //
7189   // We don't need a leading membar because a clause in the Arm ARM
7190   // says:
7191   //
7192   //   Barrier-ordered-before
7193   //
7194   //   Barrier instructions order prior Memory effects before subsequent
7195   //   Memory effects generated by the same Observer. A read or a write
7196   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
7197   //   Observer if and only if RW1 appears in program order before RW 2
7198   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
7199   //   instruction with both Acquire and Release semantics.
7200   //
7201   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
7202   // and Release semantics, therefore we don't need a leading
7203   // barrier. However, there is no corresponding Barrier-ordered-after
7204   // relationship, therefore we need a trailing membar to prevent a
7205   // later store or load from being reordered with the store in an
7206   // atomic instruction.
7207   //
7208   // This was checked by using the herd7 consistency model simulator
7209   // (http://diy.inria.fr/) with this test case:
7210   //
7211   // AArch64 LseCas
7212   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
7213   // P0 | P1;
7214   // LDR W4, [X2] | MOV W3, #0;
7215   // DMB LD       | MOV W4, #1;
7216   // LDR W3, [X1] | CASAL W3, W4, [X1];
7217   //              | DMB ISH;
7218   //              | STR W4, [X2];
7219   // exists
7220   // (0:X3=0 /\ 0:X4=1)
7221   //
7222   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
7223   // with the store to x in P1. Without the DMB in P1 this may happen.
7224   //
7225   // At the time of writing we don't know of any AArch64 hardware that
7226   // reorders stores in this way, but the Reference Manual permits it.
7227 
7228   void gen_cas_entry(Assembler::operand_size size,
7229                      atomic_memory_order order) {
7230     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
7231       exchange_val = c_rarg2;
7232     bool acquire, release;
7233     switch (order) {
7234       case memory_order_relaxed:
7235         acquire = false;
7236         release = false;
7237         break;
7238       case memory_order_release:
7239         acquire = false;
7240         release = true;
7241         break;
7242       default:
7243         acquire = true;
7244         release = true;
7245         break;
7246     }
7247     __ mov(prev, compare_val);
7248     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
7249     if (order == memory_order_conservative) {
7250       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7251     }
7252     if (size == Assembler::xword) {
7253       __ mov(r0, prev);
7254     } else {
7255       __ movw(r0, prev);
7256     }
7257     __ ret(lr);
7258   }
7259 
7260   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
7261     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7262     // If not relaxed, then default to conservative.  Relaxed is the only
7263     // case we use enough to be worth specializing.
7264     if (order == memory_order_relaxed) {
7265       __ ldadd(size, incr, prev, addr);
7266     } else {
7267       __ ldaddal(size, incr, prev, addr);
7268       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7269     }
7270     if (size == Assembler::xword) {
7271       __ mov(r0, prev);
7272     } else {
7273       __ movw(r0, prev);
7274     }
7275     __ ret(lr);
7276   }
7277 
7278   void gen_swpal_entry(Assembler::operand_size size) {
7279     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7280     __ swpal(size, incr, prev, addr);
7281     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7282     if (size == Assembler::xword) {
7283       __ mov(r0, prev);
7284     } else {
7285       __ movw(r0, prev);
7286     }
7287     __ ret(lr);
7288   }
7289 
7290   void generate_atomic_entry_points() {
7291     if (! UseLSE) {
7292       return;
7293     }
7294 
7295     __ align(CodeEntryAlignment);
7296     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
7297     address first_entry = __ pc();
7298 
7299     // ADD, memory_order_conservative
7300     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
7301     gen_ldadd_entry(Assembler::word, memory_order_conservative);
7302     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
7303     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
7304 
7305     // ADD, memory_order_relaxed
7306     AtomicStubMark mark_fetch_add_4_relaxed
7307       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
7308     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
7309     AtomicStubMark mark_fetch_add_8_relaxed
7310       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
7311     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
7312 
7313     // XCHG, memory_order_conservative
7314     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
7315     gen_swpal_entry(Assembler::word);
7316     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
7317     gen_swpal_entry(Assembler::xword);
7318 
7319     // CAS, memory_order_conservative
7320     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
7321     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
7322     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
7323     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
7324     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
7325     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
7326 
7327     // CAS, memory_order_relaxed
7328     AtomicStubMark mark_cmpxchg_1_relaxed
7329       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
7330     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
7331     AtomicStubMark mark_cmpxchg_4_relaxed
7332       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
7333     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
7334     AtomicStubMark mark_cmpxchg_8_relaxed
7335       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
7336     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
7337 
7338     AtomicStubMark mark_cmpxchg_4_release
7339       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
7340     gen_cas_entry(MacroAssembler::word, memory_order_release);
7341     AtomicStubMark mark_cmpxchg_8_release
7342       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
7343     gen_cas_entry(MacroAssembler::xword, memory_order_release);
7344 
7345     AtomicStubMark mark_cmpxchg_4_seq_cst
7346       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
7347     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
7348     AtomicStubMark mark_cmpxchg_8_seq_cst
7349       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
7350     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
7351 
7352     ICache::invalidate_range(first_entry, __ pc() - first_entry);
7353   }
7354 #endif // LINUX
7355 
7356   address generate_cont_thaw(Continuation::thaw_kind kind) {
7357     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
7358     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
7359 
7360     address start = __ pc();
7361 
7362     if (return_barrier) {
7363       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7364       __ mov(sp, rscratch1);
7365     }
7366     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7367 
7368     if (return_barrier) {
7369       // preserve possible return value from a method returning to the return barrier
7370       __ fmovd(rscratch1, v0);
7371       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7372     }
7373 
7374     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7375     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7376     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7377 
7378     if (return_barrier) {
7379       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7380       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7381       __ fmovd(v0, rscratch1);
7382     }
7383     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7384 
7385 
7386     Label thaw_success;
7387     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7388     __ cbnz(rscratch2, thaw_success);
7389     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
7390     __ br(rscratch1);
7391     __ bind(thaw_success);
7392 
7393     // make room for the thawed frames
7394     __ sub(rscratch1, sp, rscratch2);
7395     __ andr(rscratch1, rscratch1, -16); // align
7396     __ mov(sp, rscratch1);
7397 
7398     if (return_barrier) {
7399       // save original return value -- again
7400       __ fmovd(rscratch1, v0);
7401       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7402     }
7403 
7404     // If we want, we can templatize thaw by kind, and have three different entries
7405     __ movw(c_rarg1, (uint32_t)kind);
7406 
7407     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7408     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7409 
7410     if (return_barrier) {
7411       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7412       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7413       __ fmovd(v0, rscratch1);
7414     } else {
7415       __ mov(r0, zr); // return 0 (success) from doYield
7416     }
7417 
7418     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7419     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7420     __ mov(rfp, sp);
7421 
7422     if (return_barrier_exception) {
7423       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7424       __ authenticate_return_address(c_rarg1);
7425       __ verify_oop(r0);
7426       // save return value containing the exception oop in callee-saved R19
7427       __ mov(r19, r0);
7428 
7429       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7430 
7431       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7432       // __ reinitialize_ptrue();
7433 
7434       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7435 
7436       __ mov(r1, r0); // the exception handler
7437       __ mov(r0, r19); // restore return value containing the exception oop
7438       __ verify_oop(r0);
7439 
7440       __ leave();
7441       __ mov(r3, lr);
7442       __ br(r1); // the exception handler
7443     } else {
7444       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7445       __ leave();
7446       __ ret(lr);
7447     }
7448 
7449     return start;
7450   }
7451 
7452   address generate_cont_thaw() {
7453     if (!Continuations::enabled()) return nullptr;
7454 
7455     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7456     address start = __ pc();
7457     generate_cont_thaw(Continuation::thaw_top);
7458     return start;
7459   }
7460 
7461   address generate_cont_returnBarrier() {
7462     if (!Continuations::enabled()) return nullptr;
7463 
7464     // TODO: will probably need multiple return barriers depending on return type
7465     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7466     address start = __ pc();
7467 
7468     generate_cont_thaw(Continuation::thaw_return_barrier);
7469 
7470     return start;
7471   }
7472 
7473   address generate_cont_returnBarrier_exception() {
7474     if (!Continuations::enabled()) return nullptr;
7475 
7476     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7477     address start = __ pc();
7478 
7479     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7480 
7481     return start;
7482   }
7483 
7484   address generate_cont_preempt_stub() {
7485     if (!Continuations::enabled()) return nullptr;
7486     StubCodeMark mark(this, "StubRoutines","Continuation preempt stub");
7487     address start = __ pc();
7488 
7489     __ reset_last_Java_frame(true);
7490 
7491     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
7492     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
7493     __ mov(sp, rscratch2);
7494 
7495     Label preemption_cancelled;
7496     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
7497     __ cbnz(rscratch1, preemption_cancelled);
7498 
7499     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
7500     SharedRuntime::continuation_enter_cleanup(_masm);
7501     __ leave();
7502     __ ret(lr);
7503 
7504     // We acquired the monitor after freezing the frames so call thaw to continue execution.
7505     __ bind(preemption_cancelled);
7506     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
7507     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
7508     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
7509     __ ldr(rscratch1, Address(rscratch1));
7510     __ br(rscratch1);
7511 
7512     return start;
7513   }
7514 
7515   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7516   // are represented as long[5], with BITS_PER_LIMB = 26.
7517   // Pack five 26-bit limbs into three 64-bit registers.
7518   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7519     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7520     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7521     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7522     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7523 
7524     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7525     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7526     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7527     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7528 
7529     if (dest2->is_valid()) {
7530       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7531     } else {
7532 #ifdef ASSERT
7533       Label OK;
7534       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7535       __ br(__ EQ, OK);
7536       __ stop("high bits of Poly1305 integer should be zero");
7537       __ should_not_reach_here();
7538       __ bind(OK);
7539 #endif
7540     }
7541   }
7542 
7543   // As above, but return only a 128-bit integer, packed into two
7544   // 64-bit registers.
7545   void pack_26(Register dest0, Register dest1, Register src) {
7546     pack_26(dest0, dest1, noreg, src);
7547   }
7548 
7549   // Multiply and multiply-accumulate unsigned 64-bit registers.
7550   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7551     __ mul(prod_lo, n, m);
7552     __ umulh(prod_hi, n, m);
7553   }
7554   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7555     wide_mul(rscratch1, rscratch2, n, m);
7556     __ adds(sum_lo, sum_lo, rscratch1);
7557     __ adc(sum_hi, sum_hi, rscratch2);
7558   }
7559 
7560   // Poly1305, RFC 7539
7561 
7562   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7563   // description of the tricks used to simplify and accelerate this
7564   // computation.
7565 
7566   address generate_poly1305_processBlocks() {
7567     __ align(CodeEntryAlignment);
7568     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7569     address start = __ pc();
7570     Label here;
7571     __ enter();
7572     RegSet callee_saved = RegSet::range(r19, r28);
7573     __ push(callee_saved, sp);
7574 
7575     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7576 
7577     // Arguments
7578     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7579 
7580     // R_n is the 128-bit randomly-generated key, packed into two
7581     // registers.  The caller passes this key to us as long[5], with
7582     // BITS_PER_LIMB = 26.
7583     const Register R_0 = *++regs, R_1 = *++regs;
7584     pack_26(R_0, R_1, r_start);
7585 
7586     // RR_n is (R_n >> 2) * 5
7587     const Register RR_0 = *++regs, RR_1 = *++regs;
7588     __ lsr(RR_0, R_0, 2);
7589     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7590     __ lsr(RR_1, R_1, 2);
7591     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7592 
7593     // U_n is the current checksum
7594     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7595     pack_26(U_0, U_1, U_2, acc_start);
7596 
7597     static constexpr int BLOCK_LENGTH = 16;
7598     Label DONE, LOOP;
7599 
7600     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7601     __ br(Assembler::LT, DONE); {
7602       __ bind(LOOP);
7603 
7604       // S_n is to be the sum of U_n and the next block of data
7605       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7606       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7607       __ adds(S_0, U_0, S_0);
7608       __ adcs(S_1, U_1, S_1);
7609       __ adc(S_2, U_2, zr);
7610       __ add(S_2, S_2, 1);
7611 
7612       const Register U_0HI = *++regs, U_1HI = *++regs;
7613 
7614       // NB: this logic depends on some of the special properties of
7615       // Poly1305 keys. In particular, because we know that the top
7616       // four bits of R_0 and R_1 are zero, we can add together
7617       // partial products without any risk of needing to propagate a
7618       // carry out.
7619       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7620       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7621       __ andr(U_2, R_0, 3);
7622       __ mul(U_2, S_2, U_2);
7623 
7624       // Recycle registers S_0, S_1, S_2
7625       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7626 
7627       // Partial reduction mod 2**130 - 5
7628       __ adds(U_1, U_0HI, U_1);
7629       __ adc(U_2, U_1HI, U_2);
7630       // Sum now in U_2:U_1:U_0.
7631       // Dead: U_0HI, U_1HI.
7632       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7633 
7634       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7635 
7636       // First, U_2:U_1:U_0 += (U_2 >> 2)
7637       __ lsr(rscratch1, U_2, 2);
7638       __ andr(U_2, U_2, (u8)3);
7639       __ adds(U_0, U_0, rscratch1);
7640       __ adcs(U_1, U_1, zr);
7641       __ adc(U_2, U_2, zr);
7642       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7643       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7644       __ adcs(U_1, U_1, zr);
7645       __ adc(U_2, U_2, zr);
7646 
7647       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7648       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7649       __ br(~ Assembler::LT, LOOP);
7650     }
7651 
7652     // Further reduce modulo 2^130 - 5
7653     __ lsr(rscratch1, U_2, 2);
7654     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7655     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7656     __ adcs(U_1, U_1, zr);
7657     __ andr(U_2, U_2, (u1)3);
7658     __ adc(U_2, U_2, zr);
7659 
7660     // Unpack the sum into five 26-bit limbs and write to memory.
7661     __ ubfiz(rscratch1, U_0, 0, 26);
7662     __ ubfx(rscratch2, U_0, 26, 26);
7663     __ stp(rscratch1, rscratch2, Address(acc_start));
7664     __ ubfx(rscratch1, U_0, 52, 12);
7665     __ bfi(rscratch1, U_1, 12, 14);
7666     __ ubfx(rscratch2, U_1, 14, 26);
7667     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7668     __ ubfx(rscratch1, U_1, 40, 24);
7669     __ bfi(rscratch1, U_2, 24, 3);
7670     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7671 
7672     __ bind(DONE);
7673     __ pop(callee_saved, sp);
7674     __ leave();
7675     __ ret(lr);
7676 
7677     return start;
7678   }
7679 
7680   // exception handler for upcall stubs
7681   address generate_upcall_stub_exception_handler() {
7682     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7683     address start = __ pc();
7684 
7685     // Native caller has no idea how to handle exceptions,
7686     // so we just crash here. Up to callee to catch exceptions.
7687     __ verify_oop(r0);
7688     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7689     __ blr(rscratch1);
7690     __ should_not_reach_here();
7691 
7692     return start;
7693   }
7694 
7695   // load Method* target of MethodHandle
7696   // j_rarg0 = jobject receiver
7697   // rmethod = result
7698   address generate_upcall_stub_load_target() {
7699     StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target");
7700     address start = __ pc();
7701 
7702     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
7703       // Load target method from receiver
7704     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
7705     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
7706     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
7707     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
7708                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7709                       noreg, noreg);
7710     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7711 
7712     __ ret(lr);
7713 
7714     return start;
7715   }
7716 
7717 #undef __
7718 #define __ masm->
7719 
7720   class MontgomeryMultiplyGenerator : public MacroAssembler {
7721 
7722     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7723       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7724 
7725     RegSet _toSave;
7726     bool _squaring;
7727 
7728   public:
7729     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7730       : MacroAssembler(as->code()), _squaring(squaring) {
7731 
7732       // Register allocation
7733 
7734       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7735       Pa_base = *regs;       // Argument registers
7736       if (squaring)
7737         Pb_base = Pa_base;
7738       else
7739         Pb_base = *++regs;
7740       Pn_base = *++regs;
7741       Rlen= *++regs;
7742       inv = *++regs;
7743       Pm_base = *++regs;
7744 
7745                           // Working registers:
7746       Ra =  *++regs;        // The current digit of a, b, n, and m.
7747       Rb =  *++regs;
7748       Rm =  *++regs;
7749       Rn =  *++regs;
7750 
7751       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7752       Pb =  *++regs;
7753       Pm =  *++regs;
7754       Pn =  *++regs;
7755 
7756       t0 =  *++regs;        // Three registers which form a
7757       t1 =  *++regs;        // triple-precision accumuator.
7758       t2 =  *++regs;
7759 
7760       Ri =  *++regs;        // Inner and outer loop indexes.
7761       Rj =  *++regs;
7762 
7763       Rhi_ab = *++regs;     // Product registers: low and high parts
7764       Rlo_ab = *++regs;     // of a*b and m*n.
7765       Rhi_mn = *++regs;
7766       Rlo_mn = *++regs;
7767 
7768       // r19 and up are callee-saved.
7769       _toSave = RegSet::range(r19, *regs) + Pm_base;
7770     }
7771 
7772   private:
7773     void save_regs() {
7774       push(_toSave, sp);
7775     }
7776 
7777     void restore_regs() {
7778       pop(_toSave, sp);
7779     }
7780 
7781     template <typename T>
7782     void unroll_2(Register count, T block) {
7783       Label loop, end, odd;
7784       tbnz(count, 0, odd);
7785       cbz(count, end);
7786       align(16);
7787       bind(loop);
7788       (this->*block)();
7789       bind(odd);
7790       (this->*block)();
7791       subs(count, count, 2);
7792       br(Assembler::GT, loop);
7793       bind(end);
7794     }
7795 
7796     template <typename T>
7797     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7798       Label loop, end, odd;
7799       tbnz(count, 0, odd);
7800       cbz(count, end);
7801       align(16);
7802       bind(loop);
7803       (this->*block)(d, s, tmp);
7804       bind(odd);
7805       (this->*block)(d, s, tmp);
7806       subs(count, count, 2);
7807       br(Assembler::GT, loop);
7808       bind(end);
7809     }
7810 
7811     void pre1(RegisterOrConstant i) {
7812       block_comment("pre1");
7813       // Pa = Pa_base;
7814       // Pb = Pb_base + i;
7815       // Pm = Pm_base;
7816       // Pn = Pn_base + i;
7817       // Ra = *Pa;
7818       // Rb = *Pb;
7819       // Rm = *Pm;
7820       // Rn = *Pn;
7821       ldr(Ra, Address(Pa_base));
7822       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7823       ldr(Rm, Address(Pm_base));
7824       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7825       lea(Pa, Address(Pa_base));
7826       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7827       lea(Pm, Address(Pm_base));
7828       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7829 
7830       // Zero the m*n result.
7831       mov(Rhi_mn, zr);
7832       mov(Rlo_mn, zr);
7833     }
7834 
7835     // The core multiply-accumulate step of a Montgomery
7836     // multiplication.  The idea is to schedule operations as a
7837     // pipeline so that instructions with long latencies (loads and
7838     // multiplies) have time to complete before their results are
7839     // used.  This most benefits in-order implementations of the
7840     // architecture but out-of-order ones also benefit.
7841     void step() {
7842       block_comment("step");
7843       // MACC(Ra, Rb, t0, t1, t2);
7844       // Ra = *++Pa;
7845       // Rb = *--Pb;
7846       umulh(Rhi_ab, Ra, Rb);
7847       mul(Rlo_ab, Ra, Rb);
7848       ldr(Ra, pre(Pa, wordSize));
7849       ldr(Rb, pre(Pb, -wordSize));
7850       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7851                                        // previous iteration.
7852       // MACC(Rm, Rn, t0, t1, t2);
7853       // Rm = *++Pm;
7854       // Rn = *--Pn;
7855       umulh(Rhi_mn, Rm, Rn);
7856       mul(Rlo_mn, Rm, Rn);
7857       ldr(Rm, pre(Pm, wordSize));
7858       ldr(Rn, pre(Pn, -wordSize));
7859       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7860     }
7861 
7862     void post1() {
7863       block_comment("post1");
7864 
7865       // MACC(Ra, Rb, t0, t1, t2);
7866       // Ra = *++Pa;
7867       // Rb = *--Pb;
7868       umulh(Rhi_ab, Ra, Rb);
7869       mul(Rlo_ab, Ra, Rb);
7870       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7871       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7872 
7873       // *Pm = Rm = t0 * inv;
7874       mul(Rm, t0, inv);
7875       str(Rm, Address(Pm));
7876 
7877       // MACC(Rm, Rn, t0, t1, t2);
7878       // t0 = t1; t1 = t2; t2 = 0;
7879       umulh(Rhi_mn, Rm, Rn);
7880 
7881 #ifndef PRODUCT
7882       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7883       {
7884         mul(Rlo_mn, Rm, Rn);
7885         add(Rlo_mn, t0, Rlo_mn);
7886         Label ok;
7887         cbz(Rlo_mn, ok); {
7888           stop("broken Montgomery multiply");
7889         } bind(ok);
7890       }
7891 #endif
7892       // We have very carefully set things up so that
7893       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7894       // the lower half of Rm * Rn because we know the result already:
7895       // it must be -t0.  t0 + (-t0) must generate a carry iff
7896       // t0 != 0.  So, rather than do a mul and an adds we just set
7897       // the carry flag iff t0 is nonzero.
7898       //
7899       // mul(Rlo_mn, Rm, Rn);
7900       // adds(zr, t0, Rlo_mn);
7901       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7902       adcs(t0, t1, Rhi_mn);
7903       adc(t1, t2, zr);
7904       mov(t2, zr);
7905     }
7906 
7907     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7908       block_comment("pre2");
7909       // Pa = Pa_base + i-len;
7910       // Pb = Pb_base + len;
7911       // Pm = Pm_base + i-len;
7912       // Pn = Pn_base + len;
7913 
7914       if (i.is_register()) {
7915         sub(Rj, i.as_register(), len);
7916       } else {
7917         mov(Rj, i.as_constant());
7918         sub(Rj, Rj, len);
7919       }
7920       // Rj == i-len
7921 
7922       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7923       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7924       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7925       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7926 
7927       // Ra = *++Pa;
7928       // Rb = *--Pb;
7929       // Rm = *++Pm;
7930       // Rn = *--Pn;
7931       ldr(Ra, pre(Pa, wordSize));
7932       ldr(Rb, pre(Pb, -wordSize));
7933       ldr(Rm, pre(Pm, wordSize));
7934       ldr(Rn, pre(Pn, -wordSize));
7935 
7936       mov(Rhi_mn, zr);
7937       mov(Rlo_mn, zr);
7938     }
7939 
7940     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7941       block_comment("post2");
7942       if (i.is_constant()) {
7943         mov(Rj, i.as_constant()-len.as_constant());
7944       } else {
7945         sub(Rj, i.as_register(), len);
7946       }
7947 
7948       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7949 
7950       // As soon as we know the least significant digit of our result,
7951       // store it.
7952       // Pm_base[i-len] = t0;
7953       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7954 
7955       // t0 = t1; t1 = t2; t2 = 0;
7956       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7957       adc(t1, t2, zr);
7958       mov(t2, zr);
7959     }
7960 
7961     // A carry in t0 after Montgomery multiplication means that we
7962     // should subtract multiples of n from our result in m.  We'll
7963     // keep doing that until there is no carry.
7964     void normalize(RegisterOrConstant len) {
7965       block_comment("normalize");
7966       // while (t0)
7967       //   t0 = sub(Pm_base, Pn_base, t0, len);
7968       Label loop, post, again;
7969       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7970       cbz(t0, post); {
7971         bind(again); {
7972           mov(i, zr);
7973           mov(cnt, len);
7974           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7975           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7976           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7977           align(16);
7978           bind(loop); {
7979             sbcs(Rm, Rm, Rn);
7980             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7981             add(i, i, 1);
7982             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7983             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7984             sub(cnt, cnt, 1);
7985           } cbnz(cnt, loop);
7986           sbc(t0, t0, zr);
7987         } cbnz(t0, again);
7988       } bind(post);
7989     }
7990 
7991     // Move memory at s to d, reversing words.
7992     //    Increments d to end of copied memory
7993     //    Destroys tmp1, tmp2
7994     //    Preserves len
7995     //    Leaves s pointing to the address which was in d at start
7996     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7997       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7998       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7999 
8000       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
8001       mov(tmp1, len);
8002       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
8003       sub(s, d, len, ext::uxtw, LogBytesPerWord);
8004     }
8005     // where
8006     void reverse1(Register d, Register s, Register tmp) {
8007       ldr(tmp, pre(s, -wordSize));
8008       ror(tmp, tmp, 32);
8009       str(tmp, post(d, wordSize));
8010     }
8011 
8012     void step_squaring() {
8013       // An extra ACC
8014       step();
8015       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8016     }
8017 
8018     void last_squaring(RegisterOrConstant i) {
8019       Label dont;
8020       // if ((i & 1) == 0) {
8021       tbnz(i.as_register(), 0, dont); {
8022         // MACC(Ra, Rb, t0, t1, t2);
8023         // Ra = *++Pa;
8024         // Rb = *--Pb;
8025         umulh(Rhi_ab, Ra, Rb);
8026         mul(Rlo_ab, Ra, Rb);
8027         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8028       } bind(dont);
8029     }
8030 
8031     void extra_step_squaring() {
8032       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8033 
8034       // MACC(Rm, Rn, t0, t1, t2);
8035       // Rm = *++Pm;
8036       // Rn = *--Pn;
8037       umulh(Rhi_mn, Rm, Rn);
8038       mul(Rlo_mn, Rm, Rn);
8039       ldr(Rm, pre(Pm, wordSize));
8040       ldr(Rn, pre(Pn, -wordSize));
8041     }
8042 
8043     void post1_squaring() {
8044       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8045 
8046       // *Pm = Rm = t0 * inv;
8047       mul(Rm, t0, inv);
8048       str(Rm, Address(Pm));
8049 
8050       // MACC(Rm, Rn, t0, t1, t2);
8051       // t0 = t1; t1 = t2; t2 = 0;
8052       umulh(Rhi_mn, Rm, Rn);
8053 
8054 #ifndef PRODUCT
8055       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
8056       {
8057         mul(Rlo_mn, Rm, Rn);
8058         add(Rlo_mn, t0, Rlo_mn);
8059         Label ok;
8060         cbz(Rlo_mn, ok); {
8061           stop("broken Montgomery multiply");
8062         } bind(ok);
8063       }
8064 #endif
8065       // We have very carefully set things up so that
8066       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
8067       // the lower half of Rm * Rn because we know the result already:
8068       // it must be -t0.  t0 + (-t0) must generate a carry iff
8069       // t0 != 0.  So, rather than do a mul and an adds we just set
8070       // the carry flag iff t0 is nonzero.
8071       //
8072       // mul(Rlo_mn, Rm, Rn);
8073       // adds(zr, t0, Rlo_mn);
8074       subs(zr, t0, 1); // Set carry iff t0 is nonzero
8075       adcs(t0, t1, Rhi_mn);
8076       adc(t1, t2, zr);
8077       mov(t2, zr);
8078     }
8079 
8080     void acc(Register Rhi, Register Rlo,
8081              Register t0, Register t1, Register t2) {
8082       adds(t0, t0, Rlo);
8083       adcs(t1, t1, Rhi);
8084       adc(t2, t2, zr);
8085     }
8086 
8087   public:
8088     /**
8089      * Fast Montgomery multiplication.  The derivation of the
8090      * algorithm is in A Cryptographic Library for the Motorola
8091      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
8092      *
8093      * Arguments:
8094      *
8095      * Inputs for multiplication:
8096      *   c_rarg0   - int array elements a
8097      *   c_rarg1   - int array elements b
8098      *   c_rarg2   - int array elements n (the modulus)
8099      *   c_rarg3   - int length
8100      *   c_rarg4   - int inv
8101      *   c_rarg5   - int array elements m (the result)
8102      *
8103      * Inputs for squaring:
8104      *   c_rarg0   - int array elements a
8105      *   c_rarg1   - int array elements n (the modulus)
8106      *   c_rarg2   - int length
8107      *   c_rarg3   - int inv
8108      *   c_rarg4   - int array elements m (the result)
8109      *
8110      */
8111     address generate_multiply() {
8112       Label argh, nothing;
8113       bind(argh);
8114       stop("MontgomeryMultiply total_allocation must be <= 8192");
8115 
8116       align(CodeEntryAlignment);
8117       address entry = pc();
8118 
8119       cbzw(Rlen, nothing);
8120 
8121       enter();
8122 
8123       // Make room.
8124       cmpw(Rlen, 512);
8125       br(Assembler::HI, argh);
8126       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8127       andr(sp, Ra, -2 * wordSize);
8128 
8129       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8130 
8131       {
8132         // Copy input args, reversing as we go.  We use Ra as a
8133         // temporary variable.
8134         reverse(Ra, Pa_base, Rlen, t0, t1);
8135         if (!_squaring)
8136           reverse(Ra, Pb_base, Rlen, t0, t1);
8137         reverse(Ra, Pn_base, Rlen, t0, t1);
8138       }
8139 
8140       // Push all call-saved registers and also Pm_base which we'll need
8141       // at the end.
8142       save_regs();
8143 
8144 #ifndef PRODUCT
8145       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
8146       {
8147         ldr(Rn, Address(Pn_base, 0));
8148         mul(Rlo_mn, Rn, inv);
8149         subs(zr, Rlo_mn, -1);
8150         Label ok;
8151         br(EQ, ok); {
8152           stop("broken inverse in Montgomery multiply");
8153         } bind(ok);
8154       }
8155 #endif
8156 
8157       mov(Pm_base, Ra);
8158 
8159       mov(t0, zr);
8160       mov(t1, zr);
8161       mov(t2, zr);
8162 
8163       block_comment("for (int i = 0; i < len; i++) {");
8164       mov(Ri, zr); {
8165         Label loop, end;
8166         cmpw(Ri, Rlen);
8167         br(Assembler::GE, end);
8168 
8169         bind(loop);
8170         pre1(Ri);
8171 
8172         block_comment("  for (j = i; j; j--) {"); {
8173           movw(Rj, Ri);
8174           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8175         } block_comment("  } // j");
8176 
8177         post1();
8178         addw(Ri, Ri, 1);
8179         cmpw(Ri, Rlen);
8180         br(Assembler::LT, loop);
8181         bind(end);
8182         block_comment("} // i");
8183       }
8184 
8185       block_comment("for (int i = len; i < 2*len; i++) {");
8186       mov(Ri, Rlen); {
8187         Label loop, end;
8188         cmpw(Ri, Rlen, Assembler::LSL, 1);
8189         br(Assembler::GE, end);
8190 
8191         bind(loop);
8192         pre2(Ri, Rlen);
8193 
8194         block_comment("  for (j = len*2-i-1; j; j--) {"); {
8195           lslw(Rj, Rlen, 1);
8196           subw(Rj, Rj, Ri);
8197           subw(Rj, Rj, 1);
8198           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8199         } block_comment("  } // j");
8200 
8201         post2(Ri, Rlen);
8202         addw(Ri, Ri, 1);
8203         cmpw(Ri, Rlen, Assembler::LSL, 1);
8204         br(Assembler::LT, loop);
8205         bind(end);
8206       }
8207       block_comment("} // i");
8208 
8209       normalize(Rlen);
8210 
8211       mov(Ra, Pm_base);  // Save Pm_base in Ra
8212       restore_regs();  // Restore caller's Pm_base
8213 
8214       // Copy our result into caller's Pm_base
8215       reverse(Pm_base, Ra, Rlen, t0, t1);
8216 
8217       leave();
8218       bind(nothing);
8219       ret(lr);
8220 
8221       return entry;
8222     }
8223     // In C, approximately:
8224 
8225     // void
8226     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8227     //                     julong Pn_base[], julong Pm_base[],
8228     //                     julong inv, int len) {
8229     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8230     //   julong *Pa, *Pb, *Pn, *Pm;
8231     //   julong Ra, Rb, Rn, Rm;
8232 
8233     //   int i;
8234 
8235     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8236 
8237     //   for (i = 0; i < len; i++) {
8238     //     int j;
8239 
8240     //     Pa = Pa_base;
8241     //     Pb = Pb_base + i;
8242     //     Pm = Pm_base;
8243     //     Pn = Pn_base + i;
8244 
8245     //     Ra = *Pa;
8246     //     Rb = *Pb;
8247     //     Rm = *Pm;
8248     //     Rn = *Pn;
8249 
8250     //     int iters = i;
8251     //     for (j = 0; iters--; j++) {
8252     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8253     //       MACC(Ra, Rb, t0, t1, t2);
8254     //       Ra = *++Pa;
8255     //       Rb = *--Pb;
8256     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8257     //       MACC(Rm, Rn, t0, t1, t2);
8258     //       Rm = *++Pm;
8259     //       Rn = *--Pn;
8260     //     }
8261 
8262     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8263     //     MACC(Ra, Rb, t0, t1, t2);
8264     //     *Pm = Rm = t0 * inv;
8265     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8266     //     MACC(Rm, Rn, t0, t1, t2);
8267 
8268     //     assert(t0 == 0, "broken Montgomery multiply");
8269 
8270     //     t0 = t1; t1 = t2; t2 = 0;
8271     //   }
8272 
8273     //   for (i = len; i < 2*len; i++) {
8274     //     int j;
8275 
8276     //     Pa = Pa_base + i-len;
8277     //     Pb = Pb_base + len;
8278     //     Pm = Pm_base + i-len;
8279     //     Pn = Pn_base + len;
8280 
8281     //     Ra = *++Pa;
8282     //     Rb = *--Pb;
8283     //     Rm = *++Pm;
8284     //     Rn = *--Pn;
8285 
8286     //     int iters = len*2-i-1;
8287     //     for (j = i-len+1; iters--; j++) {
8288     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8289     //       MACC(Ra, Rb, t0, t1, t2);
8290     //       Ra = *++Pa;
8291     //       Rb = *--Pb;
8292     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8293     //       MACC(Rm, Rn, t0, t1, t2);
8294     //       Rm = *++Pm;
8295     //       Rn = *--Pn;
8296     //     }
8297 
8298     //     Pm_base[i-len] = t0;
8299     //     t0 = t1; t1 = t2; t2 = 0;
8300     //   }
8301 
8302     //   while (t0)
8303     //     t0 = sub(Pm_base, Pn_base, t0, len);
8304     // }
8305 
8306     /**
8307      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8308      * multiplies than Montgomery multiplication so it should be up to
8309      * 25% faster.  However, its loop control is more complex and it
8310      * may actually run slower on some machines.
8311      *
8312      * Arguments:
8313      *
8314      * Inputs:
8315      *   c_rarg0   - int array elements a
8316      *   c_rarg1   - int array elements n (the modulus)
8317      *   c_rarg2   - int length
8318      *   c_rarg3   - int inv
8319      *   c_rarg4   - int array elements m (the result)
8320      *
8321      */
8322     address generate_square() {
8323       Label argh;
8324       bind(argh);
8325       stop("MontgomeryMultiply total_allocation must be <= 8192");
8326 
8327       align(CodeEntryAlignment);
8328       address entry = pc();
8329 
8330       enter();
8331 
8332       // Make room.
8333       cmpw(Rlen, 512);
8334       br(Assembler::HI, argh);
8335       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8336       andr(sp, Ra, -2 * wordSize);
8337 
8338       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8339 
8340       {
8341         // Copy input args, reversing as we go.  We use Ra as a
8342         // temporary variable.
8343         reverse(Ra, Pa_base, Rlen, t0, t1);
8344         reverse(Ra, Pn_base, Rlen, t0, t1);
8345       }
8346 
8347       // Push all call-saved registers and also Pm_base which we'll need
8348       // at the end.
8349       save_regs();
8350 
8351       mov(Pm_base, Ra);
8352 
8353       mov(t0, zr);
8354       mov(t1, zr);
8355       mov(t2, zr);
8356 
8357       block_comment("for (int i = 0; i < len; i++) {");
8358       mov(Ri, zr); {
8359         Label loop, end;
8360         bind(loop);
8361         cmp(Ri, Rlen);
8362         br(Assembler::GE, end);
8363 
8364         pre1(Ri);
8365 
8366         block_comment("for (j = (i+1)/2; j; j--) {"); {
8367           add(Rj, Ri, 1);
8368           lsr(Rj, Rj, 1);
8369           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8370         } block_comment("  } // j");
8371 
8372         last_squaring(Ri);
8373 
8374         block_comment("  for (j = i/2; j; j--) {"); {
8375           lsr(Rj, Ri, 1);
8376           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8377         } block_comment("  } // j");
8378 
8379         post1_squaring();
8380         add(Ri, Ri, 1);
8381         cmp(Ri, Rlen);
8382         br(Assembler::LT, loop);
8383 
8384         bind(end);
8385         block_comment("} // i");
8386       }
8387 
8388       block_comment("for (int i = len; i < 2*len; i++) {");
8389       mov(Ri, Rlen); {
8390         Label loop, end;
8391         bind(loop);
8392         cmp(Ri, Rlen, Assembler::LSL, 1);
8393         br(Assembler::GE, end);
8394 
8395         pre2(Ri, Rlen);
8396 
8397         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8398           lsl(Rj, Rlen, 1);
8399           sub(Rj, Rj, Ri);
8400           sub(Rj, Rj, 1);
8401           lsr(Rj, Rj, 1);
8402           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8403         } block_comment("  } // j");
8404 
8405         last_squaring(Ri);
8406 
8407         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8408           lsl(Rj, Rlen, 1);
8409           sub(Rj, Rj, Ri);
8410           lsr(Rj, Rj, 1);
8411           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8412         } block_comment("  } // j");
8413 
8414         post2(Ri, Rlen);
8415         add(Ri, Ri, 1);
8416         cmp(Ri, Rlen, Assembler::LSL, 1);
8417 
8418         br(Assembler::LT, loop);
8419         bind(end);
8420         block_comment("} // i");
8421       }
8422 
8423       normalize(Rlen);
8424 
8425       mov(Ra, Pm_base);  // Save Pm_base in Ra
8426       restore_regs();  // Restore caller's Pm_base
8427 
8428       // Copy our result into caller's Pm_base
8429       reverse(Pm_base, Ra, Rlen, t0, t1);
8430 
8431       leave();
8432       ret(lr);
8433 
8434       return entry;
8435     }
8436     // In C, approximately:
8437 
8438     // void
8439     // montgomery_square(julong Pa_base[], julong Pn_base[],
8440     //                   julong Pm_base[], julong inv, int len) {
8441     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8442     //   julong *Pa, *Pb, *Pn, *Pm;
8443     //   julong Ra, Rb, Rn, Rm;
8444 
8445     //   int i;
8446 
8447     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8448 
8449     //   for (i = 0; i < len; i++) {
8450     //     int j;
8451 
8452     //     Pa = Pa_base;
8453     //     Pb = Pa_base + i;
8454     //     Pm = Pm_base;
8455     //     Pn = Pn_base + i;
8456 
8457     //     Ra = *Pa;
8458     //     Rb = *Pb;
8459     //     Rm = *Pm;
8460     //     Rn = *Pn;
8461 
8462     //     int iters = (i+1)/2;
8463     //     for (j = 0; iters--; j++) {
8464     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8465     //       MACC2(Ra, Rb, t0, t1, t2);
8466     //       Ra = *++Pa;
8467     //       Rb = *--Pb;
8468     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8469     //       MACC(Rm, Rn, t0, t1, t2);
8470     //       Rm = *++Pm;
8471     //       Rn = *--Pn;
8472     //     }
8473     //     if ((i & 1) == 0) {
8474     //       assert(Ra == Pa_base[j], "must be");
8475     //       MACC(Ra, Ra, t0, t1, t2);
8476     //     }
8477     //     iters = i/2;
8478     //     assert(iters == i-j, "must be");
8479     //     for (; iters--; j++) {
8480     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8481     //       MACC(Rm, Rn, t0, t1, t2);
8482     //       Rm = *++Pm;
8483     //       Rn = *--Pn;
8484     //     }
8485 
8486     //     *Pm = Rm = t0 * inv;
8487     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8488     //     MACC(Rm, Rn, t0, t1, t2);
8489 
8490     //     assert(t0 == 0, "broken Montgomery multiply");
8491 
8492     //     t0 = t1; t1 = t2; t2 = 0;
8493     //   }
8494 
8495     //   for (i = len; i < 2*len; i++) {
8496     //     int start = i-len+1;
8497     //     int end = start + (len - start)/2;
8498     //     int j;
8499 
8500     //     Pa = Pa_base + i-len;
8501     //     Pb = Pa_base + len;
8502     //     Pm = Pm_base + i-len;
8503     //     Pn = Pn_base + len;
8504 
8505     //     Ra = *++Pa;
8506     //     Rb = *--Pb;
8507     //     Rm = *++Pm;
8508     //     Rn = *--Pn;
8509 
8510     //     int iters = (2*len-i-1)/2;
8511     //     assert(iters == end-start, "must be");
8512     //     for (j = start; iters--; j++) {
8513     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8514     //       MACC2(Ra, Rb, t0, t1, t2);
8515     //       Ra = *++Pa;
8516     //       Rb = *--Pb;
8517     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8518     //       MACC(Rm, Rn, t0, t1, t2);
8519     //       Rm = *++Pm;
8520     //       Rn = *--Pn;
8521     //     }
8522     //     if ((i & 1) == 0) {
8523     //       assert(Ra == Pa_base[j], "must be");
8524     //       MACC(Ra, Ra, t0, t1, t2);
8525     //     }
8526     //     iters =  (2*len-i)/2;
8527     //     assert(iters == len-j, "must be");
8528     //     for (; iters--; j++) {
8529     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8530     //       MACC(Rm, Rn, t0, t1, t2);
8531     //       Rm = *++Pm;
8532     //       Rn = *--Pn;
8533     //     }
8534     //     Pm_base[i-len] = t0;
8535     //     t0 = t1; t1 = t2; t2 = 0;
8536     //   }
8537 
8538     //   while (t0)
8539     //     t0 = sub(Pm_base, Pn_base, t0, len);
8540     // }
8541   };
8542 
8543   void generate_vector_math_stubs() {
8544     // Get native vector math stub routine addresses
8545     void* libsleef = nullptr;
8546     char ebuf[1024];
8547     char dll_name[JVM_MAXPATHLEN];
8548     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
8549       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
8550     }
8551     if (libsleef == nullptr) {
8552       log_info(library)("Failed to load native vector math library, %s!", ebuf);
8553       return;
8554     }
8555     // Method naming convention
8556     //   All the methods are named as <OP><T><N>_<U><suffix>
8557     //   Where:
8558     //     <OP>     is the operation name, e.g. sin
8559     //     <T>      is optional to indicate float/double
8560     //              "f/d" for vector float/double operation
8561     //     <N>      is the number of elements in the vector
8562     //              "2/4" for neon, and "x" for sve
8563     //     <U>      is the precision level
8564     //              "u10/u05" represents 1.0/0.5 ULP error bounds
8565     //               We use "u10" for all operations by default
8566     //               But for those functions do not have u10 support, we use "u05" instead
8567     //     <suffix> indicates neon/sve
8568     //              "sve/advsimd" for sve/neon implementations
8569     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
8570     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
8571     //
8572     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
8573 
8574     // Math vector stubs implemented with SVE for scalable vector size.
8575     if (UseSVE > 0) {
8576       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8577         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8578         // Skip "tanh" because there is performance regression
8579         if (vop == VectorSupport::VECTOR_OP_TANH) {
8580           continue;
8581         }
8582 
8583         // The native library does not support u10 level of "hypot".
8584         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8585 
8586         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
8587         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8588 
8589         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
8590         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8591       }
8592     }
8593 
8594     // Math vector stubs implemented with NEON for 64/128 bits vector size.
8595     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8596       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8597       // Skip "tanh" because there is performance regression
8598       if (vop == VectorSupport::VECTOR_OP_TANH) {
8599         continue;
8600       }
8601 
8602       // The native library does not support u10 level of "hypot".
8603       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8604 
8605       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8606       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
8607 
8608       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8609       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8610 
8611       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
8612       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8613     }
8614   }
8615 
8616   // Initialization
8617   void generate_initial_stubs() {
8618     // Generate initial stubs and initializes the entry points
8619 
8620     // entry points that exist in all platforms Note: This is code
8621     // that could be shared among different platforms - however the
8622     // benefit seems to be smaller than the disadvantage of having a
8623     // much more complicated generator structure. See also comment in
8624     // stubRoutines.hpp.
8625 
8626     StubRoutines::_forward_exception_entry = generate_forward_exception();
8627 
8628     StubRoutines::_call_stub_entry =
8629       generate_call_stub(StubRoutines::_call_stub_return_address);
8630 
8631     // is referenced by megamorphic call
8632     StubRoutines::_catch_exception_entry = generate_catch_exception();
8633 
8634     // Initialize table for copy memory (arraycopy) check.
8635     if (UnsafeMemoryAccess::_table == nullptr) {
8636       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
8637     }
8638 
8639     if (UseCRC32Intrinsics) {
8640       // set table address before stub generation which use it
8641       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8642       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8643     }
8644 
8645     if (UseCRC32CIntrinsics) {
8646       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8647     }
8648 
8649     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8650       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8651     }
8652 
8653     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8654       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8655     }
8656 
8657     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8658         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8659       StubRoutines::_hf2f = generate_float16ToFloat();
8660       StubRoutines::_f2hf = generate_floatToFloat16();
8661     }
8662   }
8663 
8664   void generate_continuation_stubs() {
8665     // Continuation stubs:
8666     StubRoutines::_cont_thaw          = generate_cont_thaw();
8667     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8668     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8669     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
8670   }
8671 
8672   void generate_final_stubs() {
8673     // support for verify_oop (must happen after universe_init)
8674     if (VerifyOops) {
8675       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8676     }
8677 
8678     // arraycopy stubs used by compilers
8679     generate_arraycopy_stubs();
8680 
8681     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8682     if (bs_nm != nullptr) {
8683       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8684     }
8685 
8686     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8687 
8688     if (UsePoly1305Intrinsics) {
8689       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8690     }
8691 
8692 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8693 
8694     generate_atomic_entry_points();
8695 
8696 #endif // LINUX
8697 
8698 #ifdef COMPILER2
8699     if (UseSecondarySupersTable) {
8700       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
8701       if (! InlineSecondarySupersTest) {
8702         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
8703           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
8704             = generate_lookup_secondary_supers_table_stub(slot);
8705         }
8706       }
8707     }
8708 #endif
8709 
8710     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8711     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
8712 
8713     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8714   }
8715 
8716   void generate_compiler_stubs() {
8717 #if COMPILER2_OR_JVMCI
8718 
8719     if (UseSVE == 0) {
8720       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8721     }
8722 
8723     // array equals stub for large arrays.
8724     if (!UseSimpleArrayEquals) {
8725       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8726     }
8727 
8728     // arrays_hascode stub for large arrays.
8729     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
8730     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
8731     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
8732     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
8733     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
8734 
8735     // byte_array_inflate stub for large arrays.
8736     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8737 
8738     // countPositives stub for large arrays.
8739     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8740 
8741     generate_compare_long_strings();
8742 
8743     generate_string_indexof_stubs();
8744 
8745 #ifdef COMPILER2
8746     if (UseMultiplyToLenIntrinsic) {
8747       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8748     }
8749 
8750     if (UseSquareToLenIntrinsic) {
8751       StubRoutines::_squareToLen = generate_squareToLen();
8752     }
8753 
8754     if (UseMulAddIntrinsic) {
8755       StubRoutines::_mulAdd = generate_mulAdd();
8756     }
8757 
8758     if (UseSIMDForBigIntegerShiftIntrinsics) {
8759       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8760       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8761     }
8762 
8763     if (UseMontgomeryMultiplyIntrinsic) {
8764       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8765       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8766       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8767     }
8768 
8769     if (UseMontgomerySquareIntrinsic) {
8770       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8771       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8772       // We use generate_multiply() rather than generate_square()
8773       // because it's faster for the sizes of modulus we care about.
8774       StubRoutines::_montgomerySquare = g.generate_multiply();
8775     }
8776 
8777     generate_vector_math_stubs();
8778 
8779 #endif // COMPILER2
8780 
8781     if (UseChaCha20Intrinsics) {
8782       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8783     }
8784 
8785     if (UseBASE64Intrinsics) {
8786         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8787         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8788     }
8789 
8790     // data cache line writeback
8791     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8792     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8793 
8794     if (UseAESIntrinsics) {
8795       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8796       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8797       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8798       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8799       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8800     }
8801     if (UseGHASHIntrinsics) {
8802       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8803       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8804     }
8805     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8806       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8807     }
8808 
8809     if (UseMD5Intrinsics) {
8810       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8811       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8812     }
8813     if (UseSHA1Intrinsics) {
8814       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8815       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8816     }
8817     if (UseSHA256Intrinsics) {
8818       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8819       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8820     }
8821     if (UseSHA512Intrinsics) {
8822       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8823       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8824     }
8825     if (UseSHA3Intrinsics) {
8826       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8827       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8828     }
8829 
8830     // generate Adler32 intrinsics code
8831     if (UseAdler32Intrinsics) {
8832       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8833     }
8834 
8835 #endif // COMPILER2_OR_JVMCI
8836   }
8837 
8838  public:
8839   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8840     switch(kind) {
8841     case Initial_stubs:
8842       generate_initial_stubs();
8843       break;
8844      case Continuation_stubs:
8845       generate_continuation_stubs();
8846       break;
8847     case Compiler_stubs:
8848       generate_compiler_stubs();
8849       break;
8850     case Final_stubs:
8851       generate_final_stubs();
8852       break;
8853     default:
8854       fatal("unexpected stubs kind: %d", kind);
8855       break;
8856     };
8857   }
8858 }; // end class declaration
8859 
8860 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8861   StubGenerator g(code, kind);
8862 }
8863 
8864 
8865 #if defined (LINUX)
8866 
8867 // Define pointers to atomic stubs and initialize them to point to the
8868 // code in atomic_aarch64.S.
8869 
8870 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8871   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8872     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8873   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8874     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8875 
8876 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8877 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8878 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8879 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8880 DEFAULT_ATOMIC_OP(xchg, 4, )
8881 DEFAULT_ATOMIC_OP(xchg, 8, )
8882 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8883 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8884 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8885 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8886 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8887 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8888 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8889 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8890 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8891 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8892 
8893 #undef DEFAULT_ATOMIC_OP
8894 
8895 #endif // LINUX