1 /*
   2  * Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/continuation.hpp"
  46 #include "runtime/continuationEntry.inline.hpp"
  47 #include "runtime/frame.inline.hpp"
  48 #include "runtime/handles.inline.hpp"
  49 #include "runtime/javaThread.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubCodeGenerator.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "utilities/align.hpp"
  54 #include "utilities/globalDefinitions.hpp"
  55 #include "utilities/powerOfTwo.hpp"
  56 #ifdef COMPILER2
  57 #include "opto/runtime.hpp"
  58 #endif
  59 #if INCLUDE_ZGC
  60 #include "gc/z/zThreadLocalData.hpp"
  61 #endif
  62 
  63 // Declaration and definition of StubGenerator (no .hpp file).
  64 // For a more detailed description of the stub routine structure
  65 // see the comment in stubRoutines.hpp
  66 
  67 #undef __
  68 #define __ _masm->
  69 
  70 #ifdef PRODUCT
  71 #define BLOCK_COMMENT(str) /* nothing */
  72 #else
  73 #define BLOCK_COMMENT(str) __ block_comment(str)
  74 #endif
  75 
  76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  77 
  78 // Stub Code definitions
  79 
  80 class StubGenerator: public StubCodeGenerator {
  81  private:
  82 
  83 #ifdef PRODUCT
  84 #define inc_counter_np(counter) ((void)0)
  85 #else
  86   void inc_counter_np_(int& counter) {
  87     __ lea(rscratch2, ExternalAddress((address)&counter));
  88     __ ldrw(rscratch1, Address(rscratch2));
  89     __ addw(rscratch1, rscratch1, 1);
  90     __ strw(rscratch1, Address(rscratch2));
  91   }
  92 #define inc_counter_np(counter) \
  93   BLOCK_COMMENT("inc_counter " #counter); \
  94   inc_counter_np_(counter);
  95 #endif
  96 
  97   // Call stubs are used to call Java from C
  98   //
  99   // Arguments:
 100   //    c_rarg0:   call wrapper address                   address
 101   //    c_rarg1:   result                                 address
 102   //    c_rarg2:   result type                            BasicType
 103   //    c_rarg3:   method                                 Method*
 104   //    c_rarg4:   (interpreter) entry point              address
 105   //    c_rarg5:   parameters                             intptr_t*
 106   //    c_rarg6:   parameter size (in words)              int
 107   //    c_rarg7:   thread                                 Thread*
 108   //
 109   // There is no return from the stub itself as any Java result
 110   // is written to result
 111   //
 112   // we save r30 (lr) as the return PC at the base of the frame and
 113   // link r29 (fp) below it as the frame pointer installing sp (r31)
 114   // into fp.
 115   //
 116   // we save r0-r7, which accounts for all the c arguments.
 117   //
 118   // TODO: strictly do we need to save them all? they are treated as
 119   // volatile by C so could we omit saving the ones we are going to
 120   // place in global registers (thread? method?) or those we only use
 121   // during setup of the Java call?
 122   //
 123   // we don't need to save r8 which C uses as an indirect result location
 124   // return register.
 125   //
 126   // we don't need to save r9-r15 which both C and Java treat as
 127   // volatile
 128   //
 129   // we don't need to save r16-18 because Java does not use them
 130   //
 131   // we save r19-r28 which Java uses as scratch registers and C
 132   // expects to be callee-save
 133   //
 134   // we save the bottom 64 bits of each value stored in v8-v15; it is
 135   // the responsibility of the caller to preserve larger values.
 136   //
 137   // so the stub frame looks like this when we enter Java code
 138   //
 139   //     [ return_from_Java     ] <--- sp
 140   //     [ argument word n      ]
 141   //      ...
 142   // -27 [ argument word 1      ]
 143   // -26 [ saved v15            ] <--- sp_after_call
 144   // -25 [ saved v14            ]
 145   // -24 [ saved v13            ]
 146   // -23 [ saved v12            ]
 147   // -22 [ saved v11            ]
 148   // -21 [ saved v10            ]
 149   // -20 [ saved v9             ]
 150   // -19 [ saved v8             ]
 151   // -18 [ saved r28            ]
 152   // -17 [ saved r27            ]
 153   // -16 [ saved r26            ]
 154   // -15 [ saved r25            ]
 155   // -14 [ saved r24            ]
 156   // -13 [ saved r23            ]
 157   // -12 [ saved r22            ]
 158   // -11 [ saved r21            ]
 159   // -10 [ saved r20            ]
 160   //  -9 [ saved r19            ]
 161   //  -8 [ call wrapper    (r0) ]
 162   //  -7 [ result          (r1) ]
 163   //  -6 [ result type     (r2) ]
 164   //  -5 [ method          (r3) ]
 165   //  -4 [ entry point     (r4) ]
 166   //  -3 [ parameters      (r5) ]
 167   //  -2 [ parameter size  (r6) ]
 168   //  -1 [ thread (r7)          ]
 169   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 170   //   1 [ saved lr       (r30) ]
 171 
 172   // Call stub stack layout word offsets from fp
 173   enum call_stub_layout {
 174     sp_after_call_off = -26,
 175 
 176     d15_off            = -26,
 177     d13_off            = -24,
 178     d11_off            = -22,
 179     d9_off             = -20,
 180 
 181     r28_off            = -18,
 182     r26_off            = -16,
 183     r24_off            = -14,
 184     r22_off            = -12,
 185     r20_off            = -10,
 186     call_wrapper_off   =  -8,
 187     result_off         =  -7,
 188     result_type_off    =  -6,
 189     method_off         =  -5,
 190     entry_point_off    =  -4,
 191     parameter_size_off =  -2,
 192     thread_off         =  -1,
 193     fp_f               =   0,
 194     retaddr_off        =   1,
 195   };
 196 
 197   address generate_call_stub(address& return_address) {
 198     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 199            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 200            "adjust this code");
 201 
 202     StubCodeMark mark(this, "StubRoutines", "call_stub");
 203     address start = __ pc();
 204 
 205     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 206 
 207     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 208     const Address result        (rfp, result_off         * wordSize);
 209     const Address result_type   (rfp, result_type_off    * wordSize);
 210     const Address method        (rfp, method_off         * wordSize);
 211     const Address entry_point   (rfp, entry_point_off    * wordSize);
 212     const Address parameter_size(rfp, parameter_size_off * wordSize);
 213 
 214     const Address thread        (rfp, thread_off         * wordSize);
 215 
 216     const Address d15_save      (rfp, d15_off * wordSize);
 217     const Address d13_save      (rfp, d13_off * wordSize);
 218     const Address d11_save      (rfp, d11_off * wordSize);
 219     const Address d9_save       (rfp, d9_off * wordSize);
 220 
 221     const Address r28_save      (rfp, r28_off * wordSize);
 222     const Address r26_save      (rfp, r26_off * wordSize);
 223     const Address r24_save      (rfp, r24_off * wordSize);
 224     const Address r22_save      (rfp, r22_off * wordSize);
 225     const Address r20_save      (rfp, r20_off * wordSize);
 226 
 227     // stub code
 228 
 229     address aarch64_entry = __ pc();
 230 
 231     // set up frame and move sp to end of save area
 232     __ enter();
 233     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 234 
 235     // save register parameters and Java scratch/global registers
 236     // n.b. we save thread even though it gets installed in
 237     // rthread because we want to sanity check rthread later
 238     __ str(c_rarg7,  thread);
 239     __ strw(c_rarg6, parameter_size);
 240     __ stp(c_rarg4, c_rarg5,  entry_point);
 241     __ stp(c_rarg2, c_rarg3,  result_type);
 242     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 243 
 244     __ stp(r20, r19,   r20_save);
 245     __ stp(r22, r21,   r22_save);
 246     __ stp(r24, r23,   r24_save);
 247     __ stp(r26, r25,   r26_save);
 248     __ stp(r28, r27,   r28_save);
 249 
 250     __ stpd(v9,  v8,   d9_save);
 251     __ stpd(v11, v10,  d11_save);
 252     __ stpd(v13, v12,  d13_save);
 253     __ stpd(v15, v14,  d15_save);
 254 
 255     // install Java thread in global register now we have saved
 256     // whatever value it held
 257     __ mov(rthread, c_rarg7);
 258     // And method
 259     __ mov(rmethod, c_rarg3);
 260 
 261     // set up the heapbase register
 262     __ reinit_heapbase();
 263 
 264 #ifdef ASSERT
 265     // make sure we have no pending exceptions
 266     {
 267       Label L;
 268       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 269       __ cmp(rscratch1, (u1)NULL_WORD);
 270       __ br(Assembler::EQ, L);
 271       __ stop("StubRoutines::call_stub: entered with pending exception");
 272       __ BIND(L);
 273     }
 274 #endif
 275     // pass parameters if any
 276     __ mov(esp, sp);
 277     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 278     __ andr(sp, rscratch1, -2 * wordSize);
 279 
 280     BLOCK_COMMENT("pass parameters if any");
 281     Label parameters_done;
 282     // parameter count is still in c_rarg6
 283     // and parameter pointer identifying param 1 is in c_rarg5
 284     __ cbzw(c_rarg6, parameters_done);
 285 
 286     address loop = __ pc();
 287     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 288     __ subsw(c_rarg6, c_rarg6, 1);
 289     __ push(rscratch1);
 290     __ br(Assembler::GT, loop);
 291 
 292     __ BIND(parameters_done);
 293 
 294     // call Java entry -- passing methdoOop, and current sp
 295     //      rmethod: Method*
 296     //      r19_sender_sp: sender sp
 297     BLOCK_COMMENT("call Java function");
 298     __ mov(r19_sender_sp, sp);
 299     __ blr(c_rarg4);
 300 
 301     // we do this here because the notify will already have been done
 302     // if we get to the next instruction via an exception
 303     //
 304     // n.b. adding this instruction here affects the calculation of
 305     // whether or not a routine returns to the call stub (used when
 306     // doing stack walks) since the normal test is to check the return
 307     // pc against the address saved below. so we may need to allow for
 308     // this extra instruction in the check.
 309 
 310     // save current address for use by exception handling code
 311 
 312     return_address = __ pc();
 313 
 314     // store result depending on type (everything that is not
 315     // T_OBJECT, T_PRIMITIVE_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 316     // n.b. this assumes Java returns an integral result in r0
 317     // and a floating result in j_farg0
 318     // All of j_rargN may be used to return inline type fields so be careful
 319     // not to clobber those.
 320     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 321     // assignment of Rresult below.
 322     Register Rresult = r14, Rresult_type = r15;
 323     __ ldr(Rresult, result);
 324     Label is_long, is_float, is_double, check_prim, exit;
 325     __ ldr(Rresult_type, result_type);
 326     __ cmp(Rresult_type, (u1)T_OBJECT);
 327     __ br(Assembler::EQ, check_prim);
 328     __ cmp(Rresult_type, (u1)T_PRIMITIVE_OBJECT);
 329     __ br(Assembler::EQ, check_prim);
 330     __ cmp(Rresult_type, (u1)T_LONG);
 331     __ br(Assembler::EQ, is_long);
 332     __ cmp(Rresult_type, (u1)T_FLOAT);
 333     __ br(Assembler::EQ, is_float);
 334     __ cmp(Rresult_type, (u1)T_DOUBLE);
 335     __ br(Assembler::EQ, is_double);
 336 
 337     // handle T_INT case
 338     __ strw(r0, Address(Rresult));
 339 
 340     __ BIND(exit);
 341 
 342     // pop parameters
 343     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 344 
 345 #ifdef ASSERT
 346     // verify that threads correspond
 347     {
 348       Label L, S;
 349       __ ldr(rscratch1, thread);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::NE, S);
 352       __ get_thread(rscratch1);
 353       __ cmp(rthread, rscratch1);
 354       __ br(Assembler::EQ, L);
 355       __ BIND(S);
 356       __ stop("StubRoutines::call_stub: threads must correspond");
 357       __ BIND(L);
 358     }
 359 #endif
 360 
 361     __ pop_cont_fastpath(rthread);
 362 
 363     // restore callee-save registers
 364     __ ldpd(v15, v14,  d15_save);
 365     __ ldpd(v13, v12,  d13_save);
 366     __ ldpd(v11, v10,  d11_save);
 367     __ ldpd(v9,  v8,   d9_save);
 368 
 369     __ ldp(r28, r27,   r28_save);
 370     __ ldp(r26, r25,   r26_save);
 371     __ ldp(r24, r23,   r24_save);
 372     __ ldp(r22, r21,   r22_save);
 373     __ ldp(r20, r19,   r20_save);
 374 
 375     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 376     __ ldrw(c_rarg2, result_type);
 377     __ ldr(c_rarg3,  method);
 378     __ ldp(c_rarg4, c_rarg5,  entry_point);
 379     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 380 
 381     // leave frame and return to caller
 382     __ leave();
 383     __ ret(lr);
 384 
 385     // handle return types different from T_INT
 386     __ BIND(check_prim);
 387     if (InlineTypeReturnedAsFields) {
 388       // Check for scalarized return value
 389       __ tbz(r0, 0, is_long);
 390       // Load pack handler address
 391       __ andr(rscratch1, r0, -2);
 392       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 393       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 394       __ blr(rscratch1);
 395       __ b(exit);
 396     }
 397 
 398     __ BIND(is_long);
 399     __ str(r0, Address(Rresult, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     __ BIND(is_float);
 403     __ strs(j_farg0, Address(Rresult, 0));
 404     __ br(Assembler::AL, exit);
 405 
 406     __ BIND(is_double);
 407     __ strd(j_farg0, Address(Rresult, 0));
 408     __ br(Assembler::AL, exit);
 409 
 410     return start;
 411   }
 412 
 413   // Return point for a Java call if there's an exception thrown in
 414   // Java code.  The exception is caught and transformed into a
 415   // pending exception stored in JavaThread that can be tested from
 416   // within the VM.
 417   //
 418   // Note: Usually the parameters are removed by the callee. In case
 419   // of an exception crossing an activation frame boundary, that is
 420   // not the case if the callee is compiled code => need to setup the
 421   // rsp.
 422   //
 423   // r0: exception oop
 424 
 425   address generate_catch_exception() {
 426     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 427     address start = __ pc();
 428 
 429     // same as in generate_call_stub():
 430     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 431     const Address thread        (rfp, thread_off         * wordSize);
 432 
 433 #ifdef ASSERT
 434     // verify that threads correspond
 435     {
 436       Label L, S;
 437       __ ldr(rscratch1, thread);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::NE, S);
 440       __ get_thread(rscratch1);
 441       __ cmp(rthread, rscratch1);
 442       __ br(Assembler::EQ, L);
 443       __ bind(S);
 444       __ stop("StubRoutines::catch_exception: threads must correspond");
 445       __ bind(L);
 446     }
 447 #endif
 448 
 449     // set pending exception
 450     __ verify_oop(r0);
 451 
 452     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 453     __ mov(rscratch1, (address)__FILE__);
 454     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 455     __ movw(rscratch1, (int)__LINE__);
 456     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 457 
 458     // complete return to VM
 459     assert(StubRoutines::_call_stub_return_address != nullptr,
 460            "_call_stub_return_address must have been generated before");
 461     __ b(StubRoutines::_call_stub_return_address);
 462 
 463     return start;
 464   }
 465 
 466   // Continuation point for runtime calls returning with a pending
 467   // exception.  The pending exception check happened in the runtime
 468   // or native call stub.  The pending exception in Thread is
 469   // converted into a Java-level exception.
 470   //
 471   // Contract with Java-level exception handlers:
 472   // r0: exception
 473   // r3: throwing pc
 474   //
 475   // NOTE: At entry of this stub, exception-pc must be in LR !!
 476 
 477   // NOTE: this is always used as a jump target within generated code
 478   // so it just needs to be generated code with no x86 prolog
 479 
 480   address generate_forward_exception() {
 481     StubCodeMark mark(this, "StubRoutines", "forward exception");
 482     address start = __ pc();
 483 
 484     // Upon entry, LR points to the return address returning into
 485     // Java (interpreted or compiled) code; i.e., the return address
 486     // becomes the throwing pc.
 487     //
 488     // Arguments pushed before the runtime call are still on the stack
 489     // but the exception handler will reset the stack pointer ->
 490     // ignore them.  A potential result in registers can be ignored as
 491     // well.
 492 
 493 #ifdef ASSERT
 494     // make sure this code is only executed if there is a pending exception
 495     {
 496       Label L;
 497       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 498       __ cbnz(rscratch1, L);
 499       __ stop("StubRoutines::forward exception: no pending exception (1)");
 500       __ bind(L);
 501     }
 502 #endif
 503 
 504     // compute exception handler into r19
 505 
 506     // call the VM to find the handler address associated with the
 507     // caller address. pass thread in r0 and caller pc (ret address)
 508     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 509     // the stack.
 510     __ mov(c_rarg1, lr);
 511     // lr will be trashed by the VM call so we move it to R19
 512     // (callee-saved) because we also need to pass it to the handler
 513     // returned by this call.
 514     __ mov(r19, lr);
 515     BLOCK_COMMENT("call exception_handler_for_return_address");
 516     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 517                          SharedRuntime::exception_handler_for_return_address),
 518                     rthread, c_rarg1);
 519     // Reinitialize the ptrue predicate register, in case the external runtime
 520     // call clobbers ptrue reg, as we may return to SVE compiled code.
 521     __ reinitialize_ptrue();
 522 
 523     // we should not really care that lr is no longer the callee
 524     // address. we saved the value the handler needs in r19 so we can
 525     // just copy it to r3. however, the C2 handler will push its own
 526     // frame and then calls into the VM and the VM code asserts that
 527     // the PC for the frame above the handler belongs to a compiled
 528     // Java method. So, we restore lr here to satisfy that assert.
 529     __ mov(lr, r19);
 530     // setup r0 & r3 & clear pending exception
 531     __ mov(r3, r19);
 532     __ mov(r19, r0);
 533     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 534     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 535 
 536 #ifdef ASSERT
 537     // make sure exception is set
 538     {
 539       Label L;
 540       __ cbnz(r0, L);
 541       __ stop("StubRoutines::forward exception: no pending exception (2)");
 542       __ bind(L);
 543     }
 544 #endif
 545 
 546     // continue at exception handler
 547     // r0: exception
 548     // r3: throwing pc
 549     // r19: exception handler
 550     __ verify_oop(r0);
 551     __ br(r19);
 552 
 553     return start;
 554   }
 555 
 556   // Non-destructive plausibility checks for oops
 557   //
 558   // Arguments:
 559   //    r0: oop to verify
 560   //    rscratch1: error message
 561   //
 562   // Stack after saving c_rarg3:
 563   //    [tos + 0]: saved c_rarg3
 564   //    [tos + 1]: saved c_rarg2
 565   //    [tos + 2]: saved lr
 566   //    [tos + 3]: saved rscratch2
 567   //    [tos + 4]: saved r0
 568   //    [tos + 5]: saved rscratch1
 569   address generate_verify_oop() {
 570 
 571     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 572     address start = __ pc();
 573 
 574     Label exit, error;
 575 
 576     // save c_rarg2 and c_rarg3
 577     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 578 
 579     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 580     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 581     __ ldr(c_rarg3, Address(c_rarg2));
 582     __ add(c_rarg3, c_rarg3, 1);
 583     __ str(c_rarg3, Address(c_rarg2));
 584 
 585     // object is in r0
 586     // make sure object is 'reasonable'
 587     __ cbz(r0, exit); // if obj is null it is OK
 588 
 589     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 590     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 591 
 592     // return if everything seems ok
 593     __ bind(exit);
 594 
 595     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 596     __ ret(lr);
 597 
 598     // handle errors
 599     __ bind(error);
 600     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 601 
 602     __ push(RegSet::range(r0, r29), sp);
 603     // debug(char* msg, int64_t pc, int64_t regs[])
 604     __ mov(c_rarg0, rscratch1);      // pass address of error message
 605     __ mov(c_rarg1, lr);             // pass return address
 606     __ mov(c_rarg2, sp);             // pass address of regs on stack
 607 #ifndef PRODUCT
 608     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 609 #endif
 610     BLOCK_COMMENT("call MacroAssembler::debug");
 611     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 612     __ blr(rscratch1);
 613     __ hlt(0);
 614 
 615     return start;
 616   }
 617 
 618   // Generate indices for iota vector.
 619   address generate_iota_indices(const char *stub_name) {
 620     __ align(CodeEntryAlignment);
 621     StubCodeMark mark(this, "StubRoutines", stub_name);
 622     address start = __ pc();
 623     // B
 624     __ emit_data64(0x0706050403020100, relocInfo::none);
 625     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 626     // H
 627     __ emit_data64(0x0003000200010000, relocInfo::none);
 628     __ emit_data64(0x0007000600050004, relocInfo::none);
 629     // S
 630     __ emit_data64(0x0000000100000000, relocInfo::none);
 631     __ emit_data64(0x0000000300000002, relocInfo::none);
 632     // D
 633     __ emit_data64(0x0000000000000000, relocInfo::none);
 634     __ emit_data64(0x0000000000000001, relocInfo::none);
 635     // S - FP
 636     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 637     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 638     // D - FP
 639     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 640     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 641     return start;
 642   }
 643 
 644   // The inner part of zero_words().  This is the bulk operation,
 645   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 646   // caller is responsible for zeroing the last few words.
 647   //
 648   // Inputs:
 649   // r10: the HeapWord-aligned base address of an array to zero.
 650   // r11: the count in HeapWords, r11 > 0.
 651   //
 652   // Returns r10 and r11, adjusted for the caller to clear.
 653   // r10: the base address of the tail of words left to clear.
 654   // r11: the number of words in the tail.
 655   //      r11 < MacroAssembler::zero_words_block_size.
 656 
 657   address generate_zero_blocks() {
 658     Label done;
 659     Label base_aligned;
 660 
 661     Register base = r10, cnt = r11;
 662 
 663     __ align(CodeEntryAlignment);
 664     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 665     address start = __ pc();
 666 
 667     if (UseBlockZeroing) {
 668       int zva_length = VM_Version::zva_length();
 669 
 670       // Ensure ZVA length can be divided by 16. This is required by
 671       // the subsequent operations.
 672       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 673 
 674       __ tbz(base, 3, base_aligned);
 675       __ str(zr, Address(__ post(base, 8)));
 676       __ sub(cnt, cnt, 1);
 677       __ bind(base_aligned);
 678 
 679       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 680       // alignment.
 681       Label small;
 682       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 683       __ subs(rscratch1, cnt, low_limit >> 3);
 684       __ br(Assembler::LT, small);
 685       __ zero_dcache_blocks(base, cnt);
 686       __ bind(small);
 687     }
 688 
 689     {
 690       // Number of stp instructions we'll unroll
 691       const int unroll =
 692         MacroAssembler::zero_words_block_size / 2;
 693       // Clear the remaining blocks.
 694       Label loop;
 695       __ subs(cnt, cnt, unroll * 2);
 696       __ br(Assembler::LT, done);
 697       __ bind(loop);
 698       for (int i = 0; i < unroll; i++)
 699         __ stp(zr, zr, __ post(base, 16));
 700       __ subs(cnt, cnt, unroll * 2);
 701       __ br(Assembler::GE, loop);
 702       __ bind(done);
 703       __ add(cnt, cnt, unroll * 2);
 704     }
 705 
 706     __ ret(lr);
 707 
 708     return start;
 709   }
 710 
 711 
 712   typedef enum {
 713     copy_forwards = 1,
 714     copy_backwards = -1
 715   } copy_direction;
 716 
 717   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 718   // for arraycopy stubs.
 719   class ArrayCopyBarrierSetHelper : StackObj {
 720     BarrierSetAssembler* _bs_asm;
 721     MacroAssembler* _masm;
 722     DecoratorSet _decorators;
 723     BasicType _type;
 724     Register _gct1;
 725     Register _gct2;
 726     Register _gct3;
 727     FloatRegister _gcvt1;
 728     FloatRegister _gcvt2;
 729     FloatRegister _gcvt3;
 730 
 731   public:
 732     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 733                               DecoratorSet decorators,
 734                               BasicType type,
 735                               Register gct1,
 736                               Register gct2,
 737                               Register gct3,
 738                               FloatRegister gcvt1,
 739                               FloatRegister gcvt2,
 740                               FloatRegister gcvt3)
 741       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 742         _masm(masm),
 743         _decorators(decorators),
 744         _type(type),
 745         _gct1(gct1),
 746         _gct2(gct2),
 747         _gct3(gct3),
 748         _gcvt1(gcvt1),
 749         _gcvt2(gcvt2),
 750         _gcvt3(gcvt3) {
 751     }
 752 
 753     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 754       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 755                             dst1, dst2, src,
 756                             _gct1, _gct2, _gcvt1);
 757     }
 758 
 759     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 760       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 761                              dst, src1, src2,
 762                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 763     }
 764 
 765     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 766       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 767                             dst1, dst2, src,
 768                             _gct1);
 769     }
 770 
 771     void copy_store_at_16(Address dst, Register src1, Register src2) {
 772       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 773                              dst, src1, src2,
 774                              _gct1, _gct2, _gct3);
 775     }
 776 
 777     void copy_load_at_8(Register dst, Address src) {
 778       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 779                             dst, noreg, src,
 780                             _gct1);
 781     }
 782 
 783     void copy_store_at_8(Address dst, Register src) {
 784       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 785                              dst, src, noreg,
 786                              _gct1, _gct2, _gct3);
 787     }
 788   };
 789 
 790   // Bulk copy of blocks of 8 words.
 791   //
 792   // count is a count of words.
 793   //
 794   // Precondition: count >= 8
 795   //
 796   // Postconditions:
 797   //
 798   // The least significant bit of count contains the remaining count
 799   // of words to copy.  The rest of count is trash.
 800   //
 801   // s and d are adjusted to point to the remaining words to copy
 802   //
 803   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 804                            copy_direction direction) {
 805     int unit = wordSize * direction;
 806     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 807 
 808     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 809       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 810     const Register stride = r14;
 811     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 812     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 813     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 814 
 815     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 816     assert_different_registers(s, d, count, rscratch1, rscratch2);
 817 
 818     Label again, drain;
 819     const char *stub_name;
 820     if (direction == copy_forwards)
 821       stub_name = "forward_copy_longs";
 822     else
 823       stub_name = "backward_copy_longs";
 824 
 825     __ align(CodeEntryAlignment);
 826 
 827     StubCodeMark mark(this, "StubRoutines", stub_name);
 828 
 829     __ bind(start);
 830 
 831     Label unaligned_copy_long;
 832     if (AvoidUnalignedAccesses) {
 833       __ tbnz(d, 3, unaligned_copy_long);
 834     }
 835 
 836     if (direction == copy_forwards) {
 837       __ sub(s, s, bias);
 838       __ sub(d, d, bias);
 839     }
 840 
 841 #ifdef ASSERT
 842     // Make sure we are never given < 8 words
 843     {
 844       Label L;
 845       __ cmp(count, (u1)8);
 846       __ br(Assembler::GE, L);
 847       __ stop("genrate_copy_longs called with < 8 words");
 848       __ bind(L);
 849     }
 850 #endif
 851 
 852     // Fill 8 registers
 853     if (UseSIMDForMemoryOps) {
 854       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 855       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 856     } else {
 857       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 858       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 859       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 860       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 861     }
 862 
 863     __ subs(count, count, 16);
 864     __ br(Assembler::LO, drain);
 865 
 866     int prefetch = PrefetchCopyIntervalInBytes;
 867     bool use_stride = false;
 868     if (direction == copy_backwards) {
 869        use_stride = prefetch > 256;
 870        prefetch = -prefetch;
 871        if (use_stride) __ mov(stride, prefetch);
 872     }
 873 
 874     __ bind(again);
 875 
 876     if (PrefetchCopyIntervalInBytes > 0)
 877       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 878 
 879     if (UseSIMDForMemoryOps) {
 880       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 881       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 882       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 883       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 884     } else {
 885       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 886       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 887       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 888       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 889       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 890       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 891       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 892       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 893     }
 894 
 895     __ subs(count, count, 8);
 896     __ br(Assembler::HS, again);
 897 
 898     // Drain
 899     __ bind(drain);
 900     if (UseSIMDForMemoryOps) {
 901       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 902       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 903     } else {
 904       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 905       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 906       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 907       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 908     }
 909 
 910     {
 911       Label L1, L2;
 912       __ tbz(count, exact_log2(4), L1);
 913       if (UseSIMDForMemoryOps) {
 914         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 915         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 916       } else {
 917         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 918         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 919         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 920         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 921       }
 922       __ bind(L1);
 923 
 924       if (direction == copy_forwards) {
 925         __ add(s, s, bias);
 926         __ add(d, d, bias);
 927       }
 928 
 929       __ tbz(count, 1, L2);
 930       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 931       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 932       __ bind(L2);
 933     }
 934 
 935     __ ret(lr);
 936 
 937     if (AvoidUnalignedAccesses) {
 938       Label drain, again;
 939       // Register order for storing. Order is different for backward copy.
 940 
 941       __ bind(unaligned_copy_long);
 942 
 943       // source address is even aligned, target odd aligned
 944       //
 945       // when forward copying word pairs we read long pairs at offsets
 946       // {0, 2, 4, 6} (in long words). when backwards copying we read
 947       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 948       // address by -2 in the forwards case so we can compute the
 949       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 950       // or -1.
 951       //
 952       // when forward copying we need to store 1 word, 3 pairs and
 953       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 954       // zero offset We adjust the destination by -1 which means we
 955       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 956       //
 957       // When backwards copyng we need to store 1 word, 3 pairs and
 958       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 959       // offsets {1, 3, 5, 7, 8} * unit.
 960 
 961       if (direction == copy_forwards) {
 962         __ sub(s, s, 16);
 963         __ sub(d, d, 8);
 964       }
 965 
 966       // Fill 8 registers
 967       //
 968       // for forwards copy s was offset by -16 from the original input
 969       // value of s so the register contents are at these offsets
 970       // relative to the 64 bit block addressed by that original input
 971       // and so on for each successive 64 byte block when s is updated
 972       //
 973       // t0 at offset 0,  t1 at offset 8
 974       // t2 at offset 16, t3 at offset 24
 975       // t4 at offset 32, t5 at offset 40
 976       // t6 at offset 48, t7 at offset 56
 977 
 978       // for backwards copy s was not offset so the register contents
 979       // are at these offsets into the preceding 64 byte block
 980       // relative to that original input and so on for each successive
 981       // preceding 64 byte block when s is updated. this explains the
 982       // slightly counter-intuitive looking pattern of register usage
 983       // in the stp instructions for backwards copy.
 984       //
 985       // t0 at offset -16, t1 at offset -8
 986       // t2 at offset -32, t3 at offset -24
 987       // t4 at offset -48, t5 at offset -40
 988       // t6 at offset -64, t7 at offset -56
 989 
 990       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 991       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 992       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 993       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 994 
 995       __ subs(count, count, 16);
 996       __ br(Assembler::LO, drain);
 997 
 998       int prefetch = PrefetchCopyIntervalInBytes;
 999       bool use_stride = false;
1000       if (direction == copy_backwards) {
1001          use_stride = prefetch > 256;
1002          prefetch = -prefetch;
1003          if (use_stride) __ mov(stride, prefetch);
1004       }
1005 
1006       __ bind(again);
1007 
1008       if (PrefetchCopyIntervalInBytes > 0)
1009         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1010 
1011       if (direction == copy_forwards) {
1012        // allowing for the offset of -8 the store instructions place
1013        // registers into the target 64 bit block at the following
1014        // offsets
1015        //
1016        // t0 at offset 0
1017        // t1 at offset 8,  t2 at offset 16
1018        // t3 at offset 24, t4 at offset 32
1019        // t5 at offset 40, t6 at offset 48
1020        // t7 at offset 56
1021 
1022         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1023         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1024         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1025         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1026         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1027         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1028         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1029         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1030         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1031       } else {
1032        // d was not offset when we started so the registers are
1033        // written into the 64 bit block preceding d with the following
1034        // offsets
1035        //
1036        // t1 at offset -8
1037        // t3 at offset -24, t0 at offset -16
1038        // t5 at offset -48, t2 at offset -32
1039        // t7 at offset -56, t4 at offset -48
1040        //                   t6 at offset -64
1041        //
1042        // note that this matches the offsets previously noted for the
1043        // loads
1044 
1045         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1046         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1047         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1048         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1049         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1050         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1051         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1052         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1053         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1054       }
1055 
1056       __ subs(count, count, 8);
1057       __ br(Assembler::HS, again);
1058 
1059       // Drain
1060       //
1061       // this uses the same pattern of offsets and register arguments
1062       // as above
1063       __ bind(drain);
1064       if (direction == copy_forwards) {
1065         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1066         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1067         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1068         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1069         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1070       } else {
1071         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1072         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1073         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1074         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1075         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1076       }
1077       // now we need to copy any remaining part block which may
1078       // include a 4 word block subblock and/or a 2 word subblock.
1079       // bits 2 and 1 in the count are the tell-tale for whether we
1080       // have each such subblock
1081       {
1082         Label L1, L2;
1083         __ tbz(count, exact_log2(4), L1);
1084        // this is the same as above but copying only 4 longs hence
1085        // with only one intervening stp between the str instructions
1086        // but note that the offsets and registers still follow the
1087        // same pattern
1088         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1089         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1090         if (direction == copy_forwards) {
1091           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1092           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1093           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1094         } else {
1095           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1096           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1097           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1098         }
1099         __ bind(L1);
1100 
1101         __ tbz(count, 1, L2);
1102        // this is the same as above but copying only 2 longs hence
1103        // there is no intervening stp between the str instructions
1104        // but note that the offset and register patterns are still
1105        // the same
1106         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1107         if (direction == copy_forwards) {
1108           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1109           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1110         } else {
1111           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1112           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1113         }
1114         __ bind(L2);
1115 
1116        // for forwards copy we need to re-adjust the offsets we
1117        // applied so that s and d are follow the last words written
1118 
1119        if (direction == copy_forwards) {
1120          __ add(s, s, 16);
1121          __ add(d, d, 8);
1122        }
1123 
1124       }
1125 
1126       __ ret(lr);
1127       }
1128   }
1129 
1130   // Small copy: less than 16 bytes.
1131   //
1132   // NB: Ignores all of the bits of count which represent more than 15
1133   // bytes, so a caller doesn't have to mask them.
1134 
1135   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1136     bool is_backwards = step < 0;
1137     size_t granularity = uabs(step);
1138     int direction = is_backwards ? -1 : 1;
1139 
1140     Label Lword, Lint, Lshort, Lbyte;
1141 
1142     assert(granularity
1143            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1144 
1145     const Register t0 = r3;
1146     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1147     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1148 
1149     // ??? I don't know if this bit-test-and-branch is the right thing
1150     // to do.  It does a lot of jumping, resulting in several
1151     // mispredicted branches.  It might make more sense to do this
1152     // with something like Duff's device with a single computed branch.
1153 
1154     __ tbz(count, 3 - exact_log2(granularity), Lword);
1155     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1156     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1157     __ bind(Lword);
1158 
1159     if (granularity <= sizeof (jint)) {
1160       __ tbz(count, 2 - exact_log2(granularity), Lint);
1161       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1162       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1163       __ bind(Lint);
1164     }
1165 
1166     if (granularity <= sizeof (jshort)) {
1167       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1168       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1169       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1170       __ bind(Lshort);
1171     }
1172 
1173     if (granularity <= sizeof (jbyte)) {
1174       __ tbz(count, 0, Lbyte);
1175       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1176       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1177       __ bind(Lbyte);
1178     }
1179   }
1180 
1181   Label copy_f, copy_b;
1182   Label copy_obj_f, copy_obj_b;
1183   Label copy_obj_uninit_f, copy_obj_uninit_b;
1184 
1185   // All-singing all-dancing memory copy.
1186   //
1187   // Copy count units of memory from s to d.  The size of a unit is
1188   // step, which can be positive or negative depending on the direction
1189   // of copy.  If is_aligned is false, we align the source address.
1190   //
1191 
1192   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1193                    Register s, Register d, Register count, int step) {
1194     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1195     bool is_backwards = step < 0;
1196     unsigned int granularity = uabs(step);
1197     const Register t0 = r3, t1 = r4;
1198 
1199     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1200     // load all the data before writing anything
1201     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1202     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1203     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1204     const Register send = r17, dend = r16;
1205     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1206     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1207     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1208 
1209     if (PrefetchCopyIntervalInBytes > 0)
1210       __ prfm(Address(s, 0), PLDL1KEEP);
1211     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1212     __ br(Assembler::HI, copy_big);
1213 
1214     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1215     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1216 
1217     __ cmp(count, u1(16/granularity));
1218     __ br(Assembler::LS, copy16);
1219 
1220     __ cmp(count, u1(64/granularity));
1221     __ br(Assembler::HI, copy80);
1222 
1223     __ cmp(count, u1(32/granularity));
1224     __ br(Assembler::LS, copy32);
1225 
1226     // 33..64 bytes
1227     if (UseSIMDForMemoryOps) {
1228       bs.copy_load_at_32(v0, v1, Address(s, 0));
1229       bs.copy_load_at_32(v2, v3, Address(send, -32));
1230       bs.copy_store_at_32(Address(d, 0), v0, v1);
1231       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1232     } else {
1233       bs.copy_load_at_16(t0, t1, Address(s, 0));
1234       bs.copy_load_at_16(t2, t3, Address(s, 16));
1235       bs.copy_load_at_16(t4, t5, Address(send, -32));
1236       bs.copy_load_at_16(t6, t7, Address(send, -16));
1237 
1238       bs.copy_store_at_16(Address(d, 0), t0, t1);
1239       bs.copy_store_at_16(Address(d, 16), t2, t3);
1240       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1241       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1242     }
1243     __ b(finish);
1244 
1245     // 17..32 bytes
1246     __ bind(copy32);
1247     bs.copy_load_at_16(t0, t1, Address(s, 0));
1248     bs.copy_load_at_16(t6, t7, Address(send, -16));
1249 
1250     bs.copy_store_at_16(Address(d, 0), t0, t1);
1251     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1252     __ b(finish);
1253 
1254     // 65..80/96 bytes
1255     // (96 bytes if SIMD because we do 32 byes per instruction)
1256     __ bind(copy80);
1257     if (UseSIMDForMemoryOps) {
1258       bs.copy_load_at_32(v0, v1, Address(s, 0));
1259       bs.copy_load_at_32(v2, v3, Address(s, 32));
1260       // Unaligned pointers can be an issue for copying.
1261       // The issue has more chances to happen when granularity of data is
1262       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1263       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1264       // The most performance drop has been seen for the range 65-80 bytes.
1265       // For such cases using the pair of ldp/stp instead of the third pair of
1266       // ldpq/stpq fixes the performance issue.
1267       if (granularity < sizeof (jint)) {
1268         Label copy96;
1269         __ cmp(count, u1(80/granularity));
1270         __ br(Assembler::HI, copy96);
1271         bs.copy_load_at_16(t0, t1, Address(send, -16));
1272 
1273         bs.copy_store_at_32(Address(d, 0), v0, v1);
1274         bs.copy_store_at_32(Address(d, 32), v2, v3);
1275 
1276         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1277         __ b(finish);
1278 
1279         __ bind(copy96);
1280       }
1281       bs.copy_load_at_32(v4, v5, Address(send, -32));
1282 
1283       bs.copy_store_at_32(Address(d, 0), v0, v1);
1284       bs.copy_store_at_32(Address(d, 32), v2, v3);
1285 
1286       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1287     } else {
1288       bs.copy_load_at_16(t0, t1, Address(s, 0));
1289       bs.copy_load_at_16(t2, t3, Address(s, 16));
1290       bs.copy_load_at_16(t4, t5, Address(s, 32));
1291       bs.copy_load_at_16(t6, t7, Address(s, 48));
1292       bs.copy_load_at_16(t8, t9, Address(send, -16));
1293 
1294       bs.copy_store_at_16(Address(d, 0), t0, t1);
1295       bs.copy_store_at_16(Address(d, 16), t2, t3);
1296       bs.copy_store_at_16(Address(d, 32), t4, t5);
1297       bs.copy_store_at_16(Address(d, 48), t6, t7);
1298       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1299     }
1300     __ b(finish);
1301 
1302     // 0..16 bytes
1303     __ bind(copy16);
1304     __ cmp(count, u1(8/granularity));
1305     __ br(Assembler::LO, copy8);
1306 
1307     // 8..16 bytes
1308     bs.copy_load_at_8(t0, Address(s, 0));
1309     bs.copy_load_at_8(t1, Address(send, -8));
1310     bs.copy_store_at_8(Address(d, 0), t0);
1311     bs.copy_store_at_8(Address(dend, -8), t1);
1312     __ b(finish);
1313 
1314     if (granularity < 8) {
1315       // 4..7 bytes
1316       __ bind(copy8);
1317       __ tbz(count, 2 - exact_log2(granularity), copy4);
1318       __ ldrw(t0, Address(s, 0));
1319       __ ldrw(t1, Address(send, -4));
1320       __ strw(t0, Address(d, 0));
1321       __ strw(t1, Address(dend, -4));
1322       __ b(finish);
1323       if (granularity < 4) {
1324         // 0..3 bytes
1325         __ bind(copy4);
1326         __ cbz(count, finish); // get rid of 0 case
1327         if (granularity == 2) {
1328           __ ldrh(t0, Address(s, 0));
1329           __ strh(t0, Address(d, 0));
1330         } else { // granularity == 1
1331           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1332           // the first and last byte.
1333           // Handle the 3 byte case by loading and storing base + count/2
1334           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1335           // This does means in the 1 byte case we load/store the same
1336           // byte 3 times.
1337           __ lsr(count, count, 1);
1338           __ ldrb(t0, Address(s, 0));
1339           __ ldrb(t1, Address(send, -1));
1340           __ ldrb(t2, Address(s, count));
1341           __ strb(t0, Address(d, 0));
1342           __ strb(t1, Address(dend, -1));
1343           __ strb(t2, Address(d, count));
1344         }
1345         __ b(finish);
1346       }
1347     }
1348 
1349     __ bind(copy_big);
1350     if (is_backwards) {
1351       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1352       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1353     }
1354 
1355     // Now we've got the small case out of the way we can align the
1356     // source address on a 2-word boundary.
1357 
1358     // Here we will materialize a count in r15, which is used by copy_memory_small
1359     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1360     // Up until here, we have used t9, which aliases r15, but from here on, that register
1361     // can not be used as a temp register, as it contains the count.
1362 
1363     Label aligned;
1364 
1365     if (is_aligned) {
1366       // We may have to adjust by 1 word to get s 2-word-aligned.
1367       __ tbz(s, exact_log2(wordSize), aligned);
1368       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1369       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1370       __ sub(count, count, wordSize/granularity);
1371     } else {
1372       if (is_backwards) {
1373         __ andr(r15, s, 2 * wordSize - 1);
1374       } else {
1375         __ neg(r15, s);
1376         __ andr(r15, r15, 2 * wordSize - 1);
1377       }
1378       // r15 is the byte adjustment needed to align s.
1379       __ cbz(r15, aligned);
1380       int shift = exact_log2(granularity);
1381       if (shift)  __ lsr(r15, r15, shift);
1382       __ sub(count, count, r15);
1383 
1384 #if 0
1385       // ?? This code is only correct for a disjoint copy.  It may or
1386       // may not make sense to use it in that case.
1387 
1388       // Copy the first pair; s and d may not be aligned.
1389       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1390       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1391 
1392       // Align s and d, adjust count
1393       if (is_backwards) {
1394         __ sub(s, s, r15);
1395         __ sub(d, d, r15);
1396       } else {
1397         __ add(s, s, r15);
1398         __ add(d, d, r15);
1399       }
1400 #else
1401       copy_memory_small(decorators, type, s, d, r15, step);
1402 #endif
1403     }
1404 
1405     __ bind(aligned);
1406 
1407     // s is now 2-word-aligned.
1408 
1409     // We have a count of units and some trailing bytes.  Adjust the
1410     // count and do a bulk copy of words.
1411     __ lsr(r15, count, exact_log2(wordSize/granularity));
1412     if (direction == copy_forwards) {
1413       if (type != T_OBJECT) {
1414         __ bl(copy_f);
1415       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1416         __ bl(copy_obj_uninit_f);
1417       } else {
1418         __ bl(copy_obj_f);
1419       }
1420     } else {
1421       if (type != T_OBJECT) {
1422         __ bl(copy_b);
1423       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1424         __ bl(copy_obj_uninit_b);
1425       } else {
1426         __ bl(copy_obj_b);
1427       }
1428     }
1429 
1430     // And the tail.
1431     copy_memory_small(decorators, type, s, d, count, step);
1432 
1433     if (granularity >= 8) __ bind(copy8);
1434     if (granularity >= 4) __ bind(copy4);
1435     __ bind(finish);
1436   }
1437 
1438 
1439   void clobber_registers() {
1440 #ifdef ASSERT
1441     RegSet clobbered
1442       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1443     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1444     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1445     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1446       __ mov(*it, rscratch1);
1447     }
1448 #endif
1449 
1450   }
1451 
1452   // Scan over array at a for count oops, verifying each one.
1453   // Preserves a and count, clobbers rscratch1 and rscratch2.
1454   void verify_oop_array (int size, Register a, Register count, Register temp) {
1455     Label loop, end;
1456     __ mov(rscratch1, a);
1457     __ mov(rscratch2, zr);
1458     __ bind(loop);
1459     __ cmp(rscratch2, count);
1460     __ br(Assembler::HS, end);
1461     if (size == wordSize) {
1462       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1463       __ verify_oop(temp);
1464     } else {
1465       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1466       __ decode_heap_oop(temp); // calls verify_oop
1467     }
1468     __ add(rscratch2, rscratch2, 1);
1469     __ b(loop);
1470     __ bind(end);
1471   }
1472 
1473   // Arguments:
1474   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1475   //             ignored
1476   //   is_oop  - true => oop array, so generate store check code
1477   //   name    - stub name string
1478   //
1479   // Inputs:
1480   //   c_rarg0   - source array address
1481   //   c_rarg1   - destination array address
1482   //   c_rarg2   - element count, treated as ssize_t, can be zero
1483   //
1484   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1485   // the hardware handle it.  The two dwords within qwords that span
1486   // cache line boundaries will still be loaded and stored atomically.
1487   //
1488   // Side Effects:
1489   //   disjoint_int_copy_entry is set to the no-overlap entry point
1490   //   used by generate_conjoint_int_oop_copy().
1491   //
1492   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1493                                   const char *name, bool dest_uninitialized = false) {
1494     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1495     RegSet saved_reg = RegSet::of(s, d, count);
1496     __ align(CodeEntryAlignment);
1497     StubCodeMark mark(this, "StubRoutines", name);
1498     address start = __ pc();
1499     __ enter();
1500 
1501     if (entry != nullptr) {
1502       *entry = __ pc();
1503       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1504       BLOCK_COMMENT("Entry:");
1505     }
1506 
1507     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1508     if (dest_uninitialized) {
1509       decorators |= IS_DEST_UNINITIALIZED;
1510     }
1511     if (aligned) {
1512       decorators |= ARRAYCOPY_ALIGNED;
1513     }
1514 
1515     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1516     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1517 
1518     if (is_oop) {
1519       // save regs before copy_memory
1520       __ push(RegSet::of(d, count), sp);
1521     }
1522     {
1523       // UnsafeCopyMemory page error: continue after ucm
1524       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1525       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1526       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1527     }
1528 
1529     if (is_oop) {
1530       __ pop(RegSet::of(d, count), sp);
1531       if (VerifyOops)
1532         verify_oop_array(size, d, count, r16);
1533     }
1534 
1535     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1536 
1537     __ leave();
1538     __ mov(r0, zr); // return 0
1539     __ ret(lr);
1540     return start;
1541   }
1542 
1543   // Arguments:
1544   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1545   //             ignored
1546   //   is_oop  - true => oop array, so generate store check code
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1555   // the hardware handle it.  The two dwords within qwords that span
1556   // cache line boundaries will still be loaded and stored atomically.
1557   //
1558   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1559                                  address *entry, const char *name,
1560                                  bool dest_uninitialized = false) {
1561     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1562     RegSet saved_regs = RegSet::of(s, d, count);
1563     StubCodeMark mark(this, "StubRoutines", name);
1564     address start = __ pc();
1565     __ enter();
1566 
1567     if (entry != nullptr) {
1568       *entry = __ pc();
1569       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1570       BLOCK_COMMENT("Entry:");
1571     }
1572 
1573     // use fwd copy when (d-s) above_equal (count*size)
1574     __ sub(rscratch1, d, s);
1575     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1576     __ br(Assembler::HS, nooverlap_target);
1577 
1578     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1579     if (dest_uninitialized) {
1580       decorators |= IS_DEST_UNINITIALIZED;
1581     }
1582     if (aligned) {
1583       decorators |= ARRAYCOPY_ALIGNED;
1584     }
1585 
1586     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1587     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1588 
1589     if (is_oop) {
1590       // save regs before copy_memory
1591       __ push(RegSet::of(d, count), sp);
1592     }
1593     {
1594       // UnsafeCopyMemory page error: continue after ucm
1595       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1596       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1597       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1598     }
1599     if (is_oop) {
1600       __ pop(RegSet::of(d, count), sp);
1601       if (VerifyOops)
1602         verify_oop_array(size, d, count, r16);
1603     }
1604     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1605     __ leave();
1606     __ mov(r0, zr); // return 0
1607     __ ret(lr);
1608     return start;
1609 }
1610 
1611   // Arguments:
1612   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1613   //             ignored
1614   //   name    - stub name string
1615   //
1616   // Inputs:
1617   //   c_rarg0   - source array address
1618   //   c_rarg1   - destination array address
1619   //   c_rarg2   - element count, treated as ssize_t, can be zero
1620   //
1621   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1622   // we let the hardware handle it.  The one to eight bytes within words,
1623   // dwords or qwords that span cache line boundaries will still be loaded
1624   // and stored atomically.
1625   //
1626   // Side Effects:
1627   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1628   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1629   // we let the hardware handle it.  The one to eight bytes within words,
1630   // dwords or qwords that span cache line boundaries will still be loaded
1631   // and stored atomically.
1632   //
1633   // Side Effects:
1634   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1635   //   used by generate_conjoint_byte_copy().
1636   //
1637   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1638     const bool not_oop = false;
1639     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1640   }
1641 
1642   // Arguments:
1643   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1644   //             ignored
1645   //   name    - stub name string
1646   //
1647   // Inputs:
1648   //   c_rarg0   - source array address
1649   //   c_rarg1   - destination array address
1650   //   c_rarg2   - element count, treated as ssize_t, can be zero
1651   //
1652   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1653   // we let the hardware handle it.  The one to eight bytes within words,
1654   // dwords or qwords that span cache line boundaries will still be loaded
1655   // and stored atomically.
1656   //
1657   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1658                                       address* entry, const char *name) {
1659     const bool not_oop = false;
1660     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1661   }
1662 
1663   // Arguments:
1664   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1665   //             ignored
1666   //   name    - stub name string
1667   //
1668   // Inputs:
1669   //   c_rarg0   - source array address
1670   //   c_rarg1   - destination array address
1671   //   c_rarg2   - element count, treated as ssize_t, can be zero
1672   //
1673   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1674   // let the hardware handle it.  The two or four words within dwords
1675   // or qwords that span cache line boundaries will still be loaded
1676   // and stored atomically.
1677   //
1678   // Side Effects:
1679   //   disjoint_short_copy_entry is set to the no-overlap entry point
1680   //   used by generate_conjoint_short_copy().
1681   //
1682   address generate_disjoint_short_copy(bool aligned,
1683                                        address* entry, const char *name) {
1684     const bool not_oop = false;
1685     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1686   }
1687 
1688   // Arguments:
1689   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1690   //             ignored
1691   //   name    - stub name string
1692   //
1693   // Inputs:
1694   //   c_rarg0   - source array address
1695   //   c_rarg1   - destination array address
1696   //   c_rarg2   - element count, treated as ssize_t, can be zero
1697   //
1698   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1699   // let the hardware handle it.  The two or four words within dwords
1700   // or qwords that span cache line boundaries will still be loaded
1701   // and stored atomically.
1702   //
1703   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1704                                        address *entry, const char *name) {
1705     const bool not_oop = false;
1706     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1707 
1708   }
1709   // Arguments:
1710   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1711   //             ignored
1712   //   name    - stub name string
1713   //
1714   // Inputs:
1715   //   c_rarg0   - source array address
1716   //   c_rarg1   - destination array address
1717   //   c_rarg2   - element count, treated as ssize_t, can be zero
1718   //
1719   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1720   // the hardware handle it.  The two dwords within qwords that span
1721   // cache line boundaries will still be loaded and stored atomically.
1722   //
1723   // Side Effects:
1724   //   disjoint_int_copy_entry is set to the no-overlap entry point
1725   //   used by generate_conjoint_int_oop_copy().
1726   //
1727   address generate_disjoint_int_copy(bool aligned, address *entry,
1728                                          const char *name, bool dest_uninitialized = false) {
1729     const bool not_oop = false;
1730     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1731   }
1732 
1733   // Arguments:
1734   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1735   //             ignored
1736   //   name    - stub name string
1737   //
1738   // Inputs:
1739   //   c_rarg0   - source array address
1740   //   c_rarg1   - destination array address
1741   //   c_rarg2   - element count, treated as ssize_t, can be zero
1742   //
1743   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1744   // the hardware handle it.  The two dwords within qwords that span
1745   // cache line boundaries will still be loaded and stored atomically.
1746   //
1747   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1748                                      address *entry, const char *name,
1749                                      bool dest_uninitialized = false) {
1750     const bool not_oop = false;
1751     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1752   }
1753 
1754 
1755   // Arguments:
1756   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1757   //             ignored
1758   //   name    - stub name string
1759   //
1760   // Inputs:
1761   //   c_rarg0   - source array address
1762   //   c_rarg1   - destination array address
1763   //   c_rarg2   - element count, treated as size_t, can be zero
1764   //
1765   // Side Effects:
1766   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1767   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1768   //
1769   address generate_disjoint_long_copy(bool aligned, address *entry,
1770                                           const char *name, bool dest_uninitialized = false) {
1771     const bool not_oop = false;
1772     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1773   }
1774 
1775   // Arguments:
1776   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1777   //             ignored
1778   //   name    - stub name string
1779   //
1780   // Inputs:
1781   //   c_rarg0   - source array address
1782   //   c_rarg1   - destination array address
1783   //   c_rarg2   - element count, treated as size_t, can be zero
1784   //
1785   address generate_conjoint_long_copy(bool aligned,
1786                                       address nooverlap_target, address *entry,
1787                                       const char *name, bool dest_uninitialized = false) {
1788     const bool not_oop = false;
1789     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1790   }
1791 
1792   // Arguments:
1793   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1794   //             ignored
1795   //   name    - stub name string
1796   //
1797   // Inputs:
1798   //   c_rarg0   - source array address
1799   //   c_rarg1   - destination array address
1800   //   c_rarg2   - element count, treated as size_t, can be zero
1801   //
1802   // Side Effects:
1803   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1804   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1805   //
1806   address generate_disjoint_oop_copy(bool aligned, address *entry,
1807                                      const char *name, bool dest_uninitialized) {
1808     const bool is_oop = true;
1809     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1810     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1811   }
1812 
1813   // Arguments:
1814   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1815   //             ignored
1816   //   name    - stub name string
1817   //
1818   // Inputs:
1819   //   c_rarg0   - source array address
1820   //   c_rarg1   - destination array address
1821   //   c_rarg2   - element count, treated as size_t, can be zero
1822   //
1823   address generate_conjoint_oop_copy(bool aligned,
1824                                      address nooverlap_target, address *entry,
1825                                      const char *name, bool dest_uninitialized) {
1826     const bool is_oop = true;
1827     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1828     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1829                                   name, dest_uninitialized);
1830   }
1831 
1832 
1833   // Helper for generating a dynamic type check.
1834   // Smashes rscratch1, rscratch2.
1835   void generate_type_check(Register sub_klass,
1836                            Register super_check_offset,
1837                            Register super_klass,
1838                            Label& L_success) {
1839     assert_different_registers(sub_klass, super_check_offset, super_klass);
1840 
1841     BLOCK_COMMENT("type_check:");
1842 
1843     Label L_miss;
1844 
1845     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1846                                      super_check_offset);
1847     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, nullptr);
1848 
1849     // Fall through on failure!
1850     __ BIND(L_miss);
1851   }
1852 
1853   //
1854   //  Generate checkcasting array copy stub
1855   //
1856   //  Input:
1857   //    c_rarg0   - source array address
1858   //    c_rarg1   - destination array address
1859   //    c_rarg2   - element count, treated as ssize_t, can be zero
1860   //    c_rarg3   - size_t ckoff (super_check_offset)
1861   //    c_rarg4   - oop ckval (super_klass)
1862   //
1863   //  Output:
1864   //    r0 ==  0  -  success
1865   //    r0 == -1^K - failure, where K is partial transfer count
1866   //
1867   address generate_checkcast_copy(const char *name, address *entry,
1868                                   bool dest_uninitialized = false) {
1869 
1870     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1871 
1872     // Input registers (after setup_arg_regs)
1873     const Register from        = c_rarg0;   // source array address
1874     const Register to          = c_rarg1;   // destination array address
1875     const Register count       = c_rarg2;   // elementscount
1876     const Register ckoff       = c_rarg3;   // super_check_offset
1877     const Register ckval       = c_rarg4;   // super_klass
1878 
1879     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1880     RegSet wb_post_saved_regs = RegSet::of(count);
1881 
1882     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1883     const Register copied_oop  = r22;       // actual oop copied
1884     const Register count_save  = r21;       // orig elementscount
1885     const Register start_to    = r20;       // destination array start address
1886     const Register r19_klass   = r19;       // oop._klass
1887 
1888     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1889     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1890 
1891     //---------------------------------------------------------------
1892     // Assembler stub will be used for this call to arraycopy
1893     // if the two arrays are subtypes of Object[] but the
1894     // destination array type is not equal to or a supertype
1895     // of the source type.  Each element must be separately
1896     // checked.
1897 
1898     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1899                                copied_oop, r19_klass, count_save);
1900 
1901     __ align(CodeEntryAlignment);
1902     StubCodeMark mark(this, "StubRoutines", name);
1903     address start = __ pc();
1904 
1905     __ enter(); // required for proper stackwalking of RuntimeStub frame
1906 
1907 #ifdef ASSERT
1908     // caller guarantees that the arrays really are different
1909     // otherwise, we would have to make conjoint checks
1910     { Label L;
1911       __ b(L);                  // conjoint check not yet implemented
1912       __ stop("checkcast_copy within a single array");
1913       __ bind(L);
1914     }
1915 #endif //ASSERT
1916 
1917     // Caller of this entry point must set up the argument registers.
1918     if (entry != nullptr) {
1919       *entry = __ pc();
1920       BLOCK_COMMENT("Entry:");
1921     }
1922 
1923      // Empty array:  Nothing to do.
1924     __ cbz(count, L_done);
1925     __ push(RegSet::of(r19, r20, r21, r22), sp);
1926 
1927 #ifdef ASSERT
1928     BLOCK_COMMENT("assert consistent ckoff/ckval");
1929     // The ckoff and ckval must be mutually consistent,
1930     // even though caller generates both.
1931     { Label L;
1932       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1933       __ ldrw(start_to, Address(ckval, sco_offset));
1934       __ cmpw(ckoff, start_to);
1935       __ br(Assembler::EQ, L);
1936       __ stop("super_check_offset inconsistent");
1937       __ bind(L);
1938     }
1939 #endif //ASSERT
1940 
1941     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1942     bool is_oop = true;
1943     int element_size = UseCompressedOops ? 4 : 8;
1944     if (dest_uninitialized) {
1945       decorators |= IS_DEST_UNINITIALIZED;
1946     }
1947 
1948     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1949     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1950 
1951     // save the original count
1952     __ mov(count_save, count);
1953 
1954     // Copy from low to high addresses
1955     __ mov(start_to, to);              // Save destination array start address
1956     __ b(L_load_element);
1957 
1958     // ======== begin loop ========
1959     // (Loop is rotated; its entry is L_load_element.)
1960     // Loop control:
1961     //   for (; count != 0; count--) {
1962     //     copied_oop = load_heap_oop(from++);
1963     //     ... generate_type_check ...;
1964     //     store_heap_oop(to++, copied_oop);
1965     //   }
1966     __ align(OptoLoopAlignment);
1967 
1968     __ BIND(L_store_element);
1969     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1970                       __ post(to, element_size), copied_oop, noreg,
1971                       gct1, gct2, gct3);
1972     __ sub(count, count, 1);
1973     __ cbz(count, L_do_card_marks);
1974 
1975     // ======== loop entry is here ========
1976     __ BIND(L_load_element);
1977     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1978                      copied_oop, noreg, __ post(from, element_size),
1979                      gct1);
1980     __ cbz(copied_oop, L_store_element);
1981 
1982     __ load_klass(r19_klass, copied_oop);// query the object klass
1983     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1984     // ======== end loop ========
1985 
1986     // It was a real error; we must depend on the caller to finish the job.
1987     // Register count = remaining oops, count_orig = total oops.
1988     // Emit GC store barriers for the oops we have copied and report
1989     // their number to the caller.
1990 
1991     __ subs(count, count_save, count);     // K = partially copied oop count
1992     __ eon(count, count, zr);                   // report (-1^K) to caller
1993     __ br(Assembler::EQ, L_done_pop);
1994 
1995     __ BIND(L_do_card_marks);
1996     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1997 
1998     __ bind(L_done_pop);
1999     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2000     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2001 
2002     __ bind(L_done);
2003     __ mov(r0, count);
2004     __ leave();
2005     __ ret(lr);
2006 
2007     return start;
2008   }
2009 
2010   // Perform range checks on the proposed arraycopy.
2011   // Kills temp, but nothing else.
2012   // Also, clean the sign bits of src_pos and dst_pos.
2013   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2014                               Register src_pos, // source position (c_rarg1)
2015                               Register dst,     // destination array oo (c_rarg2)
2016                               Register dst_pos, // destination position (c_rarg3)
2017                               Register length,
2018                               Register temp,
2019                               Label& L_failed) {
2020     BLOCK_COMMENT("arraycopy_range_checks:");
2021 
2022     assert_different_registers(rscratch1, temp);
2023 
2024     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2025     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2026     __ addw(temp, length, src_pos);
2027     __ cmpw(temp, rscratch1);
2028     __ br(Assembler::HI, L_failed);
2029 
2030     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2031     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2032     __ addw(temp, length, dst_pos);
2033     __ cmpw(temp, rscratch1);
2034     __ br(Assembler::HI, L_failed);
2035 
2036     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2037     __ movw(src_pos, src_pos);
2038     __ movw(dst_pos, dst_pos);
2039 
2040     BLOCK_COMMENT("arraycopy_range_checks done");
2041   }
2042 
2043   // These stubs get called from some dumb test routine.
2044   // I'll write them properly when they're called from
2045   // something that's actually doing something.
2046   static void fake_arraycopy_stub(address src, address dst, int count) {
2047     assert(count == 0, "huh?");
2048   }
2049 
2050 
2051   //
2052   //  Generate 'unsafe' array copy stub
2053   //  Though just as safe as the other stubs, it takes an unscaled
2054   //  size_t argument instead of an element count.
2055   //
2056   //  Input:
2057   //    c_rarg0   - source array address
2058   //    c_rarg1   - destination array address
2059   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2060   //
2061   // Examines the alignment of the operands and dispatches
2062   // to a long, int, short, or byte copy loop.
2063   //
2064   address generate_unsafe_copy(const char *name,
2065                                address byte_copy_entry,
2066                                address short_copy_entry,
2067                                address int_copy_entry,
2068                                address long_copy_entry) {
2069     Label L_long_aligned, L_int_aligned, L_short_aligned;
2070     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2071 
2072     __ align(CodeEntryAlignment);
2073     StubCodeMark mark(this, "StubRoutines", name);
2074     address start = __ pc();
2075     __ enter(); // required for proper stackwalking of RuntimeStub frame
2076 
2077     // bump this on entry, not on exit:
2078     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2079 
2080     __ orr(rscratch1, s, d);
2081     __ orr(rscratch1, rscratch1, count);
2082 
2083     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2084     __ cbz(rscratch1, L_long_aligned);
2085     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2086     __ cbz(rscratch1, L_int_aligned);
2087     __ tbz(rscratch1, 0, L_short_aligned);
2088     __ b(RuntimeAddress(byte_copy_entry));
2089 
2090     __ BIND(L_short_aligned);
2091     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2092     __ b(RuntimeAddress(short_copy_entry));
2093     __ BIND(L_int_aligned);
2094     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2095     __ b(RuntimeAddress(int_copy_entry));
2096     __ BIND(L_long_aligned);
2097     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2098     __ b(RuntimeAddress(long_copy_entry));
2099 
2100     return start;
2101   }
2102 
2103   //
2104   //  Generate generic array copy stubs
2105   //
2106   //  Input:
2107   //    c_rarg0    -  src oop
2108   //    c_rarg1    -  src_pos (32-bits)
2109   //    c_rarg2    -  dst oop
2110   //    c_rarg3    -  dst_pos (32-bits)
2111   //    c_rarg4    -  element count (32-bits)
2112   //
2113   //  Output:
2114   //    r0 ==  0  -  success
2115   //    r0 == -1^K - failure, where K is partial transfer count
2116   //
2117   address generate_generic_copy(const char *name,
2118                                 address byte_copy_entry, address short_copy_entry,
2119                                 address int_copy_entry, address oop_copy_entry,
2120                                 address long_copy_entry, address checkcast_copy_entry) {
2121 
2122     Label L_failed, L_objArray;
2123     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2124 
2125     // Input registers
2126     const Register src        = c_rarg0;  // source array oop
2127     const Register src_pos    = c_rarg1;  // source position
2128     const Register dst        = c_rarg2;  // destination array oop
2129     const Register dst_pos    = c_rarg3;  // destination position
2130     const Register length     = c_rarg4;
2131 
2132 
2133     // Registers used as temps
2134     const Register dst_klass  = c_rarg5;
2135 
2136     __ align(CodeEntryAlignment);
2137 
2138     StubCodeMark mark(this, "StubRoutines", name);
2139 
2140     address start = __ pc();
2141 
2142     __ enter(); // required for proper stackwalking of RuntimeStub frame
2143 
2144     // bump this on entry, not on exit:
2145     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2146 
2147     //-----------------------------------------------------------------------
2148     // Assembler stub will be used for this call to arraycopy
2149     // if the following conditions are met:
2150     //
2151     // (1) src and dst must not be null.
2152     // (2) src_pos must not be negative.
2153     // (3) dst_pos must not be negative.
2154     // (4) length  must not be negative.
2155     // (5) src klass and dst klass should be the same and not null.
2156     // (6) src and dst should be arrays.
2157     // (7) src_pos + length must not exceed length of src.
2158     // (8) dst_pos + length must not exceed length of dst.
2159     //
2160 
2161     //  if (src == nullptr) return -1;
2162     __ cbz(src, L_failed);
2163 
2164     //  if (src_pos < 0) return -1;
2165     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2166 
2167     //  if (dst == nullptr) return -1;
2168     __ cbz(dst, L_failed);
2169 
2170     //  if (dst_pos < 0) return -1;
2171     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2172 
2173     // registers used as temp
2174     const Register scratch_length    = r16; // elements count to copy
2175     const Register scratch_src_klass = r17; // array klass
2176     const Register lh                = r15; // layout helper
2177 
2178     //  if (length < 0) return -1;
2179     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2180     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2181 
2182     __ load_klass(scratch_src_klass, src);
2183 #ifdef ASSERT
2184     //  assert(src->klass() != nullptr);
2185     {
2186       BLOCK_COMMENT("assert klasses not null {");
2187       Label L1, L2;
2188       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2189       __ bind(L1);
2190       __ stop("broken null klass");
2191       __ bind(L2);
2192       __ load_klass(rscratch1, dst);
2193       __ cbz(rscratch1, L1);     // this would be broken also
2194       BLOCK_COMMENT("} assert klasses not null done");
2195     }
2196 #endif
2197 
2198     // Load layout helper (32-bits)
2199     //
2200     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2201     // 32        30    24            16              8     2                 0
2202     //
2203     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2204     //
2205 
2206     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2207 
2208     // Handle objArrays completely differently...
2209     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2210     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2211     __ movw(rscratch1, objArray_lh);
2212     __ eorw(rscratch2, lh, rscratch1);
2213     __ cbzw(rscratch2, L_objArray);
2214 
2215     //  if (src->klass() != dst->klass()) return -1;
2216     __ load_klass(rscratch2, dst);
2217     __ eor(rscratch2, rscratch2, scratch_src_klass);
2218     __ cbnz(rscratch2, L_failed);
2219 
2220     // Check for flat inline type array -> return -1
2221     __ tst(lh, Klass::_lh_array_tag_flat_value_bit_inplace);
2222     __ br(Assembler::NE, L_failed);
2223 
2224     // Check for null-free (non-flat) inline type array -> handle as object array
2225     __ tst(lh, Klass::_lh_null_free_array_bit_inplace);
2226     __ br(Assembler::NE, L_failed);
2227 
2228     //  if (!src->is_Array()) return -1;
2229     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2230 
2231     // At this point, it is known to be a typeArray (array_tag 0x3).
2232 #ifdef ASSERT
2233     {
2234       BLOCK_COMMENT("assert primitive array {");
2235       Label L;
2236       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2237       __ cmpw(lh, rscratch2);
2238       __ br(Assembler::GE, L);
2239       __ stop("must be a primitive array");
2240       __ bind(L);
2241       BLOCK_COMMENT("} assert primitive array done");
2242     }
2243 #endif
2244 
2245     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2246                            rscratch2, L_failed);
2247 
2248     // TypeArrayKlass
2249     //
2250     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2251     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2252     //
2253 
2254     const Register rscratch1_offset = rscratch1;    // array offset
2255     const Register r15_elsize = lh; // element size
2256 
2257     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2258            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2259     __ add(src, src, rscratch1_offset);           // src array offset
2260     __ add(dst, dst, rscratch1_offset);           // dst array offset
2261     BLOCK_COMMENT("choose copy loop based on element size");
2262 
2263     // next registers should be set before the jump to corresponding stub
2264     const Register from     = c_rarg0;  // source array address
2265     const Register to       = c_rarg1;  // destination array address
2266     const Register count    = c_rarg2;  // elements count
2267 
2268     // 'from', 'to', 'count' registers should be set in such order
2269     // since they are the same as 'src', 'src_pos', 'dst'.
2270 
2271     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2272 
2273     // The possible values of elsize are 0-3, i.e. exact_log2(element
2274     // size in bytes).  We do a simple bitwise binary search.
2275   __ BIND(L_copy_bytes);
2276     __ tbnz(r15_elsize, 1, L_copy_ints);
2277     __ tbnz(r15_elsize, 0, L_copy_shorts);
2278     __ lea(from, Address(src, src_pos));// src_addr
2279     __ lea(to,   Address(dst, dst_pos));// dst_addr
2280     __ movw(count, scratch_length); // length
2281     __ b(RuntimeAddress(byte_copy_entry));
2282 
2283   __ BIND(L_copy_shorts);
2284     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2285     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2286     __ movw(count, scratch_length); // length
2287     __ b(RuntimeAddress(short_copy_entry));
2288 
2289   __ BIND(L_copy_ints);
2290     __ tbnz(r15_elsize, 0, L_copy_longs);
2291     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2292     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2293     __ movw(count, scratch_length); // length
2294     __ b(RuntimeAddress(int_copy_entry));
2295 
2296   __ BIND(L_copy_longs);
2297 #ifdef ASSERT
2298     {
2299       BLOCK_COMMENT("assert long copy {");
2300       Label L;
2301       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2302       __ cmpw(r15_elsize, LogBytesPerLong);
2303       __ br(Assembler::EQ, L);
2304       __ stop("must be long copy, but elsize is wrong");
2305       __ bind(L);
2306       BLOCK_COMMENT("} assert long copy done");
2307     }
2308 #endif
2309     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2310     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2311     __ movw(count, scratch_length); // length
2312     __ b(RuntimeAddress(long_copy_entry));
2313 
2314     // ObjArrayKlass
2315   __ BIND(L_objArray);
2316     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2317 
2318     Label L_plain_copy, L_checkcast_copy;
2319     //  test array classes for subtyping
2320     __ load_klass(r15, dst);
2321     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2322     __ br(Assembler::NE, L_checkcast_copy);
2323 
2324     // Identically typed arrays can be copied without element-wise checks.
2325     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2326                            rscratch2, L_failed);
2327 
2328     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2329     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2330     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2331     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2332     __ movw(count, scratch_length); // length
2333   __ BIND(L_plain_copy);
2334     __ b(RuntimeAddress(oop_copy_entry));
2335 
2336   __ BIND(L_checkcast_copy);
2337     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2338     {
2339       // Before looking at dst.length, make sure dst is also an objArray.
2340       __ ldrw(rscratch1, Address(r15, lh_offset));
2341       __ movw(rscratch2, objArray_lh);
2342       __ eorw(rscratch1, rscratch1, rscratch2);
2343       __ cbnzw(rscratch1, L_failed);
2344 
2345       // It is safe to examine both src.length and dst.length.
2346       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2347                              r15, L_failed);
2348 
2349       __ load_klass(dst_klass, dst); // reload
2350 
2351       // Marshal the base address arguments now, freeing registers.
2352       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2353       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2354       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2355       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2356       __ movw(count, length);           // length (reloaded)
2357       Register sco_temp = c_rarg3;      // this register is free now
2358       assert_different_registers(from, to, count, sco_temp,
2359                                  dst_klass, scratch_src_klass);
2360       // assert_clean_int(count, sco_temp);
2361 
2362       // Generate the type check.
2363       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2364       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2365 
2366       // Smashes rscratch1, rscratch2
2367       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2368 
2369       // Fetch destination element klass from the ObjArrayKlass header.
2370       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2371       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2372       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2373 
2374       // the checkcast_copy loop needs two extra arguments:
2375       assert(c_rarg3 == sco_temp, "#3 already in place");
2376       // Set up arguments for checkcast_copy_entry.
2377       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2378       __ b(RuntimeAddress(checkcast_copy_entry));
2379     }
2380 
2381   __ BIND(L_failed);
2382     __ mov(r0, -1);
2383     __ leave();   // required for proper stackwalking of RuntimeStub frame
2384     __ ret(lr);
2385 
2386     return start;
2387   }
2388 
2389   //
2390   // Generate stub for array fill. If "aligned" is true, the
2391   // "to" address is assumed to be heapword aligned.
2392   //
2393   // Arguments for generated stub:
2394   //   to:    c_rarg0
2395   //   value: c_rarg1
2396   //   count: c_rarg2 treated as signed
2397   //
2398   address generate_fill(BasicType t, bool aligned, const char *name) {
2399     __ align(CodeEntryAlignment);
2400     StubCodeMark mark(this, "StubRoutines", name);
2401     address start = __ pc();
2402 
2403     BLOCK_COMMENT("Entry:");
2404 
2405     const Register to        = c_rarg0;  // source array address
2406     const Register value     = c_rarg1;  // value
2407     const Register count     = c_rarg2;  // elements count
2408 
2409     const Register bz_base = r10;        // base for block_zero routine
2410     const Register cnt_words = r11;      // temp register
2411 
2412     __ enter();
2413 
2414     Label L_fill_elements, L_exit1;
2415 
2416     int shift = -1;
2417     switch (t) {
2418       case T_BYTE:
2419         shift = 0;
2420         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2421         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2422         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2423         __ br(Assembler::LO, L_fill_elements);
2424         break;
2425       case T_SHORT:
2426         shift = 1;
2427         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2428         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2429         __ br(Assembler::LO, L_fill_elements);
2430         break;
2431       case T_INT:
2432         shift = 2;
2433         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2434         __ br(Assembler::LO, L_fill_elements);
2435         break;
2436       default: ShouldNotReachHere();
2437     }
2438 
2439     // Align source address at 8 bytes address boundary.
2440     Label L_skip_align1, L_skip_align2, L_skip_align4;
2441     if (!aligned) {
2442       switch (t) {
2443         case T_BYTE:
2444           // One byte misalignment happens only for byte arrays.
2445           __ tbz(to, 0, L_skip_align1);
2446           __ strb(value, Address(__ post(to, 1)));
2447           __ subw(count, count, 1);
2448           __ bind(L_skip_align1);
2449           // Fallthrough
2450         case T_SHORT:
2451           // Two bytes misalignment happens only for byte and short (char) arrays.
2452           __ tbz(to, 1, L_skip_align2);
2453           __ strh(value, Address(__ post(to, 2)));
2454           __ subw(count, count, 2 >> shift);
2455           __ bind(L_skip_align2);
2456           // Fallthrough
2457         case T_INT:
2458           // Align to 8 bytes, we know we are 4 byte aligned to start.
2459           __ tbz(to, 2, L_skip_align4);
2460           __ strw(value, Address(__ post(to, 4)));
2461           __ subw(count, count, 4 >> shift);
2462           __ bind(L_skip_align4);
2463           break;
2464         default: ShouldNotReachHere();
2465       }
2466     }
2467 
2468     //
2469     //  Fill large chunks
2470     //
2471     __ lsrw(cnt_words, count, 3 - shift); // number of words
2472     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2473     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2474     if (UseBlockZeroing) {
2475       Label non_block_zeroing, rest;
2476       // If the fill value is zero we can use the fast zero_words().
2477       __ cbnz(value, non_block_zeroing);
2478       __ mov(bz_base, to);
2479       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2480       address tpc = __ zero_words(bz_base, cnt_words);
2481       if (tpc == nullptr) {
2482         fatal("CodeCache is full at generate_fill");
2483       }
2484       __ b(rest);
2485       __ bind(non_block_zeroing);
2486       __ fill_words(to, cnt_words, value);
2487       __ bind(rest);
2488     } else {
2489       __ fill_words(to, cnt_words, value);
2490     }
2491 
2492     // Remaining count is less than 8 bytes. Fill it by a single store.
2493     // Note that the total length is no less than 8 bytes.
2494     if (t == T_BYTE || t == T_SHORT) {
2495       Label L_exit1;
2496       __ cbzw(count, L_exit1);
2497       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2498       __ str(value, Address(to, -8));    // overwrite some elements
2499       __ bind(L_exit1);
2500       __ leave();
2501       __ ret(lr);
2502     }
2503 
2504     // Handle copies less than 8 bytes.
2505     Label L_fill_2, L_fill_4, L_exit2;
2506     __ bind(L_fill_elements);
2507     switch (t) {
2508       case T_BYTE:
2509         __ tbz(count, 0, L_fill_2);
2510         __ strb(value, Address(__ post(to, 1)));
2511         __ bind(L_fill_2);
2512         __ tbz(count, 1, L_fill_4);
2513         __ strh(value, Address(__ post(to, 2)));
2514         __ bind(L_fill_4);
2515         __ tbz(count, 2, L_exit2);
2516         __ strw(value, Address(to));
2517         break;
2518       case T_SHORT:
2519         __ tbz(count, 0, L_fill_4);
2520         __ strh(value, Address(__ post(to, 2)));
2521         __ bind(L_fill_4);
2522         __ tbz(count, 1, L_exit2);
2523         __ strw(value, Address(to));
2524         break;
2525       case T_INT:
2526         __ cbzw(count, L_exit2);
2527         __ strw(value, Address(to));
2528         break;
2529       default: ShouldNotReachHere();
2530     }
2531     __ bind(L_exit2);
2532     __ leave();
2533     __ ret(lr);
2534     return start;
2535   }
2536 
2537   address generate_data_cache_writeback() {
2538     const Register line        = c_rarg0;  // address of line to write back
2539 
2540     __ align(CodeEntryAlignment);
2541 
2542     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2543 
2544     address start = __ pc();
2545     __ enter();
2546     __ cache_wb(Address(line, 0));
2547     __ leave();
2548     __ ret(lr);
2549 
2550     return start;
2551   }
2552 
2553   address generate_data_cache_writeback_sync() {
2554     const Register is_pre     = c_rarg0;  // pre or post sync
2555 
2556     __ align(CodeEntryAlignment);
2557 
2558     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2559 
2560     // pre wbsync is a no-op
2561     // post wbsync translates to an sfence
2562 
2563     Label skip;
2564     address start = __ pc();
2565     __ enter();
2566     __ cbnz(is_pre, skip);
2567     __ cache_wbsync(false);
2568     __ bind(skip);
2569     __ leave();
2570     __ ret(lr);
2571 
2572     return start;
2573   }
2574 
2575   void generate_arraycopy_stubs() {
2576     address entry;
2577     address entry_jbyte_arraycopy;
2578     address entry_jshort_arraycopy;
2579     address entry_jint_arraycopy;
2580     address entry_oop_arraycopy;
2581     address entry_jlong_arraycopy;
2582     address entry_checkcast_arraycopy;
2583 
2584     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2585     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2586 
2587     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2588     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2589 
2590     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2591     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2592 
2593     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2594 
2595     //*** jbyte
2596     // Always need aligned and unaligned versions
2597     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2598                                                                                   "jbyte_disjoint_arraycopy");
2599     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2600                                                                                   &entry_jbyte_arraycopy,
2601                                                                                   "jbyte_arraycopy");
2602     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2603                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2604     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2605                                                                                   "arrayof_jbyte_arraycopy");
2606 
2607     //*** jshort
2608     // Always need aligned and unaligned versions
2609     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2610                                                                                     "jshort_disjoint_arraycopy");
2611     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2612                                                                                     &entry_jshort_arraycopy,
2613                                                                                     "jshort_arraycopy");
2614     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2615                                                                                     "arrayof_jshort_disjoint_arraycopy");
2616     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2617                                                                                     "arrayof_jshort_arraycopy");
2618 
2619     //*** jint
2620     // Aligned versions
2621     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2622                                                                                 "arrayof_jint_disjoint_arraycopy");
2623     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2624                                                                                 "arrayof_jint_arraycopy");
2625     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2626     // entry_jint_arraycopy always points to the unaligned version
2627     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2628                                                                                 "jint_disjoint_arraycopy");
2629     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2630                                                                                 &entry_jint_arraycopy,
2631                                                                                 "jint_arraycopy");
2632 
2633     //*** jlong
2634     // It is always aligned
2635     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2636                                                                                   "arrayof_jlong_disjoint_arraycopy");
2637     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2638                                                                                   "arrayof_jlong_arraycopy");
2639     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2640     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2641 
2642     //*** oops
2643     {
2644       // With compressed oops we need unaligned versions; notice that
2645       // we overwrite entry_oop_arraycopy.
2646       bool aligned = !UseCompressedOops;
2647 
2648       StubRoutines::_arrayof_oop_disjoint_arraycopy
2649         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2650                                      /*dest_uninitialized*/false);
2651       StubRoutines::_arrayof_oop_arraycopy
2652         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2653                                      /*dest_uninitialized*/false);
2654       // Aligned versions without pre-barriers
2655       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2656         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2657                                      /*dest_uninitialized*/true);
2658       StubRoutines::_arrayof_oop_arraycopy_uninit
2659         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2660                                      /*dest_uninitialized*/true);
2661     }
2662 
2663     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2664     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2665     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2666     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2667 
2668     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2669     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2670                                                                         /*dest_uninitialized*/true);
2671 
2672     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2673                                                               entry_jbyte_arraycopy,
2674                                                               entry_jshort_arraycopy,
2675                                                               entry_jint_arraycopy,
2676                                                               entry_jlong_arraycopy);
2677 
2678     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2679                                                                entry_jbyte_arraycopy,
2680                                                                entry_jshort_arraycopy,
2681                                                                entry_jint_arraycopy,
2682                                                                entry_oop_arraycopy,
2683                                                                entry_jlong_arraycopy,
2684                                                                entry_checkcast_arraycopy);
2685 
2686     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2687     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2688     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2689     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2690     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2691     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2692   }
2693 
2694   void generate_math_stubs() { Unimplemented(); }
2695 
2696   // Arguments:
2697   //
2698   // Inputs:
2699   //   c_rarg0   - source byte array address
2700   //   c_rarg1   - destination byte array address
2701   //   c_rarg2   - K (key) in little endian int array
2702   //
2703   address generate_aescrypt_encryptBlock() {
2704     __ align(CodeEntryAlignment);
2705     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2706 
2707     const Register from        = c_rarg0;  // source array address
2708     const Register to          = c_rarg1;  // destination array address
2709     const Register key         = c_rarg2;  // key array address
2710     const Register keylen      = rscratch1;
2711 
2712     address start = __ pc();
2713     __ enter();
2714 
2715     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2716 
2717     __ aesenc_loadkeys(key, keylen);
2718     __ aesecb_encrypt(from, to, keylen);
2719 
2720     __ mov(r0, 0);
2721 
2722     __ leave();
2723     __ ret(lr);
2724 
2725     return start;
2726   }
2727 
2728   // Arguments:
2729   //
2730   // Inputs:
2731   //   c_rarg0   - source byte array address
2732   //   c_rarg1   - destination byte array address
2733   //   c_rarg2   - K (key) in little endian int array
2734   //
2735   address generate_aescrypt_decryptBlock() {
2736     assert(UseAES, "need AES cryptographic extension support");
2737     __ align(CodeEntryAlignment);
2738     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2739     Label L_doLast;
2740 
2741     const Register from        = c_rarg0;  // source array address
2742     const Register to          = c_rarg1;  // destination array address
2743     const Register key         = c_rarg2;  // key array address
2744     const Register keylen      = rscratch1;
2745 
2746     address start = __ pc();
2747     __ enter(); // required for proper stackwalking of RuntimeStub frame
2748 
2749     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2750 
2751     __ aesecb_decrypt(from, to, key, keylen);
2752 
2753     __ mov(r0, 0);
2754 
2755     __ leave();
2756     __ ret(lr);
2757 
2758     return start;
2759   }
2760 
2761   // Arguments:
2762   //
2763   // Inputs:
2764   //   c_rarg0   - source byte array address
2765   //   c_rarg1   - destination byte array address
2766   //   c_rarg2   - K (key) in little endian int array
2767   //   c_rarg3   - r vector byte array address
2768   //   c_rarg4   - input length
2769   //
2770   // Output:
2771   //   x0        - input length
2772   //
2773   address generate_cipherBlockChaining_encryptAESCrypt() {
2774     assert(UseAES, "need AES cryptographic extension support");
2775     __ align(CodeEntryAlignment);
2776     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2777 
2778     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2779 
2780     const Register from        = c_rarg0;  // source array address
2781     const Register to          = c_rarg1;  // destination array address
2782     const Register key         = c_rarg2;  // key array address
2783     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2784                                            // and left with the results of the last encryption block
2785     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2786     const Register keylen      = rscratch1;
2787 
2788     address start = __ pc();
2789 
2790       __ enter();
2791 
2792       __ movw(rscratch2, len_reg);
2793 
2794       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2795 
2796       __ ld1(v0, __ T16B, rvec);
2797 
2798       __ cmpw(keylen, 52);
2799       __ br(Assembler::CC, L_loadkeys_44);
2800       __ br(Assembler::EQ, L_loadkeys_52);
2801 
2802       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2803       __ rev32(v17, __ T16B, v17);
2804       __ rev32(v18, __ T16B, v18);
2805     __ BIND(L_loadkeys_52);
2806       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2807       __ rev32(v19, __ T16B, v19);
2808       __ rev32(v20, __ T16B, v20);
2809     __ BIND(L_loadkeys_44);
2810       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2811       __ rev32(v21, __ T16B, v21);
2812       __ rev32(v22, __ T16B, v22);
2813       __ rev32(v23, __ T16B, v23);
2814       __ rev32(v24, __ T16B, v24);
2815       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2816       __ rev32(v25, __ T16B, v25);
2817       __ rev32(v26, __ T16B, v26);
2818       __ rev32(v27, __ T16B, v27);
2819       __ rev32(v28, __ T16B, v28);
2820       __ ld1(v29, v30, v31, __ T16B, key);
2821       __ rev32(v29, __ T16B, v29);
2822       __ rev32(v30, __ T16B, v30);
2823       __ rev32(v31, __ T16B, v31);
2824 
2825     __ BIND(L_aes_loop);
2826       __ ld1(v1, __ T16B, __ post(from, 16));
2827       __ eor(v0, __ T16B, v0, v1);
2828 
2829       __ br(Assembler::CC, L_rounds_44);
2830       __ br(Assembler::EQ, L_rounds_52);
2831 
2832       __ aese(v0, v17); __ aesmc(v0, v0);
2833       __ aese(v0, v18); __ aesmc(v0, v0);
2834     __ BIND(L_rounds_52);
2835       __ aese(v0, v19); __ aesmc(v0, v0);
2836       __ aese(v0, v20); __ aesmc(v0, v0);
2837     __ BIND(L_rounds_44);
2838       __ aese(v0, v21); __ aesmc(v0, v0);
2839       __ aese(v0, v22); __ aesmc(v0, v0);
2840       __ aese(v0, v23); __ aesmc(v0, v0);
2841       __ aese(v0, v24); __ aesmc(v0, v0);
2842       __ aese(v0, v25); __ aesmc(v0, v0);
2843       __ aese(v0, v26); __ aesmc(v0, v0);
2844       __ aese(v0, v27); __ aesmc(v0, v0);
2845       __ aese(v0, v28); __ aesmc(v0, v0);
2846       __ aese(v0, v29); __ aesmc(v0, v0);
2847       __ aese(v0, v30);
2848       __ eor(v0, __ T16B, v0, v31);
2849 
2850       __ st1(v0, __ T16B, __ post(to, 16));
2851 
2852       __ subw(len_reg, len_reg, 16);
2853       __ cbnzw(len_reg, L_aes_loop);
2854 
2855       __ st1(v0, __ T16B, rvec);
2856 
2857       __ mov(r0, rscratch2);
2858 
2859       __ leave();
2860       __ ret(lr);
2861 
2862       return start;
2863   }
2864 
2865   // Arguments:
2866   //
2867   // Inputs:
2868   //   c_rarg0   - source byte array address
2869   //   c_rarg1   - destination byte array address
2870   //   c_rarg2   - K (key) in little endian int array
2871   //   c_rarg3   - r vector byte array address
2872   //   c_rarg4   - input length
2873   //
2874   // Output:
2875   //   r0        - input length
2876   //
2877   address generate_cipherBlockChaining_decryptAESCrypt() {
2878     assert(UseAES, "need AES cryptographic extension support");
2879     __ align(CodeEntryAlignment);
2880     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2881 
2882     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2883 
2884     const Register from        = c_rarg0;  // source array address
2885     const Register to          = c_rarg1;  // destination array address
2886     const Register key         = c_rarg2;  // key array address
2887     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2888                                            // and left with the results of the last encryption block
2889     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2890     const Register keylen      = rscratch1;
2891 
2892     address start = __ pc();
2893 
2894       __ enter();
2895 
2896       __ movw(rscratch2, len_reg);
2897 
2898       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2899 
2900       __ ld1(v2, __ T16B, rvec);
2901 
2902       __ ld1(v31, __ T16B, __ post(key, 16));
2903       __ rev32(v31, __ T16B, v31);
2904 
2905       __ cmpw(keylen, 52);
2906       __ br(Assembler::CC, L_loadkeys_44);
2907       __ br(Assembler::EQ, L_loadkeys_52);
2908 
2909       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2910       __ rev32(v17, __ T16B, v17);
2911       __ rev32(v18, __ T16B, v18);
2912     __ BIND(L_loadkeys_52);
2913       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2914       __ rev32(v19, __ T16B, v19);
2915       __ rev32(v20, __ T16B, v20);
2916     __ BIND(L_loadkeys_44);
2917       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2918       __ rev32(v21, __ T16B, v21);
2919       __ rev32(v22, __ T16B, v22);
2920       __ rev32(v23, __ T16B, v23);
2921       __ rev32(v24, __ T16B, v24);
2922       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2923       __ rev32(v25, __ T16B, v25);
2924       __ rev32(v26, __ T16B, v26);
2925       __ rev32(v27, __ T16B, v27);
2926       __ rev32(v28, __ T16B, v28);
2927       __ ld1(v29, v30, __ T16B, key);
2928       __ rev32(v29, __ T16B, v29);
2929       __ rev32(v30, __ T16B, v30);
2930 
2931     __ BIND(L_aes_loop);
2932       __ ld1(v0, __ T16B, __ post(from, 16));
2933       __ orr(v1, __ T16B, v0, v0);
2934 
2935       __ br(Assembler::CC, L_rounds_44);
2936       __ br(Assembler::EQ, L_rounds_52);
2937 
2938       __ aesd(v0, v17); __ aesimc(v0, v0);
2939       __ aesd(v0, v18); __ aesimc(v0, v0);
2940     __ BIND(L_rounds_52);
2941       __ aesd(v0, v19); __ aesimc(v0, v0);
2942       __ aesd(v0, v20); __ aesimc(v0, v0);
2943     __ BIND(L_rounds_44);
2944       __ aesd(v0, v21); __ aesimc(v0, v0);
2945       __ aesd(v0, v22); __ aesimc(v0, v0);
2946       __ aesd(v0, v23); __ aesimc(v0, v0);
2947       __ aesd(v0, v24); __ aesimc(v0, v0);
2948       __ aesd(v0, v25); __ aesimc(v0, v0);
2949       __ aesd(v0, v26); __ aesimc(v0, v0);
2950       __ aesd(v0, v27); __ aesimc(v0, v0);
2951       __ aesd(v0, v28); __ aesimc(v0, v0);
2952       __ aesd(v0, v29); __ aesimc(v0, v0);
2953       __ aesd(v0, v30);
2954       __ eor(v0, __ T16B, v0, v31);
2955       __ eor(v0, __ T16B, v0, v2);
2956 
2957       __ st1(v0, __ T16B, __ post(to, 16));
2958       __ orr(v2, __ T16B, v1, v1);
2959 
2960       __ subw(len_reg, len_reg, 16);
2961       __ cbnzw(len_reg, L_aes_loop);
2962 
2963       __ st1(v2, __ T16B, rvec);
2964 
2965       __ mov(r0, rscratch2);
2966 
2967       __ leave();
2968       __ ret(lr);
2969 
2970     return start;
2971   }
2972 
2973   // CTR AES crypt.
2974   // Arguments:
2975   //
2976   // Inputs:
2977   //   c_rarg0   - source byte array address
2978   //   c_rarg1   - destination byte array address
2979   //   c_rarg2   - K (key) in little endian int array
2980   //   c_rarg3   - counter vector byte array address
2981   //   c_rarg4   - input length
2982   //   c_rarg5   - saved encryptedCounter start
2983   //   c_rarg6   - saved used length
2984   //
2985   // Output:
2986   //   r0       - input length
2987   //
2988   address generate_counterMode_AESCrypt() {
2989     const Register in = c_rarg0;
2990     const Register out = c_rarg1;
2991     const Register key = c_rarg2;
2992     const Register counter = c_rarg3;
2993     const Register saved_len = c_rarg4, len = r10;
2994     const Register saved_encrypted_ctr = c_rarg5;
2995     const Register used_ptr = c_rarg6, used = r12;
2996 
2997     const Register offset = r7;
2998     const Register keylen = r11;
2999 
3000     const unsigned char block_size = 16;
3001     const int bulk_width = 4;
3002     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3003     // performance with larger data sizes, but it also means that the
3004     // fast path isn't used until you have at least 8 blocks, and up
3005     // to 127 bytes of data will be executed on the slow path. For
3006     // that reason, and also so as not to blow away too much icache, 4
3007     // blocks seems like a sensible compromise.
3008 
3009     // Algorithm:
3010     //
3011     //    if (len == 0) {
3012     //        goto DONE;
3013     //    }
3014     //    int result = len;
3015     //    do {
3016     //        if (used >= blockSize) {
3017     //            if (len >= bulk_width * blockSize) {
3018     //                CTR_large_block();
3019     //                if (len == 0)
3020     //                    goto DONE;
3021     //            }
3022     //            for (;;) {
3023     //                16ByteVector v0 = counter;
3024     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3025     //                used = 0;
3026     //                if (len < blockSize)
3027     //                    break;    /* goto NEXT */
3028     //                16ByteVector v1 = load16Bytes(in, offset);
3029     //                v1 = v1 ^ encryptedCounter;
3030     //                store16Bytes(out, offset);
3031     //                used = blockSize;
3032     //                offset += blockSize;
3033     //                len -= blockSize;
3034     //                if (len == 0)
3035     //                    goto DONE;
3036     //            }
3037     //        }
3038     //      NEXT:
3039     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3040     //        len--;
3041     //    } while (len != 0);
3042     //  DONE:
3043     //    return result;
3044     //
3045     // CTR_large_block()
3046     //    Wide bulk encryption of whole blocks.
3047 
3048     __ align(CodeEntryAlignment);
3049     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3050     const address start = __ pc();
3051     __ enter();
3052 
3053     Label DONE, CTR_large_block, large_block_return;
3054     __ ldrw(used, Address(used_ptr));
3055     __ cbzw(saved_len, DONE);
3056 
3057     __ mov(len, saved_len);
3058     __ mov(offset, 0);
3059 
3060     // Compute #rounds for AES based on the length of the key array
3061     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3062 
3063     __ aesenc_loadkeys(key, keylen);
3064 
3065     {
3066       Label L_CTR_loop, NEXT;
3067 
3068       __ bind(L_CTR_loop);
3069 
3070       __ cmp(used, block_size);
3071       __ br(__ LO, NEXT);
3072 
3073       // Maybe we have a lot of data
3074       __ subsw(rscratch1, len, bulk_width * block_size);
3075       __ br(__ HS, CTR_large_block);
3076       __ BIND(large_block_return);
3077       __ cbzw(len, DONE);
3078 
3079       // Setup the counter
3080       __ movi(v4, __ T4S, 0);
3081       __ movi(v5, __ T4S, 1);
3082       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
3083 
3084       __ ld1(v0, __ T16B, counter); // Load the counter into v0
3085       __ rev32(v16, __ T16B, v0);
3086       __ addv(v16, __ T4S, v16, v4);
3087       __ rev32(v16, __ T16B, v16);
3088       __ st1(v16, __ T16B, counter); // Save the incremented counter back
3089 
3090       {
3091         // We have fewer than bulk_width blocks of data left. Encrypt
3092         // them one by one until there is less than a full block
3093         // remaining, being careful to save both the encrypted counter
3094         // and the counter.
3095 
3096         Label inner_loop;
3097         __ bind(inner_loop);
3098         // Counter to encrypt is in v0
3099         __ aesecb_encrypt(noreg, noreg, keylen);
3100         __ st1(v0, __ T16B, saved_encrypted_ctr);
3101 
3102         // Do we have a remaining full block?
3103 
3104         __ mov(used, 0);
3105         __ cmp(len, block_size);
3106         __ br(__ LO, NEXT);
3107 
3108         // Yes, we have a full block
3109         __ ldrq(v1, Address(in, offset));
3110         __ eor(v1, __ T16B, v1, v0);
3111         __ strq(v1, Address(out, offset));
3112         __ mov(used, block_size);
3113         __ add(offset, offset, block_size);
3114 
3115         __ subw(len, len, block_size);
3116         __ cbzw(len, DONE);
3117 
3118         // Increment the counter, store it back
3119         __ orr(v0, __ T16B, v16, v16);
3120         __ rev32(v16, __ T16B, v16);
3121         __ addv(v16, __ T4S, v16, v4);
3122         __ rev32(v16, __ T16B, v16);
3123         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3124 
3125         __ b(inner_loop);
3126       }
3127 
3128       __ BIND(NEXT);
3129 
3130       // Encrypt a single byte, and loop.
3131       // We expect this to be a rare event.
3132       __ ldrb(rscratch1, Address(in, offset));
3133       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3134       __ eor(rscratch1, rscratch1, rscratch2);
3135       __ strb(rscratch1, Address(out, offset));
3136       __ add(offset, offset, 1);
3137       __ add(used, used, 1);
3138       __ subw(len, len,1);
3139       __ cbnzw(len, L_CTR_loop);
3140     }
3141 
3142     __ bind(DONE);
3143     __ strw(used, Address(used_ptr));
3144     __ mov(r0, saved_len);
3145 
3146     __ leave(); // required for proper stackwalking of RuntimeStub frame
3147     __ ret(lr);
3148 
3149     // Bulk encryption
3150 
3151     __ BIND (CTR_large_block);
3152     assert(bulk_width == 4 || bulk_width == 8, "must be");
3153 
3154     if (bulk_width == 8) {
3155       __ sub(sp, sp, 4 * 16);
3156       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3157     }
3158     __ sub(sp, sp, 4 * 16);
3159     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3160     RegSet saved_regs = (RegSet::of(in, out, offset)
3161                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3162     __ push(saved_regs, sp);
3163     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3164     __ add(in, in, offset);
3165     __ add(out, out, offset);
3166 
3167     // Keys should already be loaded into the correct registers
3168 
3169     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3170     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3171 
3172     // AES/CTR loop
3173     {
3174       Label L_CTR_loop;
3175       __ BIND(L_CTR_loop);
3176 
3177       // Setup the counters
3178       __ movi(v8, __ T4S, 0);
3179       __ movi(v9, __ T4S, 1);
3180       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3181 
3182       for (int i = 0; i < bulk_width; i++) {
3183         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3184         __ rev32(v0_ofs, __ T16B, v16);
3185         __ addv(v16, __ T4S, v16, v8);
3186       }
3187 
3188       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3189 
3190       // Encrypt the counters
3191       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3192 
3193       if (bulk_width == 8) {
3194         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3195       }
3196 
3197       // XOR the encrypted counters with the inputs
3198       for (int i = 0; i < bulk_width; i++) {
3199         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3200         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3201         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3202       }
3203 
3204       // Write the encrypted data
3205       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3206       if (bulk_width == 8) {
3207         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3208       }
3209 
3210       __ subw(len, len, 16 * bulk_width);
3211       __ cbnzw(len, L_CTR_loop);
3212     }
3213 
3214     // Save the counter back where it goes
3215     __ rev32(v16, __ T16B, v16);
3216     __ st1(v16, __ T16B, counter);
3217 
3218     __ pop(saved_regs, sp);
3219 
3220     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3221     if (bulk_width == 8) {
3222       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3223     }
3224 
3225     __ andr(rscratch1, len, -16 * bulk_width);
3226     __ sub(len, len, rscratch1);
3227     __ add(offset, offset, rscratch1);
3228     __ mov(used, 16);
3229     __ strw(used, Address(used_ptr));
3230     __ b(large_block_return);
3231 
3232     return start;
3233   }
3234 
3235   // Vector AES Galois Counter Mode implementation. Parameters:
3236   //
3237   // in = c_rarg0
3238   // len = c_rarg1
3239   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3240   // out = c_rarg3
3241   // key = c_rarg4
3242   // state = c_rarg5 - GHASH.state
3243   // subkeyHtbl = c_rarg6 - powers of H
3244   // counter = c_rarg7 - 16 bytes of CTR
3245   // return - number of processed bytes
3246   address generate_galoisCounterMode_AESCrypt() {
3247     address ghash_polynomial = __ pc();
3248     __ emit_int64(0x87);  // The low-order bits of the field
3249                           // polynomial (i.e. p = z^7+z^2+z+1)
3250                           // repeated in the low and high parts of a
3251                           // 128-bit vector
3252     __ emit_int64(0x87);
3253 
3254     __ align(CodeEntryAlignment);
3255      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3256     address start = __ pc();
3257     __ enter();
3258 
3259     const Register in = c_rarg0;
3260     const Register len = c_rarg1;
3261     const Register ct = c_rarg2;
3262     const Register out = c_rarg3;
3263     // and updated with the incremented counter in the end
3264 
3265     const Register key = c_rarg4;
3266     const Register state = c_rarg5;
3267 
3268     const Register subkeyHtbl = c_rarg6;
3269 
3270     const Register counter = c_rarg7;
3271 
3272     const Register keylen = r10;
3273     // Save state before entering routine
3274     __ sub(sp, sp, 4 * 16);
3275     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3276     __ sub(sp, sp, 4 * 16);
3277     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3278 
3279     // __ andr(len, len, -512);
3280     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3281     __ str(len, __ pre(sp, -2 * wordSize));
3282 
3283     Label DONE;
3284     __ cbz(len, DONE);
3285 
3286     // Compute #rounds for AES based on the length of the key array
3287     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3288 
3289     __ aesenc_loadkeys(key, keylen);
3290     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3291     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3292 
3293     // AES/CTR loop
3294     {
3295       Label L_CTR_loop;
3296       __ BIND(L_CTR_loop);
3297 
3298       // Setup the counters
3299       __ movi(v8, __ T4S, 0);
3300       __ movi(v9, __ T4S, 1);
3301       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3302 
3303       assert(v0->encoding() < v8->encoding(), "");
3304       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3305         FloatRegister f = as_FloatRegister(i);
3306         __ rev32(f, __ T16B, v16);
3307         __ addv(v16, __ T4S, v16, v8);
3308       }
3309 
3310       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3311 
3312       // Encrypt the counters
3313       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3314 
3315       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3316 
3317       // XOR the encrypted counters with the inputs
3318       for (int i = 0; i < 8; i++) {
3319         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3320         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3321         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3322       }
3323       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3324       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3325 
3326       __ subw(len, len, 16 * 8);
3327       __ cbnzw(len, L_CTR_loop);
3328     }
3329 
3330     __ rev32(v16, __ T16B, v16);
3331     __ st1(v16, __ T16B, counter);
3332 
3333     __ ldr(len, Address(sp));
3334     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3335 
3336     // GHASH/CTR loop
3337     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3338                                 len, /*unrolls*/4);
3339 
3340 #ifdef ASSERT
3341     { Label L;
3342       __ cmp(len, (unsigned char)0);
3343       __ br(Assembler::EQ, L);
3344       __ stop("stubGenerator: abort");
3345       __ bind(L);
3346   }
3347 #endif
3348 
3349   __ bind(DONE);
3350     // Return the number of bytes processed
3351     __ ldr(r0, __ post(sp, 2 * wordSize));
3352 
3353     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3354     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3355 
3356     __ leave(); // required for proper stackwalking of RuntimeStub frame
3357     __ ret(lr);
3358      return start;
3359   }
3360 
3361   class Cached64Bytes {
3362   private:
3363     MacroAssembler *_masm;
3364     Register _regs[8];
3365 
3366   public:
3367     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3368       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3369       auto it = rs.begin();
3370       for (auto &r: _regs) {
3371         r = *it;
3372         ++it;
3373       }
3374     }
3375 
3376     void gen_loads(Register base) {
3377       for (int i = 0; i < 8; i += 2) {
3378         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3379       }
3380     }
3381 
3382     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3383     void extract_u32(Register dest, int i) {
3384       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3385     }
3386   };
3387 
3388   // Utility routines for md5.
3389   // Clobbers r10 and r11.
3390   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3391               int k, int s, int t) {
3392     Register rscratch3 = r10;
3393     Register rscratch4 = r11;
3394 
3395     __ eorw(rscratch3, r3, r4);
3396     __ movw(rscratch2, t);
3397     __ andw(rscratch3, rscratch3, r2);
3398     __ addw(rscratch4, r1, rscratch2);
3399     reg_cache.extract_u32(rscratch1, k);
3400     __ eorw(rscratch3, rscratch3, r4);
3401     __ addw(rscratch4, rscratch4, rscratch1);
3402     __ addw(rscratch3, rscratch3, rscratch4);
3403     __ rorw(rscratch2, rscratch3, 32 - s);
3404     __ addw(r1, rscratch2, r2);
3405   }
3406 
3407   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3408               int k, int s, int t) {
3409     Register rscratch3 = r10;
3410     Register rscratch4 = r11;
3411 
3412     __ andw(rscratch3, r2, r4);
3413     __ bicw(rscratch4, r3, r4);
3414     reg_cache.extract_u32(rscratch1, k);
3415     __ movw(rscratch2, t);
3416     __ orrw(rscratch3, rscratch3, rscratch4);
3417     __ addw(rscratch4, r1, rscratch2);
3418     __ addw(rscratch4, rscratch4, rscratch1);
3419     __ addw(rscratch3, rscratch3, rscratch4);
3420     __ rorw(rscratch2, rscratch3, 32 - s);
3421     __ addw(r1, rscratch2, r2);
3422   }
3423 
3424   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3425               int k, int s, int t) {
3426     Register rscratch3 = r10;
3427     Register rscratch4 = r11;
3428 
3429     __ eorw(rscratch3, r3, r4);
3430     __ movw(rscratch2, t);
3431     __ addw(rscratch4, r1, rscratch2);
3432     reg_cache.extract_u32(rscratch1, k);
3433     __ eorw(rscratch3, rscratch3, r2);
3434     __ addw(rscratch4, rscratch4, rscratch1);
3435     __ addw(rscratch3, rscratch3, rscratch4);
3436     __ rorw(rscratch2, rscratch3, 32 - s);
3437     __ addw(r1, rscratch2, r2);
3438   }
3439 
3440   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3441               int k, int s, int t) {
3442     Register rscratch3 = r10;
3443     Register rscratch4 = r11;
3444 
3445     __ movw(rscratch3, t);
3446     __ ornw(rscratch2, r2, r4);
3447     __ addw(rscratch4, r1, rscratch3);
3448     reg_cache.extract_u32(rscratch1, k);
3449     __ eorw(rscratch3, rscratch2, r3);
3450     __ addw(rscratch4, rscratch4, rscratch1);
3451     __ addw(rscratch3, rscratch3, rscratch4);
3452     __ rorw(rscratch2, rscratch3, 32 - s);
3453     __ addw(r1, rscratch2, r2);
3454   }
3455 
3456   // Arguments:
3457   //
3458   // Inputs:
3459   //   c_rarg0   - byte[]  source+offset
3460   //   c_rarg1   - int[]   SHA.state
3461   //   c_rarg2   - int     offset
3462   //   c_rarg3   - int     limit
3463   //
3464   address generate_md5_implCompress(bool multi_block, const char *name) {
3465     __ align(CodeEntryAlignment);
3466     StubCodeMark mark(this, "StubRoutines", name);
3467     address start = __ pc();
3468 
3469     Register buf       = c_rarg0;
3470     Register state     = c_rarg1;
3471     Register ofs       = c_rarg2;
3472     Register limit     = c_rarg3;
3473     Register a         = r4;
3474     Register b         = r5;
3475     Register c         = r6;
3476     Register d         = r7;
3477     Register rscratch3 = r10;
3478     Register rscratch4 = r11;
3479 
3480     Register state_regs[2] = { r12, r13 };
3481     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3482     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3483 
3484     __ push(saved_regs, sp);
3485 
3486     __ ldp(state_regs[0], state_regs[1], Address(state));
3487     __ ubfx(a, state_regs[0],  0, 32);
3488     __ ubfx(b, state_regs[0], 32, 32);
3489     __ ubfx(c, state_regs[1],  0, 32);
3490     __ ubfx(d, state_regs[1], 32, 32);
3491 
3492     Label md5_loop;
3493     __ BIND(md5_loop);
3494 
3495     reg_cache.gen_loads(buf);
3496 
3497     // Round 1
3498     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3499     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3500     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3501     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3502     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3503     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3504     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3505     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3506     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3507     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3508     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3509     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3510     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3511     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3512     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3513     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3514 
3515     // Round 2
3516     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3517     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3518     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3519     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3520     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3521     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3522     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3523     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3524     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3525     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3526     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3527     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3528     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3529     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3530     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3531     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3532 
3533     // Round 3
3534     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3535     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3536     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3537     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3538     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3539     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3540     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3541     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3542     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3543     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3544     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3545     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3546     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3547     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3548     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3549     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3550 
3551     // Round 4
3552     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3553     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3554     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3555     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3556     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3557     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3558     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3559     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3560     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3561     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3562     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3563     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3564     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3565     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3566     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3567     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3568 
3569     __ addw(a, state_regs[0], a);
3570     __ ubfx(rscratch2, state_regs[0], 32, 32);
3571     __ addw(b, rscratch2, b);
3572     __ addw(c, state_regs[1], c);
3573     __ ubfx(rscratch4, state_regs[1], 32, 32);
3574     __ addw(d, rscratch4, d);
3575 
3576     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3577     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3578 
3579     if (multi_block) {
3580       __ add(buf, buf, 64);
3581       __ add(ofs, ofs, 64);
3582       __ cmp(ofs, limit);
3583       __ br(Assembler::LE, md5_loop);
3584       __ mov(c_rarg0, ofs); // return ofs
3585     }
3586 
3587     // write hash values back in the correct order
3588     __ stp(state_regs[0], state_regs[1], Address(state));
3589 
3590     __ pop(saved_regs, sp);
3591 
3592     __ ret(lr);
3593 
3594     return start;
3595   }
3596 
3597   // Arguments:
3598   //
3599   // Inputs:
3600   //   c_rarg0   - byte[]  source+offset
3601   //   c_rarg1   - int[]   SHA.state
3602   //   c_rarg2   - int     offset
3603   //   c_rarg3   - int     limit
3604   //
3605   address generate_sha1_implCompress(bool multi_block, const char *name) {
3606     __ align(CodeEntryAlignment);
3607     StubCodeMark mark(this, "StubRoutines", name);
3608     address start = __ pc();
3609 
3610     Register buf   = c_rarg0;
3611     Register state = c_rarg1;
3612     Register ofs   = c_rarg2;
3613     Register limit = c_rarg3;
3614 
3615     Label keys;
3616     Label sha1_loop;
3617 
3618     // load the keys into v0..v3
3619     __ adr(rscratch1, keys);
3620     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3621     // load 5 words state into v6, v7
3622     __ ldrq(v6, Address(state, 0));
3623     __ ldrs(v7, Address(state, 16));
3624 
3625 
3626     __ BIND(sha1_loop);
3627     // load 64 bytes of data into v16..v19
3628     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3629     __ rev32(v16, __ T16B, v16);
3630     __ rev32(v17, __ T16B, v17);
3631     __ rev32(v18, __ T16B, v18);
3632     __ rev32(v19, __ T16B, v19);
3633 
3634     // do the sha1
3635     __ addv(v4, __ T4S, v16, v0);
3636     __ orr(v20, __ T16B, v6, v6);
3637 
3638     FloatRegister d0 = v16;
3639     FloatRegister d1 = v17;
3640     FloatRegister d2 = v18;
3641     FloatRegister d3 = v19;
3642 
3643     for (int round = 0; round < 20; round++) {
3644       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3645       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3646       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3647       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3648       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3649 
3650       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3651       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3652       __ sha1h(tmp2, __ T4S, v20);
3653       if (round < 5)
3654         __ sha1c(v20, __ T4S, tmp3, tmp4);
3655       else if (round < 10 || round >= 15)
3656         __ sha1p(v20, __ T4S, tmp3, tmp4);
3657       else
3658         __ sha1m(v20, __ T4S, tmp3, tmp4);
3659       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3660 
3661       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3662     }
3663 
3664     __ addv(v7, __ T2S, v7, v21);
3665     __ addv(v6, __ T4S, v6, v20);
3666 
3667     if (multi_block) {
3668       __ add(ofs, ofs, 64);
3669       __ cmp(ofs, limit);
3670       __ br(Assembler::LE, sha1_loop);
3671       __ mov(c_rarg0, ofs); // return ofs
3672     }
3673 
3674     __ strq(v6, Address(state, 0));
3675     __ strs(v7, Address(state, 16));
3676 
3677     __ ret(lr);
3678 
3679     __ bind(keys);
3680     __ emit_int32(0x5a827999);
3681     __ emit_int32(0x6ed9eba1);
3682     __ emit_int32(0x8f1bbcdc);
3683     __ emit_int32(0xca62c1d6);
3684 
3685     return start;
3686   }
3687 
3688 
3689   // Arguments:
3690   //
3691   // Inputs:
3692   //   c_rarg0   - byte[]  source+offset
3693   //   c_rarg1   - int[]   SHA.state
3694   //   c_rarg2   - int     offset
3695   //   c_rarg3   - int     limit
3696   //
3697   address generate_sha256_implCompress(bool multi_block, const char *name) {
3698     static const uint32_t round_consts[64] = {
3699       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3700       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3701       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3702       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3703       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3704       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3705       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3706       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3707       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3708       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3709       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3710       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3711       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3712       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3713       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3714       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3715     };
3716     __ align(CodeEntryAlignment);
3717     StubCodeMark mark(this, "StubRoutines", name);
3718     address start = __ pc();
3719 
3720     Register buf   = c_rarg0;
3721     Register state = c_rarg1;
3722     Register ofs   = c_rarg2;
3723     Register limit = c_rarg3;
3724 
3725     Label sha1_loop;
3726 
3727     __ stpd(v8, v9, __ pre(sp, -32));
3728     __ stpd(v10, v11, Address(sp, 16));
3729 
3730 // dga == v0
3731 // dgb == v1
3732 // dg0 == v2
3733 // dg1 == v3
3734 // dg2 == v4
3735 // t0 == v6
3736 // t1 == v7
3737 
3738     // load 16 keys to v16..v31
3739     __ lea(rscratch1, ExternalAddress((address)round_consts));
3740     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3741     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3742     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3743     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3744 
3745     // load 8 words (256 bits) state
3746     __ ldpq(v0, v1, state);
3747 
3748     __ BIND(sha1_loop);
3749     // load 64 bytes of data into v8..v11
3750     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3751     __ rev32(v8, __ T16B, v8);
3752     __ rev32(v9, __ T16B, v9);
3753     __ rev32(v10, __ T16B, v10);
3754     __ rev32(v11, __ T16B, v11);
3755 
3756     __ addv(v6, __ T4S, v8, v16);
3757     __ orr(v2, __ T16B, v0, v0);
3758     __ orr(v3, __ T16B, v1, v1);
3759 
3760     FloatRegister d0 = v8;
3761     FloatRegister d1 = v9;
3762     FloatRegister d2 = v10;
3763     FloatRegister d3 = v11;
3764 
3765 
3766     for (int round = 0; round < 16; round++) {
3767       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3768       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3769       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3770       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3771 
3772       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3773        __ orr(v4, __ T16B, v2, v2);
3774       if (round < 15)
3775         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3776       __ sha256h(v2, __ T4S, v3, tmp2);
3777       __ sha256h2(v3, __ T4S, v4, tmp2);
3778       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3779 
3780       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3781     }
3782 
3783     __ addv(v0, __ T4S, v0, v2);
3784     __ addv(v1, __ T4S, v1, v3);
3785 
3786     if (multi_block) {
3787       __ add(ofs, ofs, 64);
3788       __ cmp(ofs, limit);
3789       __ br(Assembler::LE, sha1_loop);
3790       __ mov(c_rarg0, ofs); // return ofs
3791     }
3792 
3793     __ ldpd(v10, v11, Address(sp, 16));
3794     __ ldpd(v8, v9, __ post(sp, 32));
3795 
3796     __ stpq(v0, v1, state);
3797 
3798     __ ret(lr);
3799 
3800     return start;
3801   }
3802 
3803   // Double rounds for sha512.
3804   void sha512_dround(int dr,
3805                      FloatRegister vi0, FloatRegister vi1,
3806                      FloatRegister vi2, FloatRegister vi3,
3807                      FloatRegister vi4, FloatRegister vrc0,
3808                      FloatRegister vrc1, FloatRegister vin0,
3809                      FloatRegister vin1, FloatRegister vin2,
3810                      FloatRegister vin3, FloatRegister vin4) {
3811       if (dr < 36) {
3812         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3813       }
3814       __ addv(v5, __ T2D, vrc0, vin0);
3815       __ ext(v6, __ T16B, vi2, vi3, 8);
3816       __ ext(v5, __ T16B, v5, v5, 8);
3817       __ ext(v7, __ T16B, vi1, vi2, 8);
3818       __ addv(vi3, __ T2D, vi3, v5);
3819       if (dr < 32) {
3820         __ ext(v5, __ T16B, vin3, vin4, 8);
3821         __ sha512su0(vin0, __ T2D, vin1);
3822       }
3823       __ sha512h(vi3, __ T2D, v6, v7);
3824       if (dr < 32) {
3825         __ sha512su1(vin0, __ T2D, vin2, v5);
3826       }
3827       __ addv(vi4, __ T2D, vi1, vi3);
3828       __ sha512h2(vi3, __ T2D, vi1, vi0);
3829   }
3830 
3831   // Arguments:
3832   //
3833   // Inputs:
3834   //   c_rarg0   - byte[]  source+offset
3835   //   c_rarg1   - int[]   SHA.state
3836   //   c_rarg2   - int     offset
3837   //   c_rarg3   - int     limit
3838   //
3839   address generate_sha512_implCompress(bool multi_block, const char *name) {
3840     static const uint64_t round_consts[80] = {
3841       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3842       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3843       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3844       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3845       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3846       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3847       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3848       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3849       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3850       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3851       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3852       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3853       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3854       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3855       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3856       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3857       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3858       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3859       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3860       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3861       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3862       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3863       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3864       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3865       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3866       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3867       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3868     };
3869 
3870     __ align(CodeEntryAlignment);
3871     StubCodeMark mark(this, "StubRoutines", name);
3872     address start = __ pc();
3873 
3874     Register buf   = c_rarg0;
3875     Register state = c_rarg1;
3876     Register ofs   = c_rarg2;
3877     Register limit = c_rarg3;
3878 
3879     __ stpd(v8, v9, __ pre(sp, -64));
3880     __ stpd(v10, v11, Address(sp, 16));
3881     __ stpd(v12, v13, Address(sp, 32));
3882     __ stpd(v14, v15, Address(sp, 48));
3883 
3884     Label sha512_loop;
3885 
3886     // load state
3887     __ ld1(v8, v9, v10, v11, __ T2D, state);
3888 
3889     // load first 4 round constants
3890     __ lea(rscratch1, ExternalAddress((address)round_consts));
3891     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3892 
3893     __ BIND(sha512_loop);
3894     // load 128B of data into v12..v19
3895     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3896     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3897     __ rev64(v12, __ T16B, v12);
3898     __ rev64(v13, __ T16B, v13);
3899     __ rev64(v14, __ T16B, v14);
3900     __ rev64(v15, __ T16B, v15);
3901     __ rev64(v16, __ T16B, v16);
3902     __ rev64(v17, __ T16B, v17);
3903     __ rev64(v18, __ T16B, v18);
3904     __ rev64(v19, __ T16B, v19);
3905 
3906     __ mov(rscratch2, rscratch1);
3907 
3908     __ mov(v0, __ T16B, v8);
3909     __ mov(v1, __ T16B, v9);
3910     __ mov(v2, __ T16B, v10);
3911     __ mov(v3, __ T16B, v11);
3912 
3913     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3914     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3915     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3916     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3917     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3918     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3919     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3920     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3921     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3922     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3923     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3924     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3925     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3926     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3927     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3928     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3929     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3930     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3931     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3932     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3933     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3934     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3935     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3936     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3937     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3938     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3939     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3940     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3941     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3942     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3943     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3944     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3945     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3946     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3947     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3948     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3949     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3950     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3951     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3952     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3953 
3954     __ addv(v8, __ T2D, v8, v0);
3955     __ addv(v9, __ T2D, v9, v1);
3956     __ addv(v10, __ T2D, v10, v2);
3957     __ addv(v11, __ T2D, v11, v3);
3958 
3959     if (multi_block) {
3960       __ add(ofs, ofs, 128);
3961       __ cmp(ofs, limit);
3962       __ br(Assembler::LE, sha512_loop);
3963       __ mov(c_rarg0, ofs); // return ofs
3964     }
3965 
3966     __ st1(v8, v9, v10, v11, __ T2D, state);
3967 
3968     __ ldpd(v14, v15, Address(sp, 48));
3969     __ ldpd(v12, v13, Address(sp, 32));
3970     __ ldpd(v10, v11, Address(sp, 16));
3971     __ ldpd(v8, v9, __ post(sp, 64));
3972 
3973     __ ret(lr);
3974 
3975     return start;
3976   }
3977 
3978   // Arguments:
3979   //
3980   // Inputs:
3981   //   c_rarg0   - byte[]  source+offset
3982   //   c_rarg1   - byte[]  SHA.state
3983   //   c_rarg2   - int     block_size
3984   //   c_rarg3   - int     offset
3985   //   c_rarg4   - int     limit
3986   //
3987   address generate_sha3_implCompress(bool multi_block, const char *name) {
3988     static const uint64_t round_consts[24] = {
3989       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3990       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3991       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3992       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3993       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3994       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3995       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3996       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3997     };
3998 
3999     __ align(CodeEntryAlignment);
4000     StubCodeMark mark(this, "StubRoutines", name);
4001     address start = __ pc();
4002 
4003     Register buf           = c_rarg0;
4004     Register state         = c_rarg1;
4005     Register block_size    = c_rarg2;
4006     Register ofs           = c_rarg3;
4007     Register limit         = c_rarg4;
4008 
4009     Label sha3_loop, rounds24_loop;
4010     Label sha3_512_or_sha3_384, shake128;
4011 
4012     __ stpd(v8, v9, __ pre(sp, -64));
4013     __ stpd(v10, v11, Address(sp, 16));
4014     __ stpd(v12, v13, Address(sp, 32));
4015     __ stpd(v14, v15, Address(sp, 48));
4016 
4017     // load state
4018     __ add(rscratch1, state, 32);
4019     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4020     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4021     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4022     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4023     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4024     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4025     __ ld1(v24, __ T1D, rscratch1);
4026 
4027     __ BIND(sha3_loop);
4028 
4029     // 24 keccak rounds
4030     __ movw(rscratch2, 24);
4031 
4032     // load round_constants base
4033     __ lea(rscratch1, ExternalAddress((address) round_consts));
4034 
4035     // load input
4036     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4037     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4038     __ eor(v0, __ T8B, v0, v25);
4039     __ eor(v1, __ T8B, v1, v26);
4040     __ eor(v2, __ T8B, v2, v27);
4041     __ eor(v3, __ T8B, v3, v28);
4042     __ eor(v4, __ T8B, v4, v29);
4043     __ eor(v5, __ T8B, v5, v30);
4044     __ eor(v6, __ T8B, v6, v31);
4045 
4046     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4047     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4048 
4049     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4050     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4051     __ eor(v7, __ T8B, v7, v25);
4052     __ eor(v8, __ T8B, v8, v26);
4053     __ eor(v9, __ T8B, v9, v27);
4054     __ eor(v10, __ T8B, v10, v28);
4055     __ eor(v11, __ T8B, v11, v29);
4056     __ eor(v12, __ T8B, v12, v30);
4057     __ eor(v13, __ T8B, v13, v31);
4058 
4059     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4060     __ eor(v14, __ T8B, v14, v25);
4061     __ eor(v15, __ T8B, v15, v26);
4062     __ eor(v16, __ T8B, v16, v27);
4063 
4064     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4065     __ andw(c_rarg5, block_size, 48);
4066     __ cbzw(c_rarg5, rounds24_loop);
4067 
4068     __ tbnz(block_size, 5, shake128);
4069     // block_size == 144, bit5 == 0, SHA3-244
4070     __ ldrd(v28, __ post(buf, 8));
4071     __ eor(v17, __ T8B, v17, v28);
4072     __ b(rounds24_loop);
4073 
4074     __ BIND(shake128);
4075     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4076     __ eor(v17, __ T8B, v17, v28);
4077     __ eor(v18, __ T8B, v18, v29);
4078     __ eor(v19, __ T8B, v19, v30);
4079     __ eor(v20, __ T8B, v20, v31);
4080     __ b(rounds24_loop); // block_size == 168, SHAKE128
4081 
4082     __ BIND(sha3_512_or_sha3_384);
4083     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4084     __ eor(v7, __ T8B, v7, v25);
4085     __ eor(v8, __ T8B, v8, v26);
4086     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4087 
4088     // SHA3-384
4089     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4090     __ eor(v9,  __ T8B, v9,  v27);
4091     __ eor(v10, __ T8B, v10, v28);
4092     __ eor(v11, __ T8B, v11, v29);
4093     __ eor(v12, __ T8B, v12, v30);
4094 
4095     __ BIND(rounds24_loop);
4096     __ subw(rscratch2, rscratch2, 1);
4097 
4098     __ eor3(v29, __ T16B, v4, v9, v14);
4099     __ eor3(v26, __ T16B, v1, v6, v11);
4100     __ eor3(v28, __ T16B, v3, v8, v13);
4101     __ eor3(v25, __ T16B, v0, v5, v10);
4102     __ eor3(v27, __ T16B, v2, v7, v12);
4103     __ eor3(v29, __ T16B, v29, v19, v24);
4104     __ eor3(v26, __ T16B, v26, v16, v21);
4105     __ eor3(v28, __ T16B, v28, v18, v23);
4106     __ eor3(v25, __ T16B, v25, v15, v20);
4107     __ eor3(v27, __ T16B, v27, v17, v22);
4108 
4109     __ rax1(v30, __ T2D, v29, v26);
4110     __ rax1(v26, __ T2D, v26, v28);
4111     __ rax1(v28, __ T2D, v28, v25);
4112     __ rax1(v25, __ T2D, v25, v27);
4113     __ rax1(v27, __ T2D, v27, v29);
4114 
4115     __ eor(v0, __ T16B, v0, v30);
4116     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4117     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4118     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4119     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4120     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4121     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4122     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4123     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4124     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4125     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4126     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4127     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4128     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4129     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4130     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4131     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4132     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4133     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4134     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4135     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4136     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4137     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4138     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4139     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4140 
4141     __ bcax(v20, __ T16B, v31, v22, v8);
4142     __ bcax(v21, __ T16B, v8,  v23, v22);
4143     __ bcax(v22, __ T16B, v22, v24, v23);
4144     __ bcax(v23, __ T16B, v23, v31, v24);
4145     __ bcax(v24, __ T16B, v24, v8,  v31);
4146 
4147     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4148 
4149     __ bcax(v17, __ T16B, v25, v19, v3);
4150     __ bcax(v18, __ T16B, v3,  v15, v19);
4151     __ bcax(v19, __ T16B, v19, v16, v15);
4152     __ bcax(v15, __ T16B, v15, v25, v16);
4153     __ bcax(v16, __ T16B, v16, v3,  v25);
4154 
4155     __ bcax(v10, __ T16B, v29, v12, v26);
4156     __ bcax(v11, __ T16B, v26, v13, v12);
4157     __ bcax(v12, __ T16B, v12, v14, v13);
4158     __ bcax(v13, __ T16B, v13, v29, v14);
4159     __ bcax(v14, __ T16B, v14, v26, v29);
4160 
4161     __ bcax(v7, __ T16B, v30, v9,  v4);
4162     __ bcax(v8, __ T16B, v4,  v5,  v9);
4163     __ bcax(v9, __ T16B, v9,  v6,  v5);
4164     __ bcax(v5, __ T16B, v5,  v30, v6);
4165     __ bcax(v6, __ T16B, v6,  v4,  v30);
4166 
4167     __ bcax(v3, __ T16B, v27, v0,  v28);
4168     __ bcax(v4, __ T16B, v28, v1,  v0);
4169     __ bcax(v0, __ T16B, v0,  v2,  v1);
4170     __ bcax(v1, __ T16B, v1,  v27, v2);
4171     __ bcax(v2, __ T16B, v2,  v28, v27);
4172 
4173     __ eor(v0, __ T16B, v0, v31);
4174 
4175     __ cbnzw(rscratch2, rounds24_loop);
4176 
4177     if (multi_block) {
4178       __ add(ofs, ofs, block_size);
4179       __ cmp(ofs, limit);
4180       __ br(Assembler::LE, sha3_loop);
4181       __ mov(c_rarg0, ofs); // return ofs
4182     }
4183 
4184     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4185     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4186     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4187     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4188     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4189     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4190     __ st1(v24, __ T1D, state);
4191 
4192     __ ldpd(v14, v15, Address(sp, 48));
4193     __ ldpd(v12, v13, Address(sp, 32));
4194     __ ldpd(v10, v11, Address(sp, 16));
4195     __ ldpd(v8, v9, __ post(sp, 64));
4196 
4197     __ ret(lr);
4198 
4199     return start;
4200   }
4201 
4202   /**
4203    *  Arguments:
4204    *
4205    * Inputs:
4206    *   c_rarg0   - int crc
4207    *   c_rarg1   - byte* buf
4208    *   c_rarg2   - int length
4209    *
4210    * Output:
4211    *       rax   - int crc result
4212    */
4213   address generate_updateBytesCRC32() {
4214     assert(UseCRC32Intrinsics, "what are we doing here?");
4215 
4216     __ align(CodeEntryAlignment);
4217     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4218 
4219     address start = __ pc();
4220 
4221     const Register crc   = c_rarg0;  // crc
4222     const Register buf   = c_rarg1;  // source java byte array address
4223     const Register len   = c_rarg2;  // length
4224     const Register table0 = c_rarg3; // crc_table address
4225     const Register table1 = c_rarg4;
4226     const Register table2 = c_rarg5;
4227     const Register table3 = c_rarg6;
4228     const Register tmp3 = c_rarg7;
4229 
4230     BLOCK_COMMENT("Entry:");
4231     __ enter(); // required for proper stackwalking of RuntimeStub frame
4232 
4233     __ kernel_crc32(crc, buf, len,
4234               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4235 
4236     __ leave(); // required for proper stackwalking of RuntimeStub frame
4237     __ ret(lr);
4238 
4239     return start;
4240   }
4241 
4242   // ChaCha20 block function.  This version parallelizes by loading
4243   // individual 32-bit state elements into vectors for four blocks
4244   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4245   //
4246   // state (int[16]) = c_rarg0
4247   // keystream (byte[1024]) = c_rarg1
4248   // return - number of bytes of keystream (always 256)
4249   address generate_chacha20Block_blockpar() {
4250     Label L_twoRounds, L_cc20_const;
4251     // The constant data is broken into two 128-bit segments to be loaded
4252     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4253     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4254     // The second 128-bits is a table constant used for 8-bit left rotations.
4255     __ BIND(L_cc20_const);
4256     __ emit_int64(0x0000000100000000UL);
4257     __ emit_int64(0x0000000300000002UL);
4258     __ emit_int64(0x0605040702010003UL);
4259     __ emit_int64(0x0E0D0C0F0A09080BUL);
4260 
4261     __ align(CodeEntryAlignment);
4262     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4263     address start = __ pc();
4264     __ enter();
4265 
4266     int i, j;
4267     const Register state = c_rarg0;
4268     const Register keystream = c_rarg1;
4269     const Register loopCtr = r10;
4270     const Register tmpAddr = r11;
4271 
4272     const FloatRegister stateFirst = v0;
4273     const FloatRegister stateSecond = v1;
4274     const FloatRegister stateThird = v2;
4275     const FloatRegister stateFourth = v3;
4276     const FloatRegister origCtrState = v28;
4277     const FloatRegister scratch = v29;
4278     const FloatRegister lrot8Tbl = v30;
4279 
4280     // Organize SIMD registers in an array that facilitates
4281     // putting repetitive opcodes into loop structures.  It is
4282     // important that each grouping of 4 registers is monotonically
4283     // increasing to support the requirements of multi-register
4284     // instructions (e.g. ld4r, st4, etc.)
4285     const FloatRegister workSt[16] = {
4286          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4287         v20, v21, v22, v23, v24, v25, v26, v27
4288     };
4289 
4290     // Load from memory and interlace across 16 SIMD registers,
4291     // With each word from memory being broadcast to all lanes of
4292     // each successive SIMD register.
4293     //      Addr(0) -> All lanes in workSt[i]
4294     //      Addr(4) -> All lanes workSt[i + 1], etc.
4295     __ mov(tmpAddr, state);
4296     for (i = 0; i < 16; i += 4) {
4297       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4298           __ post(tmpAddr, 16));
4299     }
4300 
4301     // Pull in constant data.  The first 16 bytes are the add overlay
4302     // which is applied to the vector holding the counter (state[12]).
4303     // The second 16 bytes is the index register for the 8-bit left
4304     // rotation tbl instruction.
4305     __ adr(tmpAddr, L_cc20_const);
4306     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4307     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4308 
4309     // Set up the 10 iteration loop and perform all 8 quarter round ops
4310     __ mov(loopCtr, 10);
4311     __ BIND(L_twoRounds);
4312 
4313     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4314         scratch, lrot8Tbl);
4315     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4316         scratch, lrot8Tbl);
4317     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4318         scratch, lrot8Tbl);
4319     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4320         scratch, lrot8Tbl);
4321 
4322     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4323         scratch, lrot8Tbl);
4324     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4325         scratch, lrot8Tbl);
4326     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4327         scratch, lrot8Tbl);
4328     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4329         scratch, lrot8Tbl);
4330 
4331     // Decrement and iterate
4332     __ sub(loopCtr, loopCtr, 1);
4333     __ cbnz(loopCtr, L_twoRounds);
4334 
4335     __ mov(tmpAddr, state);
4336 
4337     // Add the starting state back to the post-loop keystream
4338     // state.  We read/interlace the state array from memory into
4339     // 4 registers similar to what we did in the beginning.  Then
4340     // add the counter overlay onto workSt[12] at the end.
4341     for (i = 0; i < 16; i += 4) {
4342       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4343           __ post(tmpAddr, 16));
4344       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4345       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4346       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4347       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4348     }
4349     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4350 
4351     // Write to key stream, storing the same element out of workSt[0..15]
4352     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4353     // for the next element position.
4354     for (i = 0; i < 4; i++) {
4355       for (j = 0; j < 16; j += 4) {
4356         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4357             __ post(keystream, 16));
4358       }
4359     }
4360 
4361     __ mov(r0, 256);             // Return length of output keystream
4362     __ leave();
4363     __ ret(lr);
4364 
4365     return start;
4366   }
4367 
4368   /**
4369    *  Arguments:
4370    *
4371    * Inputs:
4372    *   c_rarg0   - int crc
4373    *   c_rarg1   - byte* buf
4374    *   c_rarg2   - int length
4375    *   c_rarg3   - int* table
4376    *
4377    * Output:
4378    *       r0   - int crc result
4379    */
4380   address generate_updateBytesCRC32C() {
4381     assert(UseCRC32CIntrinsics, "what are we doing here?");
4382 
4383     __ align(CodeEntryAlignment);
4384     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4385 
4386     address start = __ pc();
4387 
4388     const Register crc   = c_rarg0;  // crc
4389     const Register buf   = c_rarg1;  // source java byte array address
4390     const Register len   = c_rarg2;  // length
4391     const Register table0 = c_rarg3; // crc_table address
4392     const Register table1 = c_rarg4;
4393     const Register table2 = c_rarg5;
4394     const Register table3 = c_rarg6;
4395     const Register tmp3 = c_rarg7;
4396 
4397     BLOCK_COMMENT("Entry:");
4398     __ enter(); // required for proper stackwalking of RuntimeStub frame
4399 
4400     __ kernel_crc32c(crc, buf, len,
4401               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4402 
4403     __ leave(); // required for proper stackwalking of RuntimeStub frame
4404     __ ret(lr);
4405 
4406     return start;
4407   }
4408 
4409   /***
4410    *  Arguments:
4411    *
4412    *  Inputs:
4413    *   c_rarg0   - int   adler
4414    *   c_rarg1   - byte* buff
4415    *   c_rarg2   - int   len
4416    *
4417    * Output:
4418    *   c_rarg0   - int adler result
4419    */
4420   address generate_updateBytesAdler32() {
4421     __ align(CodeEntryAlignment);
4422     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4423     address start = __ pc();
4424 
4425     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4426 
4427     // Aliases
4428     Register adler  = c_rarg0;
4429     Register s1     = c_rarg0;
4430     Register s2     = c_rarg3;
4431     Register buff   = c_rarg1;
4432     Register len    = c_rarg2;
4433     Register nmax  = r4;
4434     Register base  = r5;
4435     Register count = r6;
4436     Register temp0 = rscratch1;
4437     Register temp1 = rscratch2;
4438     FloatRegister vbytes = v0;
4439     FloatRegister vs1acc = v1;
4440     FloatRegister vs2acc = v2;
4441     FloatRegister vtable = v3;
4442 
4443     // Max number of bytes we can process before having to take the mod
4444     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4445     uint64_t BASE = 0xfff1;
4446     uint64_t NMAX = 0x15B0;
4447 
4448     __ mov(base, BASE);
4449     __ mov(nmax, NMAX);
4450 
4451     // Load accumulation coefficients for the upper 16 bits
4452     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4453     __ ld1(vtable, __ T16B, Address(temp0));
4454 
4455     // s1 is initialized to the lower 16 bits of adler
4456     // s2 is initialized to the upper 16 bits of adler
4457     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4458     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4459 
4460     // The pipelined loop needs at least 16 elements for 1 iteration
4461     // It does check this, but it is more effective to skip to the cleanup loop
4462     __ cmp(len, (u1)16);
4463     __ br(Assembler::HS, L_nmax);
4464     __ cbz(len, L_combine);
4465 
4466     __ bind(L_simple_by1_loop);
4467     __ ldrb(temp0, Address(__ post(buff, 1)));
4468     __ add(s1, s1, temp0);
4469     __ add(s2, s2, s1);
4470     __ subs(len, len, 1);
4471     __ br(Assembler::HI, L_simple_by1_loop);
4472 
4473     // s1 = s1 % BASE
4474     __ subs(temp0, s1, base);
4475     __ csel(s1, temp0, s1, Assembler::HS);
4476 
4477     // s2 = s2 % BASE
4478     __ lsr(temp0, s2, 16);
4479     __ lsl(temp1, temp0, 4);
4480     __ sub(temp1, temp1, temp0);
4481     __ add(s2, temp1, s2, ext::uxth);
4482 
4483     __ subs(temp0, s2, base);
4484     __ csel(s2, temp0, s2, Assembler::HS);
4485 
4486     __ b(L_combine);
4487 
4488     __ bind(L_nmax);
4489     __ subs(len, len, nmax);
4490     __ sub(count, nmax, 16);
4491     __ br(Assembler::LO, L_by16);
4492 
4493     __ bind(L_nmax_loop);
4494 
4495     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4496                                       vbytes, vs1acc, vs2acc, vtable);
4497 
4498     __ subs(count, count, 16);
4499     __ br(Assembler::HS, L_nmax_loop);
4500 
4501     // s1 = s1 % BASE
4502     __ lsr(temp0, s1, 16);
4503     __ lsl(temp1, temp0, 4);
4504     __ sub(temp1, temp1, temp0);
4505     __ add(temp1, temp1, s1, ext::uxth);
4506 
4507     __ lsr(temp0, temp1, 16);
4508     __ lsl(s1, temp0, 4);
4509     __ sub(s1, s1, temp0);
4510     __ add(s1, s1, temp1, ext:: uxth);
4511 
4512     __ subs(temp0, s1, base);
4513     __ csel(s1, temp0, s1, Assembler::HS);
4514 
4515     // s2 = s2 % BASE
4516     __ lsr(temp0, s2, 16);
4517     __ lsl(temp1, temp0, 4);
4518     __ sub(temp1, temp1, temp0);
4519     __ add(temp1, temp1, s2, ext::uxth);
4520 
4521     __ lsr(temp0, temp1, 16);
4522     __ lsl(s2, temp0, 4);
4523     __ sub(s2, s2, temp0);
4524     __ add(s2, s2, temp1, ext:: uxth);
4525 
4526     __ subs(temp0, s2, base);
4527     __ csel(s2, temp0, s2, Assembler::HS);
4528 
4529     __ subs(len, len, nmax);
4530     __ sub(count, nmax, 16);
4531     __ br(Assembler::HS, L_nmax_loop);
4532 
4533     __ bind(L_by16);
4534     __ adds(len, len, count);
4535     __ br(Assembler::LO, L_by1);
4536 
4537     __ bind(L_by16_loop);
4538 
4539     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4540                                       vbytes, vs1acc, vs2acc, vtable);
4541 
4542     __ subs(len, len, 16);
4543     __ br(Assembler::HS, L_by16_loop);
4544 
4545     __ bind(L_by1);
4546     __ adds(len, len, 15);
4547     __ br(Assembler::LO, L_do_mod);
4548 
4549     __ bind(L_by1_loop);
4550     __ ldrb(temp0, Address(__ post(buff, 1)));
4551     __ add(s1, temp0, s1);
4552     __ add(s2, s2, s1);
4553     __ subs(len, len, 1);
4554     __ br(Assembler::HS, L_by1_loop);
4555 
4556     __ bind(L_do_mod);
4557     // s1 = s1 % BASE
4558     __ lsr(temp0, s1, 16);
4559     __ lsl(temp1, temp0, 4);
4560     __ sub(temp1, temp1, temp0);
4561     __ add(temp1, temp1, s1, ext::uxth);
4562 
4563     __ lsr(temp0, temp1, 16);
4564     __ lsl(s1, temp0, 4);
4565     __ sub(s1, s1, temp0);
4566     __ add(s1, s1, temp1, ext:: uxth);
4567 
4568     __ subs(temp0, s1, base);
4569     __ csel(s1, temp0, s1, Assembler::HS);
4570 
4571     // s2 = s2 % BASE
4572     __ lsr(temp0, s2, 16);
4573     __ lsl(temp1, temp0, 4);
4574     __ sub(temp1, temp1, temp0);
4575     __ add(temp1, temp1, s2, ext::uxth);
4576 
4577     __ lsr(temp0, temp1, 16);
4578     __ lsl(s2, temp0, 4);
4579     __ sub(s2, s2, temp0);
4580     __ add(s2, s2, temp1, ext:: uxth);
4581 
4582     __ subs(temp0, s2, base);
4583     __ csel(s2, temp0, s2, Assembler::HS);
4584 
4585     // Combine lower bits and higher bits
4586     __ bind(L_combine);
4587     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4588 
4589     __ ret(lr);
4590 
4591     return start;
4592   }
4593 
4594   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4595           Register temp0, Register temp1, FloatRegister vbytes,
4596           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4597     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4598     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4599     // In non-vectorized code, we update s1 and s2 as:
4600     //   s1 <- s1 + b1
4601     //   s2 <- s2 + s1
4602     //   s1 <- s1 + b2
4603     //   s2 <- s2 + b1
4604     //   ...
4605     //   s1 <- s1 + b16
4606     //   s2 <- s2 + s1
4607     // Putting above assignments together, we have:
4608     //   s1_new = s1 + b1 + b2 + ... + b16
4609     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4610     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4611     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4612     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4613 
4614     // s2 = s2 + s1 * 16
4615     __ add(s2, s2, s1, Assembler::LSL, 4);
4616 
4617     // vs1acc = b1 + b2 + b3 + ... + b16
4618     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4619     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4620     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4621     __ uaddlv(vs1acc, __ T16B, vbytes);
4622     __ uaddlv(vs2acc, __ T8H, vs2acc);
4623 
4624     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4625     __ fmovd(temp0, vs1acc);
4626     __ fmovd(temp1, vs2acc);
4627     __ add(s1, s1, temp0);
4628     __ add(s2, s2, temp1);
4629   }
4630 
4631   /**
4632    *  Arguments:
4633    *
4634    *  Input:
4635    *    c_rarg0   - x address
4636    *    c_rarg1   - x length
4637    *    c_rarg2   - y address
4638    *    c_rarg3   - y length
4639    *    c_rarg4   - z address
4640    *    c_rarg5   - z length
4641    */
4642   address generate_multiplyToLen() {
4643     __ align(CodeEntryAlignment);
4644     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4645 
4646     address start = __ pc();
4647     const Register x     = r0;
4648     const Register xlen  = r1;
4649     const Register y     = r2;
4650     const Register ylen  = r3;
4651     const Register z     = r4;
4652     const Register zlen  = r5;
4653 
4654     const Register tmp1  = r10;
4655     const Register tmp2  = r11;
4656     const Register tmp3  = r12;
4657     const Register tmp4  = r13;
4658     const Register tmp5  = r14;
4659     const Register tmp6  = r15;
4660     const Register tmp7  = r16;
4661 
4662     BLOCK_COMMENT("Entry:");
4663     __ enter(); // required for proper stackwalking of RuntimeStub frame
4664     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4665     __ leave(); // required for proper stackwalking of RuntimeStub frame
4666     __ ret(lr);
4667 
4668     return start;
4669   }
4670 
4671   address generate_squareToLen() {
4672     // squareToLen algorithm for sizes 1..127 described in java code works
4673     // faster than multiply_to_len on some CPUs and slower on others, but
4674     // multiply_to_len shows a bit better overall results
4675     __ align(CodeEntryAlignment);
4676     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4677     address start = __ pc();
4678 
4679     const Register x     = r0;
4680     const Register xlen  = r1;
4681     const Register z     = r2;
4682     const Register zlen  = r3;
4683     const Register y     = r4; // == x
4684     const Register ylen  = r5; // == xlen
4685 
4686     const Register tmp1  = r10;
4687     const Register tmp2  = r11;
4688     const Register tmp3  = r12;
4689     const Register tmp4  = r13;
4690     const Register tmp5  = r14;
4691     const Register tmp6  = r15;
4692     const Register tmp7  = r16;
4693 
4694     RegSet spilled_regs = RegSet::of(y, ylen);
4695     BLOCK_COMMENT("Entry:");
4696     __ enter();
4697     __ push(spilled_regs, sp);
4698     __ mov(y, x);
4699     __ mov(ylen, xlen);
4700     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4701     __ pop(spilled_regs, sp);
4702     __ leave();
4703     __ ret(lr);
4704     return start;
4705   }
4706 
4707   address generate_mulAdd() {
4708     __ align(CodeEntryAlignment);
4709     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4710 
4711     address start = __ pc();
4712 
4713     const Register out     = r0;
4714     const Register in      = r1;
4715     const Register offset  = r2;
4716     const Register len     = r3;
4717     const Register k       = r4;
4718 
4719     BLOCK_COMMENT("Entry:");
4720     __ enter();
4721     __ mul_add(out, in, offset, len, k);
4722     __ leave();
4723     __ ret(lr);
4724 
4725     return start;
4726   }
4727 
4728   // Arguments:
4729   //
4730   // Input:
4731   //   c_rarg0   - newArr address
4732   //   c_rarg1   - oldArr address
4733   //   c_rarg2   - newIdx
4734   //   c_rarg3   - shiftCount
4735   //   c_rarg4   - numIter
4736   //
4737   address generate_bigIntegerRightShift() {
4738     __ align(CodeEntryAlignment);
4739     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4740     address start = __ pc();
4741 
4742     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4743 
4744     Register newArr        = c_rarg0;
4745     Register oldArr        = c_rarg1;
4746     Register newIdx        = c_rarg2;
4747     Register shiftCount    = c_rarg3;
4748     Register numIter       = c_rarg4;
4749     Register idx           = numIter;
4750 
4751     Register newArrCur     = rscratch1;
4752     Register shiftRevCount = rscratch2;
4753     Register oldArrCur     = r13;
4754     Register oldArrNext    = r14;
4755 
4756     FloatRegister oldElem0        = v0;
4757     FloatRegister oldElem1        = v1;
4758     FloatRegister newElem         = v2;
4759     FloatRegister shiftVCount     = v3;
4760     FloatRegister shiftVRevCount  = v4;
4761 
4762     __ cbz(idx, Exit);
4763 
4764     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4765 
4766     // left shift count
4767     __ movw(shiftRevCount, 32);
4768     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4769 
4770     // numIter too small to allow a 4-words SIMD loop, rolling back
4771     __ cmp(numIter, (u1)4);
4772     __ br(Assembler::LT, ShiftThree);
4773 
4774     __ dup(shiftVCount,    __ T4S, shiftCount);
4775     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4776     __ negr(shiftVCount,   __ T4S, shiftVCount);
4777 
4778     __ BIND(ShiftSIMDLoop);
4779 
4780     // Calculate the load addresses
4781     __ sub(idx, idx, 4);
4782     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4783     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4784     __ add(oldArrCur,  oldArrNext, 4);
4785 
4786     // Load 4 words and process
4787     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4788     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4789     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4790     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4791     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4792     __ st1(newElem,   __ T4S,  Address(newArrCur));
4793 
4794     __ cmp(idx, (u1)4);
4795     __ br(Assembler::LT, ShiftTwoLoop);
4796     __ b(ShiftSIMDLoop);
4797 
4798     __ BIND(ShiftTwoLoop);
4799     __ cbz(idx, Exit);
4800     __ cmp(idx, (u1)1);
4801     __ br(Assembler::EQ, ShiftOne);
4802 
4803     // Calculate the load addresses
4804     __ sub(idx, idx, 2);
4805     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4806     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4807     __ add(oldArrCur,  oldArrNext, 4);
4808 
4809     // Load 2 words and process
4810     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4811     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4812     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4813     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4814     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4815     __ st1(newElem,   __ T2S, Address(newArrCur));
4816     __ b(ShiftTwoLoop);
4817 
4818     __ BIND(ShiftThree);
4819     __ tbz(idx, 1, ShiftOne);
4820     __ tbz(idx, 0, ShiftTwo);
4821     __ ldrw(r10,  Address(oldArr, 12));
4822     __ ldrw(r11,  Address(oldArr, 8));
4823     __ lsrvw(r10, r10, shiftCount);
4824     __ lslvw(r11, r11, shiftRevCount);
4825     __ orrw(r12,  r10, r11);
4826     __ strw(r12,  Address(newArr, 8));
4827 
4828     __ BIND(ShiftTwo);
4829     __ ldrw(r10,  Address(oldArr, 8));
4830     __ ldrw(r11,  Address(oldArr, 4));
4831     __ lsrvw(r10, r10, shiftCount);
4832     __ lslvw(r11, r11, shiftRevCount);
4833     __ orrw(r12,  r10, r11);
4834     __ strw(r12,  Address(newArr, 4));
4835 
4836     __ BIND(ShiftOne);
4837     __ ldrw(r10,  Address(oldArr, 4));
4838     __ ldrw(r11,  Address(oldArr));
4839     __ lsrvw(r10, r10, shiftCount);
4840     __ lslvw(r11, r11, shiftRevCount);
4841     __ orrw(r12,  r10, r11);
4842     __ strw(r12,  Address(newArr));
4843 
4844     __ BIND(Exit);
4845     __ ret(lr);
4846 
4847     return start;
4848   }
4849 
4850   // Arguments:
4851   //
4852   // Input:
4853   //   c_rarg0   - newArr address
4854   //   c_rarg1   - oldArr address
4855   //   c_rarg2   - newIdx
4856   //   c_rarg3   - shiftCount
4857   //   c_rarg4   - numIter
4858   //
4859   address generate_bigIntegerLeftShift() {
4860     __ align(CodeEntryAlignment);
4861     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4862     address start = __ pc();
4863 
4864     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4865 
4866     Register newArr        = c_rarg0;
4867     Register oldArr        = c_rarg1;
4868     Register newIdx        = c_rarg2;
4869     Register shiftCount    = c_rarg3;
4870     Register numIter       = c_rarg4;
4871 
4872     Register shiftRevCount = rscratch1;
4873     Register oldArrNext    = rscratch2;
4874 
4875     FloatRegister oldElem0        = v0;
4876     FloatRegister oldElem1        = v1;
4877     FloatRegister newElem         = v2;
4878     FloatRegister shiftVCount     = v3;
4879     FloatRegister shiftVRevCount  = v4;
4880 
4881     __ cbz(numIter, Exit);
4882 
4883     __ add(oldArrNext, oldArr, 4);
4884     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4885 
4886     // right shift count
4887     __ movw(shiftRevCount, 32);
4888     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4889 
4890     // numIter too small to allow a 4-words SIMD loop, rolling back
4891     __ cmp(numIter, (u1)4);
4892     __ br(Assembler::LT, ShiftThree);
4893 
4894     __ dup(shiftVCount,     __ T4S, shiftCount);
4895     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4896     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4897 
4898     __ BIND(ShiftSIMDLoop);
4899 
4900     // load 4 words and process
4901     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4902     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4903     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4904     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4905     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4906     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4907     __ sub(numIter,   numIter, 4);
4908 
4909     __ cmp(numIter, (u1)4);
4910     __ br(Assembler::LT, ShiftTwoLoop);
4911     __ b(ShiftSIMDLoop);
4912 
4913     __ BIND(ShiftTwoLoop);
4914     __ cbz(numIter, Exit);
4915     __ cmp(numIter, (u1)1);
4916     __ br(Assembler::EQ, ShiftOne);
4917 
4918     // load 2 words and process
4919     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4920     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4921     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4922     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4923     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4924     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4925     __ sub(numIter,   numIter, 2);
4926     __ b(ShiftTwoLoop);
4927 
4928     __ BIND(ShiftThree);
4929     __ ldrw(r10,  __ post(oldArr, 4));
4930     __ ldrw(r11,  __ post(oldArrNext, 4));
4931     __ lslvw(r10, r10, shiftCount);
4932     __ lsrvw(r11, r11, shiftRevCount);
4933     __ orrw(r12,  r10, r11);
4934     __ strw(r12,  __ post(newArr, 4));
4935     __ tbz(numIter, 1, Exit);
4936     __ tbz(numIter, 0, ShiftOne);
4937 
4938     __ BIND(ShiftTwo);
4939     __ ldrw(r10,  __ post(oldArr, 4));
4940     __ ldrw(r11,  __ post(oldArrNext, 4));
4941     __ lslvw(r10, r10, shiftCount);
4942     __ lsrvw(r11, r11, shiftRevCount);
4943     __ orrw(r12,  r10, r11);
4944     __ strw(r12,  __ post(newArr, 4));
4945 
4946     __ BIND(ShiftOne);
4947     __ ldrw(r10,  Address(oldArr));
4948     __ ldrw(r11,  Address(oldArrNext));
4949     __ lslvw(r10, r10, shiftCount);
4950     __ lsrvw(r11, r11, shiftRevCount);
4951     __ orrw(r12,  r10, r11);
4952     __ strw(r12,  Address(newArr));
4953 
4954     __ BIND(Exit);
4955     __ ret(lr);
4956 
4957     return start;
4958   }
4959 
4960   address generate_count_positives(address &count_positives_long) {
4961     const u1 large_loop_size = 64;
4962     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4963     int dcache_line = VM_Version::dcache_line_size();
4964 
4965     Register ary1 = r1, len = r2, result = r0;
4966 
4967     __ align(CodeEntryAlignment);
4968 
4969     StubCodeMark mark(this, "StubRoutines", "count_positives");
4970 
4971     address entry = __ pc();
4972 
4973     __ enter();
4974     // precondition: a copy of len is already in result
4975     // __ mov(result, len);
4976 
4977   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4978         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4979 
4980   __ cmp(len, (u1)15);
4981   __ br(Assembler::GT, LEN_OVER_15);
4982   // The only case when execution falls into this code is when pointer is near
4983   // the end of memory page and we have to avoid reading next page
4984   __ add(ary1, ary1, len);
4985   __ subs(len, len, 8);
4986   __ br(Assembler::GT, LEN_OVER_8);
4987   __ ldr(rscratch2, Address(ary1, -8));
4988   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4989   __ lsrv(rscratch2, rscratch2, rscratch1);
4990   __ tst(rscratch2, UPPER_BIT_MASK);
4991   __ csel(result, zr, result, Assembler::NE);
4992   __ leave();
4993   __ ret(lr);
4994   __ bind(LEN_OVER_8);
4995   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4996   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4997   __ tst(rscratch2, UPPER_BIT_MASK);
4998   __ br(Assembler::NE, RET_NO_POP);
4999   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5000   __ lsrv(rscratch1, rscratch1, rscratch2);
5001   __ tst(rscratch1, UPPER_BIT_MASK);
5002   __ bind(RET_NO_POP);
5003   __ csel(result, zr, result, Assembler::NE);
5004   __ leave();
5005   __ ret(lr);
5006 
5007   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5008   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5009 
5010   count_positives_long = __ pc(); // 2nd entry point
5011 
5012   __ enter();
5013 
5014   __ bind(LEN_OVER_15);
5015     __ push(spilled_regs, sp);
5016     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5017     __ cbz(rscratch2, ALIGNED);
5018     __ ldp(tmp6, tmp1, Address(ary1));
5019     __ mov(tmp5, 16);
5020     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5021     __ add(ary1, ary1, rscratch1);
5022     __ orr(tmp6, tmp6, tmp1);
5023     __ tst(tmp6, UPPER_BIT_MASK);
5024     __ br(Assembler::NE, RET_ADJUST);
5025     __ sub(len, len, rscratch1);
5026 
5027   __ bind(ALIGNED);
5028     __ cmp(len, large_loop_size);
5029     __ br(Assembler::LT, CHECK_16);
5030     // Perform 16-byte load as early return in pre-loop to handle situation
5031     // when initially aligned large array has negative values at starting bytes,
5032     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5033     // slower. Cases with negative bytes further ahead won't be affected that
5034     // much. In fact, it'll be faster due to early loads, less instructions and
5035     // less branches in LARGE_LOOP.
5036     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5037     __ sub(len, len, 16);
5038     __ orr(tmp6, tmp6, tmp1);
5039     __ tst(tmp6, UPPER_BIT_MASK);
5040     __ br(Assembler::NE, RET_ADJUST_16);
5041     __ cmp(len, large_loop_size);
5042     __ br(Assembler::LT, CHECK_16);
5043 
5044     if (SoftwarePrefetchHintDistance >= 0
5045         && SoftwarePrefetchHintDistance >= dcache_line) {
5046       // initial prefetch
5047       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5048     }
5049   __ bind(LARGE_LOOP);
5050     if (SoftwarePrefetchHintDistance >= 0) {
5051       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5052     }
5053     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5054     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5055     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5056     // instructions per cycle and have less branches, but this approach disables
5057     // early return, thus, all 64 bytes are loaded and checked every time.
5058     __ ldp(tmp2, tmp3, Address(ary1));
5059     __ ldp(tmp4, tmp5, Address(ary1, 16));
5060     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5061     __ ldp(tmp6, tmp1, Address(ary1, 48));
5062     __ add(ary1, ary1, large_loop_size);
5063     __ sub(len, len, large_loop_size);
5064     __ orr(tmp2, tmp2, tmp3);
5065     __ orr(tmp4, tmp4, tmp5);
5066     __ orr(rscratch1, rscratch1, rscratch2);
5067     __ orr(tmp6, tmp6, tmp1);
5068     __ orr(tmp2, tmp2, tmp4);
5069     __ orr(rscratch1, rscratch1, tmp6);
5070     __ orr(tmp2, tmp2, rscratch1);
5071     __ tst(tmp2, UPPER_BIT_MASK);
5072     __ br(Assembler::NE, RET_ADJUST_LONG);
5073     __ cmp(len, large_loop_size);
5074     __ br(Assembler::GE, LARGE_LOOP);
5075 
5076   __ bind(CHECK_16); // small 16-byte load pre-loop
5077     __ cmp(len, (u1)16);
5078     __ br(Assembler::LT, POST_LOOP16);
5079 
5080   __ bind(LOOP16); // small 16-byte load loop
5081     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5082     __ sub(len, len, 16);
5083     __ orr(tmp2, tmp2, tmp3);
5084     __ tst(tmp2, UPPER_BIT_MASK);
5085     __ br(Assembler::NE, RET_ADJUST_16);
5086     __ cmp(len, (u1)16);
5087     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5088 
5089   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5090     __ cmp(len, (u1)8);
5091     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5092     __ ldr(tmp3, Address(__ post(ary1, 8)));
5093     __ tst(tmp3, UPPER_BIT_MASK);
5094     __ br(Assembler::NE, RET_ADJUST);
5095     __ sub(len, len, 8);
5096 
5097   __ bind(POST_LOOP16_LOAD_TAIL);
5098     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5099     __ ldr(tmp1, Address(ary1));
5100     __ mov(tmp2, 64);
5101     __ sub(tmp4, tmp2, len, __ LSL, 3);
5102     __ lslv(tmp1, tmp1, tmp4);
5103     __ tst(tmp1, UPPER_BIT_MASK);
5104     __ br(Assembler::NE, RET_ADJUST);
5105     // Fallthrough
5106 
5107   __ bind(RET_LEN);
5108     __ pop(spilled_regs, sp);
5109     __ leave();
5110     __ ret(lr);
5111 
5112     // difference result - len is the count of guaranteed to be
5113     // positive bytes
5114 
5115   __ bind(RET_ADJUST_LONG);
5116     __ add(len, len, (u1)(large_loop_size - 16));
5117   __ bind(RET_ADJUST_16);
5118     __ add(len, len, 16);
5119   __ bind(RET_ADJUST);
5120     __ pop(spilled_regs, sp);
5121     __ leave();
5122     __ sub(result, result, len);
5123     __ ret(lr);
5124 
5125     return entry;
5126   }
5127 
5128   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5129         bool usePrefetch, Label &NOT_EQUAL) {
5130     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5131         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5132         tmp7 = r12, tmp8 = r13;
5133     Label LOOP;
5134 
5135     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5136     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5137     __ bind(LOOP);
5138     if (usePrefetch) {
5139       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5140       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5141     }
5142     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5143     __ eor(tmp1, tmp1, tmp2);
5144     __ eor(tmp3, tmp3, tmp4);
5145     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5146     __ orr(tmp1, tmp1, tmp3);
5147     __ cbnz(tmp1, NOT_EQUAL);
5148     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5149     __ eor(tmp5, tmp5, tmp6);
5150     __ eor(tmp7, tmp7, tmp8);
5151     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5152     __ orr(tmp5, tmp5, tmp7);
5153     __ cbnz(tmp5, NOT_EQUAL);
5154     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5155     __ eor(tmp1, tmp1, tmp2);
5156     __ eor(tmp3, tmp3, tmp4);
5157     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5158     __ orr(tmp1, tmp1, tmp3);
5159     __ cbnz(tmp1, NOT_EQUAL);
5160     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5161     __ eor(tmp5, tmp5, tmp6);
5162     __ sub(cnt1, cnt1, 8 * wordSize);
5163     __ eor(tmp7, tmp7, tmp8);
5164     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5165     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5166     // cmp) because subs allows an unlimited range of immediate operand.
5167     __ subs(tmp6, cnt1, loopThreshold);
5168     __ orr(tmp5, tmp5, tmp7);
5169     __ cbnz(tmp5, NOT_EQUAL);
5170     __ br(__ GE, LOOP);
5171     // post-loop
5172     __ eor(tmp1, tmp1, tmp2);
5173     __ eor(tmp3, tmp3, tmp4);
5174     __ orr(tmp1, tmp1, tmp3);
5175     __ sub(cnt1, cnt1, 2 * wordSize);
5176     __ cbnz(tmp1, NOT_EQUAL);
5177   }
5178 
5179   void generate_large_array_equals_loop_simd(int loopThreshold,
5180         bool usePrefetch, Label &NOT_EQUAL) {
5181     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5182         tmp2 = rscratch2;
5183     Label LOOP;
5184 
5185     __ bind(LOOP);
5186     if (usePrefetch) {
5187       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5188       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5189     }
5190     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5191     __ sub(cnt1, cnt1, 8 * wordSize);
5192     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5193     __ subs(tmp1, cnt1, loopThreshold);
5194     __ eor(v0, __ T16B, v0, v4);
5195     __ eor(v1, __ T16B, v1, v5);
5196     __ eor(v2, __ T16B, v2, v6);
5197     __ eor(v3, __ T16B, v3, v7);
5198     __ orr(v0, __ T16B, v0, v1);
5199     __ orr(v1, __ T16B, v2, v3);
5200     __ orr(v0, __ T16B, v0, v1);
5201     __ umov(tmp1, v0, __ D, 0);
5202     __ umov(tmp2, v0, __ D, 1);
5203     __ orr(tmp1, tmp1, tmp2);
5204     __ cbnz(tmp1, NOT_EQUAL);
5205     __ br(__ GE, LOOP);
5206   }
5207 
5208   // a1 = r1 - array1 address
5209   // a2 = r2 - array2 address
5210   // result = r0 - return value. Already contains "false"
5211   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5212   // r3-r5 are reserved temporary registers
5213   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5214   address generate_large_array_equals() {
5215     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5216         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5217         tmp7 = r12, tmp8 = r13;
5218     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5219         SMALL_LOOP, POST_LOOP;
5220     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5221     // calculate if at least 32 prefetched bytes are used
5222     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5223     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5224     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5225     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5226         tmp5, tmp6, tmp7, tmp8);
5227 
5228     __ align(CodeEntryAlignment);
5229 
5230     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5231 
5232     address entry = __ pc();
5233     __ enter();
5234     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5235     // also advance pointers to use post-increment instead of pre-increment
5236     __ add(a1, a1, wordSize);
5237     __ add(a2, a2, wordSize);
5238     if (AvoidUnalignedAccesses) {
5239       // both implementations (SIMD/nonSIMD) are using relatively large load
5240       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5241       // on some CPUs in case of address is not at least 16-byte aligned.
5242       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5243       // load if needed at least for 1st address and make if 16-byte aligned.
5244       Label ALIGNED16;
5245       __ tbz(a1, 3, ALIGNED16);
5246       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5247       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5248       __ sub(cnt1, cnt1, wordSize);
5249       __ eor(tmp1, tmp1, tmp2);
5250       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5251       __ bind(ALIGNED16);
5252     }
5253     if (UseSIMDForArrayEquals) {
5254       if (SoftwarePrefetchHintDistance >= 0) {
5255         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5256         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5257         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5258             /* prfm = */ true, NOT_EQUAL);
5259         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5260         __ br(__ LT, TAIL);
5261       }
5262       __ bind(NO_PREFETCH_LARGE_LOOP);
5263       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5264           /* prfm = */ false, NOT_EQUAL);
5265     } else {
5266       __ push(spilled_regs, sp);
5267       if (SoftwarePrefetchHintDistance >= 0) {
5268         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5269         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5270         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5271             /* prfm = */ true, NOT_EQUAL);
5272         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5273         __ br(__ LT, TAIL);
5274       }
5275       __ bind(NO_PREFETCH_LARGE_LOOP);
5276       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5277           /* prfm = */ false, NOT_EQUAL);
5278     }
5279     __ bind(TAIL);
5280       __ cbz(cnt1, EQUAL);
5281       __ subs(cnt1, cnt1, wordSize);
5282       __ br(__ LE, POST_LOOP);
5283     __ bind(SMALL_LOOP);
5284       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5285       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5286       __ subs(cnt1, cnt1, wordSize);
5287       __ eor(tmp1, tmp1, tmp2);
5288       __ cbnz(tmp1, NOT_EQUAL);
5289       __ br(__ GT, SMALL_LOOP);
5290     __ bind(POST_LOOP);
5291       __ ldr(tmp1, Address(a1, cnt1));
5292       __ ldr(tmp2, Address(a2, cnt1));
5293       __ eor(tmp1, tmp1, tmp2);
5294       __ cbnz(tmp1, NOT_EQUAL);
5295     __ bind(EQUAL);
5296       __ mov(result, true);
5297     __ bind(NOT_EQUAL);
5298       if (!UseSIMDForArrayEquals) {
5299         __ pop(spilled_regs, sp);
5300       }
5301     __ bind(NOT_EQUAL_NO_POP);
5302     __ leave();
5303     __ ret(lr);
5304     return entry;
5305   }
5306 
5307   address generate_dsin_dcos(bool isCos) {
5308     __ align(CodeEntryAlignment);
5309     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5310     address start = __ pc();
5311     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5312         (address)StubRoutines::aarch64::_two_over_pi,
5313         (address)StubRoutines::aarch64::_pio2,
5314         (address)StubRoutines::aarch64::_dsin_coef,
5315         (address)StubRoutines::aarch64::_dcos_coef);
5316     return start;
5317   }
5318 
5319   address generate_dlog() {
5320     __ align(CodeEntryAlignment);
5321     StubCodeMark mark(this, "StubRoutines", "dlog");
5322     address entry = __ pc();
5323     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5324         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5325     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5326     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5327         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5328     return entry;
5329   }
5330 
5331 
5332   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5333   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5334       Label &DIFF2) {
5335     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5336     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5337 
5338     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5339     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5340     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5341     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5342 
5343     __ fmovd(tmpL, vtmp3);
5344     __ eor(rscratch2, tmp3, tmpL);
5345     __ cbnz(rscratch2, DIFF2);
5346 
5347     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5348     __ umov(tmpL, vtmp3, __ D, 1);
5349     __ eor(rscratch2, tmpU, tmpL);
5350     __ cbnz(rscratch2, DIFF1);
5351 
5352     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5353     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5354     __ fmovd(tmpL, vtmp);
5355     __ eor(rscratch2, tmp3, tmpL);
5356     __ cbnz(rscratch2, DIFF2);
5357 
5358     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5359     __ umov(tmpL, vtmp, __ D, 1);
5360     __ eor(rscratch2, tmpU, tmpL);
5361     __ cbnz(rscratch2, DIFF1);
5362   }
5363 
5364   // r0  = result
5365   // r1  = str1
5366   // r2  = cnt1
5367   // r3  = str2
5368   // r4  = cnt2
5369   // r10 = tmp1
5370   // r11 = tmp2
5371   address generate_compare_long_string_different_encoding(bool isLU) {
5372     __ align(CodeEntryAlignment);
5373     StubCodeMark mark(this, "StubRoutines", isLU
5374         ? "compare_long_string_different_encoding LU"
5375         : "compare_long_string_different_encoding UL");
5376     address entry = __ pc();
5377     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5378         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5379         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5380     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5381         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5382     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5383     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5384 
5385     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5386 
5387     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5388     // cnt2 == amount of characters left to compare
5389     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5390     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5391     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5392     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5393     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5394     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5395     __ eor(rscratch2, tmp1, tmp2);
5396     __ mov(rscratch1, tmp2);
5397     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5398     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5399              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5400     __ push(spilled_regs, sp);
5401     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5402     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5403 
5404     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5405 
5406     if (SoftwarePrefetchHintDistance >= 0) {
5407       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5408       __ br(__ LT, NO_PREFETCH);
5409       __ bind(LARGE_LOOP_PREFETCH);
5410         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5411         __ mov(tmp4, 2);
5412         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5413         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5414           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5415           __ subs(tmp4, tmp4, 1);
5416           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5417           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5418           __ mov(tmp4, 2);
5419         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5420           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5421           __ subs(tmp4, tmp4, 1);
5422           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5423           __ sub(cnt2, cnt2, 64);
5424           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5425           __ br(__ GE, LARGE_LOOP_PREFETCH);
5426     }
5427     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5428     __ bind(NO_PREFETCH);
5429     __ subs(cnt2, cnt2, 16);
5430     __ br(__ LT, TAIL);
5431     __ align(OptoLoopAlignment);
5432     __ bind(SMALL_LOOP); // smaller loop
5433       __ subs(cnt2, cnt2, 16);
5434       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5435       __ br(__ GE, SMALL_LOOP);
5436       __ cmn(cnt2, (u1)16);
5437       __ br(__ EQ, LOAD_LAST);
5438     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5439       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5440       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5441       __ ldr(tmp3, Address(cnt1, -8));
5442       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5443       __ b(LOAD_LAST);
5444     __ bind(DIFF2);
5445       __ mov(tmpU, tmp3);
5446     __ bind(DIFF1);
5447       __ pop(spilled_regs, sp);
5448       __ b(CALCULATE_DIFFERENCE);
5449     __ bind(LOAD_LAST);
5450       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5451       // No need to load it again
5452       __ mov(tmpU, tmp3);
5453       __ pop(spilled_regs, sp);
5454 
5455       // tmp2 points to the address of the last 4 Latin1 characters right now
5456       __ ldrs(vtmp, Address(tmp2));
5457       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5458       __ fmovd(tmpL, vtmp);
5459 
5460       __ eor(rscratch2, tmpU, tmpL);
5461       __ cbz(rscratch2, DONE);
5462 
5463     // Find the first different characters in the longwords and
5464     // compute their difference.
5465     __ bind(CALCULATE_DIFFERENCE);
5466       __ rev(rscratch2, rscratch2);
5467       __ clz(rscratch2, rscratch2);
5468       __ andr(rscratch2, rscratch2, -16);
5469       __ lsrv(tmp1, tmp1, rscratch2);
5470       __ uxthw(tmp1, tmp1);
5471       __ lsrv(rscratch1, rscratch1, rscratch2);
5472       __ uxthw(rscratch1, rscratch1);
5473       __ subw(result, tmp1, rscratch1);
5474     __ bind(DONE);
5475       __ ret(lr);
5476     return entry;
5477   }
5478 
5479   address generate_method_entry_barrier() {
5480     __ align(CodeEntryAlignment);
5481     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5482 
5483     Label deoptimize_label;
5484 
5485     address start = __ pc();
5486 
5487     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5488 
5489     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5490       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5491       // We can get here despite the nmethod being good, if we have not
5492       // yet applied our cross modification fence (or data fence).
5493       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5494       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5495       __ ldrw(rscratch2, rscratch2);
5496       __ strw(rscratch2, thread_epoch_addr);
5497       __ isb();
5498       __ membar(__ LoadLoad);
5499     }
5500 
5501     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5502 
5503     __ enter();
5504     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5505 
5506     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5507 
5508     __ push_call_clobbered_registers();
5509 
5510     __ mov(c_rarg0, rscratch2);
5511     __ call_VM_leaf
5512          (CAST_FROM_FN_PTR
5513           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5514 
5515     __ reset_last_Java_frame(true);
5516 
5517     __ mov(rscratch1, r0);
5518 
5519     __ pop_call_clobbered_registers();
5520 
5521     __ cbnz(rscratch1, deoptimize_label);
5522 
5523     __ leave();
5524     __ ret(lr);
5525 
5526     __ BIND(deoptimize_label);
5527 
5528     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5529     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5530 
5531     __ mov(sp, rscratch1);
5532     __ br(rscratch2);
5533 
5534     return start;
5535   }
5536 
5537   // r0  = result
5538   // r1  = str1
5539   // r2  = cnt1
5540   // r3  = str2
5541   // r4  = cnt2
5542   // r10 = tmp1
5543   // r11 = tmp2
5544   address generate_compare_long_string_same_encoding(bool isLL) {
5545     __ align(CodeEntryAlignment);
5546     StubCodeMark mark(this, "StubRoutines", isLL
5547         ? "compare_long_string_same_encoding LL"
5548         : "compare_long_string_same_encoding UU");
5549     address entry = __ pc();
5550     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5551         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5552 
5553     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5554 
5555     // exit from large loop when less than 64 bytes left to read or we're about
5556     // to prefetch memory behind array border
5557     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5558 
5559     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5560     __ eor(rscratch2, tmp1, tmp2);
5561     __ cbnz(rscratch2, CAL_DIFFERENCE);
5562 
5563     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5564     // update pointers, because of previous read
5565     __ add(str1, str1, wordSize);
5566     __ add(str2, str2, wordSize);
5567     if (SoftwarePrefetchHintDistance >= 0) {
5568       __ align(OptoLoopAlignment);
5569       __ bind(LARGE_LOOP_PREFETCH);
5570         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5571         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5572 
5573         for (int i = 0; i < 4; i++) {
5574           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5575           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5576           __ cmp(tmp1, tmp2);
5577           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5578           __ br(Assembler::NE, DIFF);
5579         }
5580         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5581         __ add(str1, str1, 64);
5582         __ add(str2, str2, 64);
5583         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5584         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5585         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5586     }
5587 
5588     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5589     __ br(Assembler::LE, LESS16);
5590     __ align(OptoLoopAlignment);
5591     __ bind(LOOP_COMPARE16);
5592       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5593       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5594       __ cmp(tmp1, tmp2);
5595       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5596       __ br(Assembler::NE, DIFF);
5597       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5598       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5599       __ br(Assembler::LT, LESS16);
5600 
5601       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5602       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5603       __ cmp(tmp1, tmp2);
5604       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5605       __ br(Assembler::NE, DIFF);
5606       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5607       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5608       __ br(Assembler::GE, LOOP_COMPARE16);
5609       __ cbz(cnt2, LENGTH_DIFF);
5610 
5611     __ bind(LESS16);
5612       // each 8 compare
5613       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5614       __ br(Assembler::LE, LESS8);
5615       __ ldr(tmp1, Address(__ post(str1, 8)));
5616       __ ldr(tmp2, Address(__ post(str2, 8)));
5617       __ eor(rscratch2, tmp1, tmp2);
5618       __ cbnz(rscratch2, CAL_DIFFERENCE);
5619       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5620 
5621     __ bind(LESS8); // directly load last 8 bytes
5622       if (!isLL) {
5623         __ add(cnt2, cnt2, cnt2);
5624       }
5625       __ ldr(tmp1, Address(str1, cnt2));
5626       __ ldr(tmp2, Address(str2, cnt2));
5627       __ eor(rscratch2, tmp1, tmp2);
5628       __ cbz(rscratch2, LENGTH_DIFF);
5629       __ b(CAL_DIFFERENCE);
5630 
5631     __ bind(DIFF);
5632       __ cmp(tmp1, tmp2);
5633       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5634       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5635       // reuse rscratch2 register for the result of eor instruction
5636       __ eor(rscratch2, tmp1, tmp2);
5637 
5638     __ bind(CAL_DIFFERENCE);
5639       __ rev(rscratch2, rscratch2);
5640       __ clz(rscratch2, rscratch2);
5641       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5642       __ lsrv(tmp1, tmp1, rscratch2);
5643       __ lsrv(tmp2, tmp2, rscratch2);
5644       if (isLL) {
5645         __ uxtbw(tmp1, tmp1);
5646         __ uxtbw(tmp2, tmp2);
5647       } else {
5648         __ uxthw(tmp1, tmp1);
5649         __ uxthw(tmp2, tmp2);
5650       }
5651       __ subw(result, tmp1, tmp2);
5652 
5653     __ bind(LENGTH_DIFF);
5654       __ ret(lr);
5655     return entry;
5656   }
5657 
5658   enum string_compare_mode {
5659     LL,
5660     LU,
5661     UL,
5662     UU,
5663   };
5664 
5665   // The following registers are declared in aarch64.ad
5666   // r0  = result
5667   // r1  = str1
5668   // r2  = cnt1
5669   // r3  = str2
5670   // r4  = cnt2
5671   // r10 = tmp1
5672   // r11 = tmp2
5673   // z0  = ztmp1
5674   // z1  = ztmp2
5675   // p0  = pgtmp1
5676   // p1  = pgtmp2
5677   address generate_compare_long_string_sve(string_compare_mode mode) {
5678     __ align(CodeEntryAlignment);
5679     address entry = __ pc();
5680     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5681              tmp1 = r10, tmp2 = r11;
5682 
5683     Label LOOP, DONE, MISMATCH;
5684     Register vec_len = tmp1;
5685     Register idx = tmp2;
5686     // The minimum of the string lengths has been stored in cnt2.
5687     Register cnt = cnt2;
5688     FloatRegister ztmp1 = z0, ztmp2 = z1;
5689     PRegister pgtmp1 = p0, pgtmp2 = p1;
5690 
5691 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5692     switch (mode) {                                                            \
5693       case LL:                                                                 \
5694         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5695         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5696         break;                                                                 \
5697       case LU:                                                                 \
5698         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5699         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5700         break;                                                                 \
5701       case UL:                                                                 \
5702         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5703         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5704         break;                                                                 \
5705       case UU:                                                                 \
5706         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5707         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5708         break;                                                                 \
5709       default:                                                                 \
5710         ShouldNotReachHere();                                                  \
5711     }
5712 
5713     const char* stubname;
5714     switch (mode) {
5715       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5716       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5717       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5718       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5719       default: ShouldNotReachHere();
5720     }
5721 
5722     StubCodeMark mark(this, "StubRoutines", stubname);
5723 
5724     __ mov(idx, 0);
5725     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5726 
5727     if (mode == LL) {
5728       __ sve_cntb(vec_len);
5729     } else {
5730       __ sve_cnth(vec_len);
5731     }
5732 
5733     __ sub(rscratch1, cnt, vec_len);
5734 
5735     __ bind(LOOP);
5736 
5737       // main loop
5738       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5739       __ add(idx, idx, vec_len);
5740       // Compare strings.
5741       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5742       __ br(__ NE, MISMATCH);
5743       __ cmp(idx, rscratch1);
5744       __ br(__ LT, LOOP);
5745 
5746     // post loop, last iteration
5747     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5748 
5749     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5750     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5751     __ br(__ EQ, DONE);
5752 
5753     __ bind(MISMATCH);
5754 
5755     // Crop the vector to find its location.
5756     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5757     // Extract the first different characters of each string.
5758     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5759     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5760 
5761     // Compute the difference of the first different characters.
5762     __ sub(result, rscratch1, rscratch2);
5763 
5764     __ bind(DONE);
5765     __ ret(lr);
5766 #undef LOAD_PAIR
5767     return entry;
5768   }
5769 
5770   void generate_compare_long_strings() {
5771     if (UseSVE == 0) {
5772       StubRoutines::aarch64::_compare_long_string_LL
5773           = generate_compare_long_string_same_encoding(true);
5774       StubRoutines::aarch64::_compare_long_string_UU
5775           = generate_compare_long_string_same_encoding(false);
5776       StubRoutines::aarch64::_compare_long_string_LU
5777           = generate_compare_long_string_different_encoding(true);
5778       StubRoutines::aarch64::_compare_long_string_UL
5779           = generate_compare_long_string_different_encoding(false);
5780     } else {
5781       StubRoutines::aarch64::_compare_long_string_LL
5782           = generate_compare_long_string_sve(LL);
5783       StubRoutines::aarch64::_compare_long_string_UU
5784           = generate_compare_long_string_sve(UU);
5785       StubRoutines::aarch64::_compare_long_string_LU
5786           = generate_compare_long_string_sve(LU);
5787       StubRoutines::aarch64::_compare_long_string_UL
5788           = generate_compare_long_string_sve(UL);
5789     }
5790   }
5791 
5792   // R0 = result
5793   // R1 = str2
5794   // R2 = cnt1
5795   // R3 = str1
5796   // R4 = cnt2
5797   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
5798   //
5799   // This generic linear code use few additional ideas, which makes it faster:
5800   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5801   // in order to skip initial loading(help in systems with 1 ld pipeline)
5802   // 2) we can use "fast" algorithm of finding single character to search for
5803   // first symbol with less branches(1 branch per each loaded register instead
5804   // of branch for each symbol), so, this is where constants like
5805   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5806   // 3) after loading and analyzing 1st register of source string, it can be
5807   // used to search for every 1st character entry, saving few loads in
5808   // comparison with "simplier-but-slower" implementation
5809   // 4) in order to avoid lots of push/pop operations, code below is heavily
5810   // re-using/re-initializing/compressing register values, which makes code
5811   // larger and a bit less readable, however, most of extra operations are
5812   // issued during loads or branches, so, penalty is minimal
5813   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5814     const char* stubName = str1_isL
5815         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5816         : "indexof_linear_uu";
5817     __ align(CodeEntryAlignment);
5818     StubCodeMark mark(this, "StubRoutines", stubName);
5819     address entry = __ pc();
5820 
5821     int str1_chr_size = str1_isL ? 1 : 2;
5822     int str2_chr_size = str2_isL ? 1 : 2;
5823     int str1_chr_shift = str1_isL ? 0 : 1;
5824     int str2_chr_shift = str2_isL ? 0 : 1;
5825     bool isL = str1_isL && str2_isL;
5826    // parameters
5827     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5828     // temporary registers
5829     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5830     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5831     // redefinitions
5832     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5833 
5834     __ push(spilled_regs, sp);
5835     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5836         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5837         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5838         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5839         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5840         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5841     // Read whole register from str1. It is safe, because length >=8 here
5842     __ ldr(ch1, Address(str1));
5843     // Read whole register from str2. It is safe, because length >=8 here
5844     __ ldr(ch2, Address(str2));
5845     __ sub(cnt2, cnt2, cnt1);
5846     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5847     if (str1_isL != str2_isL) {
5848       __ eor(v0, __ T16B, v0, v0);
5849     }
5850     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5851     __ mul(first, first, tmp1);
5852     // check if we have less than 1 register to check
5853     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5854     if (str1_isL != str2_isL) {
5855       __ fmovd(v1, ch1);
5856     }
5857     __ br(__ LE, L_SMALL);
5858     __ eor(ch2, first, ch2);
5859     if (str1_isL != str2_isL) {
5860       __ zip1(v1, __ T16B, v1, v0);
5861     }
5862     __ sub(tmp2, ch2, tmp1);
5863     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5864     __ bics(tmp2, tmp2, ch2);
5865     if (str1_isL != str2_isL) {
5866       __ fmovd(ch1, v1);
5867     }
5868     __ br(__ NE, L_HAS_ZERO);
5869     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5870     __ add(result, result, wordSize/str2_chr_size);
5871     __ add(str2, str2, wordSize);
5872     __ br(__ LT, L_POST_LOOP);
5873     __ BIND(L_LOOP);
5874       __ ldr(ch2, Address(str2));
5875       __ eor(ch2, first, ch2);
5876       __ sub(tmp2, ch2, tmp1);
5877       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5878       __ bics(tmp2, tmp2, ch2);
5879       __ br(__ NE, L_HAS_ZERO);
5880     __ BIND(L_LOOP_PROCEED);
5881       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5882       __ add(str2, str2, wordSize);
5883       __ add(result, result, wordSize/str2_chr_size);
5884       __ br(__ GE, L_LOOP);
5885     __ BIND(L_POST_LOOP);
5886       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5887       __ br(__ LE, NOMATCH);
5888       __ ldr(ch2, Address(str2));
5889       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5890       __ eor(ch2, first, ch2);
5891       __ sub(tmp2, ch2, tmp1);
5892       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5893       __ mov(tmp4, -1); // all bits set
5894       __ b(L_SMALL_PROCEED);
5895     __ align(OptoLoopAlignment);
5896     __ BIND(L_SMALL);
5897       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5898       __ eor(ch2, first, ch2);
5899       if (str1_isL != str2_isL) {
5900         __ zip1(v1, __ T16B, v1, v0);
5901       }
5902       __ sub(tmp2, ch2, tmp1);
5903       __ mov(tmp4, -1); // all bits set
5904       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5905       if (str1_isL != str2_isL) {
5906         __ fmovd(ch1, v1); // move converted 4 symbols
5907       }
5908     __ BIND(L_SMALL_PROCEED);
5909       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5910       __ bic(tmp2, tmp2, ch2);
5911       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5912       __ rbit(tmp2, tmp2);
5913       __ br(__ EQ, NOMATCH);
5914     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5915       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5916       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5917       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5918       if (str2_isL) { // LL
5919         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5920         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5921         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5922         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5923         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5924       } else {
5925         __ mov(ch2, 0xE); // all bits in byte set except last one
5926         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5927         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5928         __ lslv(tmp2, tmp2, tmp4);
5929         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5930         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5931         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5932         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5933       }
5934       __ cmp(ch1, ch2);
5935       __ mov(tmp4, wordSize/str2_chr_size);
5936       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5937     __ BIND(L_SMALL_CMP_LOOP);
5938       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5939                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5940       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5941                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5942       __ add(tmp4, tmp4, 1);
5943       __ cmp(tmp4, cnt1);
5944       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5945       __ cmp(first, ch2);
5946       __ br(__ EQ, L_SMALL_CMP_LOOP);
5947     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5948       __ cbz(tmp2, NOMATCH); // no more matches. exit
5949       __ clz(tmp4, tmp2);
5950       __ add(result, result, 1); // advance index
5951       __ add(str2, str2, str2_chr_size); // advance pointer
5952       __ b(L_SMALL_HAS_ZERO_LOOP);
5953     __ align(OptoLoopAlignment);
5954     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5955       __ cmp(first, ch2);
5956       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5957       __ b(DONE);
5958     __ align(OptoLoopAlignment);
5959     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5960       if (str2_isL) { // LL
5961         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5962         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5963         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5964         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5965         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5966       } else {
5967         __ mov(ch2, 0xE); // all bits in byte set except last one
5968         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5969         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5970         __ lslv(tmp2, tmp2, tmp4);
5971         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5972         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5973         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5974         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5975       }
5976       __ cmp(ch1, ch2);
5977       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5978       __ b(DONE);
5979     __ align(OptoLoopAlignment);
5980     __ BIND(L_HAS_ZERO);
5981       __ rbit(tmp2, tmp2);
5982       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5983       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5984       // It's fine because both counters are 32bit and are not changed in this
5985       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5986       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5987       __ sub(result, result, 1);
5988     __ BIND(L_HAS_ZERO_LOOP);
5989       __ mov(cnt1, wordSize/str2_chr_size);
5990       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5991       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5992       if (str2_isL) {
5993         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5994         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5995         __ lslv(tmp2, tmp2, tmp4);
5996         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5997         __ add(tmp4, tmp4, 1);
5998         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5999         __ lsl(tmp2, tmp2, 1);
6000         __ mov(tmp4, wordSize/str2_chr_size);
6001       } else {
6002         __ mov(ch2, 0xE);
6003         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6004         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6005         __ lslv(tmp2, tmp2, tmp4);
6006         __ add(tmp4, tmp4, 1);
6007         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6008         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6009         __ lsl(tmp2, tmp2, 1);
6010         __ mov(tmp4, wordSize/str2_chr_size);
6011         __ sub(str2, str2, str2_chr_size);
6012       }
6013       __ cmp(ch1, ch2);
6014       __ mov(tmp4, wordSize/str2_chr_size);
6015       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6016     __ BIND(L_CMP_LOOP);
6017       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6018                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6019       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6020                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6021       __ add(tmp4, tmp4, 1);
6022       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6023       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6024       __ cmp(cnt1, ch2);
6025       __ br(__ EQ, L_CMP_LOOP);
6026     __ BIND(L_CMP_LOOP_NOMATCH);
6027       // here we're not matched
6028       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6029       __ clz(tmp4, tmp2);
6030       __ add(str2, str2, str2_chr_size); // advance pointer
6031       __ b(L_HAS_ZERO_LOOP);
6032     __ align(OptoLoopAlignment);
6033     __ BIND(L_CMP_LOOP_LAST_CMP);
6034       __ cmp(cnt1, ch2);
6035       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6036       __ b(DONE);
6037     __ align(OptoLoopAlignment);
6038     __ BIND(L_CMP_LOOP_LAST_CMP2);
6039       if (str2_isL) {
6040         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6041         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6042         __ lslv(tmp2, tmp2, tmp4);
6043         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6044         __ add(tmp4, tmp4, 1);
6045         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6046         __ lsl(tmp2, tmp2, 1);
6047       } else {
6048         __ mov(ch2, 0xE);
6049         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6050         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6051         __ lslv(tmp2, tmp2, tmp4);
6052         __ add(tmp4, tmp4, 1);
6053         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6054         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6055         __ lsl(tmp2, tmp2, 1);
6056         __ sub(str2, str2, str2_chr_size);
6057       }
6058       __ cmp(ch1, ch2);
6059       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6060       __ b(DONE);
6061     __ align(OptoLoopAlignment);
6062     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6063       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6064       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6065       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6066       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6067       // result by analyzed characters value, so, we can just reset lower bits
6068       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6069       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6070       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6071       // index of last analyzed substring inside current octet. So, str2 in at
6072       // respective start address. We need to advance it to next octet
6073       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6074       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6075       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6076       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6077       __ movw(cnt2, cnt2);
6078       __ b(L_LOOP_PROCEED);
6079     __ align(OptoLoopAlignment);
6080     __ BIND(NOMATCH);
6081       __ mov(result, -1);
6082     __ BIND(DONE);
6083       __ pop(spilled_regs, sp);
6084       __ ret(lr);
6085     return entry;
6086   }
6087 
6088   void generate_string_indexof_stubs() {
6089     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6090     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6091     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6092   }
6093 
6094   void inflate_and_store_2_fp_registers(bool generatePrfm,
6095       FloatRegister src1, FloatRegister src2) {
6096     Register dst = r1;
6097     __ zip1(v1, __ T16B, src1, v0);
6098     __ zip2(v2, __ T16B, src1, v0);
6099     if (generatePrfm) {
6100       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6101     }
6102     __ zip1(v3, __ T16B, src2, v0);
6103     __ zip2(v4, __ T16B, src2, v0);
6104     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6105   }
6106 
6107   // R0 = src
6108   // R1 = dst
6109   // R2 = len
6110   // R3 = len >> 3
6111   // V0 = 0
6112   // v1 = loaded 8 bytes
6113   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6114   address generate_large_byte_array_inflate() {
6115     __ align(CodeEntryAlignment);
6116     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6117     address entry = __ pc();
6118     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6119     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6120     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6121 
6122     // do one more 8-byte read to have address 16-byte aligned in most cases
6123     // also use single store instruction
6124     __ ldrd(v2, __ post(src, 8));
6125     __ sub(octetCounter, octetCounter, 2);
6126     __ zip1(v1, __ T16B, v1, v0);
6127     __ zip1(v2, __ T16B, v2, v0);
6128     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6129     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6130     __ subs(rscratch1, octetCounter, large_loop_threshold);
6131     __ br(__ LE, LOOP_START);
6132     __ b(LOOP_PRFM_START);
6133     __ bind(LOOP_PRFM);
6134       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6135     __ bind(LOOP_PRFM_START);
6136       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6137       __ sub(octetCounter, octetCounter, 8);
6138       __ subs(rscratch1, octetCounter, large_loop_threshold);
6139       inflate_and_store_2_fp_registers(true, v3, v4);
6140       inflate_and_store_2_fp_registers(true, v5, v6);
6141       __ br(__ GT, LOOP_PRFM);
6142       __ cmp(octetCounter, (u1)8);
6143       __ br(__ LT, DONE);
6144     __ bind(LOOP);
6145       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6146       __ bind(LOOP_START);
6147       __ sub(octetCounter, octetCounter, 8);
6148       __ cmp(octetCounter, (u1)8);
6149       inflate_and_store_2_fp_registers(false, v3, v4);
6150       inflate_and_store_2_fp_registers(false, v5, v6);
6151       __ br(__ GE, LOOP);
6152     __ bind(DONE);
6153       __ ret(lr);
6154     return entry;
6155   }
6156 
6157   /**
6158    *  Arguments:
6159    *
6160    *  Input:
6161    *  c_rarg0   - current state address
6162    *  c_rarg1   - H key address
6163    *  c_rarg2   - data address
6164    *  c_rarg3   - number of blocks
6165    *
6166    *  Output:
6167    *  Updated state at c_rarg0
6168    */
6169   address generate_ghash_processBlocks() {
6170     // Bafflingly, GCM uses little-endian for the byte order, but
6171     // big-endian for the bit order.  For example, the polynomial 1 is
6172     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6173     //
6174     // So, we must either reverse the bytes in each word and do
6175     // everything big-endian or reverse the bits in each byte and do
6176     // it little-endian.  On AArch64 it's more idiomatic to reverse
6177     // the bits in each byte (we have an instruction, RBIT, to do
6178     // that) and keep the data in little-endian bit order through the
6179     // calculation, bit-reversing the inputs and outputs.
6180 
6181     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6182     __ align(wordSize * 2);
6183     address p = __ pc();
6184     __ emit_int64(0x87);  // The low-order bits of the field
6185                           // polynomial (i.e. p = z^7+z^2+z+1)
6186                           // repeated in the low and high parts of a
6187                           // 128-bit vector
6188     __ emit_int64(0x87);
6189 
6190     __ align(CodeEntryAlignment);
6191     address start = __ pc();
6192 
6193     Register state   = c_rarg0;
6194     Register subkeyH = c_rarg1;
6195     Register data    = c_rarg2;
6196     Register blocks  = c_rarg3;
6197 
6198     FloatRegister vzr = v30;
6199     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6200 
6201     __ ldrq(v24, p);    // The field polynomial
6202 
6203     __ ldrq(v0, Address(state));
6204     __ ldrq(v1, Address(subkeyH));
6205 
6206     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6207     __ rbit(v0, __ T16B, v0);
6208     __ rev64(v1, __ T16B, v1);
6209     __ rbit(v1, __ T16B, v1);
6210 
6211     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6212     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6213 
6214     {
6215       Label L_ghash_loop;
6216       __ bind(L_ghash_loop);
6217 
6218       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6219                                                  // reversing each byte
6220       __ rbit(v2, __ T16B, v2);
6221       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6222 
6223       // Multiply state in v2 by subkey in v1
6224       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6225                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6226                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6227       // Reduce v7:v5 by the field polynomial
6228       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6229 
6230       __ sub(blocks, blocks, 1);
6231       __ cbnz(blocks, L_ghash_loop);
6232     }
6233 
6234     // The bit-reversed result is at this point in v0
6235     __ rev64(v0, __ T16B, v0);
6236     __ rbit(v0, __ T16B, v0);
6237 
6238     __ st1(v0, __ T16B, state);
6239     __ ret(lr);
6240 
6241     return start;
6242   }
6243 
6244   address generate_ghash_processBlocks_wide() {
6245     address small = generate_ghash_processBlocks();
6246 
6247     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6248     __ align(wordSize * 2);
6249     address p = __ pc();
6250     __ emit_int64(0x87);  // The low-order bits of the field
6251                           // polynomial (i.e. p = z^7+z^2+z+1)
6252                           // repeated in the low and high parts of a
6253                           // 128-bit vector
6254     __ emit_int64(0x87);
6255 
6256     __ align(CodeEntryAlignment);
6257     address start = __ pc();
6258 
6259     Register state   = c_rarg0;
6260     Register subkeyH = c_rarg1;
6261     Register data    = c_rarg2;
6262     Register blocks  = c_rarg3;
6263 
6264     const int unroll = 4;
6265 
6266     __ cmp(blocks, (unsigned char)(unroll * 2));
6267     __ br(__ LT, small);
6268 
6269     if (unroll > 1) {
6270     // Save state before entering routine
6271       __ sub(sp, sp, 4 * 16);
6272       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6273       __ sub(sp, sp, 4 * 16);
6274       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6275     }
6276 
6277     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6278 
6279     if (unroll > 1) {
6280       // And restore state
6281       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6282       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6283     }
6284 
6285     __ cmp(blocks, (unsigned char)0);
6286     __ br(__ GT, small);
6287 
6288     __ ret(lr);
6289 
6290     return start;
6291   }
6292 
6293   void generate_base64_encode_simdround(Register src, Register dst,
6294         FloatRegister codec, u8 size) {
6295 
6296     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6297     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6298     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6299 
6300     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6301 
6302     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6303 
6304     __ ushr(ind0, arrangement, in0,  2);
6305 
6306     __ ushr(ind1, arrangement, in1,  2);
6307     __ shl(in0,   arrangement, in0,  6);
6308     __ orr(ind1,  arrangement, ind1, in0);
6309     __ ushr(ind1, arrangement, ind1, 2);
6310 
6311     __ ushr(ind2, arrangement, in2,  4);
6312     __ shl(in1,   arrangement, in1,  4);
6313     __ orr(ind2,  arrangement, in1,  ind2);
6314     __ ushr(ind2, arrangement, ind2, 2);
6315 
6316     __ shl(ind3,  arrangement, in2,  2);
6317     __ ushr(ind3, arrangement, ind3, 2);
6318 
6319     __ tbl(out0,  arrangement, codec,  4, ind0);
6320     __ tbl(out1,  arrangement, codec,  4, ind1);
6321     __ tbl(out2,  arrangement, codec,  4, ind2);
6322     __ tbl(out3,  arrangement, codec,  4, ind3);
6323 
6324     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6325   }
6326 
6327    /**
6328    *  Arguments:
6329    *
6330    *  Input:
6331    *  c_rarg0   - src_start
6332    *  c_rarg1   - src_offset
6333    *  c_rarg2   - src_length
6334    *  c_rarg3   - dest_start
6335    *  c_rarg4   - dest_offset
6336    *  c_rarg5   - isURL
6337    *
6338    */
6339   address generate_base64_encodeBlock() {
6340 
6341     static const char toBase64[64] = {
6342       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6343       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6344       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6345       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6346       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6347     };
6348 
6349     static const char toBase64URL[64] = {
6350       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6351       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6352       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6353       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6354       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6355     };
6356 
6357     __ align(CodeEntryAlignment);
6358     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6359     address start = __ pc();
6360 
6361     Register src   = c_rarg0;  // source array
6362     Register soff  = c_rarg1;  // source start offset
6363     Register send  = c_rarg2;  // source end offset
6364     Register dst   = c_rarg3;  // dest array
6365     Register doff  = c_rarg4;  // position for writing to dest array
6366     Register isURL = c_rarg5;  // Base64 or URL character set
6367 
6368     // c_rarg6 and c_rarg7 are free to use as temps
6369     Register codec  = c_rarg6;
6370     Register length = c_rarg7;
6371 
6372     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6373 
6374     __ add(src, src, soff);
6375     __ add(dst, dst, doff);
6376     __ sub(length, send, soff);
6377 
6378     // load the codec base address
6379     __ lea(codec, ExternalAddress((address) toBase64));
6380     __ cbz(isURL, ProcessData);
6381     __ lea(codec, ExternalAddress((address) toBase64URL));
6382 
6383     __ BIND(ProcessData);
6384 
6385     // too short to formup a SIMD loop, roll back
6386     __ cmp(length, (u1)24);
6387     __ br(Assembler::LT, Process3B);
6388 
6389     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6390 
6391     __ BIND(Process48B);
6392     __ cmp(length, (u1)48);
6393     __ br(Assembler::LT, Process24B);
6394     generate_base64_encode_simdround(src, dst, v0, 16);
6395     __ sub(length, length, 48);
6396     __ b(Process48B);
6397 
6398     __ BIND(Process24B);
6399     __ cmp(length, (u1)24);
6400     __ br(Assembler::LT, SIMDExit);
6401     generate_base64_encode_simdround(src, dst, v0, 8);
6402     __ sub(length, length, 24);
6403 
6404     __ BIND(SIMDExit);
6405     __ cbz(length, Exit);
6406 
6407     __ BIND(Process3B);
6408     //  3 src bytes, 24 bits
6409     __ ldrb(r10, __ post(src, 1));
6410     __ ldrb(r11, __ post(src, 1));
6411     __ ldrb(r12, __ post(src, 1));
6412     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6413     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6414     // codec index
6415     __ ubfmw(r15, r12, 18, 23);
6416     __ ubfmw(r14, r12, 12, 17);
6417     __ ubfmw(r13, r12, 6,  11);
6418     __ andw(r12,  r12, 63);
6419     // get the code based on the codec
6420     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6421     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6422     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6423     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6424     __ strb(r15, __ post(dst, 1));
6425     __ strb(r14, __ post(dst, 1));
6426     __ strb(r13, __ post(dst, 1));
6427     __ strb(r12, __ post(dst, 1));
6428     __ sub(length, length, 3);
6429     __ cbnz(length, Process3B);
6430 
6431     __ BIND(Exit);
6432     __ ret(lr);
6433 
6434     return start;
6435   }
6436 
6437   void generate_base64_decode_simdround(Register src, Register dst,
6438         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6439 
6440     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6441     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6442 
6443     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6444     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6445 
6446     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6447 
6448     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6449 
6450     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6451 
6452     // we need unsigned saturating subtract, to make sure all input values
6453     // in range [0, 63] will have 0U value in the higher half lookup
6454     __ uqsubv(decH0, __ T16B, in0, v27);
6455     __ uqsubv(decH1, __ T16B, in1, v27);
6456     __ uqsubv(decH2, __ T16B, in2, v27);
6457     __ uqsubv(decH3, __ T16B, in3, v27);
6458 
6459     // lower half lookup
6460     __ tbl(decL0, arrangement, codecL, 4, in0);
6461     __ tbl(decL1, arrangement, codecL, 4, in1);
6462     __ tbl(decL2, arrangement, codecL, 4, in2);
6463     __ tbl(decL3, arrangement, codecL, 4, in3);
6464 
6465     // higher half lookup
6466     __ tbx(decH0, arrangement, codecH, 4, decH0);
6467     __ tbx(decH1, arrangement, codecH, 4, decH1);
6468     __ tbx(decH2, arrangement, codecH, 4, decH2);
6469     __ tbx(decH3, arrangement, codecH, 4, decH3);
6470 
6471     // combine lower and higher
6472     __ orr(decL0, arrangement, decL0, decH0);
6473     __ orr(decL1, arrangement, decL1, decH1);
6474     __ orr(decL2, arrangement, decL2, decH2);
6475     __ orr(decL3, arrangement, decL3, decH3);
6476 
6477     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6478     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6479     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6480     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6481     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6482     __ orr(in0, arrangement, decH0, decH1);
6483     __ orr(in1, arrangement, decH2, decH3);
6484     __ orr(in2, arrangement, in0,   in1);
6485     __ umaxv(in3, arrangement, in2);
6486     __ umov(rscratch2, in3, __ B, 0);
6487 
6488     // get the data to output
6489     __ shl(out0,  arrangement, decL0, 2);
6490     __ ushr(out1, arrangement, decL1, 4);
6491     __ orr(out0,  arrangement, out0,  out1);
6492     __ shl(out1,  arrangement, decL1, 4);
6493     __ ushr(out2, arrangement, decL2, 2);
6494     __ orr(out1,  arrangement, out1,  out2);
6495     __ shl(out2,  arrangement, decL2, 6);
6496     __ orr(out2,  arrangement, out2,  decL3);
6497 
6498     __ cbz(rscratch2, NoIllegalData);
6499 
6500     // handle illegal input
6501     __ umov(r10, in2, __ D, 0);
6502     if (size == 16) {
6503       __ cbnz(r10, ErrorInLowerHalf);
6504 
6505       // illegal input is in higher half, store the lower half now.
6506       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6507 
6508       __ umov(r10, in2,  __ D, 1);
6509       __ umov(r11, out0, __ D, 1);
6510       __ umov(r12, out1, __ D, 1);
6511       __ umov(r13, out2, __ D, 1);
6512       __ b(StoreLegalData);
6513 
6514       __ BIND(ErrorInLowerHalf);
6515     }
6516     __ umov(r11, out0, __ D, 0);
6517     __ umov(r12, out1, __ D, 0);
6518     __ umov(r13, out2, __ D, 0);
6519 
6520     __ BIND(StoreLegalData);
6521     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6522     __ strb(r11, __ post(dst, 1));
6523     __ strb(r12, __ post(dst, 1));
6524     __ strb(r13, __ post(dst, 1));
6525     __ lsr(r10, r10, 8);
6526     __ lsr(r11, r11, 8);
6527     __ lsr(r12, r12, 8);
6528     __ lsr(r13, r13, 8);
6529     __ b(StoreLegalData);
6530 
6531     __ BIND(NoIllegalData);
6532     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6533   }
6534 
6535 
6536    /**
6537    *  Arguments:
6538    *
6539    *  Input:
6540    *  c_rarg0   - src_start
6541    *  c_rarg1   - src_offset
6542    *  c_rarg2   - src_length
6543    *  c_rarg3   - dest_start
6544    *  c_rarg4   - dest_offset
6545    *  c_rarg5   - isURL
6546    *  c_rarg6   - isMIME
6547    *
6548    */
6549   address generate_base64_decodeBlock() {
6550 
6551     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6552     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6553     // titled "Base64 decoding".
6554 
6555     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6556     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6557     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6558     static const uint8_t fromBase64ForNoSIMD[256] = {
6559       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6560       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6561       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6562        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6563       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6564        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6565       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6566        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6567       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6568       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6569       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6570       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6571       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6572       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6573       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6574       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6575     };
6576 
6577     static const uint8_t fromBase64URLForNoSIMD[256] = {
6578       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6579       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6580       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6581        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6582       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6583        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6584       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6585        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6586       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6587       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6588       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6589       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6590       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6591       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6592       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6593       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6594     };
6595 
6596     // A legal value of base64 code is in range [0, 127].  We need two lookups
6597     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6598     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6599     // table vector lookup use tbx, out of range indices are unchanged in
6600     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6601     // The value of index 64 is set to 0, so that we know that we already get the
6602     // decoded data with the 1st lookup.
6603     static const uint8_t fromBase64ForSIMD[128] = {
6604       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6605       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6606       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6607        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6608         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6609        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6610       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6611        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6612     };
6613 
6614     static const uint8_t fromBase64URLForSIMD[128] = {
6615       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6616       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6617       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6618        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6619         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6620        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6621        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6622        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6623     };
6624 
6625     __ align(CodeEntryAlignment);
6626     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6627     address start = __ pc();
6628 
6629     Register src    = c_rarg0;  // source array
6630     Register soff   = c_rarg1;  // source start offset
6631     Register send   = c_rarg2;  // source end offset
6632     Register dst    = c_rarg3;  // dest array
6633     Register doff   = c_rarg4;  // position for writing to dest array
6634     Register isURL  = c_rarg5;  // Base64 or URL character set
6635     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6636 
6637     Register length = send;    // reuse send as length of source data to process
6638 
6639     Register simd_codec   = c_rarg6;
6640     Register nosimd_codec = c_rarg7;
6641 
6642     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6643 
6644     __ enter();
6645 
6646     __ add(src, src, soff);
6647     __ add(dst, dst, doff);
6648 
6649     __ mov(doff, dst);
6650 
6651     __ sub(length, send, soff);
6652     __ bfm(length, zr, 0, 1);
6653 
6654     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6655     __ cbz(isURL, ProcessData);
6656     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6657 
6658     __ BIND(ProcessData);
6659     __ mov(rscratch1, length);
6660     __ cmp(length, (u1)144); // 144 = 80 + 64
6661     __ br(Assembler::LT, Process4B);
6662 
6663     // In the MIME case, the line length cannot be more than 76
6664     // bytes (see RFC 2045). This is too short a block for SIMD
6665     // to be worthwhile, so we use non-SIMD here.
6666     __ movw(rscratch1, 79);
6667 
6668     __ BIND(Process4B);
6669     __ ldrw(r14, __ post(src, 4));
6670     __ ubfxw(r10, r14, 0,  8);
6671     __ ubfxw(r11, r14, 8,  8);
6672     __ ubfxw(r12, r14, 16, 8);
6673     __ ubfxw(r13, r14, 24, 8);
6674     // get the de-code
6675     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6676     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6677     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6678     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6679     // error detection, 255u indicates an illegal input
6680     __ orrw(r14, r10, r11);
6681     __ orrw(r15, r12, r13);
6682     __ orrw(r14, r14, r15);
6683     __ tbnz(r14, 7, Exit);
6684     // recover the data
6685     __ lslw(r14, r10, 10);
6686     __ bfiw(r14, r11, 4, 6);
6687     __ bfmw(r14, r12, 2, 5);
6688     __ rev16w(r14, r14);
6689     __ bfiw(r13, r12, 6, 2);
6690     __ strh(r14, __ post(dst, 2));
6691     __ strb(r13, __ post(dst, 1));
6692     // non-simd loop
6693     __ subsw(rscratch1, rscratch1, 4);
6694     __ br(Assembler::GT, Process4B);
6695 
6696     // if exiting from PreProcess80B, rscratch1 == -1;
6697     // otherwise, rscratch1 == 0.
6698     __ cbzw(rscratch1, Exit);
6699     __ sub(length, length, 80);
6700 
6701     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6702     __ cbz(isURL, SIMDEnter);
6703     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6704 
6705     __ BIND(SIMDEnter);
6706     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6707     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6708     __ mov(rscratch1, 63);
6709     __ dup(v27, __ T16B, rscratch1);
6710 
6711     __ BIND(Process64B);
6712     __ cmp(length, (u1)64);
6713     __ br(Assembler::LT, Process32B);
6714     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6715     __ sub(length, length, 64);
6716     __ b(Process64B);
6717 
6718     __ BIND(Process32B);
6719     __ cmp(length, (u1)32);
6720     __ br(Assembler::LT, SIMDExit);
6721     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6722     __ sub(length, length, 32);
6723     __ b(Process32B);
6724 
6725     __ BIND(SIMDExit);
6726     __ cbz(length, Exit);
6727     __ movw(rscratch1, length);
6728     __ b(Process4B);
6729 
6730     __ BIND(Exit);
6731     __ sub(c_rarg0, dst, doff);
6732 
6733     __ leave();
6734     __ ret(lr);
6735 
6736     return start;
6737   }
6738 
6739   // Support for spin waits.
6740   address generate_spin_wait() {
6741     __ align(CodeEntryAlignment);
6742     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6743     address start = __ pc();
6744 
6745     __ spin_wait();
6746     __ ret(lr);
6747 
6748     return start;
6749   }
6750 
6751 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6752 
6753   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6754   //
6755   // If LSE is in use, generate LSE versions of all the stubs. The
6756   // non-LSE versions are in atomic_aarch64.S.
6757 
6758   // class AtomicStubMark records the entry point of a stub and the
6759   // stub pointer which will point to it. The stub pointer is set to
6760   // the entry point when ~AtomicStubMark() is called, which must be
6761   // after ICache::invalidate_range. This ensures safe publication of
6762   // the generated code.
6763   class AtomicStubMark {
6764     address _entry_point;
6765     aarch64_atomic_stub_t *_stub;
6766     MacroAssembler *_masm;
6767   public:
6768     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6769       _masm = masm;
6770       __ align(32);
6771       _entry_point = __ pc();
6772       _stub = stub;
6773     }
6774     ~AtomicStubMark() {
6775       *_stub = (aarch64_atomic_stub_t)_entry_point;
6776     }
6777   };
6778 
6779   // NB: For memory_order_conservative we need a trailing membar after
6780   // LSE atomic operations but not a leading membar.
6781   //
6782   // We don't need a leading membar because a clause in the Arm ARM
6783   // says:
6784   //
6785   //   Barrier-ordered-before
6786   //
6787   //   Barrier instructions order prior Memory effects before subsequent
6788   //   Memory effects generated by the same Observer. A read or a write
6789   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6790   //   Observer if and only if RW1 appears in program order before RW 2
6791   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6792   //   instruction with both Acquire and Release semantics.
6793   //
6794   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6795   // and Release semantics, therefore we don't need a leading
6796   // barrier. However, there is no corresponding Barrier-ordered-after
6797   // relationship, therefore we need a trailing membar to prevent a
6798   // later store or load from being reordered with the store in an
6799   // atomic instruction.
6800   //
6801   // This was checked by using the herd7 consistency model simulator
6802   // (http://diy.inria.fr/) with this test case:
6803   //
6804   // AArch64 LseCas
6805   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6806   // P0 | P1;
6807   // LDR W4, [X2] | MOV W3, #0;
6808   // DMB LD       | MOV W4, #1;
6809   // LDR W3, [X1] | CASAL W3, W4, [X1];
6810   //              | DMB ISH;
6811   //              | STR W4, [X2];
6812   // exists
6813   // (0:X3=0 /\ 0:X4=1)
6814   //
6815   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6816   // with the store to x in P1. Without the DMB in P1 this may happen.
6817   //
6818   // At the time of writing we don't know of any AArch64 hardware that
6819   // reorders stores in this way, but the Reference Manual permits it.
6820 
6821   void gen_cas_entry(Assembler::operand_size size,
6822                      atomic_memory_order order) {
6823     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6824       exchange_val = c_rarg2;
6825     bool acquire, release;
6826     switch (order) {
6827       case memory_order_relaxed:
6828         acquire = false;
6829         release = false;
6830         break;
6831       case memory_order_release:
6832         acquire = false;
6833         release = true;
6834         break;
6835       default:
6836         acquire = true;
6837         release = true;
6838         break;
6839     }
6840     __ mov(prev, compare_val);
6841     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6842     if (order == memory_order_conservative) {
6843       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6844     }
6845     if (size == Assembler::xword) {
6846       __ mov(r0, prev);
6847     } else {
6848       __ movw(r0, prev);
6849     }
6850     __ ret(lr);
6851   }
6852 
6853   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6854     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6855     // If not relaxed, then default to conservative.  Relaxed is the only
6856     // case we use enough to be worth specializing.
6857     if (order == memory_order_relaxed) {
6858       __ ldadd(size, incr, prev, addr);
6859     } else {
6860       __ ldaddal(size, incr, prev, addr);
6861       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6862     }
6863     if (size == Assembler::xword) {
6864       __ mov(r0, prev);
6865     } else {
6866       __ movw(r0, prev);
6867     }
6868     __ ret(lr);
6869   }
6870 
6871   void gen_swpal_entry(Assembler::operand_size size) {
6872     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6873     __ swpal(size, incr, prev, addr);
6874     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6875     if (size == Assembler::xword) {
6876       __ mov(r0, prev);
6877     } else {
6878       __ movw(r0, prev);
6879     }
6880     __ ret(lr);
6881   }
6882 
6883   void generate_atomic_entry_points() {
6884     if (! UseLSE) {
6885       return;
6886     }
6887 
6888     __ align(CodeEntryAlignment);
6889     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6890     address first_entry = __ pc();
6891 
6892     // ADD, memory_order_conservative
6893     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6894     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6895     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6896     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6897 
6898     // ADD, memory_order_relaxed
6899     AtomicStubMark mark_fetch_add_4_relaxed
6900       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6901     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6902     AtomicStubMark mark_fetch_add_8_relaxed
6903       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6904     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6905 
6906     // XCHG, memory_order_conservative
6907     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6908     gen_swpal_entry(Assembler::word);
6909     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6910     gen_swpal_entry(Assembler::xword);
6911 
6912     // CAS, memory_order_conservative
6913     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6914     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6915     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6916     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6917     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6918     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6919 
6920     // CAS, memory_order_relaxed
6921     AtomicStubMark mark_cmpxchg_1_relaxed
6922       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6923     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6924     AtomicStubMark mark_cmpxchg_4_relaxed
6925       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6926     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6927     AtomicStubMark mark_cmpxchg_8_relaxed
6928       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6929     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6930 
6931     AtomicStubMark mark_cmpxchg_4_release
6932       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6933     gen_cas_entry(MacroAssembler::word, memory_order_release);
6934     AtomicStubMark mark_cmpxchg_8_release
6935       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6936     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6937 
6938     AtomicStubMark mark_cmpxchg_4_seq_cst
6939       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6940     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6941     AtomicStubMark mark_cmpxchg_8_seq_cst
6942       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6943     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6944 
6945     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6946   }
6947 #endif // LINUX
6948 
6949   address generate_cont_thaw(Continuation::thaw_kind kind) {
6950     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6951     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6952 
6953     address start = __ pc();
6954 
6955     if (return_barrier) {
6956       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6957       __ mov(sp, rscratch1);
6958     }
6959     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6960 
6961     if (return_barrier) {
6962       // preserve possible return value from a method returning to the return barrier
6963       __ fmovd(rscratch1, v0);
6964       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6965     }
6966 
6967     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6968     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6969     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6970 
6971     if (return_barrier) {
6972       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6973       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6974       __ fmovd(v0, rscratch1);
6975     }
6976     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6977 
6978 
6979     Label thaw_success;
6980     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6981     __ cbnz(rscratch2, thaw_success);
6982     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6983     __ br(rscratch1);
6984     __ bind(thaw_success);
6985 
6986     // make room for the thawed frames
6987     __ sub(rscratch1, sp, rscratch2);
6988     __ andr(rscratch1, rscratch1, -16); // align
6989     __ mov(sp, rscratch1);
6990 
6991     if (return_barrier) {
6992       // save original return value -- again
6993       __ fmovd(rscratch1, v0);
6994       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6995     }
6996 
6997     // If we want, we can templatize thaw by kind, and have three different entries
6998     __ movw(c_rarg1, (uint32_t)kind);
6999 
7000     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7001     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7002 
7003     if (return_barrier) {
7004       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7005       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7006       __ fmovd(v0, rscratch1);
7007     } else {
7008       __ mov(r0, zr); // return 0 (success) from doYield
7009     }
7010 
7011     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7012     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7013     __ mov(rfp, sp);
7014 
7015     if (return_barrier_exception) {
7016       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7017       __ verify_oop(r0);
7018       __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
7019 
7020       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7021 
7022       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7023       // __ reinitialize_ptrue();
7024 
7025       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7026 
7027       __ mov(r1, r0); // the exception handler
7028       __ mov(r0, r19); // restore return value contaning the exception oop
7029       __ verify_oop(r0);
7030 
7031       __ leave();
7032       __ mov(r3, lr);
7033       __ br(r1); // the exception handler
7034     } else {
7035       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7036       __ leave();
7037       __ ret(lr);
7038     }
7039 
7040     return start;
7041   }
7042 
7043   address generate_cont_thaw() {
7044     if (!Continuations::enabled()) return nullptr;
7045 
7046     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7047     address start = __ pc();
7048     generate_cont_thaw(Continuation::thaw_top);
7049     return start;
7050   }
7051 
7052   address generate_cont_returnBarrier() {
7053     if (!Continuations::enabled()) return nullptr;
7054 
7055     // TODO: will probably need multiple return barriers depending on return type
7056     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7057     address start = __ pc();
7058 
7059     generate_cont_thaw(Continuation::thaw_return_barrier);
7060 
7061     return start;
7062   }
7063 
7064   address generate_cont_returnBarrier_exception() {
7065     if (!Continuations::enabled()) return nullptr;
7066 
7067     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7068     address start = __ pc();
7069 
7070     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7071 
7072     return start;
7073   }
7074 
7075 #if INCLUDE_JFR
7076 
7077   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
7078     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7079     __ mov(c_rarg0, thread);
7080   }
7081 
7082   // The handle is dereferenced through a load barrier.
7083   static void jfr_epilogue(MacroAssembler* _masm) {
7084     __ reset_last_Java_frame(true);
7085     __ resolve_global_jobject(r0, rscratch1, rscratch2);
7086   }
7087 
7088   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
7089   // It returns a jobject handle to the event writer.
7090   // The handle is dereferenced and the return value is the event writer oop.
7091   static RuntimeStub* generate_jfr_write_checkpoint() {
7092     enum layout {
7093       rbp_off,
7094       rbpH_off,
7095       return_off,
7096       return_off2,
7097       framesize // inclusive of return address
7098     };
7099 
7100     int insts_size = 1024;
7101     int locs_size = 64;
7102     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
7103     OopMapSet* oop_maps = new OopMapSet();
7104     MacroAssembler* masm = new MacroAssembler(&code);
7105     MacroAssembler* _masm = masm;
7106 
7107     address start = __ pc();
7108     __ enter();
7109     int frame_complete = __ pc() - start;
7110     address the_pc = __ pc();
7111     jfr_prologue(the_pc, _masm, rthread);
7112     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
7113     jfr_epilogue(_masm);
7114     __ leave();
7115     __ ret(lr);
7116 
7117     OopMap* map = new OopMap(framesize, 1); // rfp
7118     oop_maps->add_gc_map(the_pc - start, map);
7119 
7120     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
7121       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
7122                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7123                                     oop_maps, false);
7124     return stub;
7125   }
7126 
7127 #endif // INCLUDE_JFR
7128 
7129   // Continuation point for throwing of implicit exceptions that are
7130   // not handled in the current activation. Fabricates an exception
7131   // oop and initiates normal exception dispatching in this
7132   // frame. Since we need to preserve callee-saved values (currently
7133   // only for C2, but done for C1 as well) we need a callee-saved oop
7134   // map and therefore have to make these stubs into RuntimeStubs
7135   // rather than BufferBlobs.  If the compiler needs all registers to
7136   // be preserved between the fault point and the exception handler
7137   // then it must assume responsibility for that in
7138   // AbstractCompiler::continuation_for_implicit_null_exception or
7139   // continuation_for_implicit_division_by_zero_exception. All other
7140   // implicit exceptions (e.g., NullPointerException or
7141   // AbstractMethodError on entry) are either at call sites or
7142   // otherwise assume that stack unwinding will be initiated, so
7143   // caller saved registers were assumed volatile in the compiler.
7144 
7145 #undef __
7146 #define __ masm->
7147 
7148   address generate_throw_exception(const char* name,
7149                                    address runtime_entry,
7150                                    Register arg1 = noreg,
7151                                    Register arg2 = noreg) {
7152     // Information about frame layout at time of blocking runtime call.
7153     // Note that we only have to preserve callee-saved registers since
7154     // the compilers are responsible for supplying a continuation point
7155     // if they expect all registers to be preserved.
7156     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
7157     enum layout {
7158       rfp_off = 0,
7159       rfp_off2,
7160       return_off,
7161       return_off2,
7162       framesize // inclusive of return address
7163     };
7164 
7165     int insts_size = 512;
7166     int locs_size  = 64;
7167 
7168     CodeBuffer code(name, insts_size, locs_size);
7169     OopMapSet* oop_maps  = new OopMapSet();
7170     MacroAssembler* masm = new MacroAssembler(&code);
7171 
7172     address start = __ pc();
7173 
7174     // This is an inlined and slightly modified version of call_VM
7175     // which has the ability to fetch the return PC out of
7176     // thread-local storage and also sets up last_Java_sp slightly
7177     // differently than the real call_VM
7178 
7179     __ enter(); // Save FP and LR before call
7180 
7181     assert(is_even(framesize/2), "sp not 16-byte aligned");
7182 
7183     // lr and fp are already in place
7184     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
7185 
7186     int frame_complete = __ pc() - start;
7187 
7188     // Set up last_Java_sp and last_Java_fp
7189     address the_pc = __ pc();
7190     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
7191 
7192     // Call runtime
7193     if (arg1 != noreg) {
7194       assert(arg2 != c_rarg1, "clobbered");
7195       __ mov(c_rarg1, arg1);
7196     }
7197     if (arg2 != noreg) {
7198       __ mov(c_rarg2, arg2);
7199     }
7200     __ mov(c_rarg0, rthread);
7201     BLOCK_COMMENT("call runtime_entry");
7202     __ mov(rscratch1, runtime_entry);
7203     __ blr(rscratch1);
7204 
7205     // Generate oop map
7206     OopMap* map = new OopMap(framesize, 0);
7207 
7208     oop_maps->add_gc_map(the_pc - start, map);
7209 
7210     __ reset_last_Java_frame(true);
7211 
7212     // Reinitialize the ptrue predicate register, in case the external runtime
7213     // call clobbers ptrue reg, as we may return to SVE compiled code.
7214     __ reinitialize_ptrue();
7215 
7216     __ leave();
7217 
7218     // check for pending exceptions
7219 #ifdef ASSERT
7220     Label L;
7221     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
7222     __ cbnz(rscratch1, L);
7223     __ should_not_reach_here();
7224     __ bind(L);
7225 #endif // ASSERT
7226     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7227 
7228     // codeBlob framesize is in words (not VMRegImpl::slot_size)
7229     RuntimeStub* stub =
7230       RuntimeStub::new_runtime_stub(name,
7231                                     &code,
7232                                     frame_complete,
7233                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7234                                     oop_maps, false);
7235     return stub->entry_point();
7236   }
7237 
7238   class MontgomeryMultiplyGenerator : public MacroAssembler {
7239 
7240     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7241       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7242 
7243     RegSet _toSave;
7244     bool _squaring;
7245 
7246   public:
7247     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7248       : MacroAssembler(as->code()), _squaring(squaring) {
7249 
7250       // Register allocation
7251 
7252       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7253       Pa_base = *regs;       // Argument registers
7254       if (squaring)
7255         Pb_base = Pa_base;
7256       else
7257         Pb_base = *++regs;
7258       Pn_base = *++regs;
7259       Rlen= *++regs;
7260       inv = *++regs;
7261       Pm_base = *++regs;
7262 
7263                           // Working registers:
7264       Ra =  *++regs;        // The current digit of a, b, n, and m.
7265       Rb =  *++regs;
7266       Rm =  *++regs;
7267       Rn =  *++regs;
7268 
7269       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7270       Pb =  *++regs;
7271       Pm =  *++regs;
7272       Pn =  *++regs;
7273 
7274       t0 =  *++regs;        // Three registers which form a
7275       t1 =  *++regs;        // triple-precision accumuator.
7276       t2 =  *++regs;
7277 
7278       Ri =  *++regs;        // Inner and outer loop indexes.
7279       Rj =  *++regs;
7280 
7281       Rhi_ab = *++regs;     // Product registers: low and high parts
7282       Rlo_ab = *++regs;     // of a*b and m*n.
7283       Rhi_mn = *++regs;
7284       Rlo_mn = *++regs;
7285 
7286       // r19 and up are callee-saved.
7287       _toSave = RegSet::range(r19, *regs) + Pm_base;
7288     }
7289 
7290   private:
7291     void save_regs() {
7292       push(_toSave, sp);
7293     }
7294 
7295     void restore_regs() {
7296       pop(_toSave, sp);
7297     }
7298 
7299     template <typename T>
7300     void unroll_2(Register count, T block) {
7301       Label loop, end, odd;
7302       tbnz(count, 0, odd);
7303       cbz(count, end);
7304       align(16);
7305       bind(loop);
7306       (this->*block)();
7307       bind(odd);
7308       (this->*block)();
7309       subs(count, count, 2);
7310       br(Assembler::GT, loop);
7311       bind(end);
7312     }
7313 
7314     template <typename T>
7315     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7316       Label loop, end, odd;
7317       tbnz(count, 0, odd);
7318       cbz(count, end);
7319       align(16);
7320       bind(loop);
7321       (this->*block)(d, s, tmp);
7322       bind(odd);
7323       (this->*block)(d, s, tmp);
7324       subs(count, count, 2);
7325       br(Assembler::GT, loop);
7326       bind(end);
7327     }
7328 
7329     void pre1(RegisterOrConstant i) {
7330       block_comment("pre1");
7331       // Pa = Pa_base;
7332       // Pb = Pb_base + i;
7333       // Pm = Pm_base;
7334       // Pn = Pn_base + i;
7335       // Ra = *Pa;
7336       // Rb = *Pb;
7337       // Rm = *Pm;
7338       // Rn = *Pn;
7339       ldr(Ra, Address(Pa_base));
7340       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7341       ldr(Rm, Address(Pm_base));
7342       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7343       lea(Pa, Address(Pa_base));
7344       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7345       lea(Pm, Address(Pm_base));
7346       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7347 
7348       // Zero the m*n result.
7349       mov(Rhi_mn, zr);
7350       mov(Rlo_mn, zr);
7351     }
7352 
7353     // The core multiply-accumulate step of a Montgomery
7354     // multiplication.  The idea is to schedule operations as a
7355     // pipeline so that instructions with long latencies (loads and
7356     // multiplies) have time to complete before their results are
7357     // used.  This most benefits in-order implementations of the
7358     // architecture but out-of-order ones also benefit.
7359     void step() {
7360       block_comment("step");
7361       // MACC(Ra, Rb, t0, t1, t2);
7362       // Ra = *++Pa;
7363       // Rb = *--Pb;
7364       umulh(Rhi_ab, Ra, Rb);
7365       mul(Rlo_ab, Ra, Rb);
7366       ldr(Ra, pre(Pa, wordSize));
7367       ldr(Rb, pre(Pb, -wordSize));
7368       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7369                                        // previous iteration.
7370       // MACC(Rm, Rn, t0, t1, t2);
7371       // Rm = *++Pm;
7372       // Rn = *--Pn;
7373       umulh(Rhi_mn, Rm, Rn);
7374       mul(Rlo_mn, Rm, Rn);
7375       ldr(Rm, pre(Pm, wordSize));
7376       ldr(Rn, pre(Pn, -wordSize));
7377       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7378     }
7379 
7380     void post1() {
7381       block_comment("post1");
7382 
7383       // MACC(Ra, Rb, t0, t1, t2);
7384       // Ra = *++Pa;
7385       // Rb = *--Pb;
7386       umulh(Rhi_ab, Ra, Rb);
7387       mul(Rlo_ab, Ra, Rb);
7388       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7389       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7390 
7391       // *Pm = Rm = t0 * inv;
7392       mul(Rm, t0, inv);
7393       str(Rm, Address(Pm));
7394 
7395       // MACC(Rm, Rn, t0, t1, t2);
7396       // t0 = t1; t1 = t2; t2 = 0;
7397       umulh(Rhi_mn, Rm, Rn);
7398 
7399 #ifndef PRODUCT
7400       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7401       {
7402         mul(Rlo_mn, Rm, Rn);
7403         add(Rlo_mn, t0, Rlo_mn);
7404         Label ok;
7405         cbz(Rlo_mn, ok); {
7406           stop("broken Montgomery multiply");
7407         } bind(ok);
7408       }
7409 #endif
7410       // We have very carefully set things up so that
7411       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7412       // the lower half of Rm * Rn because we know the result already:
7413       // it must be -t0.  t0 + (-t0) must generate a carry iff
7414       // t0 != 0.  So, rather than do a mul and an adds we just set
7415       // the carry flag iff t0 is nonzero.
7416       //
7417       // mul(Rlo_mn, Rm, Rn);
7418       // adds(zr, t0, Rlo_mn);
7419       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7420       adcs(t0, t1, Rhi_mn);
7421       adc(t1, t2, zr);
7422       mov(t2, zr);
7423     }
7424 
7425     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7426       block_comment("pre2");
7427       // Pa = Pa_base + i-len;
7428       // Pb = Pb_base + len;
7429       // Pm = Pm_base + i-len;
7430       // Pn = Pn_base + len;
7431 
7432       if (i.is_register()) {
7433         sub(Rj, i.as_register(), len);
7434       } else {
7435         mov(Rj, i.as_constant());
7436         sub(Rj, Rj, len);
7437       }
7438       // Rj == i-len
7439 
7440       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7441       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7442       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7443       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7444 
7445       // Ra = *++Pa;
7446       // Rb = *--Pb;
7447       // Rm = *++Pm;
7448       // Rn = *--Pn;
7449       ldr(Ra, pre(Pa, wordSize));
7450       ldr(Rb, pre(Pb, -wordSize));
7451       ldr(Rm, pre(Pm, wordSize));
7452       ldr(Rn, pre(Pn, -wordSize));
7453 
7454       mov(Rhi_mn, zr);
7455       mov(Rlo_mn, zr);
7456     }
7457 
7458     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7459       block_comment("post2");
7460       if (i.is_constant()) {
7461         mov(Rj, i.as_constant()-len.as_constant());
7462       } else {
7463         sub(Rj, i.as_register(), len);
7464       }
7465 
7466       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7467 
7468       // As soon as we know the least significant digit of our result,
7469       // store it.
7470       // Pm_base[i-len] = t0;
7471       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7472 
7473       // t0 = t1; t1 = t2; t2 = 0;
7474       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7475       adc(t1, t2, zr);
7476       mov(t2, zr);
7477     }
7478 
7479     // A carry in t0 after Montgomery multiplication means that we
7480     // should subtract multiples of n from our result in m.  We'll
7481     // keep doing that until there is no carry.
7482     void normalize(RegisterOrConstant len) {
7483       block_comment("normalize");
7484       // while (t0)
7485       //   t0 = sub(Pm_base, Pn_base, t0, len);
7486       Label loop, post, again;
7487       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7488       cbz(t0, post); {
7489         bind(again); {
7490           mov(i, zr);
7491           mov(cnt, len);
7492           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7493           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7494           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7495           align(16);
7496           bind(loop); {
7497             sbcs(Rm, Rm, Rn);
7498             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7499             add(i, i, 1);
7500             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7501             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7502             sub(cnt, cnt, 1);
7503           } cbnz(cnt, loop);
7504           sbc(t0, t0, zr);
7505         } cbnz(t0, again);
7506       } bind(post);
7507     }
7508 
7509     // Move memory at s to d, reversing words.
7510     //    Increments d to end of copied memory
7511     //    Destroys tmp1, tmp2
7512     //    Preserves len
7513     //    Leaves s pointing to the address which was in d at start
7514     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7515       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7516       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7517 
7518       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7519       mov(tmp1, len);
7520       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7521       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7522     }
7523     // where
7524     void reverse1(Register d, Register s, Register tmp) {
7525       ldr(tmp, pre(s, -wordSize));
7526       ror(tmp, tmp, 32);
7527       str(tmp, post(d, wordSize));
7528     }
7529 
7530     void step_squaring() {
7531       // An extra ACC
7532       step();
7533       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7534     }
7535 
7536     void last_squaring(RegisterOrConstant i) {
7537       Label dont;
7538       // if ((i & 1) == 0) {
7539       tbnz(i.as_register(), 0, dont); {
7540         // MACC(Ra, Rb, t0, t1, t2);
7541         // Ra = *++Pa;
7542         // Rb = *--Pb;
7543         umulh(Rhi_ab, Ra, Rb);
7544         mul(Rlo_ab, Ra, Rb);
7545         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7546       } bind(dont);
7547     }
7548 
7549     void extra_step_squaring() {
7550       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7551 
7552       // MACC(Rm, Rn, t0, t1, t2);
7553       // Rm = *++Pm;
7554       // Rn = *--Pn;
7555       umulh(Rhi_mn, Rm, Rn);
7556       mul(Rlo_mn, Rm, Rn);
7557       ldr(Rm, pre(Pm, wordSize));
7558       ldr(Rn, pre(Pn, -wordSize));
7559     }
7560 
7561     void post1_squaring() {
7562       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7563 
7564       // *Pm = Rm = t0 * inv;
7565       mul(Rm, t0, inv);
7566       str(Rm, Address(Pm));
7567 
7568       // MACC(Rm, Rn, t0, t1, t2);
7569       // t0 = t1; t1 = t2; t2 = 0;
7570       umulh(Rhi_mn, Rm, Rn);
7571 
7572 #ifndef PRODUCT
7573       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7574       {
7575         mul(Rlo_mn, Rm, Rn);
7576         add(Rlo_mn, t0, Rlo_mn);
7577         Label ok;
7578         cbz(Rlo_mn, ok); {
7579           stop("broken Montgomery multiply");
7580         } bind(ok);
7581       }
7582 #endif
7583       // We have very carefully set things up so that
7584       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7585       // the lower half of Rm * Rn because we know the result already:
7586       // it must be -t0.  t0 + (-t0) must generate a carry iff
7587       // t0 != 0.  So, rather than do a mul and an adds we just set
7588       // the carry flag iff t0 is nonzero.
7589       //
7590       // mul(Rlo_mn, Rm, Rn);
7591       // adds(zr, t0, Rlo_mn);
7592       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7593       adcs(t0, t1, Rhi_mn);
7594       adc(t1, t2, zr);
7595       mov(t2, zr);
7596     }
7597 
7598     void acc(Register Rhi, Register Rlo,
7599              Register t0, Register t1, Register t2) {
7600       adds(t0, t0, Rlo);
7601       adcs(t1, t1, Rhi);
7602       adc(t2, t2, zr);
7603     }
7604 
7605   public:
7606     /**
7607      * Fast Montgomery multiplication.  The derivation of the
7608      * algorithm is in A Cryptographic Library for the Motorola
7609      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7610      *
7611      * Arguments:
7612      *
7613      * Inputs for multiplication:
7614      *   c_rarg0   - int array elements a
7615      *   c_rarg1   - int array elements b
7616      *   c_rarg2   - int array elements n (the modulus)
7617      *   c_rarg3   - int length
7618      *   c_rarg4   - int inv
7619      *   c_rarg5   - int array elements m (the result)
7620      *
7621      * Inputs for squaring:
7622      *   c_rarg0   - int array elements a
7623      *   c_rarg1   - int array elements n (the modulus)
7624      *   c_rarg2   - int length
7625      *   c_rarg3   - int inv
7626      *   c_rarg4   - int array elements m (the result)
7627      *
7628      */
7629     address generate_multiply() {
7630       Label argh, nothing;
7631       bind(argh);
7632       stop("MontgomeryMultiply total_allocation must be <= 8192");
7633 
7634       align(CodeEntryAlignment);
7635       address entry = pc();
7636 
7637       cbzw(Rlen, nothing);
7638 
7639       enter();
7640 
7641       // Make room.
7642       cmpw(Rlen, 512);
7643       br(Assembler::HI, argh);
7644       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7645       andr(sp, Ra, -2 * wordSize);
7646 
7647       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7648 
7649       {
7650         // Copy input args, reversing as we go.  We use Ra as a
7651         // temporary variable.
7652         reverse(Ra, Pa_base, Rlen, t0, t1);
7653         if (!_squaring)
7654           reverse(Ra, Pb_base, Rlen, t0, t1);
7655         reverse(Ra, Pn_base, Rlen, t0, t1);
7656       }
7657 
7658       // Push all call-saved registers and also Pm_base which we'll need
7659       // at the end.
7660       save_regs();
7661 
7662 #ifndef PRODUCT
7663       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7664       {
7665         ldr(Rn, Address(Pn_base, 0));
7666         mul(Rlo_mn, Rn, inv);
7667         subs(zr, Rlo_mn, -1);
7668         Label ok;
7669         br(EQ, ok); {
7670           stop("broken inverse in Montgomery multiply");
7671         } bind(ok);
7672       }
7673 #endif
7674 
7675       mov(Pm_base, Ra);
7676 
7677       mov(t0, zr);
7678       mov(t1, zr);
7679       mov(t2, zr);
7680 
7681       block_comment("for (int i = 0; i < len; i++) {");
7682       mov(Ri, zr); {
7683         Label loop, end;
7684         cmpw(Ri, Rlen);
7685         br(Assembler::GE, end);
7686 
7687         bind(loop);
7688         pre1(Ri);
7689 
7690         block_comment("  for (j = i; j; j--) {"); {
7691           movw(Rj, Ri);
7692           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7693         } block_comment("  } // j");
7694 
7695         post1();
7696         addw(Ri, Ri, 1);
7697         cmpw(Ri, Rlen);
7698         br(Assembler::LT, loop);
7699         bind(end);
7700         block_comment("} // i");
7701       }
7702 
7703       block_comment("for (int i = len; i < 2*len; i++) {");
7704       mov(Ri, Rlen); {
7705         Label loop, end;
7706         cmpw(Ri, Rlen, Assembler::LSL, 1);
7707         br(Assembler::GE, end);
7708 
7709         bind(loop);
7710         pre2(Ri, Rlen);
7711 
7712         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7713           lslw(Rj, Rlen, 1);
7714           subw(Rj, Rj, Ri);
7715           subw(Rj, Rj, 1);
7716           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7717         } block_comment("  } // j");
7718 
7719         post2(Ri, Rlen);
7720         addw(Ri, Ri, 1);
7721         cmpw(Ri, Rlen, Assembler::LSL, 1);
7722         br(Assembler::LT, loop);
7723         bind(end);
7724       }
7725       block_comment("} // i");
7726 
7727       normalize(Rlen);
7728 
7729       mov(Ra, Pm_base);  // Save Pm_base in Ra
7730       restore_regs();  // Restore caller's Pm_base
7731 
7732       // Copy our result into caller's Pm_base
7733       reverse(Pm_base, Ra, Rlen, t0, t1);
7734 
7735       leave();
7736       bind(nothing);
7737       ret(lr);
7738 
7739       return entry;
7740     }
7741     // In C, approximately:
7742 
7743     // void
7744     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7745     //                     julong Pn_base[], julong Pm_base[],
7746     //                     julong inv, int len) {
7747     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7748     //   julong *Pa, *Pb, *Pn, *Pm;
7749     //   julong Ra, Rb, Rn, Rm;
7750 
7751     //   int i;
7752 
7753     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7754 
7755     //   for (i = 0; i < len; i++) {
7756     //     int j;
7757 
7758     //     Pa = Pa_base;
7759     //     Pb = Pb_base + i;
7760     //     Pm = Pm_base;
7761     //     Pn = Pn_base + i;
7762 
7763     //     Ra = *Pa;
7764     //     Rb = *Pb;
7765     //     Rm = *Pm;
7766     //     Rn = *Pn;
7767 
7768     //     int iters = i;
7769     //     for (j = 0; iters--; j++) {
7770     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7771     //       MACC(Ra, Rb, t0, t1, t2);
7772     //       Ra = *++Pa;
7773     //       Rb = *--Pb;
7774     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7775     //       MACC(Rm, Rn, t0, t1, t2);
7776     //       Rm = *++Pm;
7777     //       Rn = *--Pn;
7778     //     }
7779 
7780     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7781     //     MACC(Ra, Rb, t0, t1, t2);
7782     //     *Pm = Rm = t0 * inv;
7783     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7784     //     MACC(Rm, Rn, t0, t1, t2);
7785 
7786     //     assert(t0 == 0, "broken Montgomery multiply");
7787 
7788     //     t0 = t1; t1 = t2; t2 = 0;
7789     //   }
7790 
7791     //   for (i = len; i < 2*len; i++) {
7792     //     int j;
7793 
7794     //     Pa = Pa_base + i-len;
7795     //     Pb = Pb_base + len;
7796     //     Pm = Pm_base + i-len;
7797     //     Pn = Pn_base + len;
7798 
7799     //     Ra = *++Pa;
7800     //     Rb = *--Pb;
7801     //     Rm = *++Pm;
7802     //     Rn = *--Pn;
7803 
7804     //     int iters = len*2-i-1;
7805     //     for (j = i-len+1; iters--; j++) {
7806     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7807     //       MACC(Ra, Rb, t0, t1, t2);
7808     //       Ra = *++Pa;
7809     //       Rb = *--Pb;
7810     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7811     //       MACC(Rm, Rn, t0, t1, t2);
7812     //       Rm = *++Pm;
7813     //       Rn = *--Pn;
7814     //     }
7815 
7816     //     Pm_base[i-len] = t0;
7817     //     t0 = t1; t1 = t2; t2 = 0;
7818     //   }
7819 
7820     //   while (t0)
7821     //     t0 = sub(Pm_base, Pn_base, t0, len);
7822     // }
7823 
7824     /**
7825      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7826      * multiplies than Montgomery multiplication so it should be up to
7827      * 25% faster.  However, its loop control is more complex and it
7828      * may actually run slower on some machines.
7829      *
7830      * Arguments:
7831      *
7832      * Inputs:
7833      *   c_rarg0   - int array elements a
7834      *   c_rarg1   - int array elements n (the modulus)
7835      *   c_rarg2   - int length
7836      *   c_rarg3   - int inv
7837      *   c_rarg4   - int array elements m (the result)
7838      *
7839      */
7840     address generate_square() {
7841       Label argh;
7842       bind(argh);
7843       stop("MontgomeryMultiply total_allocation must be <= 8192");
7844 
7845       align(CodeEntryAlignment);
7846       address entry = pc();
7847 
7848       enter();
7849 
7850       // Make room.
7851       cmpw(Rlen, 512);
7852       br(Assembler::HI, argh);
7853       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7854       andr(sp, Ra, -2 * wordSize);
7855 
7856       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7857 
7858       {
7859         // Copy input args, reversing as we go.  We use Ra as a
7860         // temporary variable.
7861         reverse(Ra, Pa_base, Rlen, t0, t1);
7862         reverse(Ra, Pn_base, Rlen, t0, t1);
7863       }
7864 
7865       // Push all call-saved registers and also Pm_base which we'll need
7866       // at the end.
7867       save_regs();
7868 
7869       mov(Pm_base, Ra);
7870 
7871       mov(t0, zr);
7872       mov(t1, zr);
7873       mov(t2, zr);
7874 
7875       block_comment("for (int i = 0; i < len; i++) {");
7876       mov(Ri, zr); {
7877         Label loop, end;
7878         bind(loop);
7879         cmp(Ri, Rlen);
7880         br(Assembler::GE, end);
7881 
7882         pre1(Ri);
7883 
7884         block_comment("for (j = (i+1)/2; j; j--) {"); {
7885           add(Rj, Ri, 1);
7886           lsr(Rj, Rj, 1);
7887           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7888         } block_comment("  } // j");
7889 
7890         last_squaring(Ri);
7891 
7892         block_comment("  for (j = i/2; j; j--) {"); {
7893           lsr(Rj, Ri, 1);
7894           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7895         } block_comment("  } // j");
7896 
7897         post1_squaring();
7898         add(Ri, Ri, 1);
7899         cmp(Ri, Rlen);
7900         br(Assembler::LT, loop);
7901 
7902         bind(end);
7903         block_comment("} // i");
7904       }
7905 
7906       block_comment("for (int i = len; i < 2*len; i++) {");
7907       mov(Ri, Rlen); {
7908         Label loop, end;
7909         bind(loop);
7910         cmp(Ri, Rlen, Assembler::LSL, 1);
7911         br(Assembler::GE, end);
7912 
7913         pre2(Ri, Rlen);
7914 
7915         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7916           lsl(Rj, Rlen, 1);
7917           sub(Rj, Rj, Ri);
7918           sub(Rj, Rj, 1);
7919           lsr(Rj, Rj, 1);
7920           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7921         } block_comment("  } // j");
7922 
7923         last_squaring(Ri);
7924 
7925         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7926           lsl(Rj, Rlen, 1);
7927           sub(Rj, Rj, Ri);
7928           lsr(Rj, Rj, 1);
7929           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7930         } block_comment("  } // j");
7931 
7932         post2(Ri, Rlen);
7933         add(Ri, Ri, 1);
7934         cmp(Ri, Rlen, Assembler::LSL, 1);
7935 
7936         br(Assembler::LT, loop);
7937         bind(end);
7938         block_comment("} // i");
7939       }
7940 
7941       normalize(Rlen);
7942 
7943       mov(Ra, Pm_base);  // Save Pm_base in Ra
7944       restore_regs();  // Restore caller's Pm_base
7945 
7946       // Copy our result into caller's Pm_base
7947       reverse(Pm_base, Ra, Rlen, t0, t1);
7948 
7949       leave();
7950       ret(lr);
7951 
7952       return entry;
7953     }
7954     // In C, approximately:
7955 
7956     // void
7957     // montgomery_square(julong Pa_base[], julong Pn_base[],
7958     //                   julong Pm_base[], julong inv, int len) {
7959     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7960     //   julong *Pa, *Pb, *Pn, *Pm;
7961     //   julong Ra, Rb, Rn, Rm;
7962 
7963     //   int i;
7964 
7965     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7966 
7967     //   for (i = 0; i < len; i++) {
7968     //     int j;
7969 
7970     //     Pa = Pa_base;
7971     //     Pb = Pa_base + i;
7972     //     Pm = Pm_base;
7973     //     Pn = Pn_base + i;
7974 
7975     //     Ra = *Pa;
7976     //     Rb = *Pb;
7977     //     Rm = *Pm;
7978     //     Rn = *Pn;
7979 
7980     //     int iters = (i+1)/2;
7981     //     for (j = 0; iters--; j++) {
7982     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7983     //       MACC2(Ra, Rb, t0, t1, t2);
7984     //       Ra = *++Pa;
7985     //       Rb = *--Pb;
7986     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7987     //       MACC(Rm, Rn, t0, t1, t2);
7988     //       Rm = *++Pm;
7989     //       Rn = *--Pn;
7990     //     }
7991     //     if ((i & 1) == 0) {
7992     //       assert(Ra == Pa_base[j], "must be");
7993     //       MACC(Ra, Ra, t0, t1, t2);
7994     //     }
7995     //     iters = i/2;
7996     //     assert(iters == i-j, "must be");
7997     //     for (; iters--; j++) {
7998     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7999     //       MACC(Rm, Rn, t0, t1, t2);
8000     //       Rm = *++Pm;
8001     //       Rn = *--Pn;
8002     //     }
8003 
8004     //     *Pm = Rm = t0 * inv;
8005     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8006     //     MACC(Rm, Rn, t0, t1, t2);
8007 
8008     //     assert(t0 == 0, "broken Montgomery multiply");
8009 
8010     //     t0 = t1; t1 = t2; t2 = 0;
8011     //   }
8012 
8013     //   for (i = len; i < 2*len; i++) {
8014     //     int start = i-len+1;
8015     //     int end = start + (len - start)/2;
8016     //     int j;
8017 
8018     //     Pa = Pa_base + i-len;
8019     //     Pb = Pa_base + len;
8020     //     Pm = Pm_base + i-len;
8021     //     Pn = Pn_base + len;
8022 
8023     //     Ra = *++Pa;
8024     //     Rb = *--Pb;
8025     //     Rm = *++Pm;
8026     //     Rn = *--Pn;
8027 
8028     //     int iters = (2*len-i-1)/2;
8029     //     assert(iters == end-start, "must be");
8030     //     for (j = start; iters--; j++) {
8031     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8032     //       MACC2(Ra, Rb, t0, t1, t2);
8033     //       Ra = *++Pa;
8034     //       Rb = *--Pb;
8035     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8036     //       MACC(Rm, Rn, t0, t1, t2);
8037     //       Rm = *++Pm;
8038     //       Rn = *--Pn;
8039     //     }
8040     //     if ((i & 1) == 0) {
8041     //       assert(Ra == Pa_base[j], "must be");
8042     //       MACC(Ra, Ra, t0, t1, t2);
8043     //     }
8044     //     iters =  (2*len-i)/2;
8045     //     assert(iters == len-j, "must be");
8046     //     for (; iters--; j++) {
8047     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8048     //       MACC(Rm, Rn, t0, t1, t2);
8049     //       Rm = *++Pm;
8050     //       Rn = *--Pn;
8051     //     }
8052     //     Pm_base[i-len] = t0;
8053     //     t0 = t1; t1 = t2; t2 = 0;
8054     //   }
8055 
8056     //   while (t0)
8057     //     t0 = sub(Pm_base, Pn_base, t0, len);
8058     // }
8059   };
8060 
8061 
8062   // Call here from the interpreter or compiled code to either load
8063   // multiple returned values from the inline type instance being
8064   // returned to registers or to store returned values to a newly
8065   // allocated inline type instance.
8066   address generate_return_value_stub(address destination, const char* name, bool has_res) {
8067     // We need to save all registers the calling convention may use so
8068     // the runtime calls read or update those registers. This needs to
8069     // be in sync with SharedRuntime::java_return_convention().
8070     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
8071     enum layout {
8072       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
8073       j_rarg6_off, j_rarg6_2,
8074       j_rarg5_off, j_rarg5_2,
8075       j_rarg4_off, j_rarg4_2,
8076       j_rarg3_off, j_rarg3_2,
8077       j_rarg2_off, j_rarg2_2,
8078       j_rarg1_off, j_rarg1_2,
8079       j_rarg0_off, j_rarg0_2,
8080 
8081       j_farg7_off, j_farg7_2,
8082       j_farg6_off, j_farg6_2,
8083       j_farg5_off, j_farg5_2,
8084       j_farg4_off, j_farg4_2,
8085       j_farg3_off, j_farg3_2,
8086       j_farg2_off, j_farg2_2,
8087       j_farg1_off, j_farg1_2,
8088       j_farg0_off, j_farg0_2,
8089 
8090       rfp_off, rfp_off2,
8091       return_off, return_off2,
8092 
8093       framesize // inclusive of return address
8094     };
8095 
8096     CodeBuffer code(name, 512, 64);
8097     MacroAssembler* masm = new MacroAssembler(&code);
8098 
8099     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
8100     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
8101     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
8102     int frame_size_in_words = frame_size_in_bytes / wordSize;
8103 
8104     OopMapSet* oop_maps = new OopMapSet();
8105     OopMap* map = new OopMap(frame_size_in_slots, 0);
8106 
8107     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
8108     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
8109     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
8110     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
8111     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
8112     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
8113     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
8114     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
8115 
8116     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
8117     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
8118     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
8119     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
8120     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
8121     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
8122     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
8123     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
8124 
8125     address start = __ pc();
8126 
8127     __ enter(); // Save FP and LR before call
8128 
8129     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
8130     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
8131     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
8132     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
8133 
8134     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
8135     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
8136     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
8137     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
8138 
8139     int frame_complete = __ offset();
8140 
8141     // Set up last_Java_sp and last_Java_fp
8142     address the_pc = __ pc();
8143     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
8144 
8145     // Call runtime
8146     __ mov(c_rarg1, r0);
8147     __ mov(c_rarg0, rthread);
8148 
8149     __ mov(rscratch1, destination);
8150     __ blr(rscratch1);
8151 
8152     oop_maps->add_gc_map(the_pc - start, map);
8153 
8154     __ reset_last_Java_frame(false);
8155 
8156     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
8157     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
8158     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
8159     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
8160 
8161     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
8162     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
8163     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
8164     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
8165 
8166     __ leave();
8167 
8168     // check for pending exceptions
8169     Label pending;
8170     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
8171     __ cbnz(rscratch1, pending);
8172 
8173     if (has_res) {
8174       __ get_vm_result(r0, rthread);
8175     }
8176 
8177     __ ret(lr);
8178 
8179     __ bind(pending);
8180     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
8181 
8182     // -------------
8183     // make sure all code is generated
8184     masm->flush();
8185 
8186     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
8187     return stub->entry_point();
8188   }
8189 
8190   // Initialization
8191   void generate_initial_stubs() {
8192     // Generate initial stubs and initializes the entry points
8193 
8194     // entry points that exist in all platforms Note: This is code
8195     // that could be shared among different platforms - however the
8196     // benefit seems to be smaller than the disadvantage of having a
8197     // much more complicated generator structure. See also comment in
8198     // stubRoutines.hpp.
8199 
8200     StubRoutines::_forward_exception_entry = generate_forward_exception();
8201 
8202     StubRoutines::_call_stub_entry =
8203       generate_call_stub(StubRoutines::_call_stub_return_address);
8204 
8205     // is referenced by megamorphic call
8206     StubRoutines::_catch_exception_entry = generate_catch_exception();
8207 
8208     // Build this early so it's available for the interpreter.
8209     StubRoutines::_throw_StackOverflowError_entry =
8210       generate_throw_exception("StackOverflowError throw_exception",
8211                                CAST_FROM_FN_PTR(address,
8212                                                 SharedRuntime::throw_StackOverflowError));
8213     StubRoutines::_throw_delayed_StackOverflowError_entry =
8214       generate_throw_exception("delayed StackOverflowError throw_exception",
8215                                CAST_FROM_FN_PTR(address,
8216                                                 SharedRuntime::throw_delayed_StackOverflowError));
8217 
8218     // Initialize table for copy memory (arraycopy) check.
8219     if (UnsafeCopyMemory::_table == nullptr) {
8220       UnsafeCopyMemory::create_table(8);
8221     }
8222 
8223     if (UseCRC32Intrinsics) {
8224       // set table address before stub generation which use it
8225       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8226       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8227     }
8228 
8229     if (UseCRC32CIntrinsics) {
8230       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8231     }
8232 
8233     // Disabled until JDK-8210858 is fixed
8234     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
8235     //   StubRoutines::_dlog = generate_dlog();
8236     // }
8237 
8238     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8239       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8240     }
8241 
8242     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8243       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8244     }
8245 
8246     if (InlineTypeReturnedAsFields) {
8247       StubRoutines::_load_inline_type_fields_in_regs =
8248          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
8249       StubRoutines::_store_inline_type_fields_to_buf =
8250          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
8251     }
8252   }
8253 
8254   void generate_continuation_stubs() {
8255     // Continuation stubs:
8256     StubRoutines::_cont_thaw          = generate_cont_thaw();
8257     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8258     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8259 
8260     JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();)
8261     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();)
8262   }
8263 
8264   void generate_final_stubs() {
8265     // support for verify_oop (must happen after universe_init)
8266     if (VerifyOops) {
8267       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8268     }
8269     StubRoutines::_throw_AbstractMethodError_entry =
8270       generate_throw_exception("AbstractMethodError throw_exception",
8271                                CAST_FROM_FN_PTR(address,
8272                                                 SharedRuntime::
8273                                                 throw_AbstractMethodError));
8274 
8275     StubRoutines::_throw_IncompatibleClassChangeError_entry =
8276       generate_throw_exception("IncompatibleClassChangeError throw_exception",
8277                                CAST_FROM_FN_PTR(address,
8278                                                 SharedRuntime::
8279                                                 throw_IncompatibleClassChangeError));
8280 
8281     StubRoutines::_throw_NullPointerException_at_call_entry =
8282       generate_throw_exception("NullPointerException at call throw_exception",
8283                                CAST_FROM_FN_PTR(address,
8284                                                 SharedRuntime::
8285                                                 throw_NullPointerException_at_call));
8286 
8287     // arraycopy stubs used by compilers
8288     generate_arraycopy_stubs();
8289 
8290     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8291     if (bs_nm != nullptr) {
8292       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
8293     }
8294 
8295     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8296 
8297 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8298 
8299     generate_atomic_entry_points();
8300 
8301 #endif // LINUX
8302 
8303     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8304   }
8305 
8306   void generate_compiler_stubs() {
8307 #if COMPILER2_OR_JVMCI
8308 
8309     if (UseSVE == 0) {
8310       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8311     }
8312 
8313     // array equals stub for large arrays.
8314     if (!UseSimpleArrayEquals) {
8315       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8316     }
8317 
8318     // byte_array_inflate stub for large arrays.
8319     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8320 
8321     // countPositives stub for large arrays.
8322     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8323 
8324     generate_compare_long_strings();
8325 
8326     generate_string_indexof_stubs();
8327 
8328 #ifdef COMPILER2
8329     if (UseMultiplyToLenIntrinsic) {
8330       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8331     }
8332 
8333     if (UseSquareToLenIntrinsic) {
8334       StubRoutines::_squareToLen = generate_squareToLen();
8335     }
8336 
8337     if (UseMulAddIntrinsic) {
8338       StubRoutines::_mulAdd = generate_mulAdd();
8339     }
8340 
8341     if (UseSIMDForBigIntegerShiftIntrinsics) {
8342       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8343       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8344     }
8345 
8346     if (UseMontgomeryMultiplyIntrinsic) {
8347       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8348       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8349       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8350     }
8351 
8352     if (UseMontgomerySquareIntrinsic) {
8353       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8354       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8355       // We use generate_multiply() rather than generate_square()
8356       // because it's faster for the sizes of modulus we care about.
8357       StubRoutines::_montgomerySquare = g.generate_multiply();
8358     }
8359 #endif // COMPILER2
8360 
8361     if (UseChaCha20Intrinsics) {
8362       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8363     }
8364 
8365     if (UseBASE64Intrinsics) {
8366         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8367         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8368     }
8369 
8370     // data cache line writeback
8371     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8372     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8373 
8374     if (UseAESIntrinsics) {
8375       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8376       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8377       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8378       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8379       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8380     }
8381     if (UseGHASHIntrinsics) {
8382       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8383       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8384     }
8385     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8386       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8387     }
8388 
8389     if (UseMD5Intrinsics) {
8390       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8391       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8392     }
8393     if (UseSHA1Intrinsics) {
8394       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8395       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8396     }
8397     if (UseSHA256Intrinsics) {
8398       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8399       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8400     }
8401     if (UseSHA512Intrinsics) {
8402       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8403       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8404     }
8405     if (UseSHA3Intrinsics) {
8406       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8407       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8408     }
8409 
8410     // generate Adler32 intrinsics code
8411     if (UseAdler32Intrinsics) {
8412       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8413     }
8414 #endif // COMPILER2_OR_JVMCI
8415   }
8416 
8417  public:
8418   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8419     switch(kind) {
8420     case Initial_stubs:
8421       generate_initial_stubs();
8422       break;
8423      case Continuation_stubs:
8424       generate_continuation_stubs();
8425       break;
8426     case Compiler_stubs:
8427       generate_compiler_stubs();
8428       break;
8429     case Final_stubs:
8430       generate_final_stubs();
8431       break;
8432     default:
8433       fatal("unexpected stubs kind: %d", kind);
8434       break;
8435     };
8436   }
8437 }; // end class declaration
8438 
8439 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8440   StubGenerator g(code, kind);
8441 }
8442 
8443 
8444 #if defined (LINUX)
8445 
8446 // Define pointers to atomic stubs and initialize them to point to the
8447 // code in atomic_aarch64.S.
8448 
8449 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8450   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8451     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8452   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8453     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8454 
8455 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8456 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8457 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8458 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8459 DEFAULT_ATOMIC_OP(xchg, 4, )
8460 DEFAULT_ATOMIC_OP(xchg, 8, )
8461 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8462 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8463 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8464 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8465 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8466 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8467 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8468 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8469 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8470 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8471 
8472 #undef DEFAULT_ATOMIC_OP
8473 
8474 #endif // LINUX