1 /*
   2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "runtime/atomic.hpp"
  45 #include "runtime/continuation.hpp"
  46 #include "runtime/continuationEntry.inline.hpp"
  47 #include "runtime/frame.inline.hpp"
  48 #include "runtime/handles.inline.hpp"
  49 #include "runtime/javaThread.hpp"
  50 #include "runtime/sharedRuntime.hpp"
  51 #include "runtime/stubCodeGenerator.hpp"
  52 #include "runtime/stubRoutines.hpp"
  53 #include "utilities/align.hpp"
  54 #include "utilities/globalDefinitions.hpp"
  55 #include "utilities/powerOfTwo.hpp"
  56 #ifdef COMPILER2
  57 #include "opto/runtime.hpp"
  58 #endif
  59 #if INCLUDE_ZGC
  60 #include "gc/z/zThreadLocalData.hpp"
  61 #endif
  62 
  63 // Declaration and definition of StubGenerator (no .hpp file).
  64 // For a more detailed description of the stub routine structure
  65 // see the comment in stubRoutines.hpp
  66 
  67 #undef __
  68 #define __ _masm->
  69 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  70 
  71 #ifdef PRODUCT
  72 #define BLOCK_COMMENT(str) /* nothing */
  73 #else
  74 #define BLOCK_COMMENT(str) __ block_comment(str)
  75 #endif
  76 
  77 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  78 
  79 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots);
  80 void fill_continuation_entry(MacroAssembler* masm);
  81 void continuation_enter_cleanup(MacroAssembler* masm);
  82 
  83 // Stub Code definitions
  84 
  85 class StubGenerator: public StubCodeGenerator {
  86  private:
  87 
  88 #ifdef PRODUCT
  89 #define inc_counter_np(counter) ((void)0)
  90 #else
  91   void inc_counter_np_(int& counter) {
  92     __ lea(rscratch2, ExternalAddress((address)&counter));
  93     __ ldrw(rscratch1, Address(rscratch2));
  94     __ addw(rscratch1, rscratch1, 1);
  95     __ strw(rscratch1, Address(rscratch2));
  96   }
  97 #define inc_counter_np(counter) \
  98   BLOCK_COMMENT("inc_counter " #counter); \
  99   inc_counter_np_(counter);
 100 #endif
 101 
 102   // Call stubs are used to call Java from C
 103   //
 104   // Arguments:
 105   //    c_rarg0:   call wrapper address                   address
 106   //    c_rarg1:   result                                 address
 107   //    c_rarg2:   result type                            BasicType
 108   //    c_rarg3:   method                                 Method*
 109   //    c_rarg4:   (interpreter) entry point              address
 110   //    c_rarg5:   parameters                             intptr_t*
 111   //    c_rarg6:   parameter size (in words)              int
 112   //    c_rarg7:   thread                                 Thread*
 113   //
 114   // There is no return from the stub itself as any Java result
 115   // is written to result
 116   //
 117   // we save r30 (lr) as the return PC at the base of the frame and
 118   // link r29 (fp) below it as the frame pointer installing sp (r31)
 119   // into fp.
 120   //
 121   // we save r0-r7, which accounts for all the c arguments.
 122   //
 123   // TODO: strictly do we need to save them all? they are treated as
 124   // volatile by C so could we omit saving the ones we are going to
 125   // place in global registers (thread? method?) or those we only use
 126   // during setup of the Java call?
 127   //
 128   // we don't need to save r8 which C uses as an indirect result location
 129   // return register.
 130   //
 131   // we don't need to save r9-r15 which both C and Java treat as
 132   // volatile
 133   //
 134   // we don't need to save r16-18 because Java does not use them
 135   //
 136   // we save r19-r28 which Java uses as scratch registers and C
 137   // expects to be callee-save
 138   //
 139   // we save the bottom 64 bits of each value stored in v8-v15; it is
 140   // the responsibility of the caller to preserve larger values.
 141   //
 142   // so the stub frame looks like this when we enter Java code
 143   //
 144   //     [ return_from_Java     ] <--- sp
 145   //     [ argument word n      ]
 146   //      ...
 147   // -27 [ argument word 1      ]
 148   // -26 [ saved v15            ] <--- sp_after_call
 149   // -25 [ saved v14            ]
 150   // -24 [ saved v13            ]
 151   // -23 [ saved v12            ]
 152   // -22 [ saved v11            ]
 153   // -21 [ saved v10            ]
 154   // -20 [ saved v9             ]
 155   // -19 [ saved v8             ]
 156   // -18 [ saved r28            ]
 157   // -17 [ saved r27            ]
 158   // -16 [ saved r26            ]
 159   // -15 [ saved r25            ]
 160   // -14 [ saved r24            ]
 161   // -13 [ saved r23            ]
 162   // -12 [ saved r22            ]
 163   // -11 [ saved r21            ]
 164   // -10 [ saved r20            ]
 165   //  -9 [ saved r19            ]
 166   //  -8 [ call wrapper    (r0) ]
 167   //  -7 [ result          (r1) ]
 168   //  -6 [ result type     (r2) ]
 169   //  -5 [ method          (r3) ]
 170   //  -4 [ entry point     (r4) ]
 171   //  -3 [ parameters      (r5) ]
 172   //  -2 [ parameter size  (r6) ]
 173   //  -1 [ thread (r7)          ]
 174   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 175   //   1 [ saved lr       (r30) ]
 176 
 177   // Call stub stack layout word offsets from fp
 178   enum call_stub_layout {
 179     sp_after_call_off = -26,
 180 
 181     d15_off            = -26,
 182     d13_off            = -24,
 183     d11_off            = -22,
 184     d9_off             = -20,
 185 
 186     r28_off            = -18,
 187     r26_off            = -16,
 188     r24_off            = -14,
 189     r22_off            = -12,
 190     r20_off            = -10,
 191     call_wrapper_off   =  -8,
 192     result_off         =  -7,
 193     result_type_off    =  -6,
 194     method_off         =  -5,
 195     entry_point_off    =  -4,
 196     parameter_size_off =  -2,
 197     thread_off         =  -1,
 198     fp_f               =   0,
 199     retaddr_off        =   1,
 200   };
 201 
 202   address generate_call_stub(address& return_address) {
 203     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 204            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 205            "adjust this code");
 206 
 207     StubCodeMark mark(this, "StubRoutines", "call_stub");
 208     address start = __ pc();
 209 
 210     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 211 
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     // install Java thread in global register now we have saved
 261     // whatever value it held
 262     __ mov(rthread, c_rarg7);
 263     // And method
 264     __ mov(rmethod, c_rarg3);
 265 
 266     // set up the heapbase register
 267     __ reinit_heapbase();
 268 
 269 #ifdef ASSERT
 270     // make sure we have no pending exceptions
 271     {
 272       Label L;
 273       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 274       __ cmp(rscratch1, (u1)NULL_WORD);
 275       __ br(Assembler::EQ, L);
 276       __ stop("StubRoutines::call_stub: entered with pending exception");
 277       __ BIND(L);
 278     }
 279 #endif
 280     // pass parameters if any
 281     __ mov(esp, sp);
 282     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 283     __ andr(sp, rscratch1, -2 * wordSize);
 284 
 285     BLOCK_COMMENT("pass parameters if any");
 286     Label parameters_done;
 287     // parameter count is still in c_rarg6
 288     // and parameter pointer identifying param 1 is in c_rarg5
 289     __ cbzw(c_rarg6, parameters_done);
 290 
 291     address loop = __ pc();
 292     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 293     __ subsw(c_rarg6, c_rarg6, 1);
 294     __ push(rscratch1);
 295     __ br(Assembler::GT, loop);
 296 
 297     __ BIND(parameters_done);
 298 
 299     // call Java entry -- passing methdoOop, and current sp
 300     //      rmethod: Method*
 301     //      r19_sender_sp: sender sp
 302     BLOCK_COMMENT("call Java function");
 303     __ mov(r19_sender_sp, sp);
 304     __ blr(c_rarg4);
 305 
 306     // we do this here because the notify will already have been done
 307     // if we get to the next instruction via an exception
 308     //
 309     // n.b. adding this instruction here affects the calculation of
 310     // whether or not a routine returns to the call stub (used when
 311     // doing stack walks) since the normal test is to check the return
 312     // pc against the address saved below. so we may need to allow for
 313     // this extra instruction in the check.
 314 
 315     // save current address for use by exception handling code
 316 
 317     return_address = __ pc();
 318 
 319     // store result depending on type (everything that is not
 320     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 321     // n.b. this assumes Java returns an integral result in r0
 322     // and a floating result in j_farg0
 323     __ ldr(j_rarg2, result);
 324     Label is_long, is_float, is_double, exit;
 325     __ ldr(j_rarg1, result_type);
 326     __ cmp(j_rarg1, (u1)T_OBJECT);
 327     __ br(Assembler::EQ, is_long);
 328     __ cmp(j_rarg1, (u1)T_LONG);
 329     __ br(Assembler::EQ, is_long);
 330     __ cmp(j_rarg1, (u1)T_FLOAT);
 331     __ br(Assembler::EQ, is_float);
 332     __ cmp(j_rarg1, (u1)T_DOUBLE);
 333     __ br(Assembler::EQ, is_double);
 334 
 335     // handle T_INT case
 336     __ strw(r0, Address(j_rarg2));
 337 
 338     __ BIND(exit);
 339 
 340     // pop parameters
 341     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 342 
 343 #ifdef ASSERT
 344     // verify that threads correspond
 345     {
 346       Label L, S;
 347       __ ldr(rscratch1, thread);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::NE, S);
 350       __ get_thread(rscratch1);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::EQ, L);
 353       __ BIND(S);
 354       __ stop("StubRoutines::call_stub: threads must correspond");
 355       __ BIND(L);
 356     }
 357 #endif
 358 
 359     __ pop_cont_fastpath(rthread);
 360 
 361     // restore callee-save registers
 362     __ ldpd(v15, v14,  d15_save);
 363     __ ldpd(v13, v12,  d13_save);
 364     __ ldpd(v11, v10,  d11_save);
 365     __ ldpd(v9,  v8,   d9_save);
 366 
 367     __ ldp(r28, r27,   r28_save);
 368     __ ldp(r26, r25,   r26_save);
 369     __ ldp(r24, r23,   r24_save);
 370     __ ldp(r22, r21,   r22_save);
 371     __ ldp(r20, r19,   r20_save);
 372 
 373     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 374     __ ldrw(c_rarg2, result_type);
 375     __ ldr(c_rarg3,  method);
 376     __ ldp(c_rarg4, c_rarg5,  entry_point);
 377     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 378 
 379     // leave frame and return to caller
 380     __ leave();
 381     __ ret(lr);
 382 
 383     // handle return types different from T_INT
 384 
 385     __ BIND(is_long);
 386     __ str(r0, Address(j_rarg2, 0));
 387     __ br(Assembler::AL, exit);
 388 
 389     __ BIND(is_float);
 390     __ strs(j_farg0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_double);
 394     __ strd(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     return start;
 398   }
 399 
 400   // Return point for a Java call if there's an exception thrown in
 401   // Java code.  The exception is caught and transformed into a
 402   // pending exception stored in JavaThread that can be tested from
 403   // within the VM.
 404   //
 405   // Note: Usually the parameters are removed by the callee. In case
 406   // of an exception crossing an activation frame boundary, that is
 407   // not the case if the callee is compiled code => need to setup the
 408   // rsp.
 409   //
 410   // r0: exception oop
 411 
 412   address generate_catch_exception() {
 413     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 414     address start = __ pc();
 415 
 416     // same as in generate_call_stub():
 417     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 418     const Address thread        (rfp, thread_off         * wordSize);
 419 
 420 #ifdef ASSERT
 421     // verify that threads correspond
 422     {
 423       Label L, S;
 424       __ ldr(rscratch1, thread);
 425       __ cmp(rthread, rscratch1);
 426       __ br(Assembler::NE, S);
 427       __ get_thread(rscratch1);
 428       __ cmp(rthread, rscratch1);
 429       __ br(Assembler::EQ, L);
 430       __ bind(S);
 431       __ stop("StubRoutines::catch_exception: threads must correspond");
 432       __ bind(L);
 433     }
 434 #endif
 435 
 436     // set pending exception
 437     __ verify_oop(r0);
 438 
 439     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 440     __ mov(rscratch1, (address)__FILE__);
 441     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 442     __ movw(rscratch1, (int)__LINE__);
 443     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 444 
 445     // complete return to VM
 446     assert(StubRoutines::_call_stub_return_address != NULL,
 447            "_call_stub_return_address must have been generated before");
 448     __ b(StubRoutines::_call_stub_return_address);
 449 
 450     return start;
 451   }
 452 
 453   // Continuation point for runtime calls returning with a pending
 454   // exception.  The pending exception check happened in the runtime
 455   // or native call stub.  The pending exception in Thread is
 456   // converted into a Java-level exception.
 457   //
 458   // Contract with Java-level exception handlers:
 459   // r0: exception
 460   // r3: throwing pc
 461   //
 462   // NOTE: At entry of this stub, exception-pc must be in LR !!
 463 
 464   // NOTE: this is always used as a jump target within generated code
 465   // so it just needs to be generated code with no x86 prolog
 466 
 467   address generate_forward_exception() {
 468     StubCodeMark mark(this, "StubRoutines", "forward exception");
 469     address start = __ pc();
 470 
 471     // Upon entry, LR points to the return address returning into
 472     // Java (interpreted or compiled) code; i.e., the return address
 473     // becomes the throwing pc.
 474     //
 475     // Arguments pushed before the runtime call are still on the stack
 476     // but the exception handler will reset the stack pointer ->
 477     // ignore them.  A potential result in registers can be ignored as
 478     // well.
 479 
 480 #ifdef ASSERT
 481     // make sure this code is only executed if there is a pending exception
 482     {
 483       Label L;
 484       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 485       __ cbnz(rscratch1, L);
 486       __ stop("StubRoutines::forward exception: no pending exception (1)");
 487       __ bind(L);
 488     }
 489 #endif
 490 
 491     // compute exception handler into r19
 492 
 493     // call the VM to find the handler address associated with the
 494     // caller address. pass thread in r0 and caller pc (ret address)
 495     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 496     // the stack.
 497     __ mov(c_rarg1, lr);
 498     // lr will be trashed by the VM call so we move it to R19
 499     // (callee-saved) because we also need to pass it to the handler
 500     // returned by this call.
 501     __ mov(r19, lr);
 502     BLOCK_COMMENT("call exception_handler_for_return_address");
 503     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 504                          SharedRuntime::exception_handler_for_return_address),
 505                     rthread, c_rarg1);
 506     // Reinitialize the ptrue predicate register, in case the external runtime
 507     // call clobbers ptrue reg, as we may return to SVE compiled code.
 508     __ reinitialize_ptrue();
 509 
 510     // we should not really care that lr is no longer the callee
 511     // address. we saved the value the handler needs in r19 so we can
 512     // just copy it to r3. however, the C2 handler will push its own
 513     // frame and then calls into the VM and the VM code asserts that
 514     // the PC for the frame above the handler belongs to a compiled
 515     // Java method. So, we restore lr here to satisfy that assert.
 516     __ mov(lr, r19);
 517     // setup r0 & r3 & clear pending exception
 518     __ mov(r3, r19);
 519     __ mov(r19, r0);
 520     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 521     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 522 
 523 #ifdef ASSERT
 524     // make sure exception is set
 525     {
 526       Label L;
 527       __ cbnz(r0, L);
 528       __ stop("StubRoutines::forward exception: no pending exception (2)");
 529       __ bind(L);
 530     }
 531 #endif
 532 
 533     // continue at exception handler
 534     // r0: exception
 535     // r3: throwing pc
 536     // r19: exception handler
 537     __ verify_oop(r0);
 538     __ br(r19);
 539 
 540     return start;
 541   }
 542 
 543   // Non-destructive plausibility checks for oops
 544   //
 545   // Arguments:
 546   //    r0: oop to verify
 547   //    rscratch1: error message
 548   //
 549   // Stack after saving c_rarg3:
 550   //    [tos + 0]: saved c_rarg3
 551   //    [tos + 1]: saved c_rarg2
 552   //    [tos + 2]: saved lr
 553   //    [tos + 3]: saved rscratch2
 554   //    [tos + 4]: saved r0
 555   //    [tos + 5]: saved rscratch1
 556   address generate_verify_oop() {
 557 
 558     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 559     address start = __ pc();
 560 
 561     Label exit, error;
 562 
 563     // save c_rarg2 and c_rarg3
 564     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 565 
 566     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 567     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 568     __ ldr(c_rarg3, Address(c_rarg2));
 569     __ add(c_rarg3, c_rarg3, 1);
 570     __ str(c_rarg3, Address(c_rarg2));
 571 
 572     // object is in r0
 573     // make sure object is 'reasonable'
 574     __ cbz(r0, exit); // if obj is NULL it is OK
 575 
 576 #if INCLUDE_ZGC
 577     if (UseZGC) {
 578       // Check if mask is good.
 579       // verifies that ZAddressBadMask & r0 == 0
 580       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 581       __ andr(c_rarg2, r0, c_rarg3);
 582       __ cbnz(c_rarg2, error);
 583     }
 584 #endif
 585 
 586     // Check if the oop is in the right area of memory
 587     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 588     __ andr(c_rarg2, r0, c_rarg3);
 589     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 590 
 591     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 592     // instruction here because the flags register is live.
 593     __ eor(c_rarg2, c_rarg2, c_rarg3);
 594     __ cbnz(c_rarg2, error);
 595 
 596     // make sure klass is 'reasonable', which is not zero.
 597     __ load_klass(r0, r0);  // get klass
 598     __ cbz(r0, error);      // if klass is NULL it is broken
 599 
 600     // return if everything seems ok
 601     __ bind(exit);
 602 
 603     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 604     __ ret(lr);
 605 
 606     // handle errors
 607     __ bind(error);
 608     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 609 
 610     __ push(RegSet::range(r0, r29), sp);
 611     // debug(char* msg, int64_t pc, int64_t regs[])
 612     __ mov(c_rarg0, rscratch1);      // pass address of error message
 613     __ mov(c_rarg1, lr);             // pass return address
 614     __ mov(c_rarg2, sp);             // pass address of regs on stack
 615 #ifndef PRODUCT
 616     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 617 #endif
 618     BLOCK_COMMENT("call MacroAssembler::debug");
 619     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 620     __ blr(rscratch1);
 621     __ hlt(0);
 622 
 623     return start;
 624   }
 625 
 626   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 627 
 628   // Generate indices for iota vector.
 629   address generate_iota_indices(const char *stub_name) {
 630     __ align(CodeEntryAlignment);
 631     StubCodeMark mark(this, "StubRoutines", stub_name);
 632     address start = __ pc();
 633     __ emit_data64(0x0706050403020100, relocInfo::none);
 634     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 635     return start;
 636   }
 637 
 638   // The inner part of zero_words().  This is the bulk operation,
 639   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 640   // caller is responsible for zeroing the last few words.
 641   //
 642   // Inputs:
 643   // r10: the HeapWord-aligned base address of an array to zero.
 644   // r11: the count in HeapWords, r11 > 0.
 645   //
 646   // Returns r10 and r11, adjusted for the caller to clear.
 647   // r10: the base address of the tail of words left to clear.
 648   // r11: the number of words in the tail.
 649   //      r11 < MacroAssembler::zero_words_block_size.
 650 
 651   address generate_zero_blocks() {
 652     Label done;
 653     Label base_aligned;
 654 
 655     Register base = r10, cnt = r11;
 656 
 657     __ align(CodeEntryAlignment);
 658     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 659     address start = __ pc();
 660 
 661     if (UseBlockZeroing) {
 662       int zva_length = VM_Version::zva_length();
 663 
 664       // Ensure ZVA length can be divided by 16. This is required by
 665       // the subsequent operations.
 666       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 667 
 668       __ tbz(base, 3, base_aligned);
 669       __ str(zr, Address(__ post(base, 8)));
 670       __ sub(cnt, cnt, 1);
 671       __ bind(base_aligned);
 672 
 673       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 674       // alignment.
 675       Label small;
 676       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 677       __ subs(rscratch1, cnt, low_limit >> 3);
 678       __ br(Assembler::LT, small);
 679       __ zero_dcache_blocks(base, cnt);
 680       __ bind(small);
 681     }
 682 
 683     {
 684       // Number of stp instructions we'll unroll
 685       const int unroll =
 686         MacroAssembler::zero_words_block_size / 2;
 687       // Clear the remaining blocks.
 688       Label loop;
 689       __ subs(cnt, cnt, unroll * 2);
 690       __ br(Assembler::LT, done);
 691       __ bind(loop);
 692       for (int i = 0; i < unroll; i++)
 693         __ stp(zr, zr, __ post(base, 16));
 694       __ subs(cnt, cnt, unroll * 2);
 695       __ br(Assembler::GE, loop);
 696       __ bind(done);
 697       __ add(cnt, cnt, unroll * 2);
 698     }
 699 
 700     __ ret(lr);
 701 
 702     return start;
 703   }
 704 
 705 
 706   typedef enum {
 707     copy_forwards = 1,
 708     copy_backwards = -1
 709   } copy_direction;
 710 
 711   // Bulk copy of blocks of 8 words.
 712   //
 713   // count is a count of words.
 714   //
 715   // Precondition: count >= 8
 716   //
 717   // Postconditions:
 718   //
 719   // The least significant bit of count contains the remaining count
 720   // of words to copy.  The rest of count is trash.
 721   //
 722   // s and d are adjusted to point to the remaining words to copy
 723   //
 724   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 725                            copy_direction direction) {
 726     int unit = wordSize * direction;
 727     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 728 
 729     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 730       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 731     const Register stride = r13;
 732 
 733     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 734     assert_different_registers(s, d, count, rscratch1);
 735 
 736     Label again, drain;
 737     const char *stub_name;
 738     if (direction == copy_forwards)
 739       stub_name = "forward_copy_longs";
 740     else
 741       stub_name = "backward_copy_longs";
 742 
 743     __ align(CodeEntryAlignment);
 744 
 745     StubCodeMark mark(this, "StubRoutines", stub_name);
 746 
 747     __ bind(start);
 748 
 749     Label unaligned_copy_long;
 750     if (AvoidUnalignedAccesses) {
 751       __ tbnz(d, 3, unaligned_copy_long);
 752     }
 753 
 754     if (direction == copy_forwards) {
 755       __ sub(s, s, bias);
 756       __ sub(d, d, bias);
 757     }
 758 
 759 #ifdef ASSERT
 760     // Make sure we are never given < 8 words
 761     {
 762       Label L;
 763       __ cmp(count, (u1)8);
 764       __ br(Assembler::GE, L);
 765       __ stop("genrate_copy_longs called with < 8 words");
 766       __ bind(L);
 767     }
 768 #endif
 769 
 770     // Fill 8 registers
 771     if (UseSIMDForMemoryOps) {
 772       __ ldpq(v0, v1, Address(s, 4 * unit));
 773       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 774     } else {
 775       __ ldp(t0, t1, Address(s, 2 * unit));
 776       __ ldp(t2, t3, Address(s, 4 * unit));
 777       __ ldp(t4, t5, Address(s, 6 * unit));
 778       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 779     }
 780 
 781     __ subs(count, count, 16);
 782     __ br(Assembler::LO, drain);
 783 
 784     int prefetch = PrefetchCopyIntervalInBytes;
 785     bool use_stride = false;
 786     if (direction == copy_backwards) {
 787        use_stride = prefetch > 256;
 788        prefetch = -prefetch;
 789        if (use_stride) __ mov(stride, prefetch);
 790     }
 791 
 792     __ bind(again);
 793 
 794     if (PrefetchCopyIntervalInBytes > 0)
 795       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 796 
 797     if (UseSIMDForMemoryOps) {
 798       __ stpq(v0, v1, Address(d, 4 * unit));
 799       __ ldpq(v0, v1, Address(s, 4 * unit));
 800       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 801       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 802     } else {
 803       __ stp(t0, t1, Address(d, 2 * unit));
 804       __ ldp(t0, t1, Address(s, 2 * unit));
 805       __ stp(t2, t3, Address(d, 4 * unit));
 806       __ ldp(t2, t3, Address(s, 4 * unit));
 807       __ stp(t4, t5, Address(d, 6 * unit));
 808       __ ldp(t4, t5, Address(s, 6 * unit));
 809       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 810       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 811     }
 812 
 813     __ subs(count, count, 8);
 814     __ br(Assembler::HS, again);
 815 
 816     // Drain
 817     __ bind(drain);
 818     if (UseSIMDForMemoryOps) {
 819       __ stpq(v0, v1, Address(d, 4 * unit));
 820       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 821     } else {
 822       __ stp(t0, t1, Address(d, 2 * unit));
 823       __ stp(t2, t3, Address(d, 4 * unit));
 824       __ stp(t4, t5, Address(d, 6 * unit));
 825       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 826     }
 827 
 828     {
 829       Label L1, L2;
 830       __ tbz(count, exact_log2(4), L1);
 831       if (UseSIMDForMemoryOps) {
 832         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 833         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 834       } else {
 835         __ ldp(t0, t1, Address(s, 2 * unit));
 836         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 837         __ stp(t0, t1, Address(d, 2 * unit));
 838         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 839       }
 840       __ bind(L1);
 841 
 842       if (direction == copy_forwards) {
 843         __ add(s, s, bias);
 844         __ add(d, d, bias);
 845       }
 846 
 847       __ tbz(count, 1, L2);
 848       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 849       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 850       __ bind(L2);
 851     }
 852 
 853     __ ret(lr);
 854 
 855     if (AvoidUnalignedAccesses) {
 856       Label drain, again;
 857       // Register order for storing. Order is different for backward copy.
 858 
 859       __ bind(unaligned_copy_long);
 860 
 861       // source address is even aligned, target odd aligned
 862       //
 863       // when forward copying word pairs we read long pairs at offsets
 864       // {0, 2, 4, 6} (in long words). when backwards copying we read
 865       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 866       // address by -2 in the forwards case so we can compute the
 867       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 868       // or -1.
 869       //
 870       // when forward copying we need to store 1 word, 3 pairs and
 871       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 872       // zero offset We adjust the destination by -1 which means we
 873       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 874       //
 875       // When backwards copyng we need to store 1 word, 3 pairs and
 876       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 877       // offsets {1, 3, 5, 7, 8} * unit.
 878 
 879       if (direction == copy_forwards) {
 880         __ sub(s, s, 16);
 881         __ sub(d, d, 8);
 882       }
 883 
 884       // Fill 8 registers
 885       //
 886       // for forwards copy s was offset by -16 from the original input
 887       // value of s so the register contents are at these offsets
 888       // relative to the 64 bit block addressed by that original input
 889       // and so on for each successive 64 byte block when s is updated
 890       //
 891       // t0 at offset 0,  t1 at offset 8
 892       // t2 at offset 16, t3 at offset 24
 893       // t4 at offset 32, t5 at offset 40
 894       // t6 at offset 48, t7 at offset 56
 895 
 896       // for backwards copy s was not offset so the register contents
 897       // are at these offsets into the preceding 64 byte block
 898       // relative to that original input and so on for each successive
 899       // preceding 64 byte block when s is updated. this explains the
 900       // slightly counter-intuitive looking pattern of register usage
 901       // in the stp instructions for backwards copy.
 902       //
 903       // t0 at offset -16, t1 at offset -8
 904       // t2 at offset -32, t3 at offset -24
 905       // t4 at offset -48, t5 at offset -40
 906       // t6 at offset -64, t7 at offset -56
 907 
 908       __ ldp(t0, t1, Address(s, 2 * unit));
 909       __ ldp(t2, t3, Address(s, 4 * unit));
 910       __ ldp(t4, t5, Address(s, 6 * unit));
 911       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 912 
 913       __ subs(count, count, 16);
 914       __ br(Assembler::LO, drain);
 915 
 916       int prefetch = PrefetchCopyIntervalInBytes;
 917       bool use_stride = false;
 918       if (direction == copy_backwards) {
 919          use_stride = prefetch > 256;
 920          prefetch = -prefetch;
 921          if (use_stride) __ mov(stride, prefetch);
 922       }
 923 
 924       __ bind(again);
 925 
 926       if (PrefetchCopyIntervalInBytes > 0)
 927         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 928 
 929       if (direction == copy_forwards) {
 930        // allowing for the offset of -8 the store instructions place
 931        // registers into the target 64 bit block at the following
 932        // offsets
 933        //
 934        // t0 at offset 0
 935        // t1 at offset 8,  t2 at offset 16
 936        // t3 at offset 24, t4 at offset 32
 937        // t5 at offset 40, t6 at offset 48
 938        // t7 at offset 56
 939 
 940         __ str(t0, Address(d, 1 * unit));
 941         __ stp(t1, t2, Address(d, 2 * unit));
 942         __ ldp(t0, t1, Address(s, 2 * unit));
 943         __ stp(t3, t4, Address(d, 4 * unit));
 944         __ ldp(t2, t3, Address(s, 4 * unit));
 945         __ stp(t5, t6, Address(d, 6 * unit));
 946         __ ldp(t4, t5, Address(s, 6 * unit));
 947         __ str(t7, Address(__ pre(d, 8 * unit)));
 948         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 949       } else {
 950        // d was not offset when we started so the registers are
 951        // written into the 64 bit block preceding d with the following
 952        // offsets
 953        //
 954        // t1 at offset -8
 955        // t3 at offset -24, t0 at offset -16
 956        // t5 at offset -48, t2 at offset -32
 957        // t7 at offset -56, t4 at offset -48
 958        //                   t6 at offset -64
 959        //
 960        // note that this matches the offsets previously noted for the
 961        // loads
 962 
 963         __ str(t1, Address(d, 1 * unit));
 964         __ stp(t3, t0, Address(d, 3 * unit));
 965         __ ldp(t0, t1, Address(s, 2 * unit));
 966         __ stp(t5, t2, Address(d, 5 * unit));
 967         __ ldp(t2, t3, Address(s, 4 * unit));
 968         __ stp(t7, t4, Address(d, 7 * unit));
 969         __ ldp(t4, t5, Address(s, 6 * unit));
 970         __ str(t6, Address(__ pre(d, 8 * unit)));
 971         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 972       }
 973 
 974       __ subs(count, count, 8);
 975       __ br(Assembler::HS, again);
 976 
 977       // Drain
 978       //
 979       // this uses the same pattern of offsets and register arguments
 980       // as above
 981       __ bind(drain);
 982       if (direction == copy_forwards) {
 983         __ str(t0, Address(d, 1 * unit));
 984         __ stp(t1, t2, Address(d, 2 * unit));
 985         __ stp(t3, t4, Address(d, 4 * unit));
 986         __ stp(t5, t6, Address(d, 6 * unit));
 987         __ str(t7, Address(__ pre(d, 8 * unit)));
 988       } else {
 989         __ str(t1, Address(d, 1 * unit));
 990         __ stp(t3, t0, Address(d, 3 * unit));
 991         __ stp(t5, t2, Address(d, 5 * unit));
 992         __ stp(t7, t4, Address(d, 7 * unit));
 993         __ str(t6, Address(__ pre(d, 8 * unit)));
 994       }
 995       // now we need to copy any remaining part block which may
 996       // include a 4 word block subblock and/or a 2 word subblock.
 997       // bits 2 and 1 in the count are the tell-tale for whether we
 998       // have each such subblock
 999       {
1000         Label L1, L2;
1001         __ tbz(count, exact_log2(4), L1);
1002        // this is the same as above but copying only 4 longs hence
1003        // with only one intervening stp between the str instructions
1004        // but note that the offsets and registers still follow the
1005        // same pattern
1006         __ ldp(t0, t1, Address(s, 2 * unit));
1007         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1008         if (direction == copy_forwards) {
1009           __ str(t0, Address(d, 1 * unit));
1010           __ stp(t1, t2, Address(d, 2 * unit));
1011           __ str(t3, Address(__ pre(d, 4 * unit)));
1012         } else {
1013           __ str(t1, Address(d, 1 * unit));
1014           __ stp(t3, t0, Address(d, 3 * unit));
1015           __ str(t2, Address(__ pre(d, 4 * unit)));
1016         }
1017         __ bind(L1);
1018 
1019         __ tbz(count, 1, L2);
1020        // this is the same as above but copying only 2 longs hence
1021        // there is no intervening stp between the str instructions
1022        // but note that the offset and register patterns are still
1023        // the same
1024         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1025         if (direction == copy_forwards) {
1026           __ str(t0, Address(d, 1 * unit));
1027           __ str(t1, Address(__ pre(d, 2 * unit)));
1028         } else {
1029           __ str(t1, Address(d, 1 * unit));
1030           __ str(t0, Address(__ pre(d, 2 * unit)));
1031         }
1032         __ bind(L2);
1033 
1034        // for forwards copy we need to re-adjust the offsets we
1035        // applied so that s and d are follow the last words written
1036 
1037        if (direction == copy_forwards) {
1038          __ add(s, s, 16);
1039          __ add(d, d, 8);
1040        }
1041 
1042       }
1043 
1044       __ ret(lr);
1045       }
1046   }
1047 
1048   // Small copy: less than 16 bytes.
1049   //
1050   // NB: Ignores all of the bits of count which represent more than 15
1051   // bytes, so a caller doesn't have to mask them.
1052 
1053   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1054     bool is_backwards = step < 0;
1055     size_t granularity = uabs(step);
1056     int direction = is_backwards ? -1 : 1;
1057     int unit = wordSize * direction;
1058 
1059     Label Lword, Lint, Lshort, Lbyte;
1060 
1061     assert(granularity
1062            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1063 
1064     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1065 
1066     // ??? I don't know if this bit-test-and-branch is the right thing
1067     // to do.  It does a lot of jumping, resulting in several
1068     // mispredicted branches.  It might make more sense to do this
1069     // with something like Duff's device with a single computed branch.
1070 
1071     __ tbz(count, 3 - exact_log2(granularity), Lword);
1072     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1073     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1074     __ bind(Lword);
1075 
1076     if (granularity <= sizeof (jint)) {
1077       __ tbz(count, 2 - exact_log2(granularity), Lint);
1078       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1079       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1080       __ bind(Lint);
1081     }
1082 
1083     if (granularity <= sizeof (jshort)) {
1084       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1085       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1086       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1087       __ bind(Lshort);
1088     }
1089 
1090     if (granularity <= sizeof (jbyte)) {
1091       __ tbz(count, 0, Lbyte);
1092       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1093       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1094       __ bind(Lbyte);
1095     }
1096   }
1097 
1098   Label copy_f, copy_b;
1099 
1100   // All-singing all-dancing memory copy.
1101   //
1102   // Copy count units of memory from s to d.  The size of a unit is
1103   // step, which can be positive or negative depending on the direction
1104   // of copy.  If is_aligned is false, we align the source address.
1105   //
1106 
1107   void copy_memory(bool is_aligned, Register s, Register d,
1108                    Register count, Register tmp, int step) {
1109     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1110     bool is_backwards = step < 0;
1111     unsigned int granularity = uabs(step);
1112     const Register t0 = r3, t1 = r4;
1113 
1114     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1115     // load all the data before writing anything
1116     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1117     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1118     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1119     const Register send = r17, dend = r16;
1120 
1121     if (PrefetchCopyIntervalInBytes > 0)
1122       __ prfm(Address(s, 0), PLDL1KEEP);
1123     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1124     __ br(Assembler::HI, copy_big);
1125 
1126     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1127     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1128 
1129     __ cmp(count, u1(16/granularity));
1130     __ br(Assembler::LS, copy16);
1131 
1132     __ cmp(count, u1(64/granularity));
1133     __ br(Assembler::HI, copy80);
1134 
1135     __ cmp(count, u1(32/granularity));
1136     __ br(Assembler::LS, copy32);
1137 
1138     // 33..64 bytes
1139     if (UseSIMDForMemoryOps) {
1140       __ ldpq(v0, v1, Address(s, 0));
1141       __ ldpq(v2, v3, Address(send, -32));
1142       __ stpq(v0, v1, Address(d, 0));
1143       __ stpq(v2, v3, Address(dend, -32));
1144     } else {
1145       __ ldp(t0, t1, Address(s, 0));
1146       __ ldp(t2, t3, Address(s, 16));
1147       __ ldp(t4, t5, Address(send, -32));
1148       __ ldp(t6, t7, Address(send, -16));
1149 
1150       __ stp(t0, t1, Address(d, 0));
1151       __ stp(t2, t3, Address(d, 16));
1152       __ stp(t4, t5, Address(dend, -32));
1153       __ stp(t6, t7, Address(dend, -16));
1154     }
1155     __ b(finish);
1156 
1157     // 17..32 bytes
1158     __ bind(copy32);
1159     __ ldp(t0, t1, Address(s, 0));
1160     __ ldp(t2, t3, Address(send, -16));
1161     __ stp(t0, t1, Address(d, 0));
1162     __ stp(t2, t3, Address(dend, -16));
1163     __ b(finish);
1164 
1165     // 65..80/96 bytes
1166     // (96 bytes if SIMD because we do 32 byes per instruction)
1167     __ bind(copy80);
1168     if (UseSIMDForMemoryOps) {
1169       __ ldpq(v0, v1, Address(s, 0));
1170       __ ldpq(v2, v3, Address(s, 32));
1171       // Unaligned pointers can be an issue for copying.
1172       // The issue has more chances to happen when granularity of data is
1173       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1174       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1175       // The most performance drop has been seen for the range 65-80 bytes.
1176       // For such cases using the pair of ldp/stp instead of the third pair of
1177       // ldpq/stpq fixes the performance issue.
1178       if (granularity < sizeof (jint)) {
1179         Label copy96;
1180         __ cmp(count, u1(80/granularity));
1181         __ br(Assembler::HI, copy96);
1182         __ ldp(t0, t1, Address(send, -16));
1183 
1184         __ stpq(v0, v1, Address(d, 0));
1185         __ stpq(v2, v3, Address(d, 32));
1186         __ stp(t0, t1, Address(dend, -16));
1187         __ b(finish);
1188 
1189         __ bind(copy96);
1190       }
1191       __ ldpq(v4, v5, Address(send, -32));
1192 
1193       __ stpq(v0, v1, Address(d, 0));
1194       __ stpq(v2, v3, Address(d, 32));
1195       __ stpq(v4, v5, Address(dend, -32));
1196     } else {
1197       __ ldp(t0, t1, Address(s, 0));
1198       __ ldp(t2, t3, Address(s, 16));
1199       __ ldp(t4, t5, Address(s, 32));
1200       __ ldp(t6, t7, Address(s, 48));
1201       __ ldp(t8, t9, Address(send, -16));
1202 
1203       __ stp(t0, t1, Address(d, 0));
1204       __ stp(t2, t3, Address(d, 16));
1205       __ stp(t4, t5, Address(d, 32));
1206       __ stp(t6, t7, Address(d, 48));
1207       __ stp(t8, t9, Address(dend, -16));
1208     }
1209     __ b(finish);
1210 
1211     // 0..16 bytes
1212     __ bind(copy16);
1213     __ cmp(count, u1(8/granularity));
1214     __ br(Assembler::LO, copy8);
1215 
1216     // 8..16 bytes
1217     __ ldr(t0, Address(s, 0));
1218     __ ldr(t1, Address(send, -8));
1219     __ str(t0, Address(d, 0));
1220     __ str(t1, Address(dend, -8));
1221     __ b(finish);
1222 
1223     if (granularity < 8) {
1224       // 4..7 bytes
1225       __ bind(copy8);
1226       __ tbz(count, 2 - exact_log2(granularity), copy4);
1227       __ ldrw(t0, Address(s, 0));
1228       __ ldrw(t1, Address(send, -4));
1229       __ strw(t0, Address(d, 0));
1230       __ strw(t1, Address(dend, -4));
1231       __ b(finish);
1232       if (granularity < 4) {
1233         // 0..3 bytes
1234         __ bind(copy4);
1235         __ cbz(count, finish); // get rid of 0 case
1236         if (granularity == 2) {
1237           __ ldrh(t0, Address(s, 0));
1238           __ strh(t0, Address(d, 0));
1239         } else { // granularity == 1
1240           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1241           // the first and last byte.
1242           // Handle the 3 byte case by loading and storing base + count/2
1243           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1244           // This does means in the 1 byte case we load/store the same
1245           // byte 3 times.
1246           __ lsr(count, count, 1);
1247           __ ldrb(t0, Address(s, 0));
1248           __ ldrb(t1, Address(send, -1));
1249           __ ldrb(t2, Address(s, count));
1250           __ strb(t0, Address(d, 0));
1251           __ strb(t1, Address(dend, -1));
1252           __ strb(t2, Address(d, count));
1253         }
1254         __ b(finish);
1255       }
1256     }
1257 
1258     __ bind(copy_big);
1259     if (is_backwards) {
1260       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1261       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1262     }
1263 
1264     // Now we've got the small case out of the way we can align the
1265     // source address on a 2-word boundary.
1266 
1267     Label aligned;
1268 
1269     if (is_aligned) {
1270       // We may have to adjust by 1 word to get s 2-word-aligned.
1271       __ tbz(s, exact_log2(wordSize), aligned);
1272       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1273       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1274       __ sub(count, count, wordSize/granularity);
1275     } else {
1276       if (is_backwards) {
1277         __ andr(rscratch2, s, 2 * wordSize - 1);
1278       } else {
1279         __ neg(rscratch2, s);
1280         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1281       }
1282       // rscratch2 is the byte adjustment needed to align s.
1283       __ cbz(rscratch2, aligned);
1284       int shift = exact_log2(granularity);
1285       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1286       __ sub(count, count, rscratch2);
1287 
1288 #if 0
1289       // ?? This code is only correct for a disjoint copy.  It may or
1290       // may not make sense to use it in that case.
1291 
1292       // Copy the first pair; s and d may not be aligned.
1293       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1294       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1295 
1296       // Align s and d, adjust count
1297       if (is_backwards) {
1298         __ sub(s, s, rscratch2);
1299         __ sub(d, d, rscratch2);
1300       } else {
1301         __ add(s, s, rscratch2);
1302         __ add(d, d, rscratch2);
1303       }
1304 #else
1305       copy_memory_small(s, d, rscratch2, rscratch1, step);
1306 #endif
1307     }
1308 
1309     __ bind(aligned);
1310 
1311     // s is now 2-word-aligned.
1312 
1313     // We have a count of units and some trailing bytes.  Adjust the
1314     // count and do a bulk copy of words.
1315     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1316     if (direction == copy_forwards)
1317       __ bl(copy_f);
1318     else
1319       __ bl(copy_b);
1320 
1321     // And the tail.
1322     copy_memory_small(s, d, count, tmp, step);
1323 
1324     if (granularity >= 8) __ bind(copy8);
1325     if (granularity >= 4) __ bind(copy4);
1326     __ bind(finish);
1327   }
1328 
1329 
1330   void clobber_registers() {
1331 #ifdef ASSERT
1332     RegSet clobbered
1333       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1334     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1335     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1336     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1337       __ mov(*it, rscratch1);
1338     }
1339 #endif
1340 
1341   }
1342 
1343   // Scan over array at a for count oops, verifying each one.
1344   // Preserves a and count, clobbers rscratch1 and rscratch2.
1345   void verify_oop_array (int size, Register a, Register count, Register temp) {
1346     Label loop, end;
1347     __ mov(rscratch1, a);
1348     __ mov(rscratch2, zr);
1349     __ bind(loop);
1350     __ cmp(rscratch2, count);
1351     __ br(Assembler::HS, end);
1352     if (size == wordSize) {
1353       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1354       __ verify_oop(temp);
1355     } else {
1356       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1357       __ decode_heap_oop(temp); // calls verify_oop
1358     }
1359     __ add(rscratch2, rscratch2, 1);
1360     __ b(loop);
1361     __ bind(end);
1362   }
1363 
1364   // Arguments:
1365   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1366   //             ignored
1367   //   is_oop  - true => oop array, so generate store check code
1368   //   name    - stub name string
1369   //
1370   // Inputs:
1371   //   c_rarg0   - source array address
1372   //   c_rarg1   - destination array address
1373   //   c_rarg2   - element count, treated as ssize_t, can be zero
1374   //
1375   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1376   // the hardware handle it.  The two dwords within qwords that span
1377   // cache line boundaries will still be loaded and stored atomically.
1378   //
1379   // Side Effects:
1380   //   disjoint_int_copy_entry is set to the no-overlap entry point
1381   //   used by generate_conjoint_int_oop_copy().
1382   //
1383   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1384                                   const char *name, bool dest_uninitialized = false) {
1385     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1386     RegSet saved_reg = RegSet::of(s, d, count);
1387     __ align(CodeEntryAlignment);
1388     StubCodeMark mark(this, "StubRoutines", name);
1389     address start = __ pc();
1390     __ enter();
1391 
1392     if (entry != NULL) {
1393       *entry = __ pc();
1394       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1395       BLOCK_COMMENT("Entry:");
1396     }
1397 
1398     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1399     if (dest_uninitialized) {
1400       decorators |= IS_DEST_UNINITIALIZED;
1401     }
1402     if (aligned) {
1403       decorators |= ARRAYCOPY_ALIGNED;
1404     }
1405 
1406     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1407     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1408 
1409     if (is_oop) {
1410       // save regs before copy_memory
1411       __ push(RegSet::of(d, count), sp);
1412     }
1413     {
1414       // UnsafeCopyMemory page error: continue after ucm
1415       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1416       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1417       copy_memory(aligned, s, d, count, rscratch1, size);
1418     }
1419 
1420     if (is_oop) {
1421       __ pop(RegSet::of(d, count), sp);
1422       if (VerifyOops)
1423         verify_oop_array(size, d, count, r16);
1424     }
1425 
1426     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1427 
1428     __ leave();
1429     __ mov(r0, zr); // return 0
1430     __ ret(lr);
1431     return start;
1432   }
1433 
1434   // Arguments:
1435   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1436   //             ignored
1437   //   is_oop  - true => oop array, so generate store check code
1438   //   name    - stub name string
1439   //
1440   // Inputs:
1441   //   c_rarg0   - source array address
1442   //   c_rarg1   - destination array address
1443   //   c_rarg2   - element count, treated as ssize_t, can be zero
1444   //
1445   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1446   // the hardware handle it.  The two dwords within qwords that span
1447   // cache line boundaries will still be loaded and stored atomically.
1448   //
1449   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1450                                  address *entry, const char *name,
1451                                  bool dest_uninitialized = false) {
1452     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1453     RegSet saved_regs = RegSet::of(s, d, count);
1454     StubCodeMark mark(this, "StubRoutines", name);
1455     address start = __ pc();
1456     __ enter();
1457 
1458     if (entry != NULL) {
1459       *entry = __ pc();
1460       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1461       BLOCK_COMMENT("Entry:");
1462     }
1463 
1464     // use fwd copy when (d-s) above_equal (count*size)
1465     __ sub(rscratch1, d, s);
1466     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1467     __ br(Assembler::HS, nooverlap_target);
1468 
1469     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1470     if (dest_uninitialized) {
1471       decorators |= IS_DEST_UNINITIALIZED;
1472     }
1473     if (aligned) {
1474       decorators |= ARRAYCOPY_ALIGNED;
1475     }
1476 
1477     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1478     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1479 
1480     if (is_oop) {
1481       // save regs before copy_memory
1482       __ push(RegSet::of(d, count), sp);
1483     }
1484     {
1485       // UnsafeCopyMemory page error: continue after ucm
1486       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1487       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1488       copy_memory(aligned, s, d, count, rscratch1, -size);
1489     }
1490     if (is_oop) {
1491       __ pop(RegSet::of(d, count), sp);
1492       if (VerifyOops)
1493         verify_oop_array(size, d, count, r16);
1494     }
1495     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1496     __ leave();
1497     __ mov(r0, zr); // return 0
1498     __ ret(lr);
1499     return start;
1500 }
1501 
1502   // Arguments:
1503   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1504   //             ignored
1505   //   name    - stub name string
1506   //
1507   // Inputs:
1508   //   c_rarg0   - source array address
1509   //   c_rarg1   - destination array address
1510   //   c_rarg2   - element count, treated as ssize_t, can be zero
1511   //
1512   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1513   // we let the hardware handle it.  The one to eight bytes within words,
1514   // dwords or qwords that span cache line boundaries will still be loaded
1515   // and stored atomically.
1516   //
1517   // Side Effects:
1518   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1519   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1520   // we let the hardware handle it.  The one to eight bytes within words,
1521   // dwords or qwords that span cache line boundaries will still be loaded
1522   // and stored atomically.
1523   //
1524   // Side Effects:
1525   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1526   //   used by generate_conjoint_byte_copy().
1527   //
1528   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1529     const bool not_oop = false;
1530     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1531   }
1532 
1533   // Arguments:
1534   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1535   //             ignored
1536   //   name    - stub name string
1537   //
1538   // Inputs:
1539   //   c_rarg0   - source array address
1540   //   c_rarg1   - destination array address
1541   //   c_rarg2   - element count, treated as ssize_t, can be zero
1542   //
1543   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1544   // we let the hardware handle it.  The one to eight bytes within words,
1545   // dwords or qwords that span cache line boundaries will still be loaded
1546   // and stored atomically.
1547   //
1548   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1549                                       address* entry, const char *name) {
1550     const bool not_oop = false;
1551     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1552   }
1553 
1554   // Arguments:
1555   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1556   //             ignored
1557   //   name    - stub name string
1558   //
1559   // Inputs:
1560   //   c_rarg0   - source array address
1561   //   c_rarg1   - destination array address
1562   //   c_rarg2   - element count, treated as ssize_t, can be zero
1563   //
1564   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1565   // let the hardware handle it.  The two or four words within dwords
1566   // or qwords that span cache line boundaries will still be loaded
1567   // and stored atomically.
1568   //
1569   // Side Effects:
1570   //   disjoint_short_copy_entry is set to the no-overlap entry point
1571   //   used by generate_conjoint_short_copy().
1572   //
1573   address generate_disjoint_short_copy(bool aligned,
1574                                        address* entry, const char *name) {
1575     const bool not_oop = false;
1576     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1577   }
1578 
1579   // Arguments:
1580   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1581   //             ignored
1582   //   name    - stub name string
1583   //
1584   // Inputs:
1585   //   c_rarg0   - source array address
1586   //   c_rarg1   - destination array address
1587   //   c_rarg2   - element count, treated as ssize_t, can be zero
1588   //
1589   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1590   // let the hardware handle it.  The two or four words within dwords
1591   // or qwords that span cache line boundaries will still be loaded
1592   // and stored atomically.
1593   //
1594   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1595                                        address *entry, const char *name) {
1596     const bool not_oop = false;
1597     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1598 
1599   }
1600   // Arguments:
1601   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1602   //             ignored
1603   //   name    - stub name string
1604   //
1605   // Inputs:
1606   //   c_rarg0   - source array address
1607   //   c_rarg1   - destination array address
1608   //   c_rarg2   - element count, treated as ssize_t, can be zero
1609   //
1610   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1611   // the hardware handle it.  The two dwords within qwords that span
1612   // cache line boundaries will still be loaded and stored atomically.
1613   //
1614   // Side Effects:
1615   //   disjoint_int_copy_entry is set to the no-overlap entry point
1616   //   used by generate_conjoint_int_oop_copy().
1617   //
1618   address generate_disjoint_int_copy(bool aligned, address *entry,
1619                                          const char *name, bool dest_uninitialized = false) {
1620     const bool not_oop = false;
1621     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1622   }
1623 
1624   // Arguments:
1625   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1626   //             ignored
1627   //   name    - stub name string
1628   //
1629   // Inputs:
1630   //   c_rarg0   - source array address
1631   //   c_rarg1   - destination array address
1632   //   c_rarg2   - element count, treated as ssize_t, can be zero
1633   //
1634   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1635   // the hardware handle it.  The two dwords within qwords that span
1636   // cache line boundaries will still be loaded and stored atomically.
1637   //
1638   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1639                                      address *entry, const char *name,
1640                                      bool dest_uninitialized = false) {
1641     const bool not_oop = false;
1642     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1643   }
1644 
1645 
1646   // Arguments:
1647   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1648   //             ignored
1649   //   name    - stub name string
1650   //
1651   // Inputs:
1652   //   c_rarg0   - source array address
1653   //   c_rarg1   - destination array address
1654   //   c_rarg2   - element count, treated as size_t, can be zero
1655   //
1656   // Side Effects:
1657   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1658   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1659   //
1660   address generate_disjoint_long_copy(bool aligned, address *entry,
1661                                           const char *name, bool dest_uninitialized = false) {
1662     const bool not_oop = false;
1663     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1664   }
1665 
1666   // Arguments:
1667   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1668   //             ignored
1669   //   name    - stub name string
1670   //
1671   // Inputs:
1672   //   c_rarg0   - source array address
1673   //   c_rarg1   - destination array address
1674   //   c_rarg2   - element count, treated as size_t, can be zero
1675   //
1676   address generate_conjoint_long_copy(bool aligned,
1677                                       address nooverlap_target, address *entry,
1678                                       const char *name, bool dest_uninitialized = false) {
1679     const bool not_oop = false;
1680     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1681   }
1682 
1683   // Arguments:
1684   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1685   //             ignored
1686   //   name    - stub name string
1687   //
1688   // Inputs:
1689   //   c_rarg0   - source array address
1690   //   c_rarg1   - destination array address
1691   //   c_rarg2   - element count, treated as size_t, can be zero
1692   //
1693   // Side Effects:
1694   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1695   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1696   //
1697   address generate_disjoint_oop_copy(bool aligned, address *entry,
1698                                      const char *name, bool dest_uninitialized) {
1699     const bool is_oop = true;
1700     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1701     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1702   }
1703 
1704   // Arguments:
1705   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1706   //             ignored
1707   //   name    - stub name string
1708   //
1709   // Inputs:
1710   //   c_rarg0   - source array address
1711   //   c_rarg1   - destination array address
1712   //   c_rarg2   - element count, treated as size_t, can be zero
1713   //
1714   address generate_conjoint_oop_copy(bool aligned,
1715                                      address nooverlap_target, address *entry,
1716                                      const char *name, bool dest_uninitialized) {
1717     const bool is_oop = true;
1718     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1719     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1720                                   name, dest_uninitialized);
1721   }
1722 
1723 
1724   // Helper for generating a dynamic type check.
1725   // Smashes rscratch1, rscratch2.
1726   void generate_type_check(Register sub_klass,
1727                            Register super_check_offset,
1728                            Register super_klass,
1729                            Label& L_success) {
1730     assert_different_registers(sub_klass, super_check_offset, super_klass);
1731 
1732     BLOCK_COMMENT("type_check:");
1733 
1734     Label L_miss;
1735 
1736     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1737                                      super_check_offset);
1738     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1739 
1740     // Fall through on failure!
1741     __ BIND(L_miss);
1742   }
1743 
1744   //
1745   //  Generate checkcasting array copy stub
1746   //
1747   //  Input:
1748   //    c_rarg0   - source array address
1749   //    c_rarg1   - destination array address
1750   //    c_rarg2   - element count, treated as ssize_t, can be zero
1751   //    c_rarg3   - size_t ckoff (super_check_offset)
1752   //    c_rarg4   - oop ckval (super_klass)
1753   //
1754   //  Output:
1755   //    r0 ==  0  -  success
1756   //    r0 == -1^K - failure, where K is partial transfer count
1757   //
1758   address generate_checkcast_copy(const char *name, address *entry,
1759                                   bool dest_uninitialized = false) {
1760 
1761     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1762 
1763     // Input registers (after setup_arg_regs)
1764     const Register from        = c_rarg0;   // source array address
1765     const Register to          = c_rarg1;   // destination array address
1766     const Register count       = c_rarg2;   // elementscount
1767     const Register ckoff       = c_rarg3;   // super_check_offset
1768     const Register ckval       = c_rarg4;   // super_klass
1769 
1770     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1771     RegSet wb_post_saved_regs = RegSet::of(count);
1772 
1773     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1774     const Register copied_oop  = r22;       // actual oop copied
1775     const Register count_save  = r21;       // orig elementscount
1776     const Register start_to    = r20;       // destination array start address
1777     const Register r19_klass   = r19;       // oop._klass
1778 
1779     //---------------------------------------------------------------
1780     // Assembler stub will be used for this call to arraycopy
1781     // if the two arrays are subtypes of Object[] but the
1782     // destination array type is not equal to or a supertype
1783     // of the source type.  Each element must be separately
1784     // checked.
1785 
1786     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1787                                copied_oop, r19_klass, count_save);
1788 
1789     __ align(CodeEntryAlignment);
1790     StubCodeMark mark(this, "StubRoutines", name);
1791     address start = __ pc();
1792 
1793     __ enter(); // required for proper stackwalking of RuntimeStub frame
1794 
1795 #ifdef ASSERT
1796     // caller guarantees that the arrays really are different
1797     // otherwise, we would have to make conjoint checks
1798     { Label L;
1799       array_overlap_test(L, TIMES_OOP);
1800       __ stop("checkcast_copy within a single array");
1801       __ bind(L);
1802     }
1803 #endif //ASSERT
1804 
1805     // Caller of this entry point must set up the argument registers.
1806     if (entry != NULL) {
1807       *entry = __ pc();
1808       BLOCK_COMMENT("Entry:");
1809     }
1810 
1811      // Empty array:  Nothing to do.
1812     __ cbz(count, L_done);
1813     __ push(RegSet::of(r19, r20, r21, r22), sp);
1814 
1815 #ifdef ASSERT
1816     BLOCK_COMMENT("assert consistent ckoff/ckval");
1817     // The ckoff and ckval must be mutually consistent,
1818     // even though caller generates both.
1819     { Label L;
1820       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1821       __ ldrw(start_to, Address(ckval, sco_offset));
1822       __ cmpw(ckoff, start_to);
1823       __ br(Assembler::EQ, L);
1824       __ stop("super_check_offset inconsistent");
1825       __ bind(L);
1826     }
1827 #endif //ASSERT
1828 
1829     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1830     bool is_oop = true;
1831     if (dest_uninitialized) {
1832       decorators |= IS_DEST_UNINITIALIZED;
1833     }
1834 
1835     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1836     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1837 
1838     // save the original count
1839     __ mov(count_save, count);
1840 
1841     // Copy from low to high addresses
1842     __ mov(start_to, to);              // Save destination array start address
1843     __ b(L_load_element);
1844 
1845     // ======== begin loop ========
1846     // (Loop is rotated; its entry is L_load_element.)
1847     // Loop control:
1848     //   for (; count != 0; count--) {
1849     //     copied_oop = load_heap_oop(from++);
1850     //     ... generate_type_check ...;
1851     //     store_heap_oop(to++, copied_oop);
1852     //   }
1853     __ align(OptoLoopAlignment);
1854 
1855     __ BIND(L_store_element);
1856     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
1857     __ sub(count, count, 1);
1858     __ cbz(count, L_do_card_marks);
1859 
1860     // ======== loop entry is here ========
1861     __ BIND(L_load_element);
1862     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1863     __ cbz(copied_oop, L_store_element);
1864 
1865     __ load_klass(r19_klass, copied_oop);// query the object klass
1866     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1867     // ======== end loop ========
1868 
1869     // It was a real error; we must depend on the caller to finish the job.
1870     // Register count = remaining oops, count_orig = total oops.
1871     // Emit GC store barriers for the oops we have copied and report
1872     // their number to the caller.
1873 
1874     __ subs(count, count_save, count);     // K = partially copied oop count
1875     __ eon(count, count, zr);                   // report (-1^K) to caller
1876     __ br(Assembler::EQ, L_done_pop);
1877 
1878     __ BIND(L_do_card_marks);
1879     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1880 
1881     __ bind(L_done_pop);
1882     __ pop(RegSet::of(r19, r20, r21, r22), sp);
1883     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1884 
1885     __ bind(L_done);
1886     __ mov(r0, count);
1887     __ leave();
1888     __ ret(lr);
1889 
1890     return start;
1891   }
1892 
1893   // Perform range checks on the proposed arraycopy.
1894   // Kills temp, but nothing else.
1895   // Also, clean the sign bits of src_pos and dst_pos.
1896   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1897                               Register src_pos, // source position (c_rarg1)
1898                               Register dst,     // destination array oo (c_rarg2)
1899                               Register dst_pos, // destination position (c_rarg3)
1900                               Register length,
1901                               Register temp,
1902                               Label& L_failed) {
1903     BLOCK_COMMENT("arraycopy_range_checks:");
1904 
1905     assert_different_registers(rscratch1, temp);
1906 
1907     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1908     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1909     __ addw(temp, length, src_pos);
1910     __ cmpw(temp, rscratch1);
1911     __ br(Assembler::HI, L_failed);
1912 
1913     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1914     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1915     __ addw(temp, length, dst_pos);
1916     __ cmpw(temp, rscratch1);
1917     __ br(Assembler::HI, L_failed);
1918 
1919     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1920     __ movw(src_pos, src_pos);
1921     __ movw(dst_pos, dst_pos);
1922 
1923     BLOCK_COMMENT("arraycopy_range_checks done");
1924   }
1925 
1926   // These stubs get called from some dumb test routine.
1927   // I'll write them properly when they're called from
1928   // something that's actually doing something.
1929   static void fake_arraycopy_stub(address src, address dst, int count) {
1930     assert(count == 0, "huh?");
1931   }
1932 
1933 
1934   //
1935   //  Generate 'unsafe' array copy stub
1936   //  Though just as safe as the other stubs, it takes an unscaled
1937   //  size_t argument instead of an element count.
1938   //
1939   //  Input:
1940   //    c_rarg0   - source array address
1941   //    c_rarg1   - destination array address
1942   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1943   //
1944   // Examines the alignment of the operands and dispatches
1945   // to a long, int, short, or byte copy loop.
1946   //
1947   address generate_unsafe_copy(const char *name,
1948                                address byte_copy_entry,
1949                                address short_copy_entry,
1950                                address int_copy_entry,
1951                                address long_copy_entry) {
1952     Label L_long_aligned, L_int_aligned, L_short_aligned;
1953     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1954 
1955     __ align(CodeEntryAlignment);
1956     StubCodeMark mark(this, "StubRoutines", name);
1957     address start = __ pc();
1958     __ enter(); // required for proper stackwalking of RuntimeStub frame
1959 
1960     // bump this on entry, not on exit:
1961     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1962 
1963     __ orr(rscratch1, s, d);
1964     __ orr(rscratch1, rscratch1, count);
1965 
1966     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1967     __ cbz(rscratch1, L_long_aligned);
1968     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1969     __ cbz(rscratch1, L_int_aligned);
1970     __ tbz(rscratch1, 0, L_short_aligned);
1971     __ b(RuntimeAddress(byte_copy_entry));
1972 
1973     __ BIND(L_short_aligned);
1974     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1975     __ b(RuntimeAddress(short_copy_entry));
1976     __ BIND(L_int_aligned);
1977     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1978     __ b(RuntimeAddress(int_copy_entry));
1979     __ BIND(L_long_aligned);
1980     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1981     __ b(RuntimeAddress(long_copy_entry));
1982 
1983     return start;
1984   }
1985 
1986   //
1987   //  Generate generic array copy stubs
1988   //
1989   //  Input:
1990   //    c_rarg0    -  src oop
1991   //    c_rarg1    -  src_pos (32-bits)
1992   //    c_rarg2    -  dst oop
1993   //    c_rarg3    -  dst_pos (32-bits)
1994   //    c_rarg4    -  element count (32-bits)
1995   //
1996   //  Output:
1997   //    r0 ==  0  -  success
1998   //    r0 == -1^K - failure, where K is partial transfer count
1999   //
2000   address generate_generic_copy(const char *name,
2001                                 address byte_copy_entry, address short_copy_entry,
2002                                 address int_copy_entry, address oop_copy_entry,
2003                                 address long_copy_entry, address checkcast_copy_entry) {
2004 
2005     Label L_failed, L_objArray;
2006     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2007 
2008     // Input registers
2009     const Register src        = c_rarg0;  // source array oop
2010     const Register src_pos    = c_rarg1;  // source position
2011     const Register dst        = c_rarg2;  // destination array oop
2012     const Register dst_pos    = c_rarg3;  // destination position
2013     const Register length     = c_rarg4;
2014 
2015 
2016     // Registers used as temps
2017     const Register dst_klass  = c_rarg5;
2018 
2019     __ align(CodeEntryAlignment);
2020 
2021     StubCodeMark mark(this, "StubRoutines", name);
2022 
2023     address start = __ pc();
2024 
2025     __ enter(); // required for proper stackwalking of RuntimeStub frame
2026 
2027     // bump this on entry, not on exit:
2028     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2029 
2030     //-----------------------------------------------------------------------
2031     // Assembler stub will be used for this call to arraycopy
2032     // if the following conditions are met:
2033     //
2034     // (1) src and dst must not be null.
2035     // (2) src_pos must not be negative.
2036     // (3) dst_pos must not be negative.
2037     // (4) length  must not be negative.
2038     // (5) src klass and dst klass should be the same and not NULL.
2039     // (6) src and dst should be arrays.
2040     // (7) src_pos + length must not exceed length of src.
2041     // (8) dst_pos + length must not exceed length of dst.
2042     //
2043 
2044     //  if (src == NULL) return -1;
2045     __ cbz(src, L_failed);
2046 
2047     //  if (src_pos < 0) return -1;
2048     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2049 
2050     //  if (dst == NULL) return -1;
2051     __ cbz(dst, L_failed);
2052 
2053     //  if (dst_pos < 0) return -1;
2054     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2055 
2056     // registers used as temp
2057     const Register scratch_length    = r16; // elements count to copy
2058     const Register scratch_src_klass = r17; // array klass
2059     const Register lh                = r15; // layout helper
2060 
2061     //  if (length < 0) return -1;
2062     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2063     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2064 
2065     __ load_klass(scratch_src_klass, src);
2066 #ifdef ASSERT
2067     //  assert(src->klass() != NULL);
2068     {
2069       BLOCK_COMMENT("assert klasses not null {");
2070       Label L1, L2;
2071       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2072       __ bind(L1);
2073       __ stop("broken null klass");
2074       __ bind(L2);
2075       __ load_klass(rscratch1, dst);
2076       __ cbz(rscratch1, L1);     // this would be broken also
2077       BLOCK_COMMENT("} assert klasses not null done");
2078     }
2079 #endif
2080 
2081     // Load layout helper (32-bits)
2082     //
2083     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2084     // 32        30    24            16              8     2                 0
2085     //
2086     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2087     //
2088 
2089     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2090 
2091     // Handle objArrays completely differently...
2092     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2093     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2094     __ movw(rscratch1, objArray_lh);
2095     __ eorw(rscratch2, lh, rscratch1);
2096     __ cbzw(rscratch2, L_objArray);
2097 
2098     //  if (src->klass() != dst->klass()) return -1;
2099     __ load_klass(rscratch2, dst);
2100     __ eor(rscratch2, rscratch2, scratch_src_klass);
2101     __ cbnz(rscratch2, L_failed);
2102 
2103     //  if (!src->is_Array()) return -1;
2104     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2105 
2106     // At this point, it is known to be a typeArray (array_tag 0x3).
2107 #ifdef ASSERT
2108     {
2109       BLOCK_COMMENT("assert primitive array {");
2110       Label L;
2111       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2112       __ cmpw(lh, rscratch2);
2113       __ br(Assembler::GE, L);
2114       __ stop("must be a primitive array");
2115       __ bind(L);
2116       BLOCK_COMMENT("} assert primitive array done");
2117     }
2118 #endif
2119 
2120     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2121                            rscratch2, L_failed);
2122 
2123     // TypeArrayKlass
2124     //
2125     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2126     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2127     //
2128 
2129     const Register rscratch1_offset = rscratch1;    // array offset
2130     const Register r15_elsize = lh; // element size
2131 
2132     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2133            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2134     __ add(src, src, rscratch1_offset);           // src array offset
2135     __ add(dst, dst, rscratch1_offset);           // dst array offset
2136     BLOCK_COMMENT("choose copy loop based on element size");
2137 
2138     // next registers should be set before the jump to corresponding stub
2139     const Register from     = c_rarg0;  // source array address
2140     const Register to       = c_rarg1;  // destination array address
2141     const Register count    = c_rarg2;  // elements count
2142 
2143     // 'from', 'to', 'count' registers should be set in such order
2144     // since they are the same as 'src', 'src_pos', 'dst'.
2145 
2146     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2147 
2148     // The possible values of elsize are 0-3, i.e. exact_log2(element
2149     // size in bytes).  We do a simple bitwise binary search.
2150   __ BIND(L_copy_bytes);
2151     __ tbnz(r15_elsize, 1, L_copy_ints);
2152     __ tbnz(r15_elsize, 0, L_copy_shorts);
2153     __ lea(from, Address(src, src_pos));// src_addr
2154     __ lea(to,   Address(dst, dst_pos));// dst_addr
2155     __ movw(count, scratch_length); // length
2156     __ b(RuntimeAddress(byte_copy_entry));
2157 
2158   __ BIND(L_copy_shorts);
2159     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2160     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2161     __ movw(count, scratch_length); // length
2162     __ b(RuntimeAddress(short_copy_entry));
2163 
2164   __ BIND(L_copy_ints);
2165     __ tbnz(r15_elsize, 0, L_copy_longs);
2166     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2167     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2168     __ movw(count, scratch_length); // length
2169     __ b(RuntimeAddress(int_copy_entry));
2170 
2171   __ BIND(L_copy_longs);
2172 #ifdef ASSERT
2173     {
2174       BLOCK_COMMENT("assert long copy {");
2175       Label L;
2176       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2177       __ cmpw(r15_elsize, LogBytesPerLong);
2178       __ br(Assembler::EQ, L);
2179       __ stop("must be long copy, but elsize is wrong");
2180       __ bind(L);
2181       BLOCK_COMMENT("} assert long copy done");
2182     }
2183 #endif
2184     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2185     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2186     __ movw(count, scratch_length); // length
2187     __ b(RuntimeAddress(long_copy_entry));
2188 
2189     // ObjArrayKlass
2190   __ BIND(L_objArray);
2191     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2192 
2193     Label L_plain_copy, L_checkcast_copy;
2194     //  test array classes for subtyping
2195     __ load_klass(r15, dst);
2196     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2197     __ br(Assembler::NE, L_checkcast_copy);
2198 
2199     // Identically typed arrays can be copied without element-wise checks.
2200     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2201                            rscratch2, L_failed);
2202 
2203     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2204     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2205     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2206     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2207     __ movw(count, scratch_length); // length
2208   __ BIND(L_plain_copy);
2209     __ b(RuntimeAddress(oop_copy_entry));
2210 
2211   __ BIND(L_checkcast_copy);
2212     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2213     {
2214       // Before looking at dst.length, make sure dst is also an objArray.
2215       __ ldrw(rscratch1, Address(r15, lh_offset));
2216       __ movw(rscratch2, objArray_lh);
2217       __ eorw(rscratch1, rscratch1, rscratch2);
2218       __ cbnzw(rscratch1, L_failed);
2219 
2220       // It is safe to examine both src.length and dst.length.
2221       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2222                              r15, L_failed);
2223 
2224       __ load_klass(dst_klass, dst); // reload
2225 
2226       // Marshal the base address arguments now, freeing registers.
2227       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2228       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2229       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2230       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2231       __ movw(count, length);           // length (reloaded)
2232       Register sco_temp = c_rarg3;      // this register is free now
2233       assert_different_registers(from, to, count, sco_temp,
2234                                  dst_klass, scratch_src_klass);
2235       // assert_clean_int(count, sco_temp);
2236 
2237       // Generate the type check.
2238       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2239       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2240 
2241       // Smashes rscratch1, rscratch2
2242       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2243 
2244       // Fetch destination element klass from the ObjArrayKlass header.
2245       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2246       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2247       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2248 
2249       // the checkcast_copy loop needs two extra arguments:
2250       assert(c_rarg3 == sco_temp, "#3 already in place");
2251       // Set up arguments for checkcast_copy_entry.
2252       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2253       __ b(RuntimeAddress(checkcast_copy_entry));
2254     }
2255 
2256   __ BIND(L_failed);
2257     __ mov(r0, -1);
2258     __ leave();   // required for proper stackwalking of RuntimeStub frame
2259     __ ret(lr);
2260 
2261     return start;
2262   }
2263 
2264   //
2265   // Generate stub for array fill. If "aligned" is true, the
2266   // "to" address is assumed to be heapword aligned.
2267   //
2268   // Arguments for generated stub:
2269   //   to:    c_rarg0
2270   //   value: c_rarg1
2271   //   count: c_rarg2 treated as signed
2272   //
2273   address generate_fill(BasicType t, bool aligned, const char *name) {
2274     __ align(CodeEntryAlignment);
2275     StubCodeMark mark(this, "StubRoutines", name);
2276     address start = __ pc();
2277 
2278     BLOCK_COMMENT("Entry:");
2279 
2280     const Register to        = c_rarg0;  // source array address
2281     const Register value     = c_rarg1;  // value
2282     const Register count     = c_rarg2;  // elements count
2283 
2284     const Register bz_base = r10;        // base for block_zero routine
2285     const Register cnt_words = r11;      // temp register
2286 
2287     __ enter();
2288 
2289     Label L_fill_elements, L_exit1;
2290 
2291     int shift = -1;
2292     switch (t) {
2293       case T_BYTE:
2294         shift = 0;
2295         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2296         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2297         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2298         __ br(Assembler::LO, L_fill_elements);
2299         break;
2300       case T_SHORT:
2301         shift = 1;
2302         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2303         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2304         __ br(Assembler::LO, L_fill_elements);
2305         break;
2306       case T_INT:
2307         shift = 2;
2308         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2309         __ br(Assembler::LO, L_fill_elements);
2310         break;
2311       default: ShouldNotReachHere();
2312     }
2313 
2314     // Align source address at 8 bytes address boundary.
2315     Label L_skip_align1, L_skip_align2, L_skip_align4;
2316     if (!aligned) {
2317       switch (t) {
2318         case T_BYTE:
2319           // One byte misalignment happens only for byte arrays.
2320           __ tbz(to, 0, L_skip_align1);
2321           __ strb(value, Address(__ post(to, 1)));
2322           __ subw(count, count, 1);
2323           __ bind(L_skip_align1);
2324           // Fallthrough
2325         case T_SHORT:
2326           // Two bytes misalignment happens only for byte and short (char) arrays.
2327           __ tbz(to, 1, L_skip_align2);
2328           __ strh(value, Address(__ post(to, 2)));
2329           __ subw(count, count, 2 >> shift);
2330           __ bind(L_skip_align2);
2331           // Fallthrough
2332         case T_INT:
2333           // Align to 8 bytes, we know we are 4 byte aligned to start.
2334           __ tbz(to, 2, L_skip_align4);
2335           __ strw(value, Address(__ post(to, 4)));
2336           __ subw(count, count, 4 >> shift);
2337           __ bind(L_skip_align4);
2338           break;
2339         default: ShouldNotReachHere();
2340       }
2341     }
2342 
2343     //
2344     //  Fill large chunks
2345     //
2346     __ lsrw(cnt_words, count, 3 - shift); // number of words
2347     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2348     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2349     if (UseBlockZeroing) {
2350       Label non_block_zeroing, rest;
2351       // If the fill value is zero we can use the fast zero_words().
2352       __ cbnz(value, non_block_zeroing);
2353       __ mov(bz_base, to);
2354       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2355       __ zero_words(bz_base, cnt_words);
2356       __ b(rest);
2357       __ bind(non_block_zeroing);
2358       __ fill_words(to, cnt_words, value);
2359       __ bind(rest);
2360     } else {
2361       __ fill_words(to, cnt_words, value);
2362     }
2363 
2364     // Remaining count is less than 8 bytes. Fill it by a single store.
2365     // Note that the total length is no less than 8 bytes.
2366     if (t == T_BYTE || t == T_SHORT) {
2367       Label L_exit1;
2368       __ cbzw(count, L_exit1);
2369       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2370       __ str(value, Address(to, -8));    // overwrite some elements
2371       __ bind(L_exit1);
2372       __ leave();
2373       __ ret(lr);
2374     }
2375 
2376     // Handle copies less than 8 bytes.
2377     Label L_fill_2, L_fill_4, L_exit2;
2378     __ bind(L_fill_elements);
2379     switch (t) {
2380       case T_BYTE:
2381         __ tbz(count, 0, L_fill_2);
2382         __ strb(value, Address(__ post(to, 1)));
2383         __ bind(L_fill_2);
2384         __ tbz(count, 1, L_fill_4);
2385         __ strh(value, Address(__ post(to, 2)));
2386         __ bind(L_fill_4);
2387         __ tbz(count, 2, L_exit2);
2388         __ strw(value, Address(to));
2389         break;
2390       case T_SHORT:
2391         __ tbz(count, 0, L_fill_4);
2392         __ strh(value, Address(__ post(to, 2)));
2393         __ bind(L_fill_4);
2394         __ tbz(count, 1, L_exit2);
2395         __ strw(value, Address(to));
2396         break;
2397       case T_INT:
2398         __ cbzw(count, L_exit2);
2399         __ strw(value, Address(to));
2400         break;
2401       default: ShouldNotReachHere();
2402     }
2403     __ bind(L_exit2);
2404     __ leave();
2405     __ ret(lr);
2406     return start;
2407   }
2408 
2409   address generate_data_cache_writeback() {
2410     const Register line        = c_rarg0;  // address of line to write back
2411 
2412     __ align(CodeEntryAlignment);
2413 
2414     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2415 
2416     address start = __ pc();
2417     __ enter();
2418     __ cache_wb(Address(line, 0));
2419     __ leave();
2420     __ ret(lr);
2421 
2422     return start;
2423   }
2424 
2425   address generate_data_cache_writeback_sync() {
2426     const Register is_pre     = c_rarg0;  // pre or post sync
2427 
2428     __ align(CodeEntryAlignment);
2429 
2430     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2431 
2432     // pre wbsync is a no-op
2433     // post wbsync translates to an sfence
2434 
2435     Label skip;
2436     address start = __ pc();
2437     __ enter();
2438     __ cbnz(is_pre, skip);
2439     __ cache_wbsync(false);
2440     __ bind(skip);
2441     __ leave();
2442     __ ret(lr);
2443 
2444     return start;
2445   }
2446 
2447   void generate_arraycopy_stubs() {
2448     address entry;
2449     address entry_jbyte_arraycopy;
2450     address entry_jshort_arraycopy;
2451     address entry_jint_arraycopy;
2452     address entry_oop_arraycopy;
2453     address entry_jlong_arraycopy;
2454     address entry_checkcast_arraycopy;
2455 
2456     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2457     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2458 
2459     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2460 
2461     //*** jbyte
2462     // Always need aligned and unaligned versions
2463     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2464                                                                                   "jbyte_disjoint_arraycopy");
2465     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2466                                                                                   &entry_jbyte_arraycopy,
2467                                                                                   "jbyte_arraycopy");
2468     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2469                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2470     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2471                                                                                   "arrayof_jbyte_arraycopy");
2472 
2473     //*** jshort
2474     // Always need aligned and unaligned versions
2475     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2476                                                                                     "jshort_disjoint_arraycopy");
2477     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2478                                                                                     &entry_jshort_arraycopy,
2479                                                                                     "jshort_arraycopy");
2480     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2481                                                                                     "arrayof_jshort_disjoint_arraycopy");
2482     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2483                                                                                     "arrayof_jshort_arraycopy");
2484 
2485     //*** jint
2486     // Aligned versions
2487     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2488                                                                                 "arrayof_jint_disjoint_arraycopy");
2489     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2490                                                                                 "arrayof_jint_arraycopy");
2491     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2492     // entry_jint_arraycopy always points to the unaligned version
2493     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2494                                                                                 "jint_disjoint_arraycopy");
2495     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2496                                                                                 &entry_jint_arraycopy,
2497                                                                                 "jint_arraycopy");
2498 
2499     //*** jlong
2500     // It is always aligned
2501     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2502                                                                                   "arrayof_jlong_disjoint_arraycopy");
2503     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2504                                                                                   "arrayof_jlong_arraycopy");
2505     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2506     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2507 
2508     //*** oops
2509     {
2510       // With compressed oops we need unaligned versions; notice that
2511       // we overwrite entry_oop_arraycopy.
2512       bool aligned = !UseCompressedOops;
2513 
2514       StubRoutines::_arrayof_oop_disjoint_arraycopy
2515         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2516                                      /*dest_uninitialized*/false);
2517       StubRoutines::_arrayof_oop_arraycopy
2518         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2519                                      /*dest_uninitialized*/false);
2520       // Aligned versions without pre-barriers
2521       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2522         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2523                                      /*dest_uninitialized*/true);
2524       StubRoutines::_arrayof_oop_arraycopy_uninit
2525         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2526                                      /*dest_uninitialized*/true);
2527     }
2528 
2529     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2530     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2531     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2532     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2533 
2534     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2535     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2536                                                                         /*dest_uninitialized*/true);
2537 
2538     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2539                                                               entry_jbyte_arraycopy,
2540                                                               entry_jshort_arraycopy,
2541                                                               entry_jint_arraycopy,
2542                                                               entry_jlong_arraycopy);
2543 
2544     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2545                                                                entry_jbyte_arraycopy,
2546                                                                entry_jshort_arraycopy,
2547                                                                entry_jint_arraycopy,
2548                                                                entry_oop_arraycopy,
2549                                                                entry_jlong_arraycopy,
2550                                                                entry_checkcast_arraycopy);
2551 
2552     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2553     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2554     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2555     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2556     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2557     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2558   }
2559 
2560   void generate_math_stubs() { Unimplemented(); }
2561 
2562   // Arguments:
2563   //
2564   // Inputs:
2565   //   c_rarg0   - source byte array address
2566   //   c_rarg1   - destination byte array address
2567   //   c_rarg2   - K (key) in little endian int array
2568   //
2569   address generate_aescrypt_encryptBlock() {
2570     __ align(CodeEntryAlignment);
2571     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2572 
2573     const Register from        = c_rarg0;  // source array address
2574     const Register to          = c_rarg1;  // destination array address
2575     const Register key         = c_rarg2;  // key array address
2576     const Register keylen      = rscratch1;
2577 
2578     address start = __ pc();
2579     __ enter();
2580 
2581     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2582 
2583     __ aesenc_loadkeys(key, keylen);
2584     __ aesecb_encrypt(from, to, keylen);
2585 
2586     __ mov(r0, 0);
2587 
2588     __ leave();
2589     __ ret(lr);
2590 
2591     return start;
2592   }
2593 
2594   // Arguments:
2595   //
2596   // Inputs:
2597   //   c_rarg0   - source byte array address
2598   //   c_rarg1   - destination byte array address
2599   //   c_rarg2   - K (key) in little endian int array
2600   //
2601   address generate_aescrypt_decryptBlock() {
2602     assert(UseAES, "need AES cryptographic extension support");
2603     __ align(CodeEntryAlignment);
2604     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2605     Label L_doLast;
2606 
2607     const Register from        = c_rarg0;  // source array address
2608     const Register to          = c_rarg1;  // destination array address
2609     const Register key         = c_rarg2;  // key array address
2610     const Register keylen      = rscratch1;
2611 
2612     address start = __ pc();
2613     __ enter(); // required for proper stackwalking of RuntimeStub frame
2614 
2615     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2616 
2617     __ aesecb_decrypt(from, to, key, keylen);
2618 
2619     __ mov(r0, 0);
2620 
2621     __ leave();
2622     __ ret(lr);
2623 
2624     return start;
2625   }
2626 
2627   // Arguments:
2628   //
2629   // Inputs:
2630   //   c_rarg0   - source byte array address
2631   //   c_rarg1   - destination byte array address
2632   //   c_rarg2   - K (key) in little endian int array
2633   //   c_rarg3   - r vector byte array address
2634   //   c_rarg4   - input length
2635   //
2636   // Output:
2637   //   x0        - input length
2638   //
2639   address generate_cipherBlockChaining_encryptAESCrypt() {
2640     assert(UseAES, "need AES cryptographic extension support");
2641     __ align(CodeEntryAlignment);
2642     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2643 
2644     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2645 
2646     const Register from        = c_rarg0;  // source array address
2647     const Register to          = c_rarg1;  // destination array address
2648     const Register key         = c_rarg2;  // key array address
2649     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2650                                            // and left with the results of the last encryption block
2651     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2652     const Register keylen      = rscratch1;
2653 
2654     address start = __ pc();
2655 
2656       __ enter();
2657 
2658       __ movw(rscratch2, len_reg);
2659 
2660       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2661 
2662       __ ld1(v0, __ T16B, rvec);
2663 
2664       __ cmpw(keylen, 52);
2665       __ br(Assembler::CC, L_loadkeys_44);
2666       __ br(Assembler::EQ, L_loadkeys_52);
2667 
2668       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2669       __ rev32(v17, __ T16B, v17);
2670       __ rev32(v18, __ T16B, v18);
2671     __ BIND(L_loadkeys_52);
2672       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2673       __ rev32(v19, __ T16B, v19);
2674       __ rev32(v20, __ T16B, v20);
2675     __ BIND(L_loadkeys_44);
2676       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2677       __ rev32(v21, __ T16B, v21);
2678       __ rev32(v22, __ T16B, v22);
2679       __ rev32(v23, __ T16B, v23);
2680       __ rev32(v24, __ T16B, v24);
2681       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2682       __ rev32(v25, __ T16B, v25);
2683       __ rev32(v26, __ T16B, v26);
2684       __ rev32(v27, __ T16B, v27);
2685       __ rev32(v28, __ T16B, v28);
2686       __ ld1(v29, v30, v31, __ T16B, key);
2687       __ rev32(v29, __ T16B, v29);
2688       __ rev32(v30, __ T16B, v30);
2689       __ rev32(v31, __ T16B, v31);
2690 
2691     __ BIND(L_aes_loop);
2692       __ ld1(v1, __ T16B, __ post(from, 16));
2693       __ eor(v0, __ T16B, v0, v1);
2694 
2695       __ br(Assembler::CC, L_rounds_44);
2696       __ br(Assembler::EQ, L_rounds_52);
2697 
2698       __ aese(v0, v17); __ aesmc(v0, v0);
2699       __ aese(v0, v18); __ aesmc(v0, v0);
2700     __ BIND(L_rounds_52);
2701       __ aese(v0, v19); __ aesmc(v0, v0);
2702       __ aese(v0, v20); __ aesmc(v0, v0);
2703     __ BIND(L_rounds_44);
2704       __ aese(v0, v21); __ aesmc(v0, v0);
2705       __ aese(v0, v22); __ aesmc(v0, v0);
2706       __ aese(v0, v23); __ aesmc(v0, v0);
2707       __ aese(v0, v24); __ aesmc(v0, v0);
2708       __ aese(v0, v25); __ aesmc(v0, v0);
2709       __ aese(v0, v26); __ aesmc(v0, v0);
2710       __ aese(v0, v27); __ aesmc(v0, v0);
2711       __ aese(v0, v28); __ aesmc(v0, v0);
2712       __ aese(v0, v29); __ aesmc(v0, v0);
2713       __ aese(v0, v30);
2714       __ eor(v0, __ T16B, v0, v31);
2715 
2716       __ st1(v0, __ T16B, __ post(to, 16));
2717 
2718       __ subw(len_reg, len_reg, 16);
2719       __ cbnzw(len_reg, L_aes_loop);
2720 
2721       __ st1(v0, __ T16B, rvec);
2722 
2723       __ mov(r0, rscratch2);
2724 
2725       __ leave();
2726       __ ret(lr);
2727 
2728       return start;
2729   }
2730 
2731   // Arguments:
2732   //
2733   // Inputs:
2734   //   c_rarg0   - source byte array address
2735   //   c_rarg1   - destination byte array address
2736   //   c_rarg2   - K (key) in little endian int array
2737   //   c_rarg3   - r vector byte array address
2738   //   c_rarg4   - input length
2739   //
2740   // Output:
2741   //   r0        - input length
2742   //
2743   address generate_cipherBlockChaining_decryptAESCrypt() {
2744     assert(UseAES, "need AES cryptographic extension support");
2745     __ align(CodeEntryAlignment);
2746     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2747 
2748     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2749 
2750     const Register from        = c_rarg0;  // source array address
2751     const Register to          = c_rarg1;  // destination array address
2752     const Register key         = c_rarg2;  // key array address
2753     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2754                                            // and left with the results of the last encryption block
2755     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2756     const Register keylen      = rscratch1;
2757 
2758     address start = __ pc();
2759 
2760       __ enter();
2761 
2762       __ movw(rscratch2, len_reg);
2763 
2764       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2765 
2766       __ ld1(v2, __ T16B, rvec);
2767 
2768       __ ld1(v31, __ T16B, __ post(key, 16));
2769       __ rev32(v31, __ T16B, v31);
2770 
2771       __ cmpw(keylen, 52);
2772       __ br(Assembler::CC, L_loadkeys_44);
2773       __ br(Assembler::EQ, L_loadkeys_52);
2774 
2775       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2776       __ rev32(v17, __ T16B, v17);
2777       __ rev32(v18, __ T16B, v18);
2778     __ BIND(L_loadkeys_52);
2779       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2780       __ rev32(v19, __ T16B, v19);
2781       __ rev32(v20, __ T16B, v20);
2782     __ BIND(L_loadkeys_44);
2783       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2784       __ rev32(v21, __ T16B, v21);
2785       __ rev32(v22, __ T16B, v22);
2786       __ rev32(v23, __ T16B, v23);
2787       __ rev32(v24, __ T16B, v24);
2788       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2789       __ rev32(v25, __ T16B, v25);
2790       __ rev32(v26, __ T16B, v26);
2791       __ rev32(v27, __ T16B, v27);
2792       __ rev32(v28, __ T16B, v28);
2793       __ ld1(v29, v30, __ T16B, key);
2794       __ rev32(v29, __ T16B, v29);
2795       __ rev32(v30, __ T16B, v30);
2796 
2797     __ BIND(L_aes_loop);
2798       __ ld1(v0, __ T16B, __ post(from, 16));
2799       __ orr(v1, __ T16B, v0, v0);
2800 
2801       __ br(Assembler::CC, L_rounds_44);
2802       __ br(Assembler::EQ, L_rounds_52);
2803 
2804       __ aesd(v0, v17); __ aesimc(v0, v0);
2805       __ aesd(v0, v18); __ aesimc(v0, v0);
2806     __ BIND(L_rounds_52);
2807       __ aesd(v0, v19); __ aesimc(v0, v0);
2808       __ aesd(v0, v20); __ aesimc(v0, v0);
2809     __ BIND(L_rounds_44);
2810       __ aesd(v0, v21); __ aesimc(v0, v0);
2811       __ aesd(v0, v22); __ aesimc(v0, v0);
2812       __ aesd(v0, v23); __ aesimc(v0, v0);
2813       __ aesd(v0, v24); __ aesimc(v0, v0);
2814       __ aesd(v0, v25); __ aesimc(v0, v0);
2815       __ aesd(v0, v26); __ aesimc(v0, v0);
2816       __ aesd(v0, v27); __ aesimc(v0, v0);
2817       __ aesd(v0, v28); __ aesimc(v0, v0);
2818       __ aesd(v0, v29); __ aesimc(v0, v0);
2819       __ aesd(v0, v30);
2820       __ eor(v0, __ T16B, v0, v31);
2821       __ eor(v0, __ T16B, v0, v2);
2822 
2823       __ st1(v0, __ T16B, __ post(to, 16));
2824       __ orr(v2, __ T16B, v1, v1);
2825 
2826       __ subw(len_reg, len_reg, 16);
2827       __ cbnzw(len_reg, L_aes_loop);
2828 
2829       __ st1(v2, __ T16B, rvec);
2830 
2831       __ mov(r0, rscratch2);
2832 
2833       __ leave();
2834       __ ret(lr);
2835 
2836     return start;
2837   }
2838 
2839   // CTR AES crypt.
2840   // Arguments:
2841   //
2842   // Inputs:
2843   //   c_rarg0   - source byte array address
2844   //   c_rarg1   - destination byte array address
2845   //   c_rarg2   - K (key) in little endian int array
2846   //   c_rarg3   - counter vector byte array address
2847   //   c_rarg4   - input length
2848   //   c_rarg5   - saved encryptedCounter start
2849   //   c_rarg6   - saved used length
2850   //
2851   // Output:
2852   //   r0       - input length
2853   //
2854   address generate_counterMode_AESCrypt() {
2855     const Register in = c_rarg0;
2856     const Register out = c_rarg1;
2857     const Register key = c_rarg2;
2858     const Register counter = c_rarg3;
2859     const Register saved_len = c_rarg4, len = r10;
2860     const Register saved_encrypted_ctr = c_rarg5;
2861     const Register used_ptr = c_rarg6, used = r12;
2862 
2863     const Register offset = r7;
2864     const Register keylen = r11;
2865 
2866     const unsigned char block_size = 16;
2867     const int bulk_width = 4;
2868     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
2869     // performance with larger data sizes, but it also means that the
2870     // fast path isn't used until you have at least 8 blocks, and up
2871     // to 127 bytes of data will be executed on the slow path. For
2872     // that reason, and also so as not to blow away too much icache, 4
2873     // blocks seems like a sensible compromise.
2874 
2875     // Algorithm:
2876     //
2877     //    if (len == 0) {
2878     //        goto DONE;
2879     //    }
2880     //    int result = len;
2881     //    do {
2882     //        if (used >= blockSize) {
2883     //            if (len >= bulk_width * blockSize) {
2884     //                CTR_large_block();
2885     //                if (len == 0)
2886     //                    goto DONE;
2887     //            }
2888     //            for (;;) {
2889     //                16ByteVector v0 = counter;
2890     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
2891     //                used = 0;
2892     //                if (len < blockSize)
2893     //                    break;    /* goto NEXT */
2894     //                16ByteVector v1 = load16Bytes(in, offset);
2895     //                v1 = v1 ^ encryptedCounter;
2896     //                store16Bytes(out, offset);
2897     //                used = blockSize;
2898     //                offset += blockSize;
2899     //                len -= blockSize;
2900     //                if (len == 0)
2901     //                    goto DONE;
2902     //            }
2903     //        }
2904     //      NEXT:
2905     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
2906     //        len--;
2907     //    } while (len != 0);
2908     //  DONE:
2909     //    return result;
2910     //
2911     // CTR_large_block()
2912     //    Wide bulk encryption of whole blocks.
2913 
2914     __ align(CodeEntryAlignment);
2915     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
2916     const address start = __ pc();
2917     __ enter();
2918 
2919     Label DONE, CTR_large_block, large_block_return;
2920     __ ldrw(used, Address(used_ptr));
2921     __ cbzw(saved_len, DONE);
2922 
2923     __ mov(len, saved_len);
2924     __ mov(offset, 0);
2925 
2926     // Compute #rounds for AES based on the length of the key array
2927     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2928 
2929     __ aesenc_loadkeys(key, keylen);
2930 
2931     {
2932       Label L_CTR_loop, NEXT;
2933 
2934       __ bind(L_CTR_loop);
2935 
2936       __ cmp(used, block_size);
2937       __ br(__ LO, NEXT);
2938 
2939       // Maybe we have a lot of data
2940       __ subsw(rscratch1, len, bulk_width * block_size);
2941       __ br(__ HS, CTR_large_block);
2942       __ BIND(large_block_return);
2943       __ cbzw(len, DONE);
2944 
2945       // Setup the counter
2946       __ movi(v4, __ T4S, 0);
2947       __ movi(v5, __ T4S, 1);
2948       __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
2949 
2950       __ ld1(v0, __ T16B, counter); // Load the counter into v0
2951       __ rev32(v16, __ T16B, v0);
2952       __ addv(v16, __ T4S, v16, v4);
2953       __ rev32(v16, __ T16B, v16);
2954       __ st1(v16, __ T16B, counter); // Save the incremented counter back
2955 
2956       {
2957         // We have fewer than bulk_width blocks of data left. Encrypt
2958         // them one by one until there is less than a full block
2959         // remaining, being careful to save both the encrypted counter
2960         // and the counter.
2961 
2962         Label inner_loop;
2963         __ bind(inner_loop);
2964         // Counter to encrypt is in v0
2965         __ aesecb_encrypt(noreg, noreg, keylen);
2966         __ st1(v0, __ T16B, saved_encrypted_ctr);
2967 
2968         // Do we have a remaining full block?
2969 
2970         __ mov(used, 0);
2971         __ cmp(len, block_size);
2972         __ br(__ LO, NEXT);
2973 
2974         // Yes, we have a full block
2975         __ ldrq(v1, Address(in, offset));
2976         __ eor(v1, __ T16B, v1, v0);
2977         __ strq(v1, Address(out, offset));
2978         __ mov(used, block_size);
2979         __ add(offset, offset, block_size);
2980 
2981         __ subw(len, len, block_size);
2982         __ cbzw(len, DONE);
2983 
2984         // Increment the counter, store it back
2985         __ orr(v0, __ T16B, v16, v16);
2986         __ rev32(v16, __ T16B, v16);
2987         __ addv(v16, __ T4S, v16, v4);
2988         __ rev32(v16, __ T16B, v16);
2989         __ st1(v16, __ T16B, counter); // Save the incremented counter back
2990 
2991         __ b(inner_loop);
2992       }
2993 
2994       __ BIND(NEXT);
2995 
2996       // Encrypt a single byte, and loop.
2997       // We expect this to be a rare event.
2998       __ ldrb(rscratch1, Address(in, offset));
2999       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3000       __ eor(rscratch1, rscratch1, rscratch2);
3001       __ strb(rscratch1, Address(out, offset));
3002       __ add(offset, offset, 1);
3003       __ add(used, used, 1);
3004       __ subw(len, len,1);
3005       __ cbnzw(len, L_CTR_loop);
3006     }
3007 
3008     __ bind(DONE);
3009     __ strw(used, Address(used_ptr));
3010     __ mov(r0, saved_len);
3011 
3012     __ leave(); // required for proper stackwalking of RuntimeStub frame
3013     __ ret(lr);
3014 
3015     // Bulk encryption
3016 
3017     __ BIND (CTR_large_block);
3018     assert(bulk_width == 4 || bulk_width == 8, "must be");
3019 
3020     if (bulk_width == 8) {
3021       __ sub(sp, sp, 4 * 16);
3022       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3023     }
3024     __ sub(sp, sp, 4 * 16);
3025     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3026     RegSet saved_regs = (RegSet::of(in, out, offset)
3027                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3028     __ push(saved_regs, sp);
3029     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3030     __ add(in, in, offset);
3031     __ add(out, out, offset);
3032 
3033     // Keys should already be loaded into the correct registers
3034 
3035     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3036     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3037 
3038     // AES/CTR loop
3039     {
3040       Label L_CTR_loop;
3041       __ BIND(L_CTR_loop);
3042 
3043       // Setup the counters
3044       __ movi(v8, __ T4S, 0);
3045       __ movi(v9, __ T4S, 1);
3046       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3047 
3048       for (int i = 0; i < bulk_width; i++) {
3049         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3050         __ rev32(v0_ofs, __ T16B, v16);
3051         __ addv(v16, __ T4S, v16, v8);
3052       }
3053 
3054       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3055 
3056       // Encrypt the counters
3057       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3058 
3059       if (bulk_width == 8) {
3060         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3061       }
3062 
3063       // XOR the encrypted counters with the inputs
3064       for (int i = 0; i < bulk_width; i++) {
3065         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3066         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3067         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3068       }
3069 
3070       // Write the encrypted data
3071       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3072       if (bulk_width == 8) {
3073         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3074       }
3075 
3076       __ subw(len, len, 16 * bulk_width);
3077       __ cbnzw(len, L_CTR_loop);
3078     }
3079 
3080     // Save the counter back where it goes
3081     __ rev32(v16, __ T16B, v16);
3082     __ st1(v16, __ T16B, counter);
3083 
3084     __ pop(saved_regs, sp);
3085 
3086     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3087     if (bulk_width == 8) {
3088       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3089     }
3090 
3091     __ andr(rscratch1, len, -16 * bulk_width);
3092     __ sub(len, len, rscratch1);
3093     __ add(offset, offset, rscratch1);
3094     __ mov(used, 16);
3095     __ strw(used, Address(used_ptr));
3096     __ b(large_block_return);
3097 
3098     return start;
3099   }
3100 
3101   // Vector AES Galois Counter Mode implementation. Parameters:
3102   //
3103   // in = c_rarg0
3104   // len = c_rarg1
3105   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3106   // out = c_rarg3
3107   // key = c_rarg4
3108   // state = c_rarg5 - GHASH.state
3109   // subkeyHtbl = c_rarg6 - powers of H
3110   // counter = c_rarg7 - 16 bytes of CTR
3111   // return - number of processed bytes
3112   address generate_galoisCounterMode_AESCrypt() {
3113     address ghash_polynomial = __ pc();
3114     __ emit_int64(0x87);  // The low-order bits of the field
3115                           // polynomial (i.e. p = z^7+z^2+z+1)
3116                           // repeated in the low and high parts of a
3117                           // 128-bit vector
3118     __ emit_int64(0x87);
3119 
3120     __ align(CodeEntryAlignment);
3121      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3122     address start = __ pc();
3123     __ enter();
3124 
3125     const Register in = c_rarg0;
3126     const Register len = c_rarg1;
3127     const Register ct = c_rarg2;
3128     const Register out = c_rarg3;
3129     // and updated with the incremented counter in the end
3130 
3131     const Register key = c_rarg4;
3132     const Register state = c_rarg5;
3133 
3134     const Register subkeyHtbl = c_rarg6;
3135 
3136     const Register counter = c_rarg7;
3137 
3138     const Register keylen = r10;
3139     // Save state before entering routine
3140     __ sub(sp, sp, 4 * 16);
3141     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3142     __ sub(sp, sp, 4 * 16);
3143     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3144 
3145     // __ andr(len, len, -512);
3146     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3147     __ str(len, __ pre(sp, -2 * wordSize));
3148 
3149     Label DONE;
3150     __ cbz(len, DONE);
3151 
3152     // Compute #rounds for AES based on the length of the key array
3153     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3154 
3155     __ aesenc_loadkeys(key, keylen);
3156     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3157     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3158 
3159     // AES/CTR loop
3160     {
3161       Label L_CTR_loop;
3162       __ BIND(L_CTR_loop);
3163 
3164       // Setup the counters
3165       __ movi(v8, __ T4S, 0);
3166       __ movi(v9, __ T4S, 1);
3167       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3168 
3169       assert(v0->encoding() < v8->encoding(), "");
3170       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3171         FloatRegister f = as_FloatRegister(i);
3172         __ rev32(f, __ T16B, v16);
3173         __ addv(v16, __ T4S, v16, v8);
3174       }
3175 
3176       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3177 
3178       // Encrypt the counters
3179       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3180 
3181       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3182 
3183       // XOR the encrypted counters with the inputs
3184       for (int i = 0; i < 8; i++) {
3185         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3186         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3187         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3188       }
3189       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3190       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3191 
3192       __ subw(len, len, 16 * 8);
3193       __ cbnzw(len, L_CTR_loop);
3194     }
3195 
3196     __ rev32(v16, __ T16B, v16);
3197     __ st1(v16, __ T16B, counter);
3198 
3199     __ ldr(len, Address(sp));
3200     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3201 
3202     // GHASH/CTR loop
3203     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3204                                 len, /*unrolls*/4);
3205 
3206 #ifdef ASSERT
3207     { Label L;
3208       __ cmp(len, (unsigned char)0);
3209       __ br(Assembler::EQ, L);
3210       __ stop("stubGenerator: abort");
3211       __ bind(L);
3212   }
3213 #endif
3214 
3215   __ bind(DONE);
3216     // Return the number of bytes processed
3217     __ ldr(r0, __ post(sp, 2 * wordSize));
3218 
3219     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3220     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3221 
3222     __ leave(); // required for proper stackwalking of RuntimeStub frame
3223     __ ret(lr);
3224      return start;
3225   }
3226 
3227   // Utility routines for md5.
3228   // Clobbers r10 and r11.
3229   void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4,
3230               int k, int s, int t) {
3231     Register rscratch3 = r10;
3232     Register rscratch4 = r11;
3233 
3234     __ eorw(rscratch3, r3, r4);
3235     __ movw(rscratch2, t);
3236     __ andw(rscratch3, rscratch3, r2);
3237     __ addw(rscratch4, r1, rscratch2);
3238     __ ldrw(rscratch1, Address(buf, k*4));
3239     __ eorw(rscratch3, rscratch3, r4);
3240     __ addw(rscratch3, rscratch3, rscratch1);
3241     __ addw(rscratch3, rscratch3, rscratch4);
3242     __ rorw(rscratch2, rscratch3, 32 - s);
3243     __ addw(r1, rscratch2, r2);
3244   }
3245 
3246   void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4,
3247               int k, int s, int t) {
3248     Register rscratch3 = r10;
3249     Register rscratch4 = r11;
3250 
3251     __ eorw(rscratch2, r2, r3);
3252     __ ldrw(rscratch1, Address(buf, k*4));
3253     __ andw(rscratch3, rscratch2, r4);
3254     __ movw(rscratch2, t);
3255     __ eorw(rscratch3, rscratch3, r3);
3256     __ addw(rscratch4, r1, rscratch2);
3257     __ addw(rscratch3, rscratch3, rscratch1);
3258     __ addw(rscratch3, rscratch3, rscratch4);
3259     __ rorw(rscratch2, rscratch3, 32 - s);
3260     __ addw(r1, rscratch2, r2);
3261   }
3262 
3263   void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4,
3264               int k, int s, int t) {
3265     Register rscratch3 = r10;
3266     Register rscratch4 = r11;
3267 
3268     __ eorw(rscratch3, r3, r4);
3269     __ movw(rscratch2, t);
3270     __ addw(rscratch4, r1, rscratch2);
3271     __ ldrw(rscratch1, Address(buf, k*4));
3272     __ eorw(rscratch3, rscratch3, r2);
3273     __ addw(rscratch3, rscratch3, rscratch1);
3274     __ addw(rscratch3, rscratch3, rscratch4);
3275     __ rorw(rscratch2, rscratch3, 32 - s);
3276     __ addw(r1, rscratch2, r2);
3277   }
3278 
3279   void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4,
3280               int k, int s, int t) {
3281     Register rscratch3 = r10;
3282     Register rscratch4 = r11;
3283 
3284     __ movw(rscratch3, t);
3285     __ ornw(rscratch2, r2, r4);
3286     __ addw(rscratch4, r1, rscratch3);
3287     __ ldrw(rscratch1, Address(buf, k*4));
3288     __ eorw(rscratch3, rscratch2, r3);
3289     __ addw(rscratch3, rscratch3, rscratch1);
3290     __ addw(rscratch3, rscratch3, rscratch4);
3291     __ rorw(rscratch2, rscratch3, 32 - s);
3292     __ addw(r1, rscratch2, r2);
3293   }
3294 
3295   // Arguments:
3296   //
3297   // Inputs:
3298   //   c_rarg0   - byte[]  source+offset
3299   //   c_rarg1   - int[]   SHA.state
3300   //   c_rarg2   - int     offset
3301   //   c_rarg3   - int     limit
3302   //
3303   address generate_md5_implCompress(bool multi_block, const char *name) {
3304     __ align(CodeEntryAlignment);
3305     StubCodeMark mark(this, "StubRoutines", name);
3306     address start = __ pc();
3307 
3308     Register buf       = c_rarg0;
3309     Register state     = c_rarg1;
3310     Register ofs       = c_rarg2;
3311     Register limit     = c_rarg3;
3312     Register a         = r4;
3313     Register b         = r5;
3314     Register c         = r6;
3315     Register d         = r7;
3316     Register rscratch3 = r10;
3317     Register rscratch4 = r11;
3318 
3319     Label md5_loop;
3320     __ BIND(md5_loop);
3321 
3322     // Save hash values for addition after rounds
3323     __ ldrw(a, Address(state,  0));
3324     __ ldrw(b, Address(state,  4));
3325     __ ldrw(c, Address(state,  8));
3326     __ ldrw(d, Address(state, 12));
3327 
3328     // Round 1
3329     md5_FF(buf, a, b, c, d,  0,  7, 0xd76aa478);
3330     md5_FF(buf, d, a, b, c,  1, 12, 0xe8c7b756);
3331     md5_FF(buf, c, d, a, b,  2, 17, 0x242070db);
3332     md5_FF(buf, b, c, d, a,  3, 22, 0xc1bdceee);
3333     md5_FF(buf, a, b, c, d,  4,  7, 0xf57c0faf);
3334     md5_FF(buf, d, a, b, c,  5, 12, 0x4787c62a);
3335     md5_FF(buf, c, d, a, b,  6, 17, 0xa8304613);
3336     md5_FF(buf, b, c, d, a,  7, 22, 0xfd469501);
3337     md5_FF(buf, a, b, c, d,  8,  7, 0x698098d8);
3338     md5_FF(buf, d, a, b, c,  9, 12, 0x8b44f7af);
3339     md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1);
3340     md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be);
3341     md5_FF(buf, a, b, c, d, 12,  7, 0x6b901122);
3342     md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193);
3343     md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e);
3344     md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821);
3345 
3346     // Round 2
3347     md5_GG(buf, a, b, c, d,  1,  5, 0xf61e2562);
3348     md5_GG(buf, d, a, b, c,  6,  9, 0xc040b340);
3349     md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51);
3350     md5_GG(buf, b, c, d, a,  0, 20, 0xe9b6c7aa);
3351     md5_GG(buf, a, b, c, d,  5,  5, 0xd62f105d);
3352     md5_GG(buf, d, a, b, c, 10,  9, 0x02441453);
3353     md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681);
3354     md5_GG(buf, b, c, d, a,  4, 20, 0xe7d3fbc8);
3355     md5_GG(buf, a, b, c, d,  9,  5, 0x21e1cde6);
3356     md5_GG(buf, d, a, b, c, 14,  9, 0xc33707d6);
3357     md5_GG(buf, c, d, a, b,  3, 14, 0xf4d50d87);
3358     md5_GG(buf, b, c, d, a,  8, 20, 0x455a14ed);
3359     md5_GG(buf, a, b, c, d, 13,  5, 0xa9e3e905);
3360     md5_GG(buf, d, a, b, c,  2,  9, 0xfcefa3f8);
3361     md5_GG(buf, c, d, a, b,  7, 14, 0x676f02d9);
3362     md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a);
3363 
3364     // Round 3
3365     md5_HH(buf, a, b, c, d,  5,  4, 0xfffa3942);
3366     md5_HH(buf, d, a, b, c,  8, 11, 0x8771f681);
3367     md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122);
3368     md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c);
3369     md5_HH(buf, a, b, c, d,  1,  4, 0xa4beea44);
3370     md5_HH(buf, d, a, b, c,  4, 11, 0x4bdecfa9);
3371     md5_HH(buf, c, d, a, b,  7, 16, 0xf6bb4b60);
3372     md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70);
3373     md5_HH(buf, a, b, c, d, 13,  4, 0x289b7ec6);
3374     md5_HH(buf, d, a, b, c,  0, 11, 0xeaa127fa);
3375     md5_HH(buf, c, d, a, b,  3, 16, 0xd4ef3085);
3376     md5_HH(buf, b, c, d, a,  6, 23, 0x04881d05);
3377     md5_HH(buf, a, b, c, d,  9,  4, 0xd9d4d039);
3378     md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5);
3379     md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8);
3380     md5_HH(buf, b, c, d, a,  2, 23, 0xc4ac5665);
3381 
3382     // Round 4
3383     md5_II(buf, a, b, c, d,  0,  6, 0xf4292244);
3384     md5_II(buf, d, a, b, c,  7, 10, 0x432aff97);
3385     md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7);
3386     md5_II(buf, b, c, d, a,  5, 21, 0xfc93a039);
3387     md5_II(buf, a, b, c, d, 12,  6, 0x655b59c3);
3388     md5_II(buf, d, a, b, c,  3, 10, 0x8f0ccc92);
3389     md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d);
3390     md5_II(buf, b, c, d, a,  1, 21, 0x85845dd1);
3391     md5_II(buf, a, b, c, d,  8,  6, 0x6fa87e4f);
3392     md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0);
3393     md5_II(buf, c, d, a, b,  6, 15, 0xa3014314);
3394     md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1);
3395     md5_II(buf, a, b, c, d,  4,  6, 0xf7537e82);
3396     md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235);
3397     md5_II(buf, c, d, a, b,  2, 15, 0x2ad7d2bb);
3398     md5_II(buf, b, c, d, a,  9, 21, 0xeb86d391);
3399 
3400     // write hash values back in the correct order
3401     __ ldrw(rscratch1, Address(state,  0));
3402     __ addw(rscratch1, rscratch1, a);
3403     __ strw(rscratch1, Address(state,  0));
3404 
3405     __ ldrw(rscratch2, Address(state,  4));
3406     __ addw(rscratch2, rscratch2, b);
3407     __ strw(rscratch2, Address(state,  4));
3408 
3409     __ ldrw(rscratch3, Address(state,  8));
3410     __ addw(rscratch3, rscratch3, c);
3411     __ strw(rscratch3, Address(state,  8));
3412 
3413     __ ldrw(rscratch4, Address(state, 12));
3414     __ addw(rscratch4, rscratch4, d);
3415     __ strw(rscratch4, Address(state, 12));
3416 
3417     if (multi_block) {
3418       __ add(buf, buf, 64);
3419       __ add(ofs, ofs, 64);
3420       __ cmp(ofs, limit);
3421       __ br(Assembler::LE, md5_loop);
3422       __ mov(c_rarg0, ofs); // return ofs
3423     }
3424 
3425     __ ret(lr);
3426 
3427     return start;
3428   }
3429 
3430   // Arguments:
3431   //
3432   // Inputs:
3433   //   c_rarg0   - byte[]  source+offset
3434   //   c_rarg1   - int[]   SHA.state
3435   //   c_rarg2   - int     offset
3436   //   c_rarg3   - int     limit
3437   //
3438   address generate_sha1_implCompress(bool multi_block, const char *name) {
3439     __ align(CodeEntryAlignment);
3440     StubCodeMark mark(this, "StubRoutines", name);
3441     address start = __ pc();
3442 
3443     Register buf   = c_rarg0;
3444     Register state = c_rarg1;
3445     Register ofs   = c_rarg2;
3446     Register limit = c_rarg3;
3447 
3448     Label keys;
3449     Label sha1_loop;
3450 
3451     // load the keys into v0..v3
3452     __ adr(rscratch1, keys);
3453     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3454     // load 5 words state into v6, v7
3455     __ ldrq(v6, Address(state, 0));
3456     __ ldrs(v7, Address(state, 16));
3457 
3458 
3459     __ BIND(sha1_loop);
3460     // load 64 bytes of data into v16..v19
3461     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3462     __ rev32(v16, __ T16B, v16);
3463     __ rev32(v17, __ T16B, v17);
3464     __ rev32(v18, __ T16B, v18);
3465     __ rev32(v19, __ T16B, v19);
3466 
3467     // do the sha1
3468     __ addv(v4, __ T4S, v16, v0);
3469     __ orr(v20, __ T16B, v6, v6);
3470 
3471     FloatRegister d0 = v16;
3472     FloatRegister d1 = v17;
3473     FloatRegister d2 = v18;
3474     FloatRegister d3 = v19;
3475 
3476     for (int round = 0; round < 20; round++) {
3477       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3478       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3479       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3480       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3481       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3482 
3483       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3484       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3485       __ sha1h(tmp2, __ T4S, v20);
3486       if (round < 5)
3487         __ sha1c(v20, __ T4S, tmp3, tmp4);
3488       else if (round < 10 || round >= 15)
3489         __ sha1p(v20, __ T4S, tmp3, tmp4);
3490       else
3491         __ sha1m(v20, __ T4S, tmp3, tmp4);
3492       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3493 
3494       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3495     }
3496 
3497     __ addv(v7, __ T2S, v7, v21);
3498     __ addv(v6, __ T4S, v6, v20);
3499 
3500     if (multi_block) {
3501       __ add(ofs, ofs, 64);
3502       __ cmp(ofs, limit);
3503       __ br(Assembler::LE, sha1_loop);
3504       __ mov(c_rarg0, ofs); // return ofs
3505     }
3506 
3507     __ strq(v6, Address(state, 0));
3508     __ strs(v7, Address(state, 16));
3509 
3510     __ ret(lr);
3511 
3512     __ bind(keys);
3513     __ emit_int32(0x5a827999);
3514     __ emit_int32(0x6ed9eba1);
3515     __ emit_int32(0x8f1bbcdc);
3516     __ emit_int32(0xca62c1d6);
3517 
3518     return start;
3519   }
3520 
3521 
3522   // Arguments:
3523   //
3524   // Inputs:
3525   //   c_rarg0   - byte[]  source+offset
3526   //   c_rarg1   - int[]   SHA.state
3527   //   c_rarg2   - int     offset
3528   //   c_rarg3   - int     limit
3529   //
3530   address generate_sha256_implCompress(bool multi_block, const char *name) {
3531     static const uint32_t round_consts[64] = {
3532       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3533       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3534       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3535       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3536       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3537       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3538       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3539       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3540       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3541       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3542       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3543       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3544       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3545       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3546       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3547       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3548     };
3549     __ align(CodeEntryAlignment);
3550     StubCodeMark mark(this, "StubRoutines", name);
3551     address start = __ pc();
3552 
3553     Register buf   = c_rarg0;
3554     Register state = c_rarg1;
3555     Register ofs   = c_rarg2;
3556     Register limit = c_rarg3;
3557 
3558     Label sha1_loop;
3559 
3560     __ stpd(v8, v9, __ pre(sp, -32));
3561     __ stpd(v10, v11, Address(sp, 16));
3562 
3563 // dga == v0
3564 // dgb == v1
3565 // dg0 == v2
3566 // dg1 == v3
3567 // dg2 == v4
3568 // t0 == v6
3569 // t1 == v7
3570 
3571     // load 16 keys to v16..v31
3572     __ lea(rscratch1, ExternalAddress((address)round_consts));
3573     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3574     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3575     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3576     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3577 
3578     // load 8 words (256 bits) state
3579     __ ldpq(v0, v1, state);
3580 
3581     __ BIND(sha1_loop);
3582     // load 64 bytes of data into v8..v11
3583     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3584     __ rev32(v8, __ T16B, v8);
3585     __ rev32(v9, __ T16B, v9);
3586     __ rev32(v10, __ T16B, v10);
3587     __ rev32(v11, __ T16B, v11);
3588 
3589     __ addv(v6, __ T4S, v8, v16);
3590     __ orr(v2, __ T16B, v0, v0);
3591     __ orr(v3, __ T16B, v1, v1);
3592 
3593     FloatRegister d0 = v8;
3594     FloatRegister d1 = v9;
3595     FloatRegister d2 = v10;
3596     FloatRegister d3 = v11;
3597 
3598 
3599     for (int round = 0; round < 16; round++) {
3600       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3601       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3602       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3603       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3604 
3605       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3606        __ orr(v4, __ T16B, v2, v2);
3607       if (round < 15)
3608         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3609       __ sha256h(v2, __ T4S, v3, tmp2);
3610       __ sha256h2(v3, __ T4S, v4, tmp2);
3611       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3612 
3613       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3614     }
3615 
3616     __ addv(v0, __ T4S, v0, v2);
3617     __ addv(v1, __ T4S, v1, v3);
3618 
3619     if (multi_block) {
3620       __ add(ofs, ofs, 64);
3621       __ cmp(ofs, limit);
3622       __ br(Assembler::LE, sha1_loop);
3623       __ mov(c_rarg0, ofs); // return ofs
3624     }
3625 
3626     __ ldpd(v10, v11, Address(sp, 16));
3627     __ ldpd(v8, v9, __ post(sp, 32));
3628 
3629     __ stpq(v0, v1, state);
3630 
3631     __ ret(lr);
3632 
3633     return start;
3634   }
3635 
3636   // Double rounds for sha512.
3637   void sha512_dround(int dr,
3638                      FloatRegister vi0, FloatRegister vi1,
3639                      FloatRegister vi2, FloatRegister vi3,
3640                      FloatRegister vi4, FloatRegister vrc0,
3641                      FloatRegister vrc1, FloatRegister vin0,
3642                      FloatRegister vin1, FloatRegister vin2,
3643                      FloatRegister vin3, FloatRegister vin4) {
3644       if (dr < 36) {
3645         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3646       }
3647       __ addv(v5, __ T2D, vrc0, vin0);
3648       __ ext(v6, __ T16B, vi2, vi3, 8);
3649       __ ext(v5, __ T16B, v5, v5, 8);
3650       __ ext(v7, __ T16B, vi1, vi2, 8);
3651       __ addv(vi3, __ T2D, vi3, v5);
3652       if (dr < 32) {
3653         __ ext(v5, __ T16B, vin3, vin4, 8);
3654         __ sha512su0(vin0, __ T2D, vin1);
3655       }
3656       __ sha512h(vi3, __ T2D, v6, v7);
3657       if (dr < 32) {
3658         __ sha512su1(vin0, __ T2D, vin2, v5);
3659       }
3660       __ addv(vi4, __ T2D, vi1, vi3);
3661       __ sha512h2(vi3, __ T2D, vi1, vi0);
3662   }
3663 
3664   // Arguments:
3665   //
3666   // Inputs:
3667   //   c_rarg0   - byte[]  source+offset
3668   //   c_rarg1   - int[]   SHA.state
3669   //   c_rarg2   - int     offset
3670   //   c_rarg3   - int     limit
3671   //
3672   address generate_sha512_implCompress(bool multi_block, const char *name) {
3673     static const uint64_t round_consts[80] = {
3674       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3675       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3676       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3677       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3678       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3679       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3680       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3681       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3682       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3683       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3684       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3685       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3686       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3687       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3688       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3689       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3690       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3691       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3692       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3693       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3694       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3695       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3696       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3697       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3698       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3699       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3700       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3701     };
3702 
3703     __ align(CodeEntryAlignment);
3704     StubCodeMark mark(this, "StubRoutines", name);
3705     address start = __ pc();
3706 
3707     Register buf   = c_rarg0;
3708     Register state = c_rarg1;
3709     Register ofs   = c_rarg2;
3710     Register limit = c_rarg3;
3711 
3712     __ stpd(v8, v9, __ pre(sp, -64));
3713     __ stpd(v10, v11, Address(sp, 16));
3714     __ stpd(v12, v13, Address(sp, 32));
3715     __ stpd(v14, v15, Address(sp, 48));
3716 
3717     Label sha512_loop;
3718 
3719     // load state
3720     __ ld1(v8, v9, v10, v11, __ T2D, state);
3721 
3722     // load first 4 round constants
3723     __ lea(rscratch1, ExternalAddress((address)round_consts));
3724     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3725 
3726     __ BIND(sha512_loop);
3727     // load 128B of data into v12..v19
3728     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3729     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3730     __ rev64(v12, __ T16B, v12);
3731     __ rev64(v13, __ T16B, v13);
3732     __ rev64(v14, __ T16B, v14);
3733     __ rev64(v15, __ T16B, v15);
3734     __ rev64(v16, __ T16B, v16);
3735     __ rev64(v17, __ T16B, v17);
3736     __ rev64(v18, __ T16B, v18);
3737     __ rev64(v19, __ T16B, v19);
3738 
3739     __ mov(rscratch2, rscratch1);
3740 
3741     __ mov(v0, __ T16B, v8);
3742     __ mov(v1, __ T16B, v9);
3743     __ mov(v2, __ T16B, v10);
3744     __ mov(v3, __ T16B, v11);
3745 
3746     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3747     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3748     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3749     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3750     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3751     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3752     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3753     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3754     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3755     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3756     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3757     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3758     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3759     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3760     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3761     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3762     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3763     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3764     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3765     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3766     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3767     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3768     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3769     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3770     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3771     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3772     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3773     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3774     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3775     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3776     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3777     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
3778     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
3779     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
3780     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
3781     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
3782     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
3783     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
3784     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
3785     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
3786 
3787     __ addv(v8, __ T2D, v8, v0);
3788     __ addv(v9, __ T2D, v9, v1);
3789     __ addv(v10, __ T2D, v10, v2);
3790     __ addv(v11, __ T2D, v11, v3);
3791 
3792     if (multi_block) {
3793       __ add(ofs, ofs, 128);
3794       __ cmp(ofs, limit);
3795       __ br(Assembler::LE, sha512_loop);
3796       __ mov(c_rarg0, ofs); // return ofs
3797     }
3798 
3799     __ st1(v8, v9, v10, v11, __ T2D, state);
3800 
3801     __ ldpd(v14, v15, Address(sp, 48));
3802     __ ldpd(v12, v13, Address(sp, 32));
3803     __ ldpd(v10, v11, Address(sp, 16));
3804     __ ldpd(v8, v9, __ post(sp, 64));
3805 
3806     __ ret(lr);
3807 
3808     return start;
3809   }
3810 
3811   // Arguments:
3812   //
3813   // Inputs:
3814   //   c_rarg0   - byte[]  source+offset
3815   //   c_rarg1   - byte[]   SHA.state
3816   //   c_rarg2   - int     digest_length
3817   //   c_rarg3   - int     offset
3818   //   c_rarg4   - int     limit
3819   //
3820   address generate_sha3_implCompress(bool multi_block, const char *name) {
3821     static const uint64_t round_consts[24] = {
3822       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3823       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3824       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3825       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3826       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3827       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3828       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3829       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3830     };
3831 
3832     __ align(CodeEntryAlignment);
3833     StubCodeMark mark(this, "StubRoutines", name);
3834     address start = __ pc();
3835 
3836     Register buf           = c_rarg0;
3837     Register state         = c_rarg1;
3838     Register digest_length = c_rarg2;
3839     Register ofs           = c_rarg3;
3840     Register limit         = c_rarg4;
3841 
3842     Label sha3_loop, rounds24_loop;
3843     Label sha3_512, sha3_384_or_224, sha3_256;
3844 
3845     __ stpd(v8, v9, __ pre(sp, -64));
3846     __ stpd(v10, v11, Address(sp, 16));
3847     __ stpd(v12, v13, Address(sp, 32));
3848     __ stpd(v14, v15, Address(sp, 48));
3849 
3850     // load state
3851     __ add(rscratch1, state, 32);
3852     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3853     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3854     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3855     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3856     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3857     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3858     __ ld1(v24, __ T1D, rscratch1);
3859 
3860     __ BIND(sha3_loop);
3861 
3862     // 24 keccak rounds
3863     __ movw(rscratch2, 24);
3864 
3865     // load round_constants base
3866     __ lea(rscratch1, ExternalAddress((address) round_consts));
3867 
3868     // load input
3869     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3870     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3871     __ eor(v0, __ T8B, v0, v25);
3872     __ eor(v1, __ T8B, v1, v26);
3873     __ eor(v2, __ T8B, v2, v27);
3874     __ eor(v3, __ T8B, v3, v28);
3875     __ eor(v4, __ T8B, v4, v29);
3876     __ eor(v5, __ T8B, v5, v30);
3877     __ eor(v6, __ T8B, v6, v31);
3878 
3879     // digest_length == 64, SHA3-512
3880     __ tbnz(digest_length, 6, sha3_512);
3881 
3882     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3883     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3884     __ eor(v7, __ T8B, v7, v25);
3885     __ eor(v8, __ T8B, v8, v26);
3886     __ eor(v9, __ T8B, v9, v27);
3887     __ eor(v10, __ T8B, v10, v28);
3888     __ eor(v11, __ T8B, v11, v29);
3889     __ eor(v12, __ T8B, v12, v30);
3890 
3891     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3892     __ tbnz(digest_length, 4, sha3_384_or_224);
3893 
3894     // SHA3-256
3895     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3896     __ eor(v13, __ T8B, v13, v25);
3897     __ eor(v14, __ T8B, v14, v26);
3898     __ eor(v15, __ T8B, v15, v27);
3899     __ eor(v16, __ T8B, v16, v28);
3900     __ b(rounds24_loop);
3901 
3902     __ BIND(sha3_384_or_224);
3903     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3904 
3905     // SHA3-224
3906     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3907     __ ld1(v29, __ T8B, __ post(buf, 8));
3908     __ eor(v13, __ T8B, v13, v25);
3909     __ eor(v14, __ T8B, v14, v26);
3910     __ eor(v15, __ T8B, v15, v27);
3911     __ eor(v16, __ T8B, v16, v28);
3912     __ eor(v17, __ T8B, v17, v29);
3913     __ b(rounds24_loop);
3914 
3915     __ BIND(sha3_512);
3916     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3917     __ eor(v7, __ T8B, v7, v25);
3918     __ eor(v8, __ T8B, v8, v26);
3919 
3920     __ BIND(rounds24_loop);
3921     __ subw(rscratch2, rscratch2, 1);
3922 
3923     __ eor3(v29, __ T16B, v4, v9, v14);
3924     __ eor3(v26, __ T16B, v1, v6, v11);
3925     __ eor3(v28, __ T16B, v3, v8, v13);
3926     __ eor3(v25, __ T16B, v0, v5, v10);
3927     __ eor3(v27, __ T16B, v2, v7, v12);
3928     __ eor3(v29, __ T16B, v29, v19, v24);
3929     __ eor3(v26, __ T16B, v26, v16, v21);
3930     __ eor3(v28, __ T16B, v28, v18, v23);
3931     __ eor3(v25, __ T16B, v25, v15, v20);
3932     __ eor3(v27, __ T16B, v27, v17, v22);
3933 
3934     __ rax1(v30, __ T2D, v29, v26);
3935     __ rax1(v26, __ T2D, v26, v28);
3936     __ rax1(v28, __ T2D, v28, v25);
3937     __ rax1(v25, __ T2D, v25, v27);
3938     __ rax1(v27, __ T2D, v27, v29);
3939 
3940     __ eor(v0, __ T16B, v0, v30);
3941     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3942     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3943     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3944     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3945     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3946     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3947     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3948     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3949     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3950     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3951     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3952     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3953     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3954     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3955     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3956     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3957     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3958     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3959     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3960     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3961     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3962     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3963     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3964     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3965 
3966     __ bcax(v20, __ T16B, v31, v22, v8);
3967     __ bcax(v21, __ T16B, v8,  v23, v22);
3968     __ bcax(v22, __ T16B, v22, v24, v23);
3969     __ bcax(v23, __ T16B, v23, v31, v24);
3970     __ bcax(v24, __ T16B, v24, v8,  v31);
3971 
3972     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3973 
3974     __ bcax(v17, __ T16B, v25, v19, v3);
3975     __ bcax(v18, __ T16B, v3,  v15, v19);
3976     __ bcax(v19, __ T16B, v19, v16, v15);
3977     __ bcax(v15, __ T16B, v15, v25, v16);
3978     __ bcax(v16, __ T16B, v16, v3,  v25);
3979 
3980     __ bcax(v10, __ T16B, v29, v12, v26);
3981     __ bcax(v11, __ T16B, v26, v13, v12);
3982     __ bcax(v12, __ T16B, v12, v14, v13);
3983     __ bcax(v13, __ T16B, v13, v29, v14);
3984     __ bcax(v14, __ T16B, v14, v26, v29);
3985 
3986     __ bcax(v7, __ T16B, v30, v9,  v4);
3987     __ bcax(v8, __ T16B, v4,  v5,  v9);
3988     __ bcax(v9, __ T16B, v9,  v6,  v5);
3989     __ bcax(v5, __ T16B, v5,  v30, v6);
3990     __ bcax(v6, __ T16B, v6,  v4,  v30);
3991 
3992     __ bcax(v3, __ T16B, v27, v0,  v28);
3993     __ bcax(v4, __ T16B, v28, v1,  v0);
3994     __ bcax(v0, __ T16B, v0,  v2,  v1);
3995     __ bcax(v1, __ T16B, v1,  v27, v2);
3996     __ bcax(v2, __ T16B, v2,  v28, v27);
3997 
3998     __ eor(v0, __ T16B, v0, v31);
3999 
4000     __ cbnzw(rscratch2, rounds24_loop);
4001 
4002     if (multi_block) {
4003       // block_size =  200 - 2 * digest_length, ofs += block_size
4004       __ add(ofs, ofs, 200);
4005       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
4006 
4007       __ cmp(ofs, limit);
4008       __ br(Assembler::LE, sha3_loop);
4009       __ mov(c_rarg0, ofs); // return ofs
4010     }
4011 
4012     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4013     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4014     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4015     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4016     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4017     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4018     __ st1(v24, __ T1D, state);
4019 
4020     __ ldpd(v14, v15, Address(sp, 48));
4021     __ ldpd(v12, v13, Address(sp, 32));
4022     __ ldpd(v10, v11, Address(sp, 16));
4023     __ ldpd(v8, v9, __ post(sp, 64));
4024 
4025     __ ret(lr);
4026 
4027     return start;
4028   }
4029 
4030   /**
4031    *  Arguments:
4032    *
4033    * Inputs:
4034    *   c_rarg0   - int crc
4035    *   c_rarg1   - byte* buf
4036    *   c_rarg2   - int length
4037    *
4038    * Output:
4039    *       rax   - int crc result
4040    */
4041   address generate_updateBytesCRC32() {
4042     assert(UseCRC32Intrinsics, "what are we doing here?");
4043 
4044     __ align(CodeEntryAlignment);
4045     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4046 
4047     address start = __ pc();
4048 
4049     const Register crc   = c_rarg0;  // crc
4050     const Register buf   = c_rarg1;  // source java byte array address
4051     const Register len   = c_rarg2;  // length
4052     const Register table0 = c_rarg3; // crc_table address
4053     const Register table1 = c_rarg4;
4054     const Register table2 = c_rarg5;
4055     const Register table3 = c_rarg6;
4056     const Register tmp3 = c_rarg7;
4057 
4058     BLOCK_COMMENT("Entry:");
4059     __ enter(); // required for proper stackwalking of RuntimeStub frame
4060 
4061     __ kernel_crc32(crc, buf, len,
4062               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4063 
4064     __ leave(); // required for proper stackwalking of RuntimeStub frame
4065     __ ret(lr);
4066 
4067     return start;
4068   }
4069 
4070   /**
4071    *  Arguments:
4072    *
4073    * Inputs:
4074    *   c_rarg0   - int crc
4075    *   c_rarg1   - byte* buf
4076    *   c_rarg2   - int length
4077    *   c_rarg3   - int* table
4078    *
4079    * Output:
4080    *       r0   - int crc result
4081    */
4082   address generate_updateBytesCRC32C() {
4083     assert(UseCRC32CIntrinsics, "what are we doing here?");
4084 
4085     __ align(CodeEntryAlignment);
4086     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4087 
4088     address start = __ pc();
4089 
4090     const Register crc   = c_rarg0;  // crc
4091     const Register buf   = c_rarg1;  // source java byte array address
4092     const Register len   = c_rarg2;  // length
4093     const Register table0 = c_rarg3; // crc_table address
4094     const Register table1 = c_rarg4;
4095     const Register table2 = c_rarg5;
4096     const Register table3 = c_rarg6;
4097     const Register tmp3 = c_rarg7;
4098 
4099     BLOCK_COMMENT("Entry:");
4100     __ enter(); // required for proper stackwalking of RuntimeStub frame
4101 
4102     __ kernel_crc32c(crc, buf, len,
4103               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4104 
4105     __ leave(); // required for proper stackwalking of RuntimeStub frame
4106     __ ret(lr);
4107 
4108     return start;
4109   }
4110 
4111   /***
4112    *  Arguments:
4113    *
4114    *  Inputs:
4115    *   c_rarg0   - int   adler
4116    *   c_rarg1   - byte* buff
4117    *   c_rarg2   - int   len
4118    *
4119    * Output:
4120    *   c_rarg0   - int adler result
4121    */
4122   address generate_updateBytesAdler32() {
4123     __ align(CodeEntryAlignment);
4124     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4125     address start = __ pc();
4126 
4127     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4128 
4129     // Aliases
4130     Register adler  = c_rarg0;
4131     Register s1     = c_rarg0;
4132     Register s2     = c_rarg3;
4133     Register buff   = c_rarg1;
4134     Register len    = c_rarg2;
4135     Register nmax  = r4;
4136     Register base  = r5;
4137     Register count = r6;
4138     Register temp0 = rscratch1;
4139     Register temp1 = rscratch2;
4140     FloatRegister vbytes = v0;
4141     FloatRegister vs1acc = v1;
4142     FloatRegister vs2acc = v2;
4143     FloatRegister vtable = v3;
4144 
4145     // Max number of bytes we can process before having to take the mod
4146     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4147     uint64_t BASE = 0xfff1;
4148     uint64_t NMAX = 0x15B0;
4149 
4150     __ mov(base, BASE);
4151     __ mov(nmax, NMAX);
4152 
4153     // Load accumulation coefficients for the upper 16 bits
4154     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4155     __ ld1(vtable, __ T16B, Address(temp0));
4156 
4157     // s1 is initialized to the lower 16 bits of adler
4158     // s2 is initialized to the upper 16 bits of adler
4159     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4160     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4161 
4162     // The pipelined loop needs at least 16 elements for 1 iteration
4163     // It does check this, but it is more effective to skip to the cleanup loop
4164     __ cmp(len, (u1)16);
4165     __ br(Assembler::HS, L_nmax);
4166     __ cbz(len, L_combine);
4167 
4168     __ bind(L_simple_by1_loop);
4169     __ ldrb(temp0, Address(__ post(buff, 1)));
4170     __ add(s1, s1, temp0);
4171     __ add(s2, s2, s1);
4172     __ subs(len, len, 1);
4173     __ br(Assembler::HI, L_simple_by1_loop);
4174 
4175     // s1 = s1 % BASE
4176     __ subs(temp0, s1, base);
4177     __ csel(s1, temp0, s1, Assembler::HS);
4178 
4179     // s2 = s2 % BASE
4180     __ lsr(temp0, s2, 16);
4181     __ lsl(temp1, temp0, 4);
4182     __ sub(temp1, temp1, temp0);
4183     __ add(s2, temp1, s2, ext::uxth);
4184 
4185     __ subs(temp0, s2, base);
4186     __ csel(s2, temp0, s2, Assembler::HS);
4187 
4188     __ b(L_combine);
4189 
4190     __ bind(L_nmax);
4191     __ subs(len, len, nmax);
4192     __ sub(count, nmax, 16);
4193     __ br(Assembler::LO, L_by16);
4194 
4195     __ bind(L_nmax_loop);
4196 
4197     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4198                                       vbytes, vs1acc, vs2acc, vtable);
4199 
4200     __ subs(count, count, 16);
4201     __ br(Assembler::HS, L_nmax_loop);
4202 
4203     // s1 = s1 % BASE
4204     __ lsr(temp0, s1, 16);
4205     __ lsl(temp1, temp0, 4);
4206     __ sub(temp1, temp1, temp0);
4207     __ add(temp1, temp1, s1, ext::uxth);
4208 
4209     __ lsr(temp0, temp1, 16);
4210     __ lsl(s1, temp0, 4);
4211     __ sub(s1, s1, temp0);
4212     __ add(s1, s1, temp1, ext:: uxth);
4213 
4214     __ subs(temp0, s1, base);
4215     __ csel(s1, temp0, s1, Assembler::HS);
4216 
4217     // s2 = s2 % BASE
4218     __ lsr(temp0, s2, 16);
4219     __ lsl(temp1, temp0, 4);
4220     __ sub(temp1, temp1, temp0);
4221     __ add(temp1, temp1, s2, ext::uxth);
4222 
4223     __ lsr(temp0, temp1, 16);
4224     __ lsl(s2, temp0, 4);
4225     __ sub(s2, s2, temp0);
4226     __ add(s2, s2, temp1, ext:: uxth);
4227 
4228     __ subs(temp0, s2, base);
4229     __ csel(s2, temp0, s2, Assembler::HS);
4230 
4231     __ subs(len, len, nmax);
4232     __ sub(count, nmax, 16);
4233     __ br(Assembler::HS, L_nmax_loop);
4234 
4235     __ bind(L_by16);
4236     __ adds(len, len, count);
4237     __ br(Assembler::LO, L_by1);
4238 
4239     __ bind(L_by16_loop);
4240 
4241     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4242                                       vbytes, vs1acc, vs2acc, vtable);
4243 
4244     __ subs(len, len, 16);
4245     __ br(Assembler::HS, L_by16_loop);
4246 
4247     __ bind(L_by1);
4248     __ adds(len, len, 15);
4249     __ br(Assembler::LO, L_do_mod);
4250 
4251     __ bind(L_by1_loop);
4252     __ ldrb(temp0, Address(__ post(buff, 1)));
4253     __ add(s1, temp0, s1);
4254     __ add(s2, s2, s1);
4255     __ subs(len, len, 1);
4256     __ br(Assembler::HS, L_by1_loop);
4257 
4258     __ bind(L_do_mod);
4259     // s1 = s1 % BASE
4260     __ lsr(temp0, s1, 16);
4261     __ lsl(temp1, temp0, 4);
4262     __ sub(temp1, temp1, temp0);
4263     __ add(temp1, temp1, s1, ext::uxth);
4264 
4265     __ lsr(temp0, temp1, 16);
4266     __ lsl(s1, temp0, 4);
4267     __ sub(s1, s1, temp0);
4268     __ add(s1, s1, temp1, ext:: uxth);
4269 
4270     __ subs(temp0, s1, base);
4271     __ csel(s1, temp0, s1, Assembler::HS);
4272 
4273     // s2 = s2 % BASE
4274     __ lsr(temp0, s2, 16);
4275     __ lsl(temp1, temp0, 4);
4276     __ sub(temp1, temp1, temp0);
4277     __ add(temp1, temp1, s2, ext::uxth);
4278 
4279     __ lsr(temp0, temp1, 16);
4280     __ lsl(s2, temp0, 4);
4281     __ sub(s2, s2, temp0);
4282     __ add(s2, s2, temp1, ext:: uxth);
4283 
4284     __ subs(temp0, s2, base);
4285     __ csel(s2, temp0, s2, Assembler::HS);
4286 
4287     // Combine lower bits and higher bits
4288     __ bind(L_combine);
4289     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4290 
4291     __ ret(lr);
4292 
4293     return start;
4294   }
4295 
4296   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4297           Register temp0, Register temp1, FloatRegister vbytes,
4298           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4299     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4300     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4301     // In non-vectorized code, we update s1 and s2 as:
4302     //   s1 <- s1 + b1
4303     //   s2 <- s2 + s1
4304     //   s1 <- s1 + b2
4305     //   s2 <- s2 + b1
4306     //   ...
4307     //   s1 <- s1 + b16
4308     //   s2 <- s2 + s1
4309     // Putting above assignments together, we have:
4310     //   s1_new = s1 + b1 + b2 + ... + b16
4311     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4312     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4313     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4314     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4315 
4316     // s2 = s2 + s1 * 16
4317     __ add(s2, s2, s1, Assembler::LSL, 4);
4318 
4319     // vs1acc = b1 + b2 + b3 + ... + b16
4320     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4321     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4322     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4323     __ uaddlv(vs1acc, __ T16B, vbytes);
4324     __ uaddlv(vs2acc, __ T8H, vs2acc);
4325 
4326     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4327     __ fmovd(temp0, vs1acc);
4328     __ fmovd(temp1, vs2acc);
4329     __ add(s1, s1, temp0);
4330     __ add(s2, s2, temp1);
4331   }
4332 
4333   /**
4334    *  Arguments:
4335    *
4336    *  Input:
4337    *    c_rarg0   - x address
4338    *    c_rarg1   - x length
4339    *    c_rarg2   - y address
4340    *    c_rarg3   - y length
4341    *    c_rarg4   - z address
4342    *    c_rarg5   - z length
4343    */
4344   address generate_multiplyToLen() {
4345     __ align(CodeEntryAlignment);
4346     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4347 
4348     address start = __ pc();
4349     const Register x     = r0;
4350     const Register xlen  = r1;
4351     const Register y     = r2;
4352     const Register ylen  = r3;
4353     const Register z     = r4;
4354     const Register zlen  = r5;
4355 
4356     const Register tmp1  = r10;
4357     const Register tmp2  = r11;
4358     const Register tmp3  = r12;
4359     const Register tmp4  = r13;
4360     const Register tmp5  = r14;
4361     const Register tmp6  = r15;
4362     const Register tmp7  = r16;
4363 
4364     BLOCK_COMMENT("Entry:");
4365     __ enter(); // required for proper stackwalking of RuntimeStub frame
4366     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4367     __ leave(); // required for proper stackwalking of RuntimeStub frame
4368     __ ret(lr);
4369 
4370     return start;
4371   }
4372 
4373   address generate_squareToLen() {
4374     // squareToLen algorithm for sizes 1..127 described in java code works
4375     // faster than multiply_to_len on some CPUs and slower on others, but
4376     // multiply_to_len shows a bit better overall results
4377     __ align(CodeEntryAlignment);
4378     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4379     address start = __ pc();
4380 
4381     const Register x     = r0;
4382     const Register xlen  = r1;
4383     const Register z     = r2;
4384     const Register zlen  = r3;
4385     const Register y     = r4; // == x
4386     const Register ylen  = r5; // == xlen
4387 
4388     const Register tmp1  = r10;
4389     const Register tmp2  = r11;
4390     const Register tmp3  = r12;
4391     const Register tmp4  = r13;
4392     const Register tmp5  = r14;
4393     const Register tmp6  = r15;
4394     const Register tmp7  = r16;
4395 
4396     RegSet spilled_regs = RegSet::of(y, ylen);
4397     BLOCK_COMMENT("Entry:");
4398     __ enter();
4399     __ push(spilled_regs, sp);
4400     __ mov(y, x);
4401     __ mov(ylen, xlen);
4402     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4403     __ pop(spilled_regs, sp);
4404     __ leave();
4405     __ ret(lr);
4406     return start;
4407   }
4408 
4409   address generate_mulAdd() {
4410     __ align(CodeEntryAlignment);
4411     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4412 
4413     address start = __ pc();
4414 
4415     const Register out     = r0;
4416     const Register in      = r1;
4417     const Register offset  = r2;
4418     const Register len     = r3;
4419     const Register k       = r4;
4420 
4421     BLOCK_COMMENT("Entry:");
4422     __ enter();
4423     __ mul_add(out, in, offset, len, k);
4424     __ leave();
4425     __ ret(lr);
4426 
4427     return start;
4428   }
4429 
4430   // Arguments:
4431   //
4432   // Input:
4433   //   c_rarg0   - newArr address
4434   //   c_rarg1   - oldArr address
4435   //   c_rarg2   - newIdx
4436   //   c_rarg3   - shiftCount
4437   //   c_rarg4   - numIter
4438   //
4439   address generate_bigIntegerRightShift() {
4440     __ align(CodeEntryAlignment);
4441     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4442     address start = __ pc();
4443 
4444     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4445 
4446     Register newArr        = c_rarg0;
4447     Register oldArr        = c_rarg1;
4448     Register newIdx        = c_rarg2;
4449     Register shiftCount    = c_rarg3;
4450     Register numIter       = c_rarg4;
4451     Register idx           = numIter;
4452 
4453     Register newArrCur     = rscratch1;
4454     Register shiftRevCount = rscratch2;
4455     Register oldArrCur     = r13;
4456     Register oldArrNext    = r14;
4457 
4458     FloatRegister oldElem0        = v0;
4459     FloatRegister oldElem1        = v1;
4460     FloatRegister newElem         = v2;
4461     FloatRegister shiftVCount     = v3;
4462     FloatRegister shiftVRevCount  = v4;
4463 
4464     __ cbz(idx, Exit);
4465 
4466     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4467 
4468     // left shift count
4469     __ movw(shiftRevCount, 32);
4470     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4471 
4472     // numIter too small to allow a 4-words SIMD loop, rolling back
4473     __ cmp(numIter, (u1)4);
4474     __ br(Assembler::LT, ShiftThree);
4475 
4476     __ dup(shiftVCount,    __ T4S, shiftCount);
4477     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4478     __ negr(shiftVCount,   __ T4S, shiftVCount);
4479 
4480     __ BIND(ShiftSIMDLoop);
4481 
4482     // Calculate the load addresses
4483     __ sub(idx, idx, 4);
4484     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4485     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4486     __ add(oldArrCur,  oldArrNext, 4);
4487 
4488     // Load 4 words and process
4489     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4490     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4491     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4492     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4493     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4494     __ st1(newElem,   __ T4S,  Address(newArrCur));
4495 
4496     __ cmp(idx, (u1)4);
4497     __ br(Assembler::LT, ShiftTwoLoop);
4498     __ b(ShiftSIMDLoop);
4499 
4500     __ BIND(ShiftTwoLoop);
4501     __ cbz(idx, Exit);
4502     __ cmp(idx, (u1)1);
4503     __ br(Assembler::EQ, ShiftOne);
4504 
4505     // Calculate the load addresses
4506     __ sub(idx, idx, 2);
4507     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4508     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4509     __ add(oldArrCur,  oldArrNext, 4);
4510 
4511     // Load 2 words and process
4512     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4513     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4514     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4515     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4516     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4517     __ st1(newElem,   __ T2S, Address(newArrCur));
4518     __ b(ShiftTwoLoop);
4519 
4520     __ BIND(ShiftThree);
4521     __ tbz(idx, 1, ShiftOne);
4522     __ tbz(idx, 0, ShiftTwo);
4523     __ ldrw(r10,  Address(oldArr, 12));
4524     __ ldrw(r11,  Address(oldArr, 8));
4525     __ lsrvw(r10, r10, shiftCount);
4526     __ lslvw(r11, r11, shiftRevCount);
4527     __ orrw(r12,  r10, r11);
4528     __ strw(r12,  Address(newArr, 8));
4529 
4530     __ BIND(ShiftTwo);
4531     __ ldrw(r10,  Address(oldArr, 8));
4532     __ ldrw(r11,  Address(oldArr, 4));
4533     __ lsrvw(r10, r10, shiftCount);
4534     __ lslvw(r11, r11, shiftRevCount);
4535     __ orrw(r12,  r10, r11);
4536     __ strw(r12,  Address(newArr, 4));
4537 
4538     __ BIND(ShiftOne);
4539     __ ldrw(r10,  Address(oldArr, 4));
4540     __ ldrw(r11,  Address(oldArr));
4541     __ lsrvw(r10, r10, shiftCount);
4542     __ lslvw(r11, r11, shiftRevCount);
4543     __ orrw(r12,  r10, r11);
4544     __ strw(r12,  Address(newArr));
4545 
4546     __ BIND(Exit);
4547     __ ret(lr);
4548 
4549     return start;
4550   }
4551 
4552   // Arguments:
4553   //
4554   // Input:
4555   //   c_rarg0   - newArr address
4556   //   c_rarg1   - oldArr address
4557   //   c_rarg2   - newIdx
4558   //   c_rarg3   - shiftCount
4559   //   c_rarg4   - numIter
4560   //
4561   address generate_bigIntegerLeftShift() {
4562     __ align(CodeEntryAlignment);
4563     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4564     address start = __ pc();
4565 
4566     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4567 
4568     Register newArr        = c_rarg0;
4569     Register oldArr        = c_rarg1;
4570     Register newIdx        = c_rarg2;
4571     Register shiftCount    = c_rarg3;
4572     Register numIter       = c_rarg4;
4573 
4574     Register shiftRevCount = rscratch1;
4575     Register oldArrNext    = rscratch2;
4576 
4577     FloatRegister oldElem0        = v0;
4578     FloatRegister oldElem1        = v1;
4579     FloatRegister newElem         = v2;
4580     FloatRegister shiftVCount     = v3;
4581     FloatRegister shiftVRevCount  = v4;
4582 
4583     __ cbz(numIter, Exit);
4584 
4585     __ add(oldArrNext, oldArr, 4);
4586     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4587 
4588     // right shift count
4589     __ movw(shiftRevCount, 32);
4590     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4591 
4592     // numIter too small to allow a 4-words SIMD loop, rolling back
4593     __ cmp(numIter, (u1)4);
4594     __ br(Assembler::LT, ShiftThree);
4595 
4596     __ dup(shiftVCount,     __ T4S, shiftCount);
4597     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4598     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4599 
4600     __ BIND(ShiftSIMDLoop);
4601 
4602     // load 4 words and process
4603     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4604     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4605     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4606     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4607     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4608     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4609     __ sub(numIter,   numIter, 4);
4610 
4611     __ cmp(numIter, (u1)4);
4612     __ br(Assembler::LT, ShiftTwoLoop);
4613     __ b(ShiftSIMDLoop);
4614 
4615     __ BIND(ShiftTwoLoop);
4616     __ cbz(numIter, Exit);
4617     __ cmp(numIter, (u1)1);
4618     __ br(Assembler::EQ, ShiftOne);
4619 
4620     // load 2 words and process
4621     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4622     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4623     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4624     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4625     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4626     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4627     __ sub(numIter,   numIter, 2);
4628     __ b(ShiftTwoLoop);
4629 
4630     __ BIND(ShiftThree);
4631     __ ldrw(r10,  __ post(oldArr, 4));
4632     __ ldrw(r11,  __ post(oldArrNext, 4));
4633     __ lslvw(r10, r10, shiftCount);
4634     __ lsrvw(r11, r11, shiftRevCount);
4635     __ orrw(r12,  r10, r11);
4636     __ strw(r12,  __ post(newArr, 4));
4637     __ tbz(numIter, 1, Exit);
4638     __ tbz(numIter, 0, ShiftOne);
4639 
4640     __ BIND(ShiftTwo);
4641     __ ldrw(r10,  __ post(oldArr, 4));
4642     __ ldrw(r11,  __ post(oldArrNext, 4));
4643     __ lslvw(r10, r10, shiftCount);
4644     __ lsrvw(r11, r11, shiftRevCount);
4645     __ orrw(r12,  r10, r11);
4646     __ strw(r12,  __ post(newArr, 4));
4647 
4648     __ BIND(ShiftOne);
4649     __ ldrw(r10,  Address(oldArr));
4650     __ ldrw(r11,  Address(oldArrNext));
4651     __ lslvw(r10, r10, shiftCount);
4652     __ lsrvw(r11, r11, shiftRevCount);
4653     __ orrw(r12,  r10, r11);
4654     __ strw(r12,  Address(newArr));
4655 
4656     __ BIND(Exit);
4657     __ ret(lr);
4658 
4659     return start;
4660   }
4661 
4662   address generate_count_positives(address &count_positives_long) {
4663     const u1 large_loop_size = 64;
4664     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4665     int dcache_line = VM_Version::dcache_line_size();
4666 
4667     Register ary1 = r1, len = r2, result = r0;
4668 
4669     __ align(CodeEntryAlignment);
4670 
4671     StubCodeMark mark(this, "StubRoutines", "count_positives");
4672 
4673     address entry = __ pc();
4674 
4675     __ enter();
4676     // precondition: a copy of len is already in result
4677     // __ mov(result, len);
4678 
4679   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
4680         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
4681 
4682   __ cmp(len, (u1)15);
4683   __ br(Assembler::GT, LEN_OVER_15);
4684   // The only case when execution falls into this code is when pointer is near
4685   // the end of memory page and we have to avoid reading next page
4686   __ add(ary1, ary1, len);
4687   __ subs(len, len, 8);
4688   __ br(Assembler::GT, LEN_OVER_8);
4689   __ ldr(rscratch2, Address(ary1, -8));
4690   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
4691   __ lsrv(rscratch2, rscratch2, rscratch1);
4692   __ tst(rscratch2, UPPER_BIT_MASK);
4693   __ csel(result, zr, result, Assembler::NE);
4694   __ leave();
4695   __ ret(lr);
4696   __ bind(LEN_OVER_8);
4697   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
4698   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
4699   __ tst(rscratch2, UPPER_BIT_MASK);
4700   __ br(Assembler::NE, RET_NO_POP);
4701   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
4702   __ lsrv(rscratch1, rscratch1, rscratch2);
4703   __ tst(rscratch1, UPPER_BIT_MASK);
4704   __ bind(RET_NO_POP);
4705   __ csel(result, zr, result, Assembler::NE);
4706   __ leave();
4707   __ ret(lr);
4708 
4709   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
4710   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
4711 
4712   count_positives_long = __ pc(); // 2nd entry point
4713 
4714   __ enter();
4715 
4716   __ bind(LEN_OVER_15);
4717     __ push(spilled_regs, sp);
4718     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
4719     __ cbz(rscratch2, ALIGNED);
4720     __ ldp(tmp6, tmp1, Address(ary1));
4721     __ mov(tmp5, 16);
4722     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
4723     __ add(ary1, ary1, rscratch1);
4724     __ orr(tmp6, tmp6, tmp1);
4725     __ tst(tmp6, UPPER_BIT_MASK);
4726     __ br(Assembler::NE, RET_ADJUST);
4727     __ sub(len, len, rscratch1);
4728 
4729   __ bind(ALIGNED);
4730     __ cmp(len, large_loop_size);
4731     __ br(Assembler::LT, CHECK_16);
4732     // Perform 16-byte load as early return in pre-loop to handle situation
4733     // when initially aligned large array has negative values at starting bytes,
4734     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
4735     // slower. Cases with negative bytes further ahead won't be affected that
4736     // much. In fact, it'll be faster due to early loads, less instructions and
4737     // less branches in LARGE_LOOP.
4738     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
4739     __ sub(len, len, 16);
4740     __ orr(tmp6, tmp6, tmp1);
4741     __ tst(tmp6, UPPER_BIT_MASK);
4742     __ br(Assembler::NE, RET_ADJUST_16);
4743     __ cmp(len, large_loop_size);
4744     __ br(Assembler::LT, CHECK_16);
4745 
4746     if (SoftwarePrefetchHintDistance >= 0
4747         && SoftwarePrefetchHintDistance >= dcache_line) {
4748       // initial prefetch
4749       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
4750     }
4751   __ bind(LARGE_LOOP);
4752     if (SoftwarePrefetchHintDistance >= 0) {
4753       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
4754     }
4755     // Issue load instructions first, since it can save few CPU/MEM cycles, also
4756     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
4757     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
4758     // instructions per cycle and have less branches, but this approach disables
4759     // early return, thus, all 64 bytes are loaded and checked every time.
4760     __ ldp(tmp2, tmp3, Address(ary1));
4761     __ ldp(tmp4, tmp5, Address(ary1, 16));
4762     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
4763     __ ldp(tmp6, tmp1, Address(ary1, 48));
4764     __ add(ary1, ary1, large_loop_size);
4765     __ sub(len, len, large_loop_size);
4766     __ orr(tmp2, tmp2, tmp3);
4767     __ orr(tmp4, tmp4, tmp5);
4768     __ orr(rscratch1, rscratch1, rscratch2);
4769     __ orr(tmp6, tmp6, tmp1);
4770     __ orr(tmp2, tmp2, tmp4);
4771     __ orr(rscratch1, rscratch1, tmp6);
4772     __ orr(tmp2, tmp2, rscratch1);
4773     __ tst(tmp2, UPPER_BIT_MASK);
4774     __ br(Assembler::NE, RET_ADJUST_LONG);
4775     __ cmp(len, large_loop_size);
4776     __ br(Assembler::GE, LARGE_LOOP);
4777 
4778   __ bind(CHECK_16); // small 16-byte load pre-loop
4779     __ cmp(len, (u1)16);
4780     __ br(Assembler::LT, POST_LOOP16);
4781 
4782   __ bind(LOOP16); // small 16-byte load loop
4783     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
4784     __ sub(len, len, 16);
4785     __ orr(tmp2, tmp2, tmp3);
4786     __ tst(tmp2, UPPER_BIT_MASK);
4787     __ br(Assembler::NE, RET_ADJUST_16);
4788     __ cmp(len, (u1)16);
4789     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
4790 
4791   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
4792     __ cmp(len, (u1)8);
4793     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
4794     __ ldr(tmp3, Address(__ post(ary1, 8)));
4795     __ tst(tmp3, UPPER_BIT_MASK);
4796     __ br(Assembler::NE, RET_ADJUST);
4797     __ sub(len, len, 8);
4798 
4799   __ bind(POST_LOOP16_LOAD_TAIL);
4800     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
4801     __ ldr(tmp1, Address(ary1));
4802     __ mov(tmp2, 64);
4803     __ sub(tmp4, tmp2, len, __ LSL, 3);
4804     __ lslv(tmp1, tmp1, tmp4);
4805     __ tst(tmp1, UPPER_BIT_MASK);
4806     __ br(Assembler::NE, RET_ADJUST);
4807     // Fallthrough
4808 
4809   __ bind(RET_LEN);
4810     __ pop(spilled_regs, sp);
4811     __ leave();
4812     __ ret(lr);
4813 
4814     // difference result - len is the count of guaranteed to be
4815     // positive bytes
4816 
4817   __ bind(RET_ADJUST_LONG);
4818     __ add(len, len, (u1)(large_loop_size - 16));
4819   __ bind(RET_ADJUST_16);
4820     __ add(len, len, 16);
4821   __ bind(RET_ADJUST);
4822     __ pop(spilled_regs, sp);
4823     __ leave();
4824     __ sub(result, result, len);
4825     __ ret(lr);
4826 
4827     return entry;
4828   }
4829 
4830   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
4831         bool usePrefetch, Label &NOT_EQUAL) {
4832     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4833         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4834         tmp7 = r12, tmp8 = r13;
4835     Label LOOP;
4836 
4837     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4838     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4839     __ bind(LOOP);
4840     if (usePrefetch) {
4841       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4842       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4843     }
4844     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4845     __ eor(tmp1, tmp1, tmp2);
4846     __ eor(tmp3, tmp3, tmp4);
4847     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4848     __ orr(tmp1, tmp1, tmp3);
4849     __ cbnz(tmp1, NOT_EQUAL);
4850     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4851     __ eor(tmp5, tmp5, tmp6);
4852     __ eor(tmp7, tmp7, tmp8);
4853     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4854     __ orr(tmp5, tmp5, tmp7);
4855     __ cbnz(tmp5, NOT_EQUAL);
4856     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
4857     __ eor(tmp1, tmp1, tmp2);
4858     __ eor(tmp3, tmp3, tmp4);
4859     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
4860     __ orr(tmp1, tmp1, tmp3);
4861     __ cbnz(tmp1, NOT_EQUAL);
4862     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
4863     __ eor(tmp5, tmp5, tmp6);
4864     __ sub(cnt1, cnt1, 8 * wordSize);
4865     __ eor(tmp7, tmp7, tmp8);
4866     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
4867     // tmp6 is not used. MacroAssembler::subs is used here (rather than
4868     // cmp) because subs allows an unlimited range of immediate operand.
4869     __ subs(tmp6, cnt1, loopThreshold);
4870     __ orr(tmp5, tmp5, tmp7);
4871     __ cbnz(tmp5, NOT_EQUAL);
4872     __ br(__ GE, LOOP);
4873     // post-loop
4874     __ eor(tmp1, tmp1, tmp2);
4875     __ eor(tmp3, tmp3, tmp4);
4876     __ orr(tmp1, tmp1, tmp3);
4877     __ sub(cnt1, cnt1, 2 * wordSize);
4878     __ cbnz(tmp1, NOT_EQUAL);
4879   }
4880 
4881   void generate_large_array_equals_loop_simd(int loopThreshold,
4882         bool usePrefetch, Label &NOT_EQUAL) {
4883     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4884         tmp2 = rscratch2;
4885     Label LOOP;
4886 
4887     __ bind(LOOP);
4888     if (usePrefetch) {
4889       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
4890       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
4891     }
4892     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
4893     __ sub(cnt1, cnt1, 8 * wordSize);
4894     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
4895     __ subs(tmp1, cnt1, loopThreshold);
4896     __ eor(v0, __ T16B, v0, v4);
4897     __ eor(v1, __ T16B, v1, v5);
4898     __ eor(v2, __ T16B, v2, v6);
4899     __ eor(v3, __ T16B, v3, v7);
4900     __ orr(v0, __ T16B, v0, v1);
4901     __ orr(v1, __ T16B, v2, v3);
4902     __ orr(v0, __ T16B, v0, v1);
4903     __ umov(tmp1, v0, __ D, 0);
4904     __ umov(tmp2, v0, __ D, 1);
4905     __ orr(tmp1, tmp1, tmp2);
4906     __ cbnz(tmp1, NOT_EQUAL);
4907     __ br(__ GE, LOOP);
4908   }
4909 
4910   // a1 = r1 - array1 address
4911   // a2 = r2 - array2 address
4912   // result = r0 - return value. Already contains "false"
4913   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
4914   // r3-r5 are reserved temporary registers
4915   address generate_large_array_equals() {
4916     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
4917         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
4918         tmp7 = r12, tmp8 = r13;
4919     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
4920         SMALL_LOOP, POST_LOOP;
4921     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
4922     // calculate if at least 32 prefetched bytes are used
4923     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
4924     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
4925     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
4926     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
4927         tmp5, tmp6, tmp7, tmp8);
4928 
4929     __ align(CodeEntryAlignment);
4930 
4931     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
4932 
4933     address entry = __ pc();
4934     __ enter();
4935     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
4936     // also advance pointers to use post-increment instead of pre-increment
4937     __ add(a1, a1, wordSize);
4938     __ add(a2, a2, wordSize);
4939     if (AvoidUnalignedAccesses) {
4940       // both implementations (SIMD/nonSIMD) are using relatively large load
4941       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
4942       // on some CPUs in case of address is not at least 16-byte aligned.
4943       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
4944       // load if needed at least for 1st address and make if 16-byte aligned.
4945       Label ALIGNED16;
4946       __ tbz(a1, 3, ALIGNED16);
4947       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4948       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4949       __ sub(cnt1, cnt1, wordSize);
4950       __ eor(tmp1, tmp1, tmp2);
4951       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
4952       __ bind(ALIGNED16);
4953     }
4954     if (UseSIMDForArrayEquals) {
4955       if (SoftwarePrefetchHintDistance >= 0) {
4956         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4957         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4958         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
4959             /* prfm = */ true, NOT_EQUAL);
4960         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4961         __ br(__ LT, TAIL);
4962       }
4963       __ bind(NO_PREFETCH_LARGE_LOOP);
4964       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
4965           /* prfm = */ false, NOT_EQUAL);
4966     } else {
4967       __ push(spilled_regs, sp);
4968       if (SoftwarePrefetchHintDistance >= 0) {
4969         __ subs(tmp1, cnt1, prefetchLoopThreshold);
4970         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
4971         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
4972             /* prfm = */ true, NOT_EQUAL);
4973         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
4974         __ br(__ LT, TAIL);
4975       }
4976       __ bind(NO_PREFETCH_LARGE_LOOP);
4977       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4978           /* prfm = */ false, NOT_EQUAL);
4979     }
4980     __ bind(TAIL);
4981       __ cbz(cnt1, EQUAL);
4982       __ subs(cnt1, cnt1, wordSize);
4983       __ br(__ LE, POST_LOOP);
4984     __ bind(SMALL_LOOP);
4985       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4986       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4987       __ subs(cnt1, cnt1, wordSize);
4988       __ eor(tmp1, tmp1, tmp2);
4989       __ cbnz(tmp1, NOT_EQUAL);
4990       __ br(__ GT, SMALL_LOOP);
4991     __ bind(POST_LOOP);
4992       __ ldr(tmp1, Address(a1, cnt1));
4993       __ ldr(tmp2, Address(a2, cnt1));
4994       __ eor(tmp1, tmp1, tmp2);
4995       __ cbnz(tmp1, NOT_EQUAL);
4996     __ bind(EQUAL);
4997       __ mov(result, true);
4998     __ bind(NOT_EQUAL);
4999       if (!UseSIMDForArrayEquals) {
5000         __ pop(spilled_regs, sp);
5001       }
5002     __ bind(NOT_EQUAL_NO_POP);
5003     __ leave();
5004     __ ret(lr);
5005     return entry;
5006   }
5007 
5008   address generate_dsin_dcos(bool isCos) {
5009     __ align(CodeEntryAlignment);
5010     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5011     address start = __ pc();
5012     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5013         (address)StubRoutines::aarch64::_two_over_pi,
5014         (address)StubRoutines::aarch64::_pio2,
5015         (address)StubRoutines::aarch64::_dsin_coef,
5016         (address)StubRoutines::aarch64::_dcos_coef);
5017     return start;
5018   }
5019 
5020   address generate_dlog() {
5021     __ align(CodeEntryAlignment);
5022     StubCodeMark mark(this, "StubRoutines", "dlog");
5023     address entry = __ pc();
5024     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
5025         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
5026     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
5027     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
5028         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
5029     return entry;
5030   }
5031 
5032 
5033   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5034   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5035       Label &DIFF2) {
5036     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5037     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5038 
5039     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5040     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5041     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5042     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5043 
5044     __ fmovd(tmpL, vtmp3);
5045     __ eor(rscratch2, tmp3, tmpL);
5046     __ cbnz(rscratch2, DIFF2);
5047 
5048     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5049     __ umov(tmpL, vtmp3, __ D, 1);
5050     __ eor(rscratch2, tmpU, tmpL);
5051     __ cbnz(rscratch2, DIFF1);
5052 
5053     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5054     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5055     __ fmovd(tmpL, vtmp);
5056     __ eor(rscratch2, tmp3, tmpL);
5057     __ cbnz(rscratch2, DIFF2);
5058 
5059     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5060     __ umov(tmpL, vtmp, __ D, 1);
5061     __ eor(rscratch2, tmpU, tmpL);
5062     __ cbnz(rscratch2, DIFF1);
5063   }
5064 
5065   // r0  = result
5066   // r1  = str1
5067   // r2  = cnt1
5068   // r3  = str2
5069   // r4  = cnt2
5070   // r10 = tmp1
5071   // r11 = tmp2
5072   address generate_compare_long_string_different_encoding(bool isLU) {
5073     __ align(CodeEntryAlignment);
5074     StubCodeMark mark(this, "StubRoutines", isLU
5075         ? "compare_long_string_different_encoding LU"
5076         : "compare_long_string_different_encoding UL");
5077     address entry = __ pc();
5078     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5079         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5080         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5081     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5082         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5083     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5084     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5085 
5086     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5087 
5088     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5089     // cnt2 == amount of characters left to compare
5090     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5091     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5092     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5093     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5094     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5095     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5096     __ eor(rscratch2, tmp1, tmp2);
5097     __ mov(rscratch1, tmp2);
5098     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5099     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5100              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5101     __ push(spilled_regs, sp);
5102     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5103     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5104 
5105     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5106 
5107     if (SoftwarePrefetchHintDistance >= 0) {
5108       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5109       __ br(__ LT, NO_PREFETCH);
5110       __ bind(LARGE_LOOP_PREFETCH);
5111         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5112         __ mov(tmp4, 2);
5113         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5114         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5115           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5116           __ subs(tmp4, tmp4, 1);
5117           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5118           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5119           __ mov(tmp4, 2);
5120         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5121           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5122           __ subs(tmp4, tmp4, 1);
5123           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5124           __ sub(cnt2, cnt2, 64);
5125           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5126           __ br(__ GE, LARGE_LOOP_PREFETCH);
5127     }
5128     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5129     __ bind(NO_PREFETCH);
5130     __ subs(cnt2, cnt2, 16);
5131     __ br(__ LT, TAIL);
5132     __ align(OptoLoopAlignment);
5133     __ bind(SMALL_LOOP); // smaller loop
5134       __ subs(cnt2, cnt2, 16);
5135       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5136       __ br(__ GE, SMALL_LOOP);
5137       __ cmn(cnt2, (u1)16);
5138       __ br(__ EQ, LOAD_LAST);
5139     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5140       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5141       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5142       __ ldr(tmp3, Address(cnt1, -8));
5143       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5144       __ b(LOAD_LAST);
5145     __ bind(DIFF2);
5146       __ mov(tmpU, tmp3);
5147     __ bind(DIFF1);
5148       __ pop(spilled_regs, sp);
5149       __ b(CALCULATE_DIFFERENCE);
5150     __ bind(LOAD_LAST);
5151       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5152       // No need to load it again
5153       __ mov(tmpU, tmp3);
5154       __ pop(spilled_regs, sp);
5155 
5156       // tmp2 points to the address of the last 4 Latin1 characters right now
5157       __ ldrs(vtmp, Address(tmp2));
5158       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5159       __ fmovd(tmpL, vtmp);
5160 
5161       __ eor(rscratch2, tmpU, tmpL);
5162       __ cbz(rscratch2, DONE);
5163 
5164     // Find the first different characters in the longwords and
5165     // compute their difference.
5166     __ bind(CALCULATE_DIFFERENCE);
5167       __ rev(rscratch2, rscratch2);
5168       __ clz(rscratch2, rscratch2);
5169       __ andr(rscratch2, rscratch2, -16);
5170       __ lsrv(tmp1, tmp1, rscratch2);
5171       __ uxthw(tmp1, tmp1);
5172       __ lsrv(rscratch1, rscratch1, rscratch2);
5173       __ uxthw(rscratch1, rscratch1);
5174       __ subw(result, tmp1, rscratch1);
5175     __ bind(DONE);
5176       __ ret(lr);
5177     return entry;
5178   }
5179 
5180   address generate_method_entry_barrier() {
5181     __ align(CodeEntryAlignment);
5182     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5183 
5184     Label deoptimize_label;
5185 
5186     address start = __ pc();
5187 
5188     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5189 
5190     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5191       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5192       // We can get here despite the nmethod being good, if we have not
5193       // yet applied our cross modification fence (or data fence).
5194       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()) + 4);
5195       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5196       __ ldrw(rscratch2, rscratch2);
5197       __ strw(rscratch2, thread_epoch_addr);
5198       __ isb();
5199       __ membar(__ LoadLoad);
5200     }
5201 
5202     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5203 
5204     __ enter();
5205     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5206 
5207     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5208 
5209     __ push_call_clobbered_registers();
5210 
5211     __ mov(c_rarg0, rscratch2);
5212     __ call_VM_leaf
5213          (CAST_FROM_FN_PTR
5214           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5215 
5216     __ reset_last_Java_frame(true);
5217 
5218     __ mov(rscratch1, r0);
5219 
5220     __ pop_call_clobbered_registers();
5221 
5222     __ cbnz(rscratch1, deoptimize_label);
5223 
5224     __ leave();
5225     __ ret(lr);
5226 
5227     __ BIND(deoptimize_label);
5228 
5229     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5230     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5231 
5232     __ mov(sp, rscratch1);
5233     __ br(rscratch2);
5234 
5235     return start;
5236   }
5237 
5238   // r0  = result
5239   // r1  = str1
5240   // r2  = cnt1
5241   // r3  = str2
5242   // r4  = cnt2
5243   // r10 = tmp1
5244   // r11 = tmp2
5245   address generate_compare_long_string_same_encoding(bool isLL) {
5246     __ align(CodeEntryAlignment);
5247     StubCodeMark mark(this, "StubRoutines", isLL
5248         ? "compare_long_string_same_encoding LL"
5249         : "compare_long_string_same_encoding UU");
5250     address entry = __ pc();
5251     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5252         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5253 
5254     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5255 
5256     // exit from large loop when less than 64 bytes left to read or we're about
5257     // to prefetch memory behind array border
5258     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5259 
5260     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5261     __ eor(rscratch2, tmp1, tmp2);
5262     __ cbnz(rscratch2, CAL_DIFFERENCE);
5263 
5264     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5265     // update pointers, because of previous read
5266     __ add(str1, str1, wordSize);
5267     __ add(str2, str2, wordSize);
5268     if (SoftwarePrefetchHintDistance >= 0) {
5269       __ align(OptoLoopAlignment);
5270       __ bind(LARGE_LOOP_PREFETCH);
5271         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5272         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5273 
5274         for (int i = 0; i < 4; i++) {
5275           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5276           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5277           __ cmp(tmp1, tmp2);
5278           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5279           __ br(Assembler::NE, DIFF);
5280         }
5281         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5282         __ add(str1, str1, 64);
5283         __ add(str2, str2, 64);
5284         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5285         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5286         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5287     }
5288 
5289     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5290     __ br(Assembler::LE, LESS16);
5291     __ align(OptoLoopAlignment);
5292     __ bind(LOOP_COMPARE16);
5293       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5294       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5295       __ cmp(tmp1, tmp2);
5296       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5297       __ br(Assembler::NE, DIFF);
5298       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5299       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5300       __ br(Assembler::LT, LESS16);
5301 
5302       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5303       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5304       __ cmp(tmp1, tmp2);
5305       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5306       __ br(Assembler::NE, DIFF);
5307       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5308       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5309       __ br(Assembler::GE, LOOP_COMPARE16);
5310       __ cbz(cnt2, LENGTH_DIFF);
5311 
5312     __ bind(LESS16);
5313       // each 8 compare
5314       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5315       __ br(Assembler::LE, LESS8);
5316       __ ldr(tmp1, Address(__ post(str1, 8)));
5317       __ ldr(tmp2, Address(__ post(str2, 8)));
5318       __ eor(rscratch2, tmp1, tmp2);
5319       __ cbnz(rscratch2, CAL_DIFFERENCE);
5320       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5321 
5322     __ bind(LESS8); // directly load last 8 bytes
5323       if (!isLL) {
5324         __ add(cnt2, cnt2, cnt2);
5325       }
5326       __ ldr(tmp1, Address(str1, cnt2));
5327       __ ldr(tmp2, Address(str2, cnt2));
5328       __ eor(rscratch2, tmp1, tmp2);
5329       __ cbz(rscratch2, LENGTH_DIFF);
5330       __ b(CAL_DIFFERENCE);
5331 
5332     __ bind(DIFF);
5333       __ cmp(tmp1, tmp2);
5334       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
5335       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
5336       // reuse rscratch2 register for the result of eor instruction
5337       __ eor(rscratch2, tmp1, tmp2);
5338 
5339     __ bind(CAL_DIFFERENCE);
5340       __ rev(rscratch2, rscratch2);
5341       __ clz(rscratch2, rscratch2);
5342       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
5343       __ lsrv(tmp1, tmp1, rscratch2);
5344       __ lsrv(tmp2, tmp2, rscratch2);
5345       if (isLL) {
5346         __ uxtbw(tmp1, tmp1);
5347         __ uxtbw(tmp2, tmp2);
5348       } else {
5349         __ uxthw(tmp1, tmp1);
5350         __ uxthw(tmp2, tmp2);
5351       }
5352       __ subw(result, tmp1, tmp2);
5353 
5354     __ bind(LENGTH_DIFF);
5355       __ ret(lr);
5356     return entry;
5357   }
5358 
5359   enum string_compare_mode {
5360     LL,
5361     LU,
5362     UL,
5363     UU,
5364   };
5365 
5366   // The following registers are declared in aarch64.ad
5367   // r0  = result
5368   // r1  = str1
5369   // r2  = cnt1
5370   // r3  = str2
5371   // r4  = cnt2
5372   // r10 = tmp1
5373   // r11 = tmp2
5374   // z0  = ztmp1
5375   // z1  = ztmp2
5376   // p0  = pgtmp1
5377   // p1  = pgtmp2
5378   address generate_compare_long_string_sve(string_compare_mode mode) {
5379     __ align(CodeEntryAlignment);
5380     address entry = __ pc();
5381     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5382              tmp1 = r10, tmp2 = r11;
5383 
5384     Label LOOP, DONE, MISMATCH;
5385     Register vec_len = tmp1;
5386     Register idx = tmp2;
5387     // The minimum of the string lengths has been stored in cnt2.
5388     Register cnt = cnt2;
5389     FloatRegister ztmp1 = z0, ztmp2 = z1;
5390     PRegister pgtmp1 = p0, pgtmp2 = p1;
5391 
5392 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
5393     switch (mode) {                                                            \
5394       case LL:                                                                 \
5395         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
5396         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
5397         break;                                                                 \
5398       case LU:                                                                 \
5399         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
5400         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5401         break;                                                                 \
5402       case UL:                                                                 \
5403         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5404         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
5405         break;                                                                 \
5406       case UU:                                                                 \
5407         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
5408         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
5409         break;                                                                 \
5410       default:                                                                 \
5411         ShouldNotReachHere();                                                  \
5412     }
5413 
5414     const char* stubname;
5415     switch (mode) {
5416       case LL: stubname = "compare_long_string_same_encoding LL";      break;
5417       case LU: stubname = "compare_long_string_different_encoding LU"; break;
5418       case UL: stubname = "compare_long_string_different_encoding UL"; break;
5419       case UU: stubname = "compare_long_string_same_encoding UU";      break;
5420       default: ShouldNotReachHere();
5421     }
5422 
5423     StubCodeMark mark(this, "StubRoutines", stubname);
5424 
5425     __ mov(idx, 0);
5426     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5427 
5428     if (mode == LL) {
5429       __ sve_cntb(vec_len);
5430     } else {
5431       __ sve_cnth(vec_len);
5432     }
5433 
5434     __ sub(rscratch1, cnt, vec_len);
5435 
5436     __ bind(LOOP);
5437 
5438       // main loop
5439       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5440       __ add(idx, idx, vec_len);
5441       // Compare strings.
5442       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5443       __ br(__ NE, MISMATCH);
5444       __ cmp(idx, rscratch1);
5445       __ br(__ LT, LOOP);
5446 
5447     // post loop, last iteration
5448     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
5449 
5450     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
5451     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
5452     __ br(__ EQ, DONE);
5453 
5454     __ bind(MISMATCH);
5455 
5456     // Crop the vector to find its location.
5457     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
5458     // Extract the first different characters of each string.
5459     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
5460     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
5461 
5462     // Compute the difference of the first different characters.
5463     __ sub(result, rscratch1, rscratch2);
5464 
5465     __ bind(DONE);
5466     __ ret(lr);
5467 #undef LOAD_PAIR
5468     return entry;
5469   }
5470 
5471   void generate_compare_long_strings() {
5472     if (UseSVE == 0) {
5473       StubRoutines::aarch64::_compare_long_string_LL
5474           = generate_compare_long_string_same_encoding(true);
5475       StubRoutines::aarch64::_compare_long_string_UU
5476           = generate_compare_long_string_same_encoding(false);
5477       StubRoutines::aarch64::_compare_long_string_LU
5478           = generate_compare_long_string_different_encoding(true);
5479       StubRoutines::aarch64::_compare_long_string_UL
5480           = generate_compare_long_string_different_encoding(false);
5481     } else {
5482       StubRoutines::aarch64::_compare_long_string_LL
5483           = generate_compare_long_string_sve(LL);
5484       StubRoutines::aarch64::_compare_long_string_UU
5485           = generate_compare_long_string_sve(UU);
5486       StubRoutines::aarch64::_compare_long_string_LU
5487           = generate_compare_long_string_sve(LU);
5488       StubRoutines::aarch64::_compare_long_string_UL
5489           = generate_compare_long_string_sve(UL);
5490     }
5491   }
5492 
5493   // R0 = result
5494   // R1 = str2
5495   // R2 = cnt1
5496   // R3 = str1
5497   // R4 = cnt2
5498   // This generic linear code use few additional ideas, which makes it faster:
5499   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
5500   // in order to skip initial loading(help in systems with 1 ld pipeline)
5501   // 2) we can use "fast" algorithm of finding single character to search for
5502   // first symbol with less branches(1 branch per each loaded register instead
5503   // of branch for each symbol), so, this is where constants like
5504   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
5505   // 3) after loading and analyzing 1st register of source string, it can be
5506   // used to search for every 1st character entry, saving few loads in
5507   // comparison with "simplier-but-slower" implementation
5508   // 4) in order to avoid lots of push/pop operations, code below is heavily
5509   // re-using/re-initializing/compressing register values, which makes code
5510   // larger and a bit less readable, however, most of extra operations are
5511   // issued during loads or branches, so, penalty is minimal
5512   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
5513     const char* stubName = str1_isL
5514         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
5515         : "indexof_linear_uu";
5516     __ align(CodeEntryAlignment);
5517     StubCodeMark mark(this, "StubRoutines", stubName);
5518     address entry = __ pc();
5519 
5520     int str1_chr_size = str1_isL ? 1 : 2;
5521     int str2_chr_size = str2_isL ? 1 : 2;
5522     int str1_chr_shift = str1_isL ? 0 : 1;
5523     int str2_chr_shift = str2_isL ? 0 : 1;
5524     bool isL = str1_isL && str2_isL;
5525    // parameters
5526     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
5527     // temporary registers
5528     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
5529     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
5530     // redefinitions
5531     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
5532 
5533     __ push(spilled_regs, sp);
5534     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
5535         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
5536         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
5537         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
5538         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
5539         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
5540     // Read whole register from str1. It is safe, because length >=8 here
5541     __ ldr(ch1, Address(str1));
5542     // Read whole register from str2. It is safe, because length >=8 here
5543     __ ldr(ch2, Address(str2));
5544     __ sub(cnt2, cnt2, cnt1);
5545     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
5546     if (str1_isL != str2_isL) {
5547       __ eor(v0, __ T16B, v0, v0);
5548     }
5549     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
5550     __ mul(first, first, tmp1);
5551     // check if we have less than 1 register to check
5552     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
5553     if (str1_isL != str2_isL) {
5554       __ fmovd(v1, ch1);
5555     }
5556     __ br(__ LE, L_SMALL);
5557     __ eor(ch2, first, ch2);
5558     if (str1_isL != str2_isL) {
5559       __ zip1(v1, __ T16B, v1, v0);
5560     }
5561     __ sub(tmp2, ch2, tmp1);
5562     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5563     __ bics(tmp2, tmp2, ch2);
5564     if (str1_isL != str2_isL) {
5565       __ fmovd(ch1, v1);
5566     }
5567     __ br(__ NE, L_HAS_ZERO);
5568     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5569     __ add(result, result, wordSize/str2_chr_size);
5570     __ add(str2, str2, wordSize);
5571     __ br(__ LT, L_POST_LOOP);
5572     __ BIND(L_LOOP);
5573       __ ldr(ch2, Address(str2));
5574       __ eor(ch2, first, ch2);
5575       __ sub(tmp2, ch2, tmp1);
5576       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5577       __ bics(tmp2, tmp2, ch2);
5578       __ br(__ NE, L_HAS_ZERO);
5579     __ BIND(L_LOOP_PROCEED);
5580       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
5581       __ add(str2, str2, wordSize);
5582       __ add(result, result, wordSize/str2_chr_size);
5583       __ br(__ GE, L_LOOP);
5584     __ BIND(L_POST_LOOP);
5585       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
5586       __ br(__ LE, NOMATCH);
5587       __ ldr(ch2, Address(str2));
5588       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5589       __ eor(ch2, first, ch2);
5590       __ sub(tmp2, ch2, tmp1);
5591       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5592       __ mov(tmp4, -1); // all bits set
5593       __ b(L_SMALL_PROCEED);
5594     __ align(OptoLoopAlignment);
5595     __ BIND(L_SMALL);
5596       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
5597       __ eor(ch2, first, ch2);
5598       if (str1_isL != str2_isL) {
5599         __ zip1(v1, __ T16B, v1, v0);
5600       }
5601       __ sub(tmp2, ch2, tmp1);
5602       __ mov(tmp4, -1); // all bits set
5603       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
5604       if (str1_isL != str2_isL) {
5605         __ fmovd(ch1, v1); // move converted 4 symbols
5606       }
5607     __ BIND(L_SMALL_PROCEED);
5608       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
5609       __ bic(tmp2, tmp2, ch2);
5610       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
5611       __ rbit(tmp2, tmp2);
5612       __ br(__ EQ, NOMATCH);
5613     __ BIND(L_SMALL_HAS_ZERO_LOOP);
5614       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
5615       __ cmp(cnt1, u1(wordSize/str2_chr_size));
5616       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
5617       if (str2_isL) { // LL
5618         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5619         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5620         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5621         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5622         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5623       } else {
5624         __ mov(ch2, 0xE); // all bits in byte set except last one
5625         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5626         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5627         __ lslv(tmp2, tmp2, tmp4);
5628         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5629         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5630         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5631         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5632       }
5633       __ cmp(ch1, ch2);
5634       __ mov(tmp4, wordSize/str2_chr_size);
5635       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5636     __ BIND(L_SMALL_CMP_LOOP);
5637       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5638                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5639       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5640                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5641       __ add(tmp4, tmp4, 1);
5642       __ cmp(tmp4, cnt1);
5643       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
5644       __ cmp(first, ch2);
5645       __ br(__ EQ, L_SMALL_CMP_LOOP);
5646     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
5647       __ cbz(tmp2, NOMATCH); // no more matches. exit
5648       __ clz(tmp4, tmp2);
5649       __ add(result, result, 1); // advance index
5650       __ add(str2, str2, str2_chr_size); // advance pointer
5651       __ b(L_SMALL_HAS_ZERO_LOOP);
5652     __ align(OptoLoopAlignment);
5653     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
5654       __ cmp(first, ch2);
5655       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5656       __ b(DONE);
5657     __ align(OptoLoopAlignment);
5658     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
5659       if (str2_isL) { // LL
5660         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
5661         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
5662         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
5663         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
5664         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5665       } else {
5666         __ mov(ch2, 0xE); // all bits in byte set except last one
5667         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5668         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5669         __ lslv(tmp2, tmp2, tmp4);
5670         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5671         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5672         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
5673         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5674       }
5675       __ cmp(ch1, ch2);
5676       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
5677       __ b(DONE);
5678     __ align(OptoLoopAlignment);
5679     __ BIND(L_HAS_ZERO);
5680       __ rbit(tmp2, tmp2);
5681       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
5682       // Now, perform compression of counters(cnt2 and cnt1) into one register.
5683       // It's fine because both counters are 32bit and are not changed in this
5684       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
5685       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
5686       __ sub(result, result, 1);
5687     __ BIND(L_HAS_ZERO_LOOP);
5688       __ mov(cnt1, wordSize/str2_chr_size);
5689       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5690       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
5691       if (str2_isL) {
5692         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5693         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5694         __ lslv(tmp2, tmp2, tmp4);
5695         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5696         __ add(tmp4, tmp4, 1);
5697         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5698         __ lsl(tmp2, tmp2, 1);
5699         __ mov(tmp4, wordSize/str2_chr_size);
5700       } else {
5701         __ mov(ch2, 0xE);
5702         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5703         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5704         __ lslv(tmp2, tmp2, tmp4);
5705         __ add(tmp4, tmp4, 1);
5706         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5707         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5708         __ lsl(tmp2, tmp2, 1);
5709         __ mov(tmp4, wordSize/str2_chr_size);
5710         __ sub(str2, str2, str2_chr_size);
5711       }
5712       __ cmp(ch1, ch2);
5713       __ mov(tmp4, wordSize/str2_chr_size);
5714       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5715     __ BIND(L_CMP_LOOP);
5716       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
5717                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
5718       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
5719                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
5720       __ add(tmp4, tmp4, 1);
5721       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
5722       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
5723       __ cmp(cnt1, ch2);
5724       __ br(__ EQ, L_CMP_LOOP);
5725     __ BIND(L_CMP_LOOP_NOMATCH);
5726       // here we're not matched
5727       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
5728       __ clz(tmp4, tmp2);
5729       __ add(str2, str2, str2_chr_size); // advance pointer
5730       __ b(L_HAS_ZERO_LOOP);
5731     __ align(OptoLoopAlignment);
5732     __ BIND(L_CMP_LOOP_LAST_CMP);
5733       __ cmp(cnt1, ch2);
5734       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5735       __ b(DONE);
5736     __ align(OptoLoopAlignment);
5737     __ BIND(L_CMP_LOOP_LAST_CMP2);
5738       if (str2_isL) {
5739         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
5740         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5741         __ lslv(tmp2, tmp2, tmp4);
5742         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5743         __ add(tmp4, tmp4, 1);
5744         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5745         __ lsl(tmp2, tmp2, 1);
5746       } else {
5747         __ mov(ch2, 0xE);
5748         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
5749         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
5750         __ lslv(tmp2, tmp2, tmp4);
5751         __ add(tmp4, tmp4, 1);
5752         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
5753         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
5754         __ lsl(tmp2, tmp2, 1);
5755         __ sub(str2, str2, str2_chr_size);
5756       }
5757       __ cmp(ch1, ch2);
5758       __ br(__ NE, L_CMP_LOOP_NOMATCH);
5759       __ b(DONE);
5760     __ align(OptoLoopAlignment);
5761     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
5762       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
5763       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
5764       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
5765       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
5766       // result by analyzed characters value, so, we can just reset lower bits
5767       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
5768       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
5769       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
5770       // index of last analyzed substring inside current octet. So, str2 in at
5771       // respective start address. We need to advance it to next octet
5772       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
5773       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
5774       __ bfm(result, zr, 0, 2 - str2_chr_shift);
5775       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
5776       __ movw(cnt2, cnt2);
5777       __ b(L_LOOP_PROCEED);
5778     __ align(OptoLoopAlignment);
5779     __ BIND(NOMATCH);
5780       __ mov(result, -1);
5781     __ BIND(DONE);
5782       __ pop(spilled_regs, sp);
5783       __ ret(lr);
5784     return entry;
5785   }
5786 
5787   void generate_string_indexof_stubs() {
5788     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
5789     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
5790     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
5791   }
5792 
5793   void inflate_and_store_2_fp_registers(bool generatePrfm,
5794       FloatRegister src1, FloatRegister src2) {
5795     Register dst = r1;
5796     __ zip1(v1, __ T16B, src1, v0);
5797     __ zip2(v2, __ T16B, src1, v0);
5798     if (generatePrfm) {
5799       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
5800     }
5801     __ zip1(v3, __ T16B, src2, v0);
5802     __ zip2(v4, __ T16B, src2, v0);
5803     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
5804   }
5805 
5806   // R0 = src
5807   // R1 = dst
5808   // R2 = len
5809   // R3 = len >> 3
5810   // V0 = 0
5811   // v1 = loaded 8 bytes
5812   address generate_large_byte_array_inflate() {
5813     __ align(CodeEntryAlignment);
5814     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
5815     address entry = __ pc();
5816     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
5817     Register src = r0, dst = r1, len = r2, octetCounter = r3;
5818     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
5819 
5820     // do one more 8-byte read to have address 16-byte aligned in most cases
5821     // also use single store instruction
5822     __ ldrd(v2, __ post(src, 8));
5823     __ sub(octetCounter, octetCounter, 2);
5824     __ zip1(v1, __ T16B, v1, v0);
5825     __ zip1(v2, __ T16B, v2, v0);
5826     __ st1(v1, v2, __ T16B, __ post(dst, 32));
5827     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5828     __ subs(rscratch1, octetCounter, large_loop_threshold);
5829     __ br(__ LE, LOOP_START);
5830     __ b(LOOP_PRFM_START);
5831     __ bind(LOOP_PRFM);
5832       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5833     __ bind(LOOP_PRFM_START);
5834       __ prfm(Address(src, SoftwarePrefetchHintDistance));
5835       __ sub(octetCounter, octetCounter, 8);
5836       __ subs(rscratch1, octetCounter, large_loop_threshold);
5837       inflate_and_store_2_fp_registers(true, v3, v4);
5838       inflate_and_store_2_fp_registers(true, v5, v6);
5839       __ br(__ GT, LOOP_PRFM);
5840       __ cmp(octetCounter, (u1)8);
5841       __ br(__ LT, DONE);
5842     __ bind(LOOP);
5843       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
5844       __ bind(LOOP_START);
5845       __ sub(octetCounter, octetCounter, 8);
5846       __ cmp(octetCounter, (u1)8);
5847       inflate_and_store_2_fp_registers(false, v3, v4);
5848       inflate_and_store_2_fp_registers(false, v5, v6);
5849       __ br(__ GE, LOOP);
5850     __ bind(DONE);
5851       __ ret(lr);
5852     return entry;
5853   }
5854 
5855   /**
5856    *  Arguments:
5857    *
5858    *  Input:
5859    *  c_rarg0   - current state address
5860    *  c_rarg1   - H key address
5861    *  c_rarg2   - data address
5862    *  c_rarg3   - number of blocks
5863    *
5864    *  Output:
5865    *  Updated state at c_rarg0
5866    */
5867   address generate_ghash_processBlocks() {
5868     // Bafflingly, GCM uses little-endian for the byte order, but
5869     // big-endian for the bit order.  For example, the polynomial 1 is
5870     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
5871     //
5872     // So, we must either reverse the bytes in each word and do
5873     // everything big-endian or reverse the bits in each byte and do
5874     // it little-endian.  On AArch64 it's more idiomatic to reverse
5875     // the bits in each byte (we have an instruction, RBIT, to do
5876     // that) and keep the data in little-endian bit order through the
5877     // calculation, bit-reversing the inputs and outputs.
5878 
5879     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5880     __ align(wordSize * 2);
5881     address p = __ pc();
5882     __ emit_int64(0x87);  // The low-order bits of the field
5883                           // polynomial (i.e. p = z^7+z^2+z+1)
5884                           // repeated in the low and high parts of a
5885                           // 128-bit vector
5886     __ emit_int64(0x87);
5887 
5888     __ align(CodeEntryAlignment);
5889     address start = __ pc();
5890 
5891     Register state   = c_rarg0;
5892     Register subkeyH = c_rarg1;
5893     Register data    = c_rarg2;
5894     Register blocks  = c_rarg3;
5895 
5896     FloatRegister vzr = v30;
5897     __ eor(vzr, __ T16B, vzr, vzr); // zero register
5898 
5899     __ ldrq(v24, p);    // The field polynomial
5900 
5901     __ ldrq(v0, Address(state));
5902     __ ldrq(v1, Address(subkeyH));
5903 
5904     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
5905     __ rbit(v0, __ T16B, v0);
5906     __ rev64(v1, __ T16B, v1);
5907     __ rbit(v1, __ T16B, v1);
5908 
5909     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
5910     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
5911 
5912     {
5913       Label L_ghash_loop;
5914       __ bind(L_ghash_loop);
5915 
5916       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
5917                                                  // reversing each byte
5918       __ rbit(v2, __ T16B, v2);
5919       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
5920 
5921       // Multiply state in v2 by subkey in v1
5922       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
5923                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
5924                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
5925       // Reduce v7:v5 by the field polynomial
5926       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
5927 
5928       __ sub(blocks, blocks, 1);
5929       __ cbnz(blocks, L_ghash_loop);
5930     }
5931 
5932     // The bit-reversed result is at this point in v0
5933     __ rev64(v0, __ T16B, v0);
5934     __ rbit(v0, __ T16B, v0);
5935 
5936     __ st1(v0, __ T16B, state);
5937     __ ret(lr);
5938 
5939     return start;
5940   }
5941 
5942   address generate_ghash_processBlocks_wide() {
5943     address small = generate_ghash_processBlocks();
5944 
5945     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
5946     __ align(wordSize * 2);
5947     address p = __ pc();
5948     __ emit_int64(0x87);  // The low-order bits of the field
5949                           // polynomial (i.e. p = z^7+z^2+z+1)
5950                           // repeated in the low and high parts of a
5951                           // 128-bit vector
5952     __ emit_int64(0x87);
5953 
5954     __ align(CodeEntryAlignment);
5955     address start = __ pc();
5956 
5957     Register state   = c_rarg0;
5958     Register subkeyH = c_rarg1;
5959     Register data    = c_rarg2;
5960     Register blocks  = c_rarg3;
5961 
5962     const int unroll = 4;
5963 
5964     __ cmp(blocks, (unsigned char)(unroll * 2));
5965     __ br(__ LT, small);
5966 
5967     if (unroll > 1) {
5968     // Save state before entering routine
5969       __ sub(sp, sp, 4 * 16);
5970       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
5971       __ sub(sp, sp, 4 * 16);
5972       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
5973     }
5974 
5975     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
5976 
5977     if (unroll > 1) {
5978       // And restore state
5979       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
5980       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
5981     }
5982 
5983     __ cmp(blocks, (unsigned char)0);
5984     __ br(__ GT, small);
5985 
5986     __ ret(lr);
5987 
5988     return start;
5989   }
5990 
5991   void generate_base64_encode_simdround(Register src, Register dst,
5992         FloatRegister codec, u8 size) {
5993 
5994     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
5995     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
5996     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
5997 
5998     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
5999 
6000     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6001 
6002     __ ushr(ind0, arrangement, in0,  2);
6003 
6004     __ ushr(ind1, arrangement, in1,  2);
6005     __ shl(in0,   arrangement, in0,  6);
6006     __ orr(ind1,  arrangement, ind1, in0);
6007     __ ushr(ind1, arrangement, ind1, 2);
6008 
6009     __ ushr(ind2, arrangement, in2,  4);
6010     __ shl(in1,   arrangement, in1,  4);
6011     __ orr(ind2,  arrangement, in1,  ind2);
6012     __ ushr(ind2, arrangement, ind2, 2);
6013 
6014     __ shl(ind3,  arrangement, in2,  2);
6015     __ ushr(ind3, arrangement, ind3, 2);
6016 
6017     __ tbl(out0,  arrangement, codec,  4, ind0);
6018     __ tbl(out1,  arrangement, codec,  4, ind1);
6019     __ tbl(out2,  arrangement, codec,  4, ind2);
6020     __ tbl(out3,  arrangement, codec,  4, ind3);
6021 
6022     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6023   }
6024 
6025    /**
6026    *  Arguments:
6027    *
6028    *  Input:
6029    *  c_rarg0   - src_start
6030    *  c_rarg1   - src_offset
6031    *  c_rarg2   - src_length
6032    *  c_rarg3   - dest_start
6033    *  c_rarg4   - dest_offset
6034    *  c_rarg5   - isURL
6035    *
6036    */
6037   address generate_base64_encodeBlock() {
6038 
6039     static const char toBase64[64] = {
6040       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6041       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6042       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6043       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6044       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6045     };
6046 
6047     static const char toBase64URL[64] = {
6048       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6049       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6050       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6051       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6052       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6053     };
6054 
6055     __ align(CodeEntryAlignment);
6056     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6057     address start = __ pc();
6058 
6059     Register src   = c_rarg0;  // source array
6060     Register soff  = c_rarg1;  // source start offset
6061     Register send  = c_rarg2;  // source end offset
6062     Register dst   = c_rarg3;  // dest array
6063     Register doff  = c_rarg4;  // position for writing to dest array
6064     Register isURL = c_rarg5;  // Base64 or URL character set
6065 
6066     // c_rarg6 and c_rarg7 are free to use as temps
6067     Register codec  = c_rarg6;
6068     Register length = c_rarg7;
6069 
6070     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6071 
6072     __ add(src, src, soff);
6073     __ add(dst, dst, doff);
6074     __ sub(length, send, soff);
6075 
6076     // load the codec base address
6077     __ lea(codec, ExternalAddress((address) toBase64));
6078     __ cbz(isURL, ProcessData);
6079     __ lea(codec, ExternalAddress((address) toBase64URL));
6080 
6081     __ BIND(ProcessData);
6082 
6083     // too short to formup a SIMD loop, roll back
6084     __ cmp(length, (u1)24);
6085     __ br(Assembler::LT, Process3B);
6086 
6087     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6088 
6089     __ BIND(Process48B);
6090     __ cmp(length, (u1)48);
6091     __ br(Assembler::LT, Process24B);
6092     generate_base64_encode_simdround(src, dst, v0, 16);
6093     __ sub(length, length, 48);
6094     __ b(Process48B);
6095 
6096     __ BIND(Process24B);
6097     __ cmp(length, (u1)24);
6098     __ br(Assembler::LT, SIMDExit);
6099     generate_base64_encode_simdround(src, dst, v0, 8);
6100     __ sub(length, length, 24);
6101 
6102     __ BIND(SIMDExit);
6103     __ cbz(length, Exit);
6104 
6105     __ BIND(Process3B);
6106     //  3 src bytes, 24 bits
6107     __ ldrb(r10, __ post(src, 1));
6108     __ ldrb(r11, __ post(src, 1));
6109     __ ldrb(r12, __ post(src, 1));
6110     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6111     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6112     // codec index
6113     __ ubfmw(r15, r12, 18, 23);
6114     __ ubfmw(r14, r12, 12, 17);
6115     __ ubfmw(r13, r12, 6,  11);
6116     __ andw(r12,  r12, 63);
6117     // get the code based on the codec
6118     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6119     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6120     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6121     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6122     __ strb(r15, __ post(dst, 1));
6123     __ strb(r14, __ post(dst, 1));
6124     __ strb(r13, __ post(dst, 1));
6125     __ strb(r12, __ post(dst, 1));
6126     __ sub(length, length, 3);
6127     __ cbnz(length, Process3B);
6128 
6129     __ BIND(Exit);
6130     __ ret(lr);
6131 
6132     return start;
6133   }
6134 
6135   void generate_base64_decode_simdround(Register src, Register dst,
6136         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6137 
6138     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6139     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6140 
6141     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6142     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6143 
6144     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6145 
6146     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6147 
6148     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6149 
6150     // we need unsigned saturating subtract, to make sure all input values
6151     // in range [0, 63] will have 0U value in the higher half lookup
6152     __ uqsubv(decH0, __ T16B, in0, v27);
6153     __ uqsubv(decH1, __ T16B, in1, v27);
6154     __ uqsubv(decH2, __ T16B, in2, v27);
6155     __ uqsubv(decH3, __ T16B, in3, v27);
6156 
6157     // lower half lookup
6158     __ tbl(decL0, arrangement, codecL, 4, in0);
6159     __ tbl(decL1, arrangement, codecL, 4, in1);
6160     __ tbl(decL2, arrangement, codecL, 4, in2);
6161     __ tbl(decL3, arrangement, codecL, 4, in3);
6162 
6163     // higher half lookup
6164     __ tbx(decH0, arrangement, codecH, 4, decH0);
6165     __ tbx(decH1, arrangement, codecH, 4, decH1);
6166     __ tbx(decH2, arrangement, codecH, 4, decH2);
6167     __ tbx(decH3, arrangement, codecH, 4, decH3);
6168 
6169     // combine lower and higher
6170     __ orr(decL0, arrangement, decL0, decH0);
6171     __ orr(decL1, arrangement, decL1, decH1);
6172     __ orr(decL2, arrangement, decL2, decH2);
6173     __ orr(decL3, arrangement, decL3, decH3);
6174 
6175     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6176     __ cmhi(decH0, arrangement, decL0, v27);
6177     __ cmhi(decH1, arrangement, decL1, v27);
6178     __ cmhi(decH2, arrangement, decL2, v27);
6179     __ cmhi(decH3, arrangement, decL3, v27);
6180     __ orr(in0, arrangement, decH0, decH1);
6181     __ orr(in1, arrangement, decH2, decH3);
6182     __ orr(in2, arrangement, in0,   in1);
6183     __ umaxv(in3, arrangement, in2);
6184     __ umov(rscratch2, in3, __ B, 0);
6185 
6186     // get the data to output
6187     __ shl(out0,  arrangement, decL0, 2);
6188     __ ushr(out1, arrangement, decL1, 4);
6189     __ orr(out0,  arrangement, out0,  out1);
6190     __ shl(out1,  arrangement, decL1, 4);
6191     __ ushr(out2, arrangement, decL2, 2);
6192     __ orr(out1,  arrangement, out1,  out2);
6193     __ shl(out2,  arrangement, decL2, 6);
6194     __ orr(out2,  arrangement, out2,  decL3);
6195 
6196     __ cbz(rscratch2, NoIllegalData);
6197 
6198     // handle illegal input
6199     __ umov(r10, in2, __ D, 0);
6200     if (size == 16) {
6201       __ cbnz(r10, ErrorInLowerHalf);
6202 
6203       // illegal input is in higher half, store the lower half now.
6204       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6205 
6206       __ umov(r10, in2,  __ D, 1);
6207       __ umov(r11, out0, __ D, 1);
6208       __ umov(r12, out1, __ D, 1);
6209       __ umov(r13, out2, __ D, 1);
6210       __ b(StoreLegalData);
6211 
6212       __ BIND(ErrorInLowerHalf);
6213     }
6214     __ umov(r11, out0, __ D, 0);
6215     __ umov(r12, out1, __ D, 0);
6216     __ umov(r13, out2, __ D, 0);
6217 
6218     __ BIND(StoreLegalData);
6219     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6220     __ strb(r11, __ post(dst, 1));
6221     __ strb(r12, __ post(dst, 1));
6222     __ strb(r13, __ post(dst, 1));
6223     __ lsr(r10, r10, 8);
6224     __ lsr(r11, r11, 8);
6225     __ lsr(r12, r12, 8);
6226     __ lsr(r13, r13, 8);
6227     __ b(StoreLegalData);
6228 
6229     __ BIND(NoIllegalData);
6230     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6231   }
6232 
6233 
6234    /**
6235    *  Arguments:
6236    *
6237    *  Input:
6238    *  c_rarg0   - src_start
6239    *  c_rarg1   - src_offset
6240    *  c_rarg2   - src_length
6241    *  c_rarg3   - dest_start
6242    *  c_rarg4   - dest_offset
6243    *  c_rarg5   - isURL
6244    *  c_rarg6   - isMIME
6245    *
6246    */
6247   address generate_base64_decodeBlock() {
6248 
6249     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6250     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6251     // titled "Base64 decoding".
6252 
6253     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6254     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6255     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6256     static const uint8_t fromBase64ForNoSIMD[256] = {
6257       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6258       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6259       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6260        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6261       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6262        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6263       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6264        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6265       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6266       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6267       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6268       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6269       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6270       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6271       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6272       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6273     };
6274 
6275     static const uint8_t fromBase64URLForNoSIMD[256] = {
6276       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6277       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6278       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6279        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6280       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6281        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6282       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6283        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6284       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6285       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6286       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6287       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6288       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6289       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6290       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6291       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6292     };
6293 
6294     // A legal value of base64 code is in range [0, 127].  We need two lookups
6295     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6296     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6297     // table vector lookup use tbx, out of range indices are unchanged in
6298     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6299     // The value of index 64 is set to 0, so that we know that we already get the
6300     // decoded data with the 1st lookup.
6301     static const uint8_t fromBase64ForSIMD[128] = {
6302       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6303       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6304       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6305        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6306         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6307        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6308       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6309        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6310     };
6311 
6312     static const uint8_t fromBase64URLForSIMD[128] = {
6313       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6314       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6315       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6316        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6317         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6318        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6319        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6320        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6321     };
6322 
6323     __ align(CodeEntryAlignment);
6324     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6325     address start = __ pc();
6326 
6327     Register src    = c_rarg0;  // source array
6328     Register soff   = c_rarg1;  // source start offset
6329     Register send   = c_rarg2;  // source end offset
6330     Register dst    = c_rarg3;  // dest array
6331     Register doff   = c_rarg4;  // position for writing to dest array
6332     Register isURL  = c_rarg5;  // Base64 or URL character set
6333     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
6334 
6335     Register length = send;    // reuse send as length of source data to process
6336 
6337     Register simd_codec   = c_rarg6;
6338     Register nosimd_codec = c_rarg7;
6339 
6340     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
6341 
6342     __ enter();
6343 
6344     __ add(src, src, soff);
6345     __ add(dst, dst, doff);
6346 
6347     __ mov(doff, dst);
6348 
6349     __ sub(length, send, soff);
6350     __ bfm(length, zr, 0, 1);
6351 
6352     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
6353     __ cbz(isURL, ProcessData);
6354     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
6355 
6356     __ BIND(ProcessData);
6357     __ mov(rscratch1, length);
6358     __ cmp(length, (u1)144); // 144 = 80 + 64
6359     __ br(Assembler::LT, Process4B);
6360 
6361     // In the MIME case, the line length cannot be more than 76
6362     // bytes (see RFC 2045). This is too short a block for SIMD
6363     // to be worthwhile, so we use non-SIMD here.
6364     __ movw(rscratch1, 79);
6365 
6366     __ BIND(Process4B);
6367     __ ldrw(r14, __ post(src, 4));
6368     __ ubfxw(r10, r14, 0,  8);
6369     __ ubfxw(r11, r14, 8,  8);
6370     __ ubfxw(r12, r14, 16, 8);
6371     __ ubfxw(r13, r14, 24, 8);
6372     // get the de-code
6373     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
6374     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
6375     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
6376     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
6377     // error detection, 255u indicates an illegal input
6378     __ orrw(r14, r10, r11);
6379     __ orrw(r15, r12, r13);
6380     __ orrw(r14, r14, r15);
6381     __ tbnz(r14, 7, Exit);
6382     // recover the data
6383     __ lslw(r14, r10, 10);
6384     __ bfiw(r14, r11, 4, 6);
6385     __ bfmw(r14, r12, 2, 5);
6386     __ rev16w(r14, r14);
6387     __ bfiw(r13, r12, 6, 2);
6388     __ strh(r14, __ post(dst, 2));
6389     __ strb(r13, __ post(dst, 1));
6390     // non-simd loop
6391     __ subsw(rscratch1, rscratch1, 4);
6392     __ br(Assembler::GT, Process4B);
6393 
6394     // if exiting from PreProcess80B, rscratch1 == -1;
6395     // otherwise, rscratch1 == 0.
6396     __ cbzw(rscratch1, Exit);
6397     __ sub(length, length, 80);
6398 
6399     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
6400     __ cbz(isURL, SIMDEnter);
6401     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
6402 
6403     __ BIND(SIMDEnter);
6404     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
6405     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
6406     __ mov(rscratch1, 63);
6407     __ dup(v27, __ T16B, rscratch1);
6408 
6409     __ BIND(Process64B);
6410     __ cmp(length, (u1)64);
6411     __ br(Assembler::LT, Process32B);
6412     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
6413     __ sub(length, length, 64);
6414     __ b(Process64B);
6415 
6416     __ BIND(Process32B);
6417     __ cmp(length, (u1)32);
6418     __ br(Assembler::LT, SIMDExit);
6419     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
6420     __ sub(length, length, 32);
6421     __ b(Process32B);
6422 
6423     __ BIND(SIMDExit);
6424     __ cbz(length, Exit);
6425     __ movw(rscratch1, length);
6426     __ b(Process4B);
6427 
6428     __ BIND(Exit);
6429     __ sub(c_rarg0, dst, doff);
6430 
6431     __ leave();
6432     __ ret(lr);
6433 
6434     return start;
6435   }
6436 
6437   // Support for spin waits.
6438   address generate_spin_wait() {
6439     __ align(CodeEntryAlignment);
6440     StubCodeMark mark(this, "StubRoutines", "spin_wait");
6441     address start = __ pc();
6442 
6443     __ spin_wait();
6444     __ ret(lr);
6445 
6446     return start;
6447   }
6448 
6449 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
6450 
6451   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
6452   //
6453   // If LSE is in use, generate LSE versions of all the stubs. The
6454   // non-LSE versions are in atomic_aarch64.S.
6455 
6456   // class AtomicStubMark records the entry point of a stub and the
6457   // stub pointer which will point to it. The stub pointer is set to
6458   // the entry point when ~AtomicStubMark() is called, which must be
6459   // after ICache::invalidate_range. This ensures safe publication of
6460   // the generated code.
6461   class AtomicStubMark {
6462     address _entry_point;
6463     aarch64_atomic_stub_t *_stub;
6464     MacroAssembler *_masm;
6465   public:
6466     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
6467       _masm = masm;
6468       __ align(32);
6469       _entry_point = __ pc();
6470       _stub = stub;
6471     }
6472     ~AtomicStubMark() {
6473       *_stub = (aarch64_atomic_stub_t)_entry_point;
6474     }
6475   };
6476 
6477   // NB: For memory_order_conservative we need a trailing membar after
6478   // LSE atomic operations but not a leading membar.
6479   //
6480   // We don't need a leading membar because a clause in the Arm ARM
6481   // says:
6482   //
6483   //   Barrier-ordered-before
6484   //
6485   //   Barrier instructions order prior Memory effects before subsequent
6486   //   Memory effects generated by the same Observer. A read or a write
6487   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
6488   //   Observer if and only if RW1 appears in program order before RW 2
6489   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
6490   //   instruction with both Acquire and Release semantics.
6491   //
6492   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
6493   // and Release semantics, therefore we don't need a leading
6494   // barrier. However, there is no corresponding Barrier-ordered-after
6495   // relationship, therefore we need a trailing membar to prevent a
6496   // later store or load from being reordered with the store in an
6497   // atomic instruction.
6498   //
6499   // This was checked by using the herd7 consistency model simulator
6500   // (http://diy.inria.fr/) with this test case:
6501   //
6502   // AArch64 LseCas
6503   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
6504   // P0 | P1;
6505   // LDR W4, [X2] | MOV W3, #0;
6506   // DMB LD       | MOV W4, #1;
6507   // LDR W3, [X1] | CASAL W3, W4, [X1];
6508   //              | DMB ISH;
6509   //              | STR W4, [X2];
6510   // exists
6511   // (0:X3=0 /\ 0:X4=1)
6512   //
6513   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
6514   // with the store to x in P1. Without the DMB in P1 this may happen.
6515   //
6516   // At the time of writing we don't know of any AArch64 hardware that
6517   // reorders stores in this way, but the Reference Manual permits it.
6518 
6519   void gen_cas_entry(Assembler::operand_size size,
6520                      atomic_memory_order order) {
6521     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
6522       exchange_val = c_rarg2;
6523     bool acquire, release;
6524     switch (order) {
6525       case memory_order_relaxed:
6526         acquire = false;
6527         release = false;
6528         break;
6529       case memory_order_release:
6530         acquire = false;
6531         release = true;
6532         break;
6533       default:
6534         acquire = true;
6535         release = true;
6536         break;
6537     }
6538     __ mov(prev, compare_val);
6539     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
6540     if (order == memory_order_conservative) {
6541       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6542     }
6543     if (size == Assembler::xword) {
6544       __ mov(r0, prev);
6545     } else {
6546       __ movw(r0, prev);
6547     }
6548     __ ret(lr);
6549   }
6550 
6551   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
6552     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6553     // If not relaxed, then default to conservative.  Relaxed is the only
6554     // case we use enough to be worth specializing.
6555     if (order == memory_order_relaxed) {
6556       __ ldadd(size, incr, prev, addr);
6557     } else {
6558       __ ldaddal(size, incr, prev, addr);
6559       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6560     }
6561     if (size == Assembler::xword) {
6562       __ mov(r0, prev);
6563     } else {
6564       __ movw(r0, prev);
6565     }
6566     __ ret(lr);
6567   }
6568 
6569   void gen_swpal_entry(Assembler::operand_size size) {
6570     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
6571     __ swpal(size, incr, prev, addr);
6572     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
6573     if (size == Assembler::xword) {
6574       __ mov(r0, prev);
6575     } else {
6576       __ movw(r0, prev);
6577     }
6578     __ ret(lr);
6579   }
6580 
6581   void generate_atomic_entry_points() {
6582     if (! UseLSE) {
6583       return;
6584     }
6585 
6586     __ align(CodeEntryAlignment);
6587     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
6588     address first_entry = __ pc();
6589 
6590     // ADD, memory_order_conservative
6591     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
6592     gen_ldadd_entry(Assembler::word, memory_order_conservative);
6593     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
6594     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
6595 
6596     // ADD, memory_order_relaxed
6597     AtomicStubMark mark_fetch_add_4_relaxed
6598       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
6599     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
6600     AtomicStubMark mark_fetch_add_8_relaxed
6601       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
6602     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
6603 
6604     // XCHG, memory_order_conservative
6605     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
6606     gen_swpal_entry(Assembler::word);
6607     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
6608     gen_swpal_entry(Assembler::xword);
6609 
6610     // CAS, memory_order_conservative
6611     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
6612     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
6613     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
6614     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
6615     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
6616     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
6617 
6618     // CAS, memory_order_relaxed
6619     AtomicStubMark mark_cmpxchg_1_relaxed
6620       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
6621     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
6622     AtomicStubMark mark_cmpxchg_4_relaxed
6623       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
6624     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
6625     AtomicStubMark mark_cmpxchg_8_relaxed
6626       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
6627     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
6628 
6629     AtomicStubMark mark_cmpxchg_4_release
6630       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
6631     gen_cas_entry(MacroAssembler::word, memory_order_release);
6632     AtomicStubMark mark_cmpxchg_8_release
6633       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
6634     gen_cas_entry(MacroAssembler::xword, memory_order_release);
6635 
6636     AtomicStubMark mark_cmpxchg_4_seq_cst
6637       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
6638     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
6639     AtomicStubMark mark_cmpxchg_8_seq_cst
6640       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
6641     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
6642 
6643     ICache::invalidate_range(first_entry, __ pc() - first_entry);
6644   }
6645 #endif // LINUX
6646 
6647   address generate_cont_thaw(Continuation::thaw_kind kind) {
6648     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
6649     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
6650 
6651     address start = __ pc();
6652 
6653     if (return_barrier) {
6654       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
6655       __ mov(sp, rscratch1);
6656     }
6657     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6658 
6659     if (return_barrier) {
6660       // preserve possible return value from a method returning to the return barrier
6661       __ fmovd(rscratch1, v0);
6662       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6663     }
6664 
6665     __ movw(c_rarg1, (return_barrier ? 1 : 0));
6666     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
6667     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
6668 
6669     if (return_barrier) {
6670       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6671       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6672       __ fmovd(v0, rscratch1);
6673     }
6674     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
6675 
6676 
6677     Label thaw_success;
6678     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
6679     __ cbnz(rscratch2, thaw_success);
6680     __ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
6681     __ br(rscratch1);
6682     __ bind(thaw_success);
6683 
6684     // make room for the thawed frames
6685     __ sub(rscratch1, sp, rscratch2);
6686     __ andr(rscratch1, rscratch1, -16); // align
6687     __ mov(sp, rscratch1);
6688 
6689     if (return_barrier) {
6690       // save original return value -- again
6691       __ fmovd(rscratch1, v0);
6692       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
6693     }
6694 
6695     // If we want, we can templatize thaw by kind, and have three different entries
6696     __ movw(c_rarg1, (uint32_t)kind);
6697 
6698     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
6699     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
6700 
6701     if (return_barrier) {
6702       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
6703       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
6704       __ fmovd(v0, rscratch1);
6705     } else {
6706       __ mov(r0, zr); // return 0 (success) from doYield
6707     }
6708 
6709     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
6710     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
6711     __ mov(rfp, sp);
6712 
6713     if (return_barrier_exception) {
6714       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
6715       __ verify_oop(r0);
6716       __ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
6717 
6718       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
6719 
6720       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
6721       // __ reinitialize_ptrue();
6722 
6723       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
6724 
6725       __ mov(r1, r0); // the exception handler
6726       __ mov(r0, r19); // restore return value contaning the exception oop
6727       __ verify_oop(r0);
6728 
6729       __ leave();
6730       __ mov(r3, lr);
6731       __ br(r1); // the exception handler
6732     } else {
6733       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
6734       __ leave();
6735       __ ret(lr);
6736     }
6737 
6738     return start;
6739   }
6740 
6741   address generate_cont_thaw() {
6742     if (!Continuations::enabled()) return nullptr;
6743 
6744     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
6745     address start = __ pc();
6746     generate_cont_thaw(Continuation::thaw_top);
6747     return start;
6748   }
6749 
6750   address generate_cont_returnBarrier() {
6751     if (!Continuations::enabled()) return nullptr;
6752 
6753     // TODO: will probably need multiple return barriers depending on return type
6754     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
6755     address start = __ pc();
6756 
6757     generate_cont_thaw(Continuation::thaw_return_barrier);
6758 
6759     return start;
6760   }
6761 
6762   address generate_cont_returnBarrier_exception() {
6763     if (!Continuations::enabled()) return nullptr;
6764 
6765     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
6766     address start = __ pc();
6767 
6768     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
6769 
6770     return start;
6771   }
6772 
6773 #if INCLUDE_JFR
6774 
6775   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
6776     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6777     __ mov(c_rarg0, thread);
6778   }
6779 
6780   // The handle is dereferenced through a load barrier.
6781   static void jfr_epilogue(MacroAssembler* _masm) {
6782     __ reset_last_Java_frame(true);
6783     Label null_jobject;
6784     __ cbz(r0, null_jobject);
6785     DecoratorSet decorators = ACCESS_READ | IN_NATIVE;
6786     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
6787     bs->load_at(_masm, decorators, T_OBJECT, r0, Address(r0, 0), rscratch1, rscratch2);
6788     __ bind(null_jobject);
6789   }
6790 
6791   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
6792   // It returns a jobject handle to the event writer.
6793   // The handle is dereferenced and the return value is the event writer oop.
6794   static RuntimeStub* generate_jfr_write_checkpoint() {
6795     enum layout {
6796       rbp_off,
6797       rbpH_off,
6798       return_off,
6799       return_off2,
6800       framesize // inclusive of return address
6801     };
6802 
6803     int insts_size = 512;
6804     int locs_size = 64;
6805     CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
6806     OopMapSet* oop_maps = new OopMapSet();
6807     MacroAssembler* masm = new MacroAssembler(&code);
6808     MacroAssembler* _masm = masm;
6809 
6810     address start = __ pc();
6811     __ enter();
6812     int frame_complete = __ pc() - start;
6813     address the_pc = __ pc();
6814     jfr_prologue(the_pc, _masm, rthread);
6815     __ call_VM_leaf(CAST_FROM_FN_PTR(address, JfrIntrinsicSupport::write_checkpoint), 1);
6816     jfr_epilogue(_masm);
6817     __ leave();
6818     __ ret(lr);
6819 
6820     OopMap* map = new OopMap(framesize, 1); // rfp
6821     oop_maps->add_gc_map(the_pc - start, map);
6822 
6823     RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
6824       RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
6825                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6826                                     oop_maps, false);
6827     return stub;
6828   }
6829 
6830 #endif // INCLUDE_JFR
6831 
6832   // Continuation point for throwing of implicit exceptions that are
6833   // not handled in the current activation. Fabricates an exception
6834   // oop and initiates normal exception dispatching in this
6835   // frame. Since we need to preserve callee-saved values (currently
6836   // only for C2, but done for C1 as well) we need a callee-saved oop
6837   // map and therefore have to make these stubs into RuntimeStubs
6838   // rather than BufferBlobs.  If the compiler needs all registers to
6839   // be preserved between the fault point and the exception handler
6840   // then it must assume responsibility for that in
6841   // AbstractCompiler::continuation_for_implicit_null_exception or
6842   // continuation_for_implicit_division_by_zero_exception. All other
6843   // implicit exceptions (e.g., NullPointerException or
6844   // AbstractMethodError on entry) are either at call sites or
6845   // otherwise assume that stack unwinding will be initiated, so
6846   // caller saved registers were assumed volatile in the compiler.
6847 
6848 #undef __
6849 #define __ masm->
6850 
6851   address generate_throw_exception(const char* name,
6852                                    address runtime_entry,
6853                                    Register arg1 = noreg,
6854                                    Register arg2 = noreg) {
6855     // Information about frame layout at time of blocking runtime call.
6856     // Note that we only have to preserve callee-saved registers since
6857     // the compilers are responsible for supplying a continuation point
6858     // if they expect all registers to be preserved.
6859     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
6860     enum layout {
6861       rfp_off = 0,
6862       rfp_off2,
6863       return_off,
6864       return_off2,
6865       framesize // inclusive of return address
6866     };
6867 
6868     int insts_size = 512;
6869     int locs_size  = 64;
6870 
6871     CodeBuffer code(name, insts_size, locs_size);
6872     OopMapSet* oop_maps  = new OopMapSet();
6873     MacroAssembler* masm = new MacroAssembler(&code);
6874 
6875     address start = __ pc();
6876 
6877     // This is an inlined and slightly modified version of call_VM
6878     // which has the ability to fetch the return PC out of
6879     // thread-local storage and also sets up last_Java_sp slightly
6880     // differently than the real call_VM
6881 
6882     __ enter(); // Save FP and LR before call
6883 
6884     assert(is_even(framesize/2), "sp not 16-byte aligned");
6885 
6886     // lr and fp are already in place
6887     __ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
6888 
6889     int frame_complete = __ pc() - start;
6890 
6891     // Set up last_Java_sp and last_Java_fp
6892     address the_pc = __ pc();
6893     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
6894 
6895     // Call runtime
6896     if (arg1 != noreg) {
6897       assert(arg2 != c_rarg1, "clobbered");
6898       __ mov(c_rarg1, arg1);
6899     }
6900     if (arg2 != noreg) {
6901       __ mov(c_rarg2, arg2);
6902     }
6903     __ mov(c_rarg0, rthread);
6904     BLOCK_COMMENT("call runtime_entry");
6905     __ mov(rscratch1, runtime_entry);
6906     __ blr(rscratch1);
6907 
6908     // Generate oop map
6909     OopMap* map = new OopMap(framesize, 0);
6910 
6911     oop_maps->add_gc_map(the_pc - start, map);
6912 
6913     __ reset_last_Java_frame(true);
6914 
6915     // Reinitialize the ptrue predicate register, in case the external runtime
6916     // call clobbers ptrue reg, as we may return to SVE compiled code.
6917     __ reinitialize_ptrue();
6918 
6919     __ leave();
6920 
6921     // check for pending exceptions
6922 #ifdef ASSERT
6923     Label L;
6924     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
6925     __ cbnz(rscratch1, L);
6926     __ should_not_reach_here();
6927     __ bind(L);
6928 #endif // ASSERT
6929     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6930 
6931     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6932     RuntimeStub* stub =
6933       RuntimeStub::new_runtime_stub(name,
6934                                     &code,
6935                                     frame_complete,
6936                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6937                                     oop_maps, false);
6938     return stub->entry_point();
6939   }
6940 
6941   class MontgomeryMultiplyGenerator : public MacroAssembler {
6942 
6943     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
6944       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
6945 
6946     RegSet _toSave;
6947     bool _squaring;
6948 
6949   public:
6950     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
6951       : MacroAssembler(as->code()), _squaring(squaring) {
6952 
6953       // Register allocation
6954 
6955       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
6956       Pa_base = *regs;       // Argument registers
6957       if (squaring)
6958         Pb_base = Pa_base;
6959       else
6960         Pb_base = *++regs;
6961       Pn_base = *++regs;
6962       Rlen= *++regs;
6963       inv = *++regs;
6964       Pm_base = *++regs;
6965 
6966                           // Working registers:
6967       Ra =  *++regs;        // The current digit of a, b, n, and m.
6968       Rb =  *++regs;
6969       Rm =  *++regs;
6970       Rn =  *++regs;
6971 
6972       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
6973       Pb =  *++regs;
6974       Pm =  *++regs;
6975       Pn =  *++regs;
6976 
6977       t0 =  *++regs;        // Three registers which form a
6978       t1 =  *++regs;        // triple-precision accumuator.
6979       t2 =  *++regs;
6980 
6981       Ri =  *++regs;        // Inner and outer loop indexes.
6982       Rj =  *++regs;
6983 
6984       Rhi_ab = *++regs;     // Product registers: low and high parts
6985       Rlo_ab = *++regs;     // of a*b and m*n.
6986       Rhi_mn = *++regs;
6987       Rlo_mn = *++regs;
6988 
6989       // r19 and up are callee-saved.
6990       _toSave = RegSet::range(r19, *regs) + Pm_base;
6991     }
6992 
6993   private:
6994     void save_regs() {
6995       push(_toSave, sp);
6996     }
6997 
6998     void restore_regs() {
6999       pop(_toSave, sp);
7000     }
7001 
7002     template <typename T>
7003     void unroll_2(Register count, T block) {
7004       Label loop, end, odd;
7005       tbnz(count, 0, odd);
7006       cbz(count, end);
7007       align(16);
7008       bind(loop);
7009       (this->*block)();
7010       bind(odd);
7011       (this->*block)();
7012       subs(count, count, 2);
7013       br(Assembler::GT, loop);
7014       bind(end);
7015     }
7016 
7017     template <typename T>
7018     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7019       Label loop, end, odd;
7020       tbnz(count, 0, odd);
7021       cbz(count, end);
7022       align(16);
7023       bind(loop);
7024       (this->*block)(d, s, tmp);
7025       bind(odd);
7026       (this->*block)(d, s, tmp);
7027       subs(count, count, 2);
7028       br(Assembler::GT, loop);
7029       bind(end);
7030     }
7031 
7032     void pre1(RegisterOrConstant i) {
7033       block_comment("pre1");
7034       // Pa = Pa_base;
7035       // Pb = Pb_base + i;
7036       // Pm = Pm_base;
7037       // Pn = Pn_base + i;
7038       // Ra = *Pa;
7039       // Rb = *Pb;
7040       // Rm = *Pm;
7041       // Rn = *Pn;
7042       ldr(Ra, Address(Pa_base));
7043       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7044       ldr(Rm, Address(Pm_base));
7045       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7046       lea(Pa, Address(Pa_base));
7047       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7048       lea(Pm, Address(Pm_base));
7049       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7050 
7051       // Zero the m*n result.
7052       mov(Rhi_mn, zr);
7053       mov(Rlo_mn, zr);
7054     }
7055 
7056     // The core multiply-accumulate step of a Montgomery
7057     // multiplication.  The idea is to schedule operations as a
7058     // pipeline so that instructions with long latencies (loads and
7059     // multiplies) have time to complete before their results are
7060     // used.  This most benefits in-order implementations of the
7061     // architecture but out-of-order ones also benefit.
7062     void step() {
7063       block_comment("step");
7064       // MACC(Ra, Rb, t0, t1, t2);
7065       // Ra = *++Pa;
7066       // Rb = *--Pb;
7067       umulh(Rhi_ab, Ra, Rb);
7068       mul(Rlo_ab, Ra, Rb);
7069       ldr(Ra, pre(Pa, wordSize));
7070       ldr(Rb, pre(Pb, -wordSize));
7071       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7072                                        // previous iteration.
7073       // MACC(Rm, Rn, t0, t1, t2);
7074       // Rm = *++Pm;
7075       // Rn = *--Pn;
7076       umulh(Rhi_mn, Rm, Rn);
7077       mul(Rlo_mn, Rm, Rn);
7078       ldr(Rm, pre(Pm, wordSize));
7079       ldr(Rn, pre(Pn, -wordSize));
7080       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7081     }
7082 
7083     void post1() {
7084       block_comment("post1");
7085 
7086       // MACC(Ra, Rb, t0, t1, t2);
7087       // Ra = *++Pa;
7088       // Rb = *--Pb;
7089       umulh(Rhi_ab, Ra, Rb);
7090       mul(Rlo_ab, Ra, Rb);
7091       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7092       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7093 
7094       // *Pm = Rm = t0 * inv;
7095       mul(Rm, t0, inv);
7096       str(Rm, Address(Pm));
7097 
7098       // MACC(Rm, Rn, t0, t1, t2);
7099       // t0 = t1; t1 = t2; t2 = 0;
7100       umulh(Rhi_mn, Rm, Rn);
7101 
7102 #ifndef PRODUCT
7103       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7104       {
7105         mul(Rlo_mn, Rm, Rn);
7106         add(Rlo_mn, t0, Rlo_mn);
7107         Label ok;
7108         cbz(Rlo_mn, ok); {
7109           stop("broken Montgomery multiply");
7110         } bind(ok);
7111       }
7112 #endif
7113       // We have very carefully set things up so that
7114       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7115       // the lower half of Rm * Rn because we know the result already:
7116       // it must be -t0.  t0 + (-t0) must generate a carry iff
7117       // t0 != 0.  So, rather than do a mul and an adds we just set
7118       // the carry flag iff t0 is nonzero.
7119       //
7120       // mul(Rlo_mn, Rm, Rn);
7121       // adds(zr, t0, Rlo_mn);
7122       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7123       adcs(t0, t1, Rhi_mn);
7124       adc(t1, t2, zr);
7125       mov(t2, zr);
7126     }
7127 
7128     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7129       block_comment("pre2");
7130       // Pa = Pa_base + i-len;
7131       // Pb = Pb_base + len;
7132       // Pm = Pm_base + i-len;
7133       // Pn = Pn_base + len;
7134 
7135       if (i.is_register()) {
7136         sub(Rj, i.as_register(), len);
7137       } else {
7138         mov(Rj, i.as_constant());
7139         sub(Rj, Rj, len);
7140       }
7141       // Rj == i-len
7142 
7143       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7144       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7145       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7146       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7147 
7148       // Ra = *++Pa;
7149       // Rb = *--Pb;
7150       // Rm = *++Pm;
7151       // Rn = *--Pn;
7152       ldr(Ra, pre(Pa, wordSize));
7153       ldr(Rb, pre(Pb, -wordSize));
7154       ldr(Rm, pre(Pm, wordSize));
7155       ldr(Rn, pre(Pn, -wordSize));
7156 
7157       mov(Rhi_mn, zr);
7158       mov(Rlo_mn, zr);
7159     }
7160 
7161     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7162       block_comment("post2");
7163       if (i.is_constant()) {
7164         mov(Rj, i.as_constant()-len.as_constant());
7165       } else {
7166         sub(Rj, i.as_register(), len);
7167       }
7168 
7169       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7170 
7171       // As soon as we know the least significant digit of our result,
7172       // store it.
7173       // Pm_base[i-len] = t0;
7174       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7175 
7176       // t0 = t1; t1 = t2; t2 = 0;
7177       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7178       adc(t1, t2, zr);
7179       mov(t2, zr);
7180     }
7181 
7182     // A carry in t0 after Montgomery multiplication means that we
7183     // should subtract multiples of n from our result in m.  We'll
7184     // keep doing that until there is no carry.
7185     void normalize(RegisterOrConstant len) {
7186       block_comment("normalize");
7187       // while (t0)
7188       //   t0 = sub(Pm_base, Pn_base, t0, len);
7189       Label loop, post, again;
7190       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7191       cbz(t0, post); {
7192         bind(again); {
7193           mov(i, zr);
7194           mov(cnt, len);
7195           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7196           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7197           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7198           align(16);
7199           bind(loop); {
7200             sbcs(Rm, Rm, Rn);
7201             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7202             add(i, i, 1);
7203             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7204             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7205             sub(cnt, cnt, 1);
7206           } cbnz(cnt, loop);
7207           sbc(t0, t0, zr);
7208         } cbnz(t0, again);
7209       } bind(post);
7210     }
7211 
7212     // Move memory at s to d, reversing words.
7213     //    Increments d to end of copied memory
7214     //    Destroys tmp1, tmp2
7215     //    Preserves len
7216     //    Leaves s pointing to the address which was in d at start
7217     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7218       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7219       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7220 
7221       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7222       mov(tmp1, len);
7223       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7224       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7225     }
7226     // where
7227     void reverse1(Register d, Register s, Register tmp) {
7228       ldr(tmp, pre(s, -wordSize));
7229       ror(tmp, tmp, 32);
7230       str(tmp, post(d, wordSize));
7231     }
7232 
7233     void step_squaring() {
7234       // An extra ACC
7235       step();
7236       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7237     }
7238 
7239     void last_squaring(RegisterOrConstant i) {
7240       Label dont;
7241       // if ((i & 1) == 0) {
7242       tbnz(i.as_register(), 0, dont); {
7243         // MACC(Ra, Rb, t0, t1, t2);
7244         // Ra = *++Pa;
7245         // Rb = *--Pb;
7246         umulh(Rhi_ab, Ra, Rb);
7247         mul(Rlo_ab, Ra, Rb);
7248         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7249       } bind(dont);
7250     }
7251 
7252     void extra_step_squaring() {
7253       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7254 
7255       // MACC(Rm, Rn, t0, t1, t2);
7256       // Rm = *++Pm;
7257       // Rn = *--Pn;
7258       umulh(Rhi_mn, Rm, Rn);
7259       mul(Rlo_mn, Rm, Rn);
7260       ldr(Rm, pre(Pm, wordSize));
7261       ldr(Rn, pre(Pn, -wordSize));
7262     }
7263 
7264     void post1_squaring() {
7265       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7266 
7267       // *Pm = Rm = t0 * inv;
7268       mul(Rm, t0, inv);
7269       str(Rm, Address(Pm));
7270 
7271       // MACC(Rm, Rn, t0, t1, t2);
7272       // t0 = t1; t1 = t2; t2 = 0;
7273       umulh(Rhi_mn, Rm, Rn);
7274 
7275 #ifndef PRODUCT
7276       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7277       {
7278         mul(Rlo_mn, Rm, Rn);
7279         add(Rlo_mn, t0, Rlo_mn);
7280         Label ok;
7281         cbz(Rlo_mn, ok); {
7282           stop("broken Montgomery multiply");
7283         } bind(ok);
7284       }
7285 #endif
7286       // We have very carefully set things up so that
7287       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7288       // the lower half of Rm * Rn because we know the result already:
7289       // it must be -t0.  t0 + (-t0) must generate a carry iff
7290       // t0 != 0.  So, rather than do a mul and an adds we just set
7291       // the carry flag iff t0 is nonzero.
7292       //
7293       // mul(Rlo_mn, Rm, Rn);
7294       // adds(zr, t0, Rlo_mn);
7295       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7296       adcs(t0, t1, Rhi_mn);
7297       adc(t1, t2, zr);
7298       mov(t2, zr);
7299     }
7300 
7301     void acc(Register Rhi, Register Rlo,
7302              Register t0, Register t1, Register t2) {
7303       adds(t0, t0, Rlo);
7304       adcs(t1, t1, Rhi);
7305       adc(t2, t2, zr);
7306     }
7307 
7308   public:
7309     /**
7310      * Fast Montgomery multiplication.  The derivation of the
7311      * algorithm is in A Cryptographic Library for the Motorola
7312      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
7313      *
7314      * Arguments:
7315      *
7316      * Inputs for multiplication:
7317      *   c_rarg0   - int array elements a
7318      *   c_rarg1   - int array elements b
7319      *   c_rarg2   - int array elements n (the modulus)
7320      *   c_rarg3   - int length
7321      *   c_rarg4   - int inv
7322      *   c_rarg5   - int array elements m (the result)
7323      *
7324      * Inputs for squaring:
7325      *   c_rarg0   - int array elements a
7326      *   c_rarg1   - int array elements n (the modulus)
7327      *   c_rarg2   - int length
7328      *   c_rarg3   - int inv
7329      *   c_rarg4   - int array elements m (the result)
7330      *
7331      */
7332     address generate_multiply() {
7333       Label argh, nothing;
7334       bind(argh);
7335       stop("MontgomeryMultiply total_allocation must be <= 8192");
7336 
7337       align(CodeEntryAlignment);
7338       address entry = pc();
7339 
7340       cbzw(Rlen, nothing);
7341 
7342       enter();
7343 
7344       // Make room.
7345       cmpw(Rlen, 512);
7346       br(Assembler::HI, argh);
7347       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7348       andr(sp, Ra, -2 * wordSize);
7349 
7350       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7351 
7352       {
7353         // Copy input args, reversing as we go.  We use Ra as a
7354         // temporary variable.
7355         reverse(Ra, Pa_base, Rlen, t0, t1);
7356         if (!_squaring)
7357           reverse(Ra, Pb_base, Rlen, t0, t1);
7358         reverse(Ra, Pn_base, Rlen, t0, t1);
7359       }
7360 
7361       // Push all call-saved registers and also Pm_base which we'll need
7362       // at the end.
7363       save_regs();
7364 
7365 #ifndef PRODUCT
7366       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
7367       {
7368         ldr(Rn, Address(Pn_base, 0));
7369         mul(Rlo_mn, Rn, inv);
7370         subs(zr, Rlo_mn, -1);
7371         Label ok;
7372         br(EQ, ok); {
7373           stop("broken inverse in Montgomery multiply");
7374         } bind(ok);
7375       }
7376 #endif
7377 
7378       mov(Pm_base, Ra);
7379 
7380       mov(t0, zr);
7381       mov(t1, zr);
7382       mov(t2, zr);
7383 
7384       block_comment("for (int i = 0; i < len; i++) {");
7385       mov(Ri, zr); {
7386         Label loop, end;
7387         cmpw(Ri, Rlen);
7388         br(Assembler::GE, end);
7389 
7390         bind(loop);
7391         pre1(Ri);
7392 
7393         block_comment("  for (j = i; j; j--) {"); {
7394           movw(Rj, Ri);
7395           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7396         } block_comment("  } // j");
7397 
7398         post1();
7399         addw(Ri, Ri, 1);
7400         cmpw(Ri, Rlen);
7401         br(Assembler::LT, loop);
7402         bind(end);
7403         block_comment("} // i");
7404       }
7405 
7406       block_comment("for (int i = len; i < 2*len; i++) {");
7407       mov(Ri, Rlen); {
7408         Label loop, end;
7409         cmpw(Ri, Rlen, Assembler::LSL, 1);
7410         br(Assembler::GE, end);
7411 
7412         bind(loop);
7413         pre2(Ri, Rlen);
7414 
7415         block_comment("  for (j = len*2-i-1; j; j--) {"); {
7416           lslw(Rj, Rlen, 1);
7417           subw(Rj, Rj, Ri);
7418           subw(Rj, Rj, 1);
7419           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
7420         } block_comment("  } // j");
7421 
7422         post2(Ri, Rlen);
7423         addw(Ri, Ri, 1);
7424         cmpw(Ri, Rlen, Assembler::LSL, 1);
7425         br(Assembler::LT, loop);
7426         bind(end);
7427       }
7428       block_comment("} // i");
7429 
7430       normalize(Rlen);
7431 
7432       mov(Ra, Pm_base);  // Save Pm_base in Ra
7433       restore_regs();  // Restore caller's Pm_base
7434 
7435       // Copy our result into caller's Pm_base
7436       reverse(Pm_base, Ra, Rlen, t0, t1);
7437 
7438       leave();
7439       bind(nothing);
7440       ret(lr);
7441 
7442       return entry;
7443     }
7444     // In C, approximately:
7445 
7446     // void
7447     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
7448     //                     julong Pn_base[], julong Pm_base[],
7449     //                     julong inv, int len) {
7450     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7451     //   julong *Pa, *Pb, *Pn, *Pm;
7452     //   julong Ra, Rb, Rn, Rm;
7453 
7454     //   int i;
7455 
7456     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7457 
7458     //   for (i = 0; i < len; i++) {
7459     //     int j;
7460 
7461     //     Pa = Pa_base;
7462     //     Pb = Pb_base + i;
7463     //     Pm = Pm_base;
7464     //     Pn = Pn_base + i;
7465 
7466     //     Ra = *Pa;
7467     //     Rb = *Pb;
7468     //     Rm = *Pm;
7469     //     Rn = *Pn;
7470 
7471     //     int iters = i;
7472     //     for (j = 0; iters--; j++) {
7473     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7474     //       MACC(Ra, Rb, t0, t1, t2);
7475     //       Ra = *++Pa;
7476     //       Rb = *--Pb;
7477     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7478     //       MACC(Rm, Rn, t0, t1, t2);
7479     //       Rm = *++Pm;
7480     //       Rn = *--Pn;
7481     //     }
7482 
7483     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
7484     //     MACC(Ra, Rb, t0, t1, t2);
7485     //     *Pm = Rm = t0 * inv;
7486     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7487     //     MACC(Rm, Rn, t0, t1, t2);
7488 
7489     //     assert(t0 == 0, "broken Montgomery multiply");
7490 
7491     //     t0 = t1; t1 = t2; t2 = 0;
7492     //   }
7493 
7494     //   for (i = len; i < 2*len; i++) {
7495     //     int j;
7496 
7497     //     Pa = Pa_base + i-len;
7498     //     Pb = Pb_base + len;
7499     //     Pm = Pm_base + i-len;
7500     //     Pn = Pn_base + len;
7501 
7502     //     Ra = *++Pa;
7503     //     Rb = *--Pb;
7504     //     Rm = *++Pm;
7505     //     Rn = *--Pn;
7506 
7507     //     int iters = len*2-i-1;
7508     //     for (j = i-len+1; iters--; j++) {
7509     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
7510     //       MACC(Ra, Rb, t0, t1, t2);
7511     //       Ra = *++Pa;
7512     //       Rb = *--Pb;
7513     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7514     //       MACC(Rm, Rn, t0, t1, t2);
7515     //       Rm = *++Pm;
7516     //       Rn = *--Pn;
7517     //     }
7518 
7519     //     Pm_base[i-len] = t0;
7520     //     t0 = t1; t1 = t2; t2 = 0;
7521     //   }
7522 
7523     //   while (t0)
7524     //     t0 = sub(Pm_base, Pn_base, t0, len);
7525     // }
7526 
7527     /**
7528      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
7529      * multiplies than Montgomery multiplication so it should be up to
7530      * 25% faster.  However, its loop control is more complex and it
7531      * may actually run slower on some machines.
7532      *
7533      * Arguments:
7534      *
7535      * Inputs:
7536      *   c_rarg0   - int array elements a
7537      *   c_rarg1   - int array elements n (the modulus)
7538      *   c_rarg2   - int length
7539      *   c_rarg3   - int inv
7540      *   c_rarg4   - int array elements m (the result)
7541      *
7542      */
7543     address generate_square() {
7544       Label argh;
7545       bind(argh);
7546       stop("MontgomeryMultiply total_allocation must be <= 8192");
7547 
7548       align(CodeEntryAlignment);
7549       address entry = pc();
7550 
7551       enter();
7552 
7553       // Make room.
7554       cmpw(Rlen, 512);
7555       br(Assembler::HI, argh);
7556       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
7557       andr(sp, Ra, -2 * wordSize);
7558 
7559       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
7560 
7561       {
7562         // Copy input args, reversing as we go.  We use Ra as a
7563         // temporary variable.
7564         reverse(Ra, Pa_base, Rlen, t0, t1);
7565         reverse(Ra, Pn_base, Rlen, t0, t1);
7566       }
7567 
7568       // Push all call-saved registers and also Pm_base which we'll need
7569       // at the end.
7570       save_regs();
7571 
7572       mov(Pm_base, Ra);
7573 
7574       mov(t0, zr);
7575       mov(t1, zr);
7576       mov(t2, zr);
7577 
7578       block_comment("for (int i = 0; i < len; i++) {");
7579       mov(Ri, zr); {
7580         Label loop, end;
7581         bind(loop);
7582         cmp(Ri, Rlen);
7583         br(Assembler::GE, end);
7584 
7585         pre1(Ri);
7586 
7587         block_comment("for (j = (i+1)/2; j; j--) {"); {
7588           add(Rj, Ri, 1);
7589           lsr(Rj, Rj, 1);
7590           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7591         } block_comment("  } // j");
7592 
7593         last_squaring(Ri);
7594 
7595         block_comment("  for (j = i/2; j; j--) {"); {
7596           lsr(Rj, Ri, 1);
7597           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7598         } block_comment("  } // j");
7599 
7600         post1_squaring();
7601         add(Ri, Ri, 1);
7602         cmp(Ri, Rlen);
7603         br(Assembler::LT, loop);
7604 
7605         bind(end);
7606         block_comment("} // i");
7607       }
7608 
7609       block_comment("for (int i = len; i < 2*len; i++) {");
7610       mov(Ri, Rlen); {
7611         Label loop, end;
7612         bind(loop);
7613         cmp(Ri, Rlen, Assembler::LSL, 1);
7614         br(Assembler::GE, end);
7615 
7616         pre2(Ri, Rlen);
7617 
7618         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
7619           lsl(Rj, Rlen, 1);
7620           sub(Rj, Rj, Ri);
7621           sub(Rj, Rj, 1);
7622           lsr(Rj, Rj, 1);
7623           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
7624         } block_comment("  } // j");
7625 
7626         last_squaring(Ri);
7627 
7628         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
7629           lsl(Rj, Rlen, 1);
7630           sub(Rj, Rj, Ri);
7631           lsr(Rj, Rj, 1);
7632           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
7633         } block_comment("  } // j");
7634 
7635         post2(Ri, Rlen);
7636         add(Ri, Ri, 1);
7637         cmp(Ri, Rlen, Assembler::LSL, 1);
7638 
7639         br(Assembler::LT, loop);
7640         bind(end);
7641         block_comment("} // i");
7642       }
7643 
7644       normalize(Rlen);
7645 
7646       mov(Ra, Pm_base);  // Save Pm_base in Ra
7647       restore_regs();  // Restore caller's Pm_base
7648 
7649       // Copy our result into caller's Pm_base
7650       reverse(Pm_base, Ra, Rlen, t0, t1);
7651 
7652       leave();
7653       ret(lr);
7654 
7655       return entry;
7656     }
7657     // In C, approximately:
7658 
7659     // void
7660     // montgomery_square(julong Pa_base[], julong Pn_base[],
7661     //                   julong Pm_base[], julong inv, int len) {
7662     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
7663     //   julong *Pa, *Pb, *Pn, *Pm;
7664     //   julong Ra, Rb, Rn, Rm;
7665 
7666     //   int i;
7667 
7668     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
7669 
7670     //   for (i = 0; i < len; i++) {
7671     //     int j;
7672 
7673     //     Pa = Pa_base;
7674     //     Pb = Pa_base + i;
7675     //     Pm = Pm_base;
7676     //     Pn = Pn_base + i;
7677 
7678     //     Ra = *Pa;
7679     //     Rb = *Pb;
7680     //     Rm = *Pm;
7681     //     Rn = *Pn;
7682 
7683     //     int iters = (i+1)/2;
7684     //     for (j = 0; iters--; j++) {
7685     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7686     //       MACC2(Ra, Rb, t0, t1, t2);
7687     //       Ra = *++Pa;
7688     //       Rb = *--Pb;
7689     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7690     //       MACC(Rm, Rn, t0, t1, t2);
7691     //       Rm = *++Pm;
7692     //       Rn = *--Pn;
7693     //     }
7694     //     if ((i & 1) == 0) {
7695     //       assert(Ra == Pa_base[j], "must be");
7696     //       MACC(Ra, Ra, t0, t1, t2);
7697     //     }
7698     //     iters = i/2;
7699     //     assert(iters == i-j, "must be");
7700     //     for (; iters--; j++) {
7701     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7702     //       MACC(Rm, Rn, t0, t1, t2);
7703     //       Rm = *++Pm;
7704     //       Rn = *--Pn;
7705     //     }
7706 
7707     //     *Pm = Rm = t0 * inv;
7708     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
7709     //     MACC(Rm, Rn, t0, t1, t2);
7710 
7711     //     assert(t0 == 0, "broken Montgomery multiply");
7712 
7713     //     t0 = t1; t1 = t2; t2 = 0;
7714     //   }
7715 
7716     //   for (i = len; i < 2*len; i++) {
7717     //     int start = i-len+1;
7718     //     int end = start + (len - start)/2;
7719     //     int j;
7720 
7721     //     Pa = Pa_base + i-len;
7722     //     Pb = Pa_base + len;
7723     //     Pm = Pm_base + i-len;
7724     //     Pn = Pn_base + len;
7725 
7726     //     Ra = *++Pa;
7727     //     Rb = *--Pb;
7728     //     Rm = *++Pm;
7729     //     Rn = *--Pn;
7730 
7731     //     int iters = (2*len-i-1)/2;
7732     //     assert(iters == end-start, "must be");
7733     //     for (j = start; iters--; j++) {
7734     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
7735     //       MACC2(Ra, Rb, t0, t1, t2);
7736     //       Ra = *++Pa;
7737     //       Rb = *--Pb;
7738     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7739     //       MACC(Rm, Rn, t0, t1, t2);
7740     //       Rm = *++Pm;
7741     //       Rn = *--Pn;
7742     //     }
7743     //     if ((i & 1) == 0) {
7744     //       assert(Ra == Pa_base[j], "must be");
7745     //       MACC(Ra, Ra, t0, t1, t2);
7746     //     }
7747     //     iters =  (2*len-i)/2;
7748     //     assert(iters == len-j, "must be");
7749     //     for (; iters--; j++) {
7750     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
7751     //       MACC(Rm, Rn, t0, t1, t2);
7752     //       Rm = *++Pm;
7753     //       Rn = *--Pn;
7754     //     }
7755     //     Pm_base[i-len] = t0;
7756     //     t0 = t1; t1 = t2; t2 = 0;
7757     //   }
7758 
7759     //   while (t0)
7760     //     t0 = sub(Pm_base, Pn_base, t0, len);
7761     // }
7762   };
7763 
7764 
7765   // Initialization
7766   void generate_initial() {
7767     // Generate initial stubs and initializes the entry points
7768 
7769     // entry points that exist in all platforms Note: This is code
7770     // that could be shared among different platforms - however the
7771     // benefit seems to be smaller than the disadvantage of having a
7772     // much more complicated generator structure. See also comment in
7773     // stubRoutines.hpp.
7774 
7775     StubRoutines::_forward_exception_entry = generate_forward_exception();
7776 
7777     StubRoutines::_call_stub_entry =
7778       generate_call_stub(StubRoutines::_call_stub_return_address);
7779 
7780     // is referenced by megamorphic call
7781     StubRoutines::_catch_exception_entry = generate_catch_exception();
7782 
7783     // Build this early so it's available for the interpreter.
7784     StubRoutines::_throw_StackOverflowError_entry =
7785       generate_throw_exception("StackOverflowError throw_exception",
7786                                CAST_FROM_FN_PTR(address,
7787                                                 SharedRuntime::throw_StackOverflowError));
7788     StubRoutines::_throw_delayed_StackOverflowError_entry =
7789       generate_throw_exception("delayed StackOverflowError throw_exception",
7790                                CAST_FROM_FN_PTR(address,
7791                                                 SharedRuntime::throw_delayed_StackOverflowError));
7792     if (UseCRC32Intrinsics) {
7793       // set table address before stub generation which use it
7794       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
7795       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7796     }
7797 
7798     if (UseCRC32CIntrinsics) {
7799       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
7800     }
7801 
7802     // Disabled until JDK-8210858 is fixed
7803     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7804     //   StubRoutines::_dlog = generate_dlog();
7805     // }
7806 
7807     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7808       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
7809     }
7810 
7811     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7812       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
7813     }
7814   }
7815 
7816   void generate_phase1() {
7817     // Continuation stubs:
7818     StubRoutines::_cont_thaw          = generate_cont_thaw();
7819     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
7820     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7821 
7822     JFR_ONLY(StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();)
7823     JFR_ONLY(StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();)
7824   }
7825 
7826   void generate_all() {
7827     // support for verify_oop (must happen after universe_init)
7828     if (VerifyOops) {
7829       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
7830     }
7831     StubRoutines::_throw_AbstractMethodError_entry =
7832       generate_throw_exception("AbstractMethodError throw_exception",
7833                                CAST_FROM_FN_PTR(address,
7834                                                 SharedRuntime::
7835                                                 throw_AbstractMethodError));
7836 
7837     StubRoutines::_throw_IncompatibleClassChangeError_entry =
7838       generate_throw_exception("IncompatibleClassChangeError throw_exception",
7839                                CAST_FROM_FN_PTR(address,
7840                                                 SharedRuntime::
7841                                                 throw_IncompatibleClassChangeError));
7842 
7843     StubRoutines::_throw_NullPointerException_at_call_entry =
7844       generate_throw_exception("NullPointerException at call throw_exception",
7845                                CAST_FROM_FN_PTR(address,
7846                                                 SharedRuntime::
7847                                                 throw_NullPointerException_at_call));
7848 
7849     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
7850 
7851     // arraycopy stubs used by compilers
7852     generate_arraycopy_stubs();
7853 
7854     // countPositives stub for large arrays.
7855     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
7856 
7857     // array equals stub for large arrays.
7858     if (!UseSimpleArrayEquals) {
7859       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
7860     }
7861 
7862     generate_compare_long_strings();
7863 
7864     generate_string_indexof_stubs();
7865 
7866     // byte_array_inflate stub for large arrays.
7867     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
7868 
7869     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7870     if (bs_nm != NULL) {
7871       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
7872     }
7873 #ifdef COMPILER2
7874     if (UseMultiplyToLenIntrinsic) {
7875       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7876     }
7877 
7878     if (UseSquareToLenIntrinsic) {
7879       StubRoutines::_squareToLen = generate_squareToLen();
7880     }
7881 
7882     if (UseMulAddIntrinsic) {
7883       StubRoutines::_mulAdd = generate_mulAdd();
7884     }
7885 
7886     if (UseSIMDForBigIntegerShiftIntrinsics) {
7887       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7888       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
7889     }
7890 
7891     if (UseMontgomeryMultiplyIntrinsic) {
7892       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
7893       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7894       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7895     }
7896 
7897     if (UseMontgomerySquareIntrinsic) {
7898       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
7899       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7900       // We use generate_multiply() rather than generate_square()
7901       // because it's faster for the sizes of modulus we care about.
7902       StubRoutines::_montgomerySquare = g.generate_multiply();
7903     }
7904 #endif // COMPILER2
7905 
7906     if (UseBASE64Intrinsics) {
7907         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7908         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7909     }
7910 
7911     // data cache line writeback
7912     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7913     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7914 
7915     if (UseAESIntrinsics) {
7916       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7917       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7918       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7919       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7920       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7921     }
7922     if (UseGHASHIntrinsics) {
7923       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7924       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
7925     }
7926     if (UseAESIntrinsics && UseGHASHIntrinsics) {
7927       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7928     }
7929 
7930     if (UseMD5Intrinsics) {
7931       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
7932       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
7933     }
7934     if (UseSHA1Intrinsics) {
7935       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
7936       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
7937     }
7938     if (UseSHA256Intrinsics) {
7939       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
7940       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
7941     }
7942     if (UseSHA512Intrinsics) {
7943       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
7944       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
7945     }
7946     if (UseSHA3Intrinsics) {
7947       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
7948       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
7949     }
7950 
7951     // generate Adler32 intrinsics code
7952     if (UseAdler32Intrinsics) {
7953       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7954     }
7955 
7956     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
7957 
7958 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
7959 
7960     generate_atomic_entry_points();
7961 
7962 #endif // LINUX
7963 
7964     StubRoutines::aarch64::set_completed();
7965   }
7966 
7967  public:
7968   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
7969     if (phase == 0) {
7970       generate_initial();
7971     } else if (phase == 1) {
7972       generate_phase1(); // stubs that must be available for the interpreter
7973     } else {
7974       generate_all();
7975     }
7976   }
7977 }; // end class declaration
7978 
7979 #define UCM_TABLE_MAX_ENTRIES 8
7980 void StubGenerator_generate(CodeBuffer* code, int phase) {
7981   if (UnsafeCopyMemory::_table == NULL) {
7982     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7983   }
7984   StubGenerator g(code, phase);
7985 }
7986 
7987 
7988 #if defined (LINUX)
7989 
7990 // Define pointers to atomic stubs and initialize them to point to the
7991 // code in atomic_aarch64.S.
7992 
7993 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
7994   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
7995     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
7996   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
7997     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
7998 
7999 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8000 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8001 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8002 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8003 DEFAULT_ATOMIC_OP(xchg, 4, )
8004 DEFAULT_ATOMIC_OP(xchg, 8, )
8005 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8006 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8007 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8008 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8009 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8010 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8011 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
8012 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
8013 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
8014 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
8015 
8016 #undef DEFAULT_ATOMIC_OP
8017 
8018 #endif // LINUX
8019 
8020 
8021 #undef __
8022 #define __ masm->
8023 
8024 // on exit, sp points to the ContinuationEntry
8025 OopMap* continuation_enter_setup(MacroAssembler* masm, int& stack_slots) {
8026   assert(ContinuationEntry::size() % VMRegImpl::stack_slot_size == 0, "");
8027   assert(in_bytes(ContinuationEntry::cont_offset())  % VMRegImpl::stack_slot_size == 0, "");
8028   assert(in_bytes(ContinuationEntry::chunk_offset()) % VMRegImpl::stack_slot_size == 0, "");
8029 
8030   stack_slots += (int)ContinuationEntry::size()/wordSize;
8031   __ sub(sp, sp, (int)ContinuationEntry::size()); // place Continuation metadata
8032 
8033   OopMap* map = new OopMap(((int)ContinuationEntry::size() + wordSize)/ VMRegImpl::stack_slot_size, 0 /* arg_slots*/);
8034   ContinuationEntry::setup_oopmap(map);
8035 
8036   __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
8037   __ str(rscratch1, Address(sp, ContinuationEntry::parent_offset()));
8038   __ mov(rscratch1, sp); // we can't use sp as the source in str
8039   __ str(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
8040 
8041   return map;
8042 }
8043 
8044 // on entry c_rarg1 points to the continuation
8045 //          sp points to ContinuationEntry
8046 //          c_rarg3 -- isVirtualThread
8047 void fill_continuation_entry(MacroAssembler* masm) {
8048 #ifdef ASSERT
8049   __ movw(rscratch1, ContinuationEntry::cookie_value());
8050   __ strw(rscratch1, Address(sp, ContinuationEntry::cookie_offset()));
8051 #endif
8052 
8053   __ str (c_rarg1, Address(sp, ContinuationEntry::cont_offset()));
8054   __ strw(c_rarg3, Address(sp, ContinuationEntry::flags_offset()));
8055   __ str (zr,      Address(sp, ContinuationEntry::chunk_offset()));
8056   __ strw(zr,      Address(sp, ContinuationEntry::argsize_offset()));
8057   __ strw(zr,      Address(sp, ContinuationEntry::pin_count_offset()));
8058 
8059   __ ldr(rscratch1, Address(rthread, JavaThread::cont_fastpath_offset()));
8060   __ str(rscratch1, Address(sp, ContinuationEntry::parent_cont_fastpath_offset()));
8061   __ ldr(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
8062   __ str(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
8063 
8064   __ str(zr, Address(rthread, JavaThread::cont_fastpath_offset()));
8065   __ str(zr, Address(rthread, JavaThread::held_monitor_count_offset()));
8066 }
8067 
8068 // on entry, sp points to the ContinuationEntry
8069 // on exit, rfp points to the spilled rfp in the entry frame
8070 void continuation_enter_cleanup(MacroAssembler* masm) {
8071 #ifndef PRODUCT
8072   Label OK;
8073   __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
8074   __ cmp(sp, rscratch1);
8075   __ br(Assembler::EQ, OK);
8076   __ stop("incorrect sp1");
8077   __ bind(OK);
8078 #endif
8079 
8080   __ ldr(rscratch1, Address(sp, ContinuationEntry::parent_cont_fastpath_offset()));
8081   __ str(rscratch1, Address(rthread, JavaThread::cont_fastpath_offset()));
8082   __ ldr(rscratch1, Address(sp, ContinuationEntry::parent_held_monitor_count_offset()));
8083   __ str(rscratch1, Address(rthread, JavaThread::held_monitor_count_offset()));
8084 
8085   __ ldr(rscratch2, Address(sp, ContinuationEntry::parent_offset()));
8086   __ str(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
8087   __ add(rfp, sp, (int)ContinuationEntry::size());
8088 }
8089 
8090 #undef __